@slowdini/slow-powers-opencode 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -65
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -13
- package/skills/evaluating-skills/SKILL.md +91 -337
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -158
- package/skills/evaluating-skills/runner/README.md +0 -154
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
- package/skills/evaluating-skills/runner/aggregate.ts +0 -248
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
- package/skills/evaluating-skills/runner/run.test.ts +0 -1180
- package/skills/evaluating-skills/runner/run.ts +0 -1029
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
- package/skills/evaluating-skills/runner/types.ts +0 -112
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
|
@@ -1,1029 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
import { randomBytes } from "node:crypto";
|
|
3
|
-
import {
|
|
4
|
-
cpSync,
|
|
5
|
-
existsSync,
|
|
6
|
-
mkdirSync,
|
|
7
|
-
mkdtempSync,
|
|
8
|
-
readdirSync,
|
|
9
|
-
readFileSync,
|
|
10
|
-
rmSync,
|
|
11
|
-
statSync,
|
|
12
|
-
writeFileSync,
|
|
13
|
-
} from "node:fs";
|
|
14
|
-
import { tmpdir } from "node:os";
|
|
15
|
-
import { basename, dirname, join } from "node:path";
|
|
16
|
-
import {
|
|
17
|
-
renderAvailableSkillsBlock,
|
|
18
|
-
renderPlanModeContext,
|
|
19
|
-
} from "./adapters/claude-code-session";
|
|
20
|
-
import { detectRunContext, type Harness, type RunContext } from "./context";
|
|
21
|
-
import { installGuard, teardownGuard } from "./guard/install";
|
|
22
|
-
import {
|
|
23
|
-
detectPluginShadows,
|
|
24
|
-
formatShadowBanner,
|
|
25
|
-
resolveConfigDir,
|
|
26
|
-
} from "./plugin-shadow";
|
|
27
|
-
import type {
|
|
28
|
-
AvailableSkill,
|
|
29
|
-
ConditionsRecord,
|
|
30
|
-
Eval,
|
|
31
|
-
EvalsConfig,
|
|
32
|
-
} from "./types";
|
|
33
|
-
import { validateEvalsConfig } from "./validate";
|
|
34
|
-
|
|
35
|
-
export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
|
|
36
|
-
export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
|
|
37
|
-
|
|
38
|
-
export function stageSkillForCC(opts: {
|
|
39
|
-
content: string;
|
|
40
|
-
iteration: number;
|
|
41
|
-
condition: string;
|
|
42
|
-
skillName: string;
|
|
43
|
-
repoRoot: string;
|
|
44
|
-
/**
|
|
45
|
-
* When set, stage under this verbatim identifier instead of the conspicuous
|
|
46
|
-
* `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
|
|
47
|
-
* against the eval-flagged one (issue #144 Step 2). A custom name is not
|
|
48
|
-
* caught by `cleanupStagedSkills`'s prefix scan, so the caller must also call
|
|
49
|
-
* `registerStagedSkillForCleanup` to have it removed on the next run.
|
|
50
|
-
*/
|
|
51
|
-
stageNameOverride?: string;
|
|
52
|
-
}): string {
|
|
53
|
-
const slug =
|
|
54
|
-
opts.stageNameOverride ??
|
|
55
|
-
`${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
|
|
56
|
-
const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
|
|
57
|
-
mkdirSync(skillDir, { recursive: true });
|
|
58
|
-
writeFileSync(join(skillDir, "SKILL.md"), opts.content);
|
|
59
|
-
return slug;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
* Adds a custom-named staged skill dir (one created via `stageNameOverride`) to
|
|
64
|
-
* the sibling manifest's `created_entries` so the next run's
|
|
65
|
-
* `cleanupStagedSkills` removes it — the prefix scan only catches
|
|
66
|
-
* `slow-powers-eval-…` names. Idempotent: a name already recorded is left alone.
|
|
67
|
-
*/
|
|
68
|
-
export function registerStagedSkillForCleanup(
|
|
69
|
-
repoRoot: string,
|
|
70
|
-
name: string,
|
|
71
|
-
): void {
|
|
72
|
-
const skillsDir = join(repoRoot, ".claude", "skills");
|
|
73
|
-
const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
|
|
74
|
-
let manifest: SiblingManifest;
|
|
75
|
-
if (existsSync(manifestPath)) {
|
|
76
|
-
manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
77
|
-
} else {
|
|
78
|
-
manifest = {
|
|
79
|
-
created_at: new Date().toISOString(),
|
|
80
|
-
staged_under_test: name,
|
|
81
|
-
created_entries: [],
|
|
82
|
-
};
|
|
83
|
-
}
|
|
84
|
-
if (manifest.created_entries.some((e) => e.name === name)) return;
|
|
85
|
-
manifest.created_entries.push({ name, preexisting: false });
|
|
86
|
-
writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
type SiblingManifest = {
|
|
90
|
-
created_at: string;
|
|
91
|
-
staged_under_test: string;
|
|
92
|
-
created_entries: Array<{
|
|
93
|
-
name: string;
|
|
94
|
-
preexisting: boolean;
|
|
95
|
-
backup_path?: string;
|
|
96
|
-
}>;
|
|
97
|
-
};
|
|
98
|
-
|
|
99
|
-
export function stageSiblingSkills(opts: {
|
|
100
|
-
skillUnderTest: string;
|
|
101
|
-
skillsSourceDir: string;
|
|
102
|
-
repoRoot: string;
|
|
103
|
-
}): SiblingManifest {
|
|
104
|
-
const skillsDir = join(opts.repoRoot, ".claude", "skills");
|
|
105
|
-
mkdirSync(skillsDir, { recursive: true });
|
|
106
|
-
|
|
107
|
-
const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
|
|
108
|
-
if (name === opts.skillUnderTest) return false;
|
|
109
|
-
const srcDir = join(opts.skillsSourceDir, name);
|
|
110
|
-
if (!statSync(srcDir).isDirectory()) return false;
|
|
111
|
-
return existsSync(join(srcDir, "SKILL.md"));
|
|
112
|
-
});
|
|
113
|
-
|
|
114
|
-
const manifest: SiblingManifest = {
|
|
115
|
-
created_at: new Date().toISOString(),
|
|
116
|
-
staged_under_test: opts.skillUnderTest,
|
|
117
|
-
created_entries: [],
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
for (const name of siblings) {
|
|
121
|
-
const srcDir = join(opts.skillsSourceDir, name);
|
|
122
|
-
const dstDir = join(skillsDir, name);
|
|
123
|
-
const evalsSubdir = join(srcDir, "evals");
|
|
124
|
-
|
|
125
|
-
const entry: SiblingManifest["created_entries"][number] = {
|
|
126
|
-
name,
|
|
127
|
-
preexisting: false,
|
|
128
|
-
};
|
|
129
|
-
|
|
130
|
-
if (existsSync(dstDir)) {
|
|
131
|
-
entry.preexisting = true;
|
|
132
|
-
const backupRoot = mkdtempSync(
|
|
133
|
-
join(tmpdir(), "slow-powers-eval-backup-"),
|
|
134
|
-
);
|
|
135
|
-
entry.backup_path = join(backupRoot, name);
|
|
136
|
-
cpSync(dstDir, entry.backup_path, { recursive: true });
|
|
137
|
-
rmSync(dstDir, { recursive: true, force: true });
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
cpSync(srcDir, dstDir, {
|
|
141
|
-
recursive: true,
|
|
142
|
-
filter: (src) =>
|
|
143
|
-
src !== evalsSubdir && !src.startsWith(`${evalsSubdir}/`),
|
|
144
|
-
});
|
|
145
|
-
|
|
146
|
-
manifest.created_entries.push(entry);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
writeFileSync(
|
|
150
|
-
join(skillsDir, STAGED_SIBLING_MANIFEST),
|
|
151
|
-
`${JSON.stringify(manifest, null, 2)}\n`,
|
|
152
|
-
);
|
|
153
|
-
return manifest;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
export function cleanupStagedSkills(repoRoot: string): void {
|
|
157
|
-
const skillsDir = join(repoRoot, ".claude", "skills");
|
|
158
|
-
if (!existsSync(skillsDir)) return;
|
|
159
|
-
|
|
160
|
-
for (const entry of readdirSync(skillsDir)) {
|
|
161
|
-
if (!entry.startsWith(STAGED_SKILL_PREFIX)) continue;
|
|
162
|
-
rmSync(join(skillsDir, entry), { recursive: true, force: true });
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
|
|
166
|
-
if (!existsSync(manifestPath)) return;
|
|
167
|
-
let manifest: SiblingManifest;
|
|
168
|
-
try {
|
|
169
|
-
manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
|
|
170
|
-
} catch {
|
|
171
|
-
rmSync(manifestPath, { force: true });
|
|
172
|
-
return;
|
|
173
|
-
}
|
|
174
|
-
for (const e of manifest.created_entries) {
|
|
175
|
-
const target = join(skillsDir, e.name);
|
|
176
|
-
rmSync(target, { recursive: true, force: true });
|
|
177
|
-
if (e.preexisting && e.backup_path && existsSync(e.backup_path)) {
|
|
178
|
-
cpSync(e.backup_path, target, { recursive: true });
|
|
179
|
-
rmSync(dirname(e.backup_path), { recursive: true, force: true });
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
rmSync(manifestPath, { force: true });
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
type Mode = "new-skill" | "revision";
|
|
186
|
-
|
|
187
|
-
type Args = {
|
|
188
|
-
command: "run" | "snapshot" | "teardown-guard";
|
|
189
|
-
mode?: Mode;
|
|
190
|
-
baseline?: string;
|
|
191
|
-
label?: string;
|
|
192
|
-
iteration?: number;
|
|
193
|
-
only?: string[];
|
|
194
|
-
skip?: string[];
|
|
195
|
-
dryRun: boolean;
|
|
196
|
-
noStage: boolean;
|
|
197
|
-
guard: boolean;
|
|
198
|
-
stageName?: string;
|
|
199
|
-
planMode: boolean;
|
|
200
|
-
};
|
|
201
|
-
|
|
202
|
-
function die(msg: string): never {
|
|
203
|
-
console.error(`error: ${msg}`);
|
|
204
|
-
process.exit(1);
|
|
205
|
-
}
|
|
206
|
-
|
|
207
|
-
function parseArgs(argv: string[]): Args {
|
|
208
|
-
const positionals = argv.filter((a) => !a.startsWith("--"));
|
|
209
|
-
const command: Args["command"] =
|
|
210
|
-
positionals[0] === "snapshot"
|
|
211
|
-
? "snapshot"
|
|
212
|
-
: positionals[0] === "teardown-guard"
|
|
213
|
-
? "teardown-guard"
|
|
214
|
-
: "run";
|
|
215
|
-
|
|
216
|
-
const flag = (name: string): string | undefined => {
|
|
217
|
-
const i = argv.indexOf(`--${name}`);
|
|
218
|
-
if (i === -1) return undefined;
|
|
219
|
-
const v = argv[i + 1];
|
|
220
|
-
if (v === undefined || v.startsWith("--")) {
|
|
221
|
-
die(`flag --${name} requires a value`);
|
|
222
|
-
}
|
|
223
|
-
return v;
|
|
224
|
-
};
|
|
225
|
-
|
|
226
|
-
const has = (name: string) => argv.includes(`--${name}`);
|
|
227
|
-
|
|
228
|
-
const iterationFlag = flag("iteration");
|
|
229
|
-
const iteration =
|
|
230
|
-
iterationFlag !== undefined ? Number(iterationFlag) : undefined;
|
|
231
|
-
if (iteration !== undefined && !Number.isInteger(iteration))
|
|
232
|
-
die(`--iteration must be an integer, got ${iterationFlag}`);
|
|
233
|
-
|
|
234
|
-
const parseIdList = (v: string | undefined): string[] | undefined =>
|
|
235
|
-
v === undefined
|
|
236
|
-
? undefined
|
|
237
|
-
: v
|
|
238
|
-
.split(",")
|
|
239
|
-
.map((s) => s.trim())
|
|
240
|
-
.filter(Boolean);
|
|
241
|
-
|
|
242
|
-
return {
|
|
243
|
-
command,
|
|
244
|
-
mode: flag("mode") as Mode | undefined,
|
|
245
|
-
baseline: flag("baseline"),
|
|
246
|
-
label: flag("label"),
|
|
247
|
-
iteration,
|
|
248
|
-
only: parseIdList(flag("only")),
|
|
249
|
-
skip: parseIdList(flag("skip")),
|
|
250
|
-
dryRun: has("dry-run"),
|
|
251
|
-
noStage: has("no-stage"),
|
|
252
|
-
guard: has("guard"),
|
|
253
|
-
stageName: flag("stage-name"),
|
|
254
|
-
planMode: has("plan-mode"),
|
|
255
|
-
};
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
function ensureDir(path: string): void {
|
|
259
|
-
if (!existsSync(path)) mkdirSync(path, { recursive: true });
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
function writeJson(path: string, value: unknown): void {
|
|
263
|
-
writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
function readJson<T>(path: string): T {
|
|
267
|
-
return JSON.parse(readFileSync(path, "utf8"));
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
function nextIteration(workspaceSkillDir: string, override?: number): number {
|
|
271
|
-
if (override !== undefined) return override;
|
|
272
|
-
if (!existsSync(workspaceSkillDir)) return 1;
|
|
273
|
-
const entries = readdirSync(workspaceSkillDir).filter((e) =>
|
|
274
|
-
e.startsWith("iteration-"),
|
|
275
|
-
);
|
|
276
|
-
if (entries.length === 0) return 1;
|
|
277
|
-
const nums = entries
|
|
278
|
-
.map((e) => Number(e.slice("iteration-".length)))
|
|
279
|
-
.filter((n) => Number.isFinite(n));
|
|
280
|
-
return Math.max(...nums, 0) + 1;
|
|
281
|
-
}
|
|
282
|
-
|
|
283
|
-
function conditionNamesFor(mode: Mode): [string, string] {
|
|
284
|
-
return mode === "new-skill"
|
|
285
|
-
? ["with_skill", "without_skill"]
|
|
286
|
-
: ["old_skill", "new_skill"];
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
function commandSnapshot(args: Args, ctx: RunContext): void {
|
|
290
|
-
if (!args.label) die("snapshot requires --label <name>");
|
|
291
|
-
const skillDir = ctx.skillSubdir;
|
|
292
|
-
const skillMd = join(skillDir, "SKILL.md");
|
|
293
|
-
if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
|
|
294
|
-
|
|
295
|
-
const destDir = join(
|
|
296
|
-
ctx.workspaceRoot,
|
|
297
|
-
ctx.skillName,
|
|
298
|
-
"snapshots",
|
|
299
|
-
args.label,
|
|
300
|
-
);
|
|
301
|
-
if (existsSync(destDir))
|
|
302
|
-
die(
|
|
303
|
-
`snapshot already exists: ${destDir}\n` +
|
|
304
|
-
" Use a different --label or delete the existing snapshot first.",
|
|
305
|
-
);
|
|
306
|
-
ensureDir(destDir);
|
|
307
|
-
|
|
308
|
-
cpSync(skillMd, join(destDir, "SKILL.md"));
|
|
309
|
-
for (const entry of readdirSync(skillDir)) {
|
|
310
|
-
if (entry === "SKILL.md" || entry === "evals") continue;
|
|
311
|
-
const src = join(skillDir, entry);
|
|
312
|
-
const dst = join(destDir, entry);
|
|
313
|
-
if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
|
|
314
|
-
else cpSync(src, dst);
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
function commandRun(args: Args, ctx: RunContext): void {
|
|
321
|
-
if (!args.mode) die("--mode required: new-skill | revision");
|
|
322
|
-
if (args.mode !== "new-skill" && args.mode !== "revision")
|
|
323
|
-
die(`unknown --mode: ${args.mode}`);
|
|
324
|
-
if (args.mode === "revision" && !args.baseline)
|
|
325
|
-
die("revision mode requires --baseline <label>");
|
|
326
|
-
|
|
327
|
-
const skillDir = ctx.skillSubdir;
|
|
328
|
-
const skillMd = join(skillDir, "SKILL.md");
|
|
329
|
-
if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
|
|
330
|
-
|
|
331
|
-
const evalsPath = join(skillDir, "evals", "evals.json");
|
|
332
|
-
if (!existsSync(evalsPath)) die(`evals.json not found: ${evalsPath}`);
|
|
333
|
-
|
|
334
|
-
const config: EvalsConfig = validateEvalsConfig(
|
|
335
|
-
readJson(evalsPath),
|
|
336
|
-
evalsPath,
|
|
337
|
-
);
|
|
338
|
-
if (config.skill_name !== ctx.skillName)
|
|
339
|
-
console.warn(
|
|
340
|
-
`warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
|
|
341
|
-
);
|
|
342
|
-
|
|
343
|
-
let selectedEvals: Eval[];
|
|
344
|
-
try {
|
|
345
|
-
selectedEvals = selectEvals(config.evals, {
|
|
346
|
-
only: args.only,
|
|
347
|
-
skip: args.skip,
|
|
348
|
-
});
|
|
349
|
-
} catch (err) {
|
|
350
|
-
die(err instanceof Error ? err.message : String(err));
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
|
|
354
|
-
const iteration = nextIteration(workspaceSkillDir, args.iteration);
|
|
355
|
-
const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
|
|
356
|
-
|
|
357
|
-
// A per-run nonce makes each dispatch description globally unique. The
|
|
358
|
-
// subagents dir is shared across iterations of one parent session, so a bare
|
|
359
|
-
// `<eval>:<condition>` description repeats and fill-transcripts could fill an
|
|
360
|
-
// iteration's run from a colliding agent in another iteration. `i<N>-<nonce>`
|
|
361
|
-
// also disambiguates re-running the same iteration number.
|
|
362
|
-
const runNonce = `${Date.now().toString(36)}-${randomBytes(3).toString("hex")}`;
|
|
363
|
-
const runTag = `i${iteration}-${runNonce}`;
|
|
364
|
-
|
|
365
|
-
if (existsSync(iterationDir) && args.iteration === undefined)
|
|
366
|
-
die(
|
|
367
|
-
`iteration-${iteration} already exists; pass --iteration to overwrite explicitly`,
|
|
368
|
-
);
|
|
369
|
-
|
|
370
|
-
const [conditionA, conditionB] = conditionNamesFor(args.mode);
|
|
371
|
-
|
|
372
|
-
let skillPathForA: string | null;
|
|
373
|
-
let skillPathForB: string | null;
|
|
374
|
-
if (args.mode === "new-skill") {
|
|
375
|
-
skillPathForA = skillMd;
|
|
376
|
-
skillPathForB = null;
|
|
377
|
-
} else {
|
|
378
|
-
const baselineSkill = join(
|
|
379
|
-
workspaceSkillDir,
|
|
380
|
-
"snapshots",
|
|
381
|
-
args.baseline as string,
|
|
382
|
-
"SKILL.md",
|
|
383
|
-
);
|
|
384
|
-
if (!existsSync(baselineSkill))
|
|
385
|
-
die(
|
|
386
|
-
`baseline snapshot not found: ${baselineSkill}\n` +
|
|
387
|
-
` Run: bun run evals:snapshot --skill ${ctx.skillName} --skill-dir ${ctx.skillDir} --label ${args.baseline} (before editing)`,
|
|
388
|
-
);
|
|
389
|
-
skillPathForA = baselineSkill;
|
|
390
|
-
skillPathForB = skillMd;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
console.log(
|
|
394
|
-
`Preparing ${ctx.skillName} iteration-${iteration} (${args.mode})`,
|
|
395
|
-
);
|
|
396
|
-
console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
|
|
397
|
-
console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
|
|
398
|
-
if (selectedEvals.length !== config.evals.length) {
|
|
399
|
-
const [flagName, ids] = args.only
|
|
400
|
-
? ["--only", args.only]
|
|
401
|
-
: ["--skip", args.skip ?? []];
|
|
402
|
-
console.log(
|
|
403
|
-
` selection: ${selectedEvals.length} of ${config.evals.length} evals (${flagName} ${ids.join(", ")})`,
|
|
404
|
-
);
|
|
405
|
-
}
|
|
406
|
-
if (args.noStage)
|
|
407
|
-
console.log(
|
|
408
|
-
" staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
|
|
409
|
-
);
|
|
410
|
-
|
|
411
|
-
ensureDir(iterationDir);
|
|
412
|
-
cpSync(skillMd, join(iterationDir, "skill-snapshot.md"));
|
|
413
|
-
|
|
414
|
-
// Always disarm a prior run's guard before re-staging, so a crashed run can't
|
|
415
|
-
// leave the write-blocking hook armed across runs.
|
|
416
|
-
teardownGuard(ctx.stageRoot);
|
|
417
|
-
|
|
418
|
-
if (!args.noStage) cleanupStagedSkills(ctx.stageRoot);
|
|
419
|
-
|
|
420
|
-
if (!args.noStage) {
|
|
421
|
-
stageSiblingSkills({
|
|
422
|
-
skillUnderTest: ctx.skillName,
|
|
423
|
-
skillsSourceDir: ctx.skillDir,
|
|
424
|
-
repoRoot: ctx.stageRoot,
|
|
425
|
-
});
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
const bootstrapContent =
|
|
429
|
-
ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
|
|
430
|
-
|
|
431
|
-
// `--plan-mode` (issue #142): inject the harness's verbatim plan-mode
|
|
432
|
-
// procedure as an operating-context layer. The profile is a bundled asset
|
|
433
|
-
// resolved relative to this runner (mirroring the guard-script resolution
|
|
434
|
-
// below) and keyed by harness, so a harness without a profile simply has no
|
|
435
|
-
// `--plan-mode` and the portable dispatch contract is unchanged.
|
|
436
|
-
const planModeContent = args.planMode
|
|
437
|
-
? resolvePlanModeProfile(ctx.harness)
|
|
438
|
-
: null;
|
|
439
|
-
if (args.planMode)
|
|
440
|
-
console.log(
|
|
441
|
-
` plan-mode: injecting ${ctx.harness} plan-mode profile as operating context (issue #142; necessary-not-sufficient fidelity layer)`,
|
|
442
|
-
);
|
|
443
|
-
|
|
444
|
-
// Sibling skill metadata, shared across conditions. Empty when --no-stage
|
|
445
|
-
// (nothing is staged, so nothing is discoverable to list).
|
|
446
|
-
const siblingSkills: AvailableSkill[] = args.noStage
|
|
447
|
-
? []
|
|
448
|
-
: ctx.siblingSkillNames.map((name) => {
|
|
449
|
-
const p = join(ctx.skillDir, name, "SKILL.md");
|
|
450
|
-
return { name, path: p, description: getSkillDescription(p) };
|
|
451
|
-
});
|
|
452
|
-
|
|
453
|
-
// `--stage-name` overrides the conspicuous `slow-powers-eval-…` slug with a
|
|
454
|
-
// verbatim name (issue #144 Step 2: A/B a natural name against the eval slug).
|
|
455
|
-
// It targets the single staging condition, so reject the case where both
|
|
456
|
-
// conditions stage (e.g. revision mode) — one name can't cover two dirs — and
|
|
457
|
-
// refuse to clobber a dir that already exists (a real project skill the user
|
|
458
|
-
// owns; cleanup has already removed our own prior custom dirs by this point).
|
|
459
|
-
if (args.stageName !== undefined && !args.noStage) {
|
|
460
|
-
if (skillPathForA !== null && skillPathForB !== null) {
|
|
461
|
-
die(
|
|
462
|
-
"--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
|
|
463
|
-
);
|
|
464
|
-
}
|
|
465
|
-
const target = join(ctx.stageRoot, ".claude", "skills", args.stageName);
|
|
466
|
-
if (existsSync(target)) {
|
|
467
|
-
die(
|
|
468
|
-
`--stage-name "${args.stageName}": ${target} already exists; refusing to clobber it. Remove it or choose a different name.`,
|
|
469
|
-
);
|
|
470
|
-
}
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
const stageFor = (
|
|
474
|
-
condName: string,
|
|
475
|
-
condSkillPath: string | null,
|
|
476
|
-
): string | null => {
|
|
477
|
-
if (!condSkillPath || args.noStage) return null;
|
|
478
|
-
return stageSkillForCC({
|
|
479
|
-
content: readFileSync(condSkillPath, "utf8"),
|
|
480
|
-
iteration,
|
|
481
|
-
condition: condName,
|
|
482
|
-
skillName: ctx.skillName,
|
|
483
|
-
repoRoot: ctx.stageRoot,
|
|
484
|
-
stageNameOverride: args.stageName,
|
|
485
|
-
});
|
|
486
|
-
};
|
|
487
|
-
|
|
488
|
-
const conditionASlug = stageFor(conditionA, skillPathForA);
|
|
489
|
-
const conditionBSlug = stageFor(conditionB, skillPathForB);
|
|
490
|
-
|
|
491
|
-
// A custom-named dir isn't caught by cleanupStagedSkills's prefix scan; record
|
|
492
|
-
// it in the sibling manifest so the next run removes it.
|
|
493
|
-
if (
|
|
494
|
-
args.stageName !== undefined &&
|
|
495
|
-
(conditionASlug === args.stageName || conditionBSlug === args.stageName)
|
|
496
|
-
) {
|
|
497
|
-
registerStagedSkillForCleanup(ctx.stageRoot, args.stageName);
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
const conditions: ConditionsRecord = {
|
|
501
|
-
mode: args.mode,
|
|
502
|
-
baseline: args.baseline,
|
|
503
|
-
conditions: [
|
|
504
|
-
{
|
|
505
|
-
name: conditionA,
|
|
506
|
-
skill_path: skillPathForA,
|
|
507
|
-
staged_skill_slug: conditionASlug,
|
|
508
|
-
},
|
|
509
|
-
{
|
|
510
|
-
name: conditionB,
|
|
511
|
-
skill_path: skillPathForB,
|
|
512
|
-
staged_skill_slug: conditionBSlug,
|
|
513
|
-
},
|
|
514
|
-
],
|
|
515
|
-
timestamp: new Date().toISOString(),
|
|
516
|
-
harness: ctx.harness,
|
|
517
|
-
run_nonce: runNonce,
|
|
518
|
-
};
|
|
519
|
-
writeJson(join(iterationDir, "conditions.json"), conditions);
|
|
520
|
-
|
|
521
|
-
// availableSkills for a condition = siblings + the skill-under-test when
|
|
522
|
-
// that condition loads it. Empty when nothing was staged.
|
|
523
|
-
const availableSkillsFor = (
|
|
524
|
-
condSkillPath: string | null,
|
|
525
|
-
): AvailableSkill[] => {
|
|
526
|
-
if (args.noStage) return [];
|
|
527
|
-
const skills = [...siblingSkills];
|
|
528
|
-
if (condSkillPath) {
|
|
529
|
-
skills.push({
|
|
530
|
-
name: ctx.skillName,
|
|
531
|
-
path: condSkillPath,
|
|
532
|
-
description: getSkillDescription(condSkillPath),
|
|
533
|
-
});
|
|
534
|
-
}
|
|
535
|
-
return skills;
|
|
536
|
-
};
|
|
537
|
-
|
|
538
|
-
const tasks: DispatchTask[] = [];
|
|
539
|
-
for (const ev of selectedEvals) {
|
|
540
|
-
const evalDir = join(iterationDir, `eval-${ev.id}`);
|
|
541
|
-
ensureDir(evalDir);
|
|
542
|
-
|
|
543
|
-
for (const [condName, condSkillPath, condSlug] of [
|
|
544
|
-
[conditionA, skillPathForA, conditionASlug],
|
|
545
|
-
[conditionB, skillPathForB, conditionBSlug],
|
|
546
|
-
] as const) {
|
|
547
|
-
const condDir = join(evalDir, condName);
|
|
548
|
-
const outputsDir = join(condDir, "outputs");
|
|
549
|
-
ensureDir(outputsDir);
|
|
550
|
-
|
|
551
|
-
const fixtures = copyFixtures(ev, skillDir, condDir);
|
|
552
|
-
tasks.push(
|
|
553
|
-
buildDispatchTask({
|
|
554
|
-
evalId: ev.id,
|
|
555
|
-
condition: condName,
|
|
556
|
-
skillPath: condSkillPath,
|
|
557
|
-
stagedSkillSlug: condSlug,
|
|
558
|
-
userPrompt: ev.prompt,
|
|
559
|
-
fixtures,
|
|
560
|
-
outputsDir,
|
|
561
|
-
condDir,
|
|
562
|
-
bootstrapContent,
|
|
563
|
-
planModeContent,
|
|
564
|
-
skillName: ctx.skillName,
|
|
565
|
-
availableSkills: availableSkillsFor(condSkillPath),
|
|
566
|
-
runTag,
|
|
567
|
-
}),
|
|
568
|
-
);
|
|
569
|
-
}
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
const manifestPath = join(iterationDir, "dispatch-manifest.md");
|
|
573
|
-
writeFileSync(
|
|
574
|
-
manifestPath,
|
|
575
|
-
buildManifest({
|
|
576
|
-
skillName: ctx.skillName,
|
|
577
|
-
mode: args.mode,
|
|
578
|
-
baseline: args.baseline,
|
|
579
|
-
iteration,
|
|
580
|
-
tasks,
|
|
581
|
-
}),
|
|
582
|
-
);
|
|
583
|
-
|
|
584
|
-
// Write each prompt to its own file and reference it by path in dispatch.json.
|
|
585
|
-
// The orchestrator then dispatches with a short "read this file" prompt instead
|
|
586
|
-
// of reproducing the full prompt verbatim per Task call.
|
|
587
|
-
for (const task of tasks) {
|
|
588
|
-
writeFileSync(task.dispatch_prompt_path, task.dispatch_prompt);
|
|
589
|
-
}
|
|
590
|
-
|
|
591
|
-
const dispatchJsonPath = join(iterationDir, "dispatch.json");
|
|
592
|
-
writeJson(dispatchJsonPath, {
|
|
593
|
-
skill_name: ctx.skillName,
|
|
594
|
-
iteration,
|
|
595
|
-
run_nonce: runNonce,
|
|
596
|
-
iteration_dir: iterationDir,
|
|
597
|
-
mode: args.mode,
|
|
598
|
-
baseline: args.baseline ?? null,
|
|
599
|
-
plan_mode: args.planMode,
|
|
600
|
-
conditions: conditions.conditions,
|
|
601
|
-
harness: ctx.harness,
|
|
602
|
-
tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
|
|
603
|
-
});
|
|
604
|
-
|
|
605
|
-
// Opt-in hard guard. Stages a PreToolUse hook that blocks subagent
|
|
606
|
-
// writes/installs outside the eval sandbox while dispatches run.
|
|
607
|
-
if (args.guard && !args.dryRun) {
|
|
608
|
-
if (args.noStage) {
|
|
609
|
-
console.warn(
|
|
610
|
-
"\n⚠ --guard requires staging enabled; skipping guard install.",
|
|
611
|
-
);
|
|
612
|
-
} else {
|
|
613
|
-
const guardScriptPath = join(import.meta.dir, "guard", "guard.ts");
|
|
614
|
-
installGuard({
|
|
615
|
-
stageRoot: ctx.stageRoot,
|
|
616
|
-
workspaceRoot: ctx.workspaceRoot,
|
|
617
|
-
guardScriptPath,
|
|
618
|
-
});
|
|
619
|
-
console.log(
|
|
620
|
-
"\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n" +
|
|
621
|
-
" and will block writes/installs outside the eval sandbox during dispatches.\n" +
|
|
622
|
-
" It auto-expires in 6h and is removed on the next run; to remove it now:\n" +
|
|
623
|
-
" bun run evals:teardown-guard --skill <name>",
|
|
624
|
-
);
|
|
625
|
-
}
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
// Plugin-shadow preflight (Claude Code): a staged skill name that is also
|
|
629
|
-
// discoverable from an enabled plugin or the global skills dir contaminates the
|
|
630
|
-
// run — subagents inherit this session's plugins, so both copies are reachable.
|
|
631
|
-
// The runner can't unload a plugin from a live session; it only flags it. The
|
|
632
|
-
// report is persisted so the aggregator can surface it in validity_warnings.
|
|
633
|
-
if (ctx.harness === "claude-code") {
|
|
634
|
-
const shadowReport = detectPluginShadows({
|
|
635
|
-
configDir: resolveConfigDir(),
|
|
636
|
-
cwd: ctx.stageRoot,
|
|
637
|
-
stagedSkillNames: [ctx.skillName, ...ctx.siblingSkillNames],
|
|
638
|
-
});
|
|
639
|
-
if (shadowReport.shadowed.length > 0) {
|
|
640
|
-
writeJson(join(iterationDir, "plugin-shadow.json"), shadowReport);
|
|
641
|
-
console.warn(formatShadowBanner(shadowReport));
|
|
642
|
-
}
|
|
643
|
-
}
|
|
644
|
-
|
|
645
|
-
console.log(`\nWorkspace prepared: ${iterationDir}`);
|
|
646
|
-
console.log(`Dispatch manifest: ${manifestPath}`);
|
|
647
|
-
console.log(`Dispatch tasks: ${dispatchJsonPath}`);
|
|
648
|
-
console.log(
|
|
649
|
-
`\n${tasks.length} dispatches required (${selectedEvals.length} evals × 2 conditions).`,
|
|
650
|
-
);
|
|
651
|
-
|
|
652
|
-
if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
|
|
653
|
-
else
|
|
654
|
-
console.log(
|
|
655
|
-
"\nNext: read dispatch.json, dispatch each task as a subagent, write run.json + timing.json to the paths in each task.",
|
|
656
|
-
);
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
type DispatchTask = {
|
|
660
|
-
eval_id: string;
|
|
661
|
-
condition: string;
|
|
662
|
-
skill_path: string | null;
|
|
663
|
-
staged_skill_slug: string | null;
|
|
664
|
-
user_prompt: string;
|
|
665
|
-
fixtures: string[];
|
|
666
|
-
outputs_dir: string;
|
|
667
|
-
run_record_path: string;
|
|
668
|
-
timing_path: string;
|
|
669
|
-
agent_description: string;
|
|
670
|
-
/**
|
|
671
|
-
* Absolute path to the file holding the full dispatch prompt. The orchestrator
|
|
672
|
-
* dispatches each subagent with a short "read this file and follow it" prompt
|
|
673
|
-
* rather than inlining the prompt, so it never has to reproduce ~KB of text per
|
|
674
|
-
* Task call. `dispatch_prompt` carries the same text in-memory (for manifest
|
|
675
|
-
* building and unit tests) but is stripped from the serialized dispatch.json.
|
|
676
|
-
*/
|
|
677
|
-
dispatch_prompt_path: string;
|
|
678
|
-
dispatch_prompt: string;
|
|
679
|
-
};
|
|
680
|
-
|
|
681
|
-
export type { AvailableSkill } from "./types";
|
|
682
|
-
|
|
683
|
-
/**
|
|
684
|
-
* Filters the eval list to the subset requested via `--only` / `--skip`. The
|
|
685
|
-
* two flags are mutually exclusive. Every requested id must exist in the config,
|
|
686
|
-
* so a typo'd id is caught up front rather than silently producing an empty or
|
|
687
|
-
* surprising run. Throws on invalid input; the caller routes the message to
|
|
688
|
-
* `die`. `--only` preserves the config's eval order, not the order ids were
|
|
689
|
-
* passed.
|
|
690
|
-
*/
|
|
691
|
-
export function selectEvals(
|
|
692
|
-
evals: Eval[],
|
|
693
|
-
opts: { only?: string[]; skip?: string[] },
|
|
694
|
-
): Eval[] {
|
|
695
|
-
if (opts.only && opts.skip)
|
|
696
|
-
throw new Error("use only one of --only / --skip, not both");
|
|
697
|
-
const requested = opts.only ?? opts.skip;
|
|
698
|
-
if (requested === undefined) return evals;
|
|
699
|
-
if (requested.length === 0)
|
|
700
|
-
throw new Error("--only/--skip requires at least one eval id");
|
|
701
|
-
|
|
702
|
-
const known = new Set(evals.map((e) => e.id));
|
|
703
|
-
const unknown = requested.filter((id) => !known.has(id));
|
|
704
|
-
if (unknown.length)
|
|
705
|
-
throw new Error(
|
|
706
|
-
`unknown eval id(s): ${unknown.join(", ")}. ` +
|
|
707
|
-
`Available ids: ${[...known].join(", ")}`,
|
|
708
|
-
);
|
|
709
|
-
|
|
710
|
-
const set = new Set(requested);
|
|
711
|
-
return opts.only
|
|
712
|
-
? evals.filter((e) => set.has(e.id))
|
|
713
|
-
: evals.filter((e) => !set.has(e.id));
|
|
714
|
-
}
|
|
715
|
-
|
|
716
|
-
function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
|
|
717
|
-
if (!ev.files || ev.files.length === 0) return [];
|
|
718
|
-
const inputsDir = join(condDir, "inputs");
|
|
719
|
-
ensureDir(inputsDir);
|
|
720
|
-
const copied: string[] = [];
|
|
721
|
-
for (const f of ev.files) {
|
|
722
|
-
const src = join(skillDir, "evals", f);
|
|
723
|
-
if (!existsSync(src)) die(`fixture not found: ${src}`);
|
|
724
|
-
const dst = join(inputsDir, basename(f));
|
|
725
|
-
if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
|
|
726
|
-
else cpSync(src, dst);
|
|
727
|
-
copied.push(dst);
|
|
728
|
-
}
|
|
729
|
-
return copied;
|
|
730
|
-
}
|
|
731
|
-
|
|
732
|
-
/**
|
|
733
|
-
* Resolve the verbatim plan-mode procedure profile for a harness (issue #142).
|
|
734
|
-
* The profile is a bundled supporting-file asset under
|
|
735
|
-
* `profiles/<harness>/plan-mode.md`, resolved relative to this runner exactly
|
|
736
|
-
* like the guard script (`join(import.meta.dir, "guard", "guard.ts")`). A
|
|
737
|
-
* harness without a profile gets a clear error rather than a silent no-op — the
|
|
738
|
-
* profile is Claude-tier fidelity, and a harness lacking one leaves the portable
|
|
739
|
-
* dispatch contract unchanged (no `<system-reminder>` plan-mode block emitted).
|
|
740
|
-
*/
|
|
741
|
-
function resolvePlanModeProfile(harness: Harness): string {
|
|
742
|
-
const profilePath = join(
|
|
743
|
-
import.meta.dir,
|
|
744
|
-
"profiles",
|
|
745
|
-
harness,
|
|
746
|
-
"plan-mode.md",
|
|
747
|
-
);
|
|
748
|
-
if (!existsSync(profilePath)) {
|
|
749
|
-
die(
|
|
750
|
-
`--plan-mode: no plan-mode profile exists for harness '${harness}' ` +
|
|
751
|
-
`(expected ${profilePath}). This is a Claude-tier fidelity layer; a ` +
|
|
752
|
-
"harness without a profile leaves the portable dispatch contract unchanged.",
|
|
753
|
-
);
|
|
754
|
-
}
|
|
755
|
-
return readFileSync(profilePath, "utf8");
|
|
756
|
-
}
|
|
757
|
-
|
|
758
|
-
function getSkillDescription(skillPath: string): string {
|
|
759
|
-
try {
|
|
760
|
-
const content = readFileSync(skillPath, "utf8");
|
|
761
|
-
const match = content.match(/description:\s*([^\n\r]+)/);
|
|
762
|
-
if (match) {
|
|
763
|
-
let desc = match[1].trim();
|
|
764
|
-
if (
|
|
765
|
-
(desc.startsWith('"') && desc.endsWith('"')) ||
|
|
766
|
-
(desc.startsWith("'") && desc.endsWith("'"))
|
|
767
|
-
) {
|
|
768
|
-
desc = desc.slice(1, -1).trim();
|
|
769
|
-
}
|
|
770
|
-
return desc;
|
|
771
|
-
}
|
|
772
|
-
} catch {}
|
|
773
|
-
return "No description available.";
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
/**
|
|
777
|
-
* Removes the skill-under-test's "Active Skills Directory" entry from bootstrap
|
|
778
|
-
* content so a skill-absent condition (e.g. `without_skill`) carries no
|
|
779
|
-
* reference to it. Targets the markdown list-item block: a top-level `*`/`-`
|
|
780
|
-
* bullet whose backticked name equals `skillName`, plus its indented
|
|
781
|
-
* continuation lines (the `*Trigger:*` sub-bullet). Sibling entries and the
|
|
782
|
-
* heading are left intact. The eval bootstrap names skills only in that
|
|
783
|
-
* directory, so this is the sole reference vector to scrub.
|
|
784
|
-
*/
|
|
785
|
-
export function redactSkillFromBootstrap(
|
|
786
|
-
content: string,
|
|
787
|
-
skillName: string,
|
|
788
|
-
): string {
|
|
789
|
-
const out: string[] = [];
|
|
790
|
-
let skipping = false;
|
|
791
|
-
for (const line of content.split("\n")) {
|
|
792
|
-
if (skipping) {
|
|
793
|
-
// Indented continuation lines belong to the entry being dropped.
|
|
794
|
-
if (/^\s+\S/.test(line)) continue;
|
|
795
|
-
skipping = false;
|
|
796
|
-
}
|
|
797
|
-
if (/^[*-]\s/.test(line) && line.includes(`\`${skillName}\``)) {
|
|
798
|
-
skipping = true;
|
|
799
|
-
continue;
|
|
800
|
-
}
|
|
801
|
-
out.push(line);
|
|
802
|
-
}
|
|
803
|
-
return out.join("\n");
|
|
804
|
-
}
|
|
805
|
-
|
|
806
|
-
export function buildDispatchTask(opts: {
|
|
807
|
-
evalId: string;
|
|
808
|
-
condition: string;
|
|
809
|
-
skillPath: string | null;
|
|
810
|
-
stagedSkillSlug: string | null;
|
|
811
|
-
userPrompt: string;
|
|
812
|
-
fixtures: string[];
|
|
813
|
-
outputsDir: string;
|
|
814
|
-
condDir: string;
|
|
815
|
-
bootstrapContent: string | null;
|
|
816
|
-
/**
|
|
817
|
-
* Verbatim plan-mode procedure profile (from
|
|
818
|
-
* `profiles/<harness>/plan-mode.md`) to inject as an operating-context layer,
|
|
819
|
-
* or null/undefined to omit it. Skill-agnostic, so it is identical across the
|
|
820
|
-
* with/without-skill arms and needs no redaction. Set by the `--plan-mode`
|
|
821
|
-
* flag (issue #142): the highest-fidelity in-runner approximation of a real
|
|
822
|
-
* plan mode, still text the agent reads — a necessary-not-sufficient signal.
|
|
823
|
-
*/
|
|
824
|
-
planModeContent?: string | null;
|
|
825
|
-
skillName: string;
|
|
826
|
-
availableSkills: AvailableSkill[];
|
|
827
|
-
/**
|
|
828
|
-
* Per-run uniqueness suffix (`i<iteration>-<nonce>`). Appended to the
|
|
829
|
-
* dispatch description so transcripts can't collide across iterations or
|
|
830
|
-
* re-runs. Omitted in unit tests that exercise prompt assembly directly.
|
|
831
|
-
*/
|
|
832
|
-
runTag?: string;
|
|
833
|
-
}): DispatchTask {
|
|
834
|
-
const stagedSkills = [...opts.availableSkills].sort((a, b) =>
|
|
835
|
-
a.name.localeCompare(b.name),
|
|
836
|
-
);
|
|
837
|
-
|
|
838
|
-
let skillBlock: string;
|
|
839
|
-
if (opts.stagedSkillSlug) {
|
|
840
|
-
// Neutral slug disambiguation only — no imperative to invoke. The skill is
|
|
841
|
-
// staged under a unique slug; surface that identifier so a deliberate
|
|
842
|
-
// invocation targets the staged copy and the __skill_invoked meta-check can
|
|
843
|
-
// find it. Do NOT assert a plugin is "loaded" or tell the agent to prefer the
|
|
844
|
-
// slug "rather than the bare name": in an isolated run there is no global copy,
|
|
845
|
-
// and that framing invited the agent to hunt for one (issue #144 global-plugin
|
|
846
|
-
// leakage). Whether to invoke is left to the skill's own triggering (dropping
|
|
847
|
-
// the old "invoke if it applies" directive was the issue #119 ceiling fix).
|
|
848
|
-
skillBlock = [
|
|
849
|
-
`The \`${opts.skillName}\` skill is registered under the identifier \`${opts.stagedSkillSlug}\` and is discoverable via the Skill tool. If you invoke it, use that identifier.`,
|
|
850
|
-
].join("\n");
|
|
851
|
-
} else if (opts.skillPath) {
|
|
852
|
-
skillBlock = [
|
|
853
|
-
"The following skill is loaded into your operating guidelines. Apply it where relevant to the user's request.",
|
|
854
|
-
"",
|
|
855
|
-
`<skill name="${basename(dirname(opts.skillPath))}">`,
|
|
856
|
-
readFileSync(opts.skillPath, "utf8").trim(),
|
|
857
|
-
"</skill>",
|
|
858
|
-
].join("\n");
|
|
859
|
-
} else if (stagedSkills.length > 0 || opts.bootstrapContent) {
|
|
860
|
-
// Skill-absent arm in a realistic environment: stay silent. The
|
|
861
|
-
// available-skills block already omits the skill-under-test, so any
|
|
862
|
-
// commentary here would only announce the eval (and, in the control arm,
|
|
863
|
-
// draw attention to the very skill that is supposed to be absent).
|
|
864
|
-
skillBlock = "";
|
|
865
|
-
} else {
|
|
866
|
-
skillBlock = "No skill is loaded. Respond as you naturally would.";
|
|
867
|
-
}
|
|
868
|
-
|
|
869
|
-
const fixturesBlock = opts.fixtures.length
|
|
870
|
-
? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
|
|
871
|
-
: "Available fixture files: none";
|
|
872
|
-
|
|
873
|
-
// A dispatch mirrors a real session by carrying two *separate* surfaces, the
|
|
874
|
-
// way the harness actually delivers them:
|
|
875
|
-
// 1. The verbatim --bootstrap file (the SessionStart-hook equivalent),
|
|
876
|
-
// wrapped in <session-start-context>, if supplied.
|
|
877
|
-
// 2. The list of discoverable skills, rendered in the harness's native
|
|
878
|
-
// presentation as its own block (see adapters/claude-code-session.ts).
|
|
879
|
-
// A condition that does not load the skill-under-test (the new-skill
|
|
880
|
-
// `without_skill` arm, under staging or --no-stage) must carry zero reference
|
|
881
|
-
// to it. The skill-under-test is auto-omitted from the available-skills block
|
|
882
|
-
// (see `availableSkillsFor`). redactSkillFromBootstrap covers the other path:
|
|
883
|
-
// a *user-supplied* --bootstrap that names the skill in its own prose would
|
|
884
|
-
// otherwise leak it into the control arm. (The shipped bootstrap.md no longer
|
|
885
|
-
// enumerates skills, so that redaction is a no-op against it.)
|
|
886
|
-
const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
|
|
887
|
-
const effectiveBootstrap =
|
|
888
|
-
opts.bootstrapContent && skillAbsent
|
|
889
|
-
? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
|
|
890
|
-
: opts.bootstrapContent;
|
|
891
|
-
|
|
892
|
-
const sections: string[] = [];
|
|
893
|
-
if (effectiveBootstrap) {
|
|
894
|
-
sections.push(
|
|
895
|
-
[
|
|
896
|
-
"<session-start-context>",
|
|
897
|
-
"The following guidelines were loaded at session start by the slow-powers plugin",
|
|
898
|
-
"(equivalent to the SessionStart hook firing in a real user's environment):",
|
|
899
|
-
"",
|
|
900
|
-
effectiveBootstrap.trim(),
|
|
901
|
-
"</session-start-context>",
|
|
902
|
-
"",
|
|
903
|
-
].join("\n"),
|
|
904
|
-
);
|
|
905
|
-
}
|
|
906
|
-
const availableSkillsBlock = renderAvailableSkillsBlock(stagedSkills);
|
|
907
|
-
if (availableSkillsBlock) {
|
|
908
|
-
sections.push(`${availableSkillsBlock}\n\n`);
|
|
909
|
-
}
|
|
910
|
-
// Plan-mode operating context (issue #142). Injected as its own block after
|
|
911
|
-
// the session-start surfaces and before the eval task framing, so it reads as
|
|
912
|
-
// a session-level mode active for this turn — layered the way the real harness
|
|
913
|
-
// delivers it, not as seed prose. Skill-agnostic: identical in both arms.
|
|
914
|
-
const planModeBlock = opts.planModeContent
|
|
915
|
-
? renderPlanModeContext(opts.planModeContent)
|
|
916
|
-
: "";
|
|
917
|
-
if (planModeBlock) {
|
|
918
|
-
sections.push(`${planModeBlock}\n\n`);
|
|
919
|
-
}
|
|
920
|
-
const taskLines = [
|
|
921
|
-
"You are executing a single test case for a skill evaluation framework.",
|
|
922
|
-
"Treat this as a real user request — do NOT optimize behavior for the eval.",
|
|
923
|
-
];
|
|
924
|
-
if (skillBlock) taskLines.push("", skillBlock);
|
|
925
|
-
taskLines.push(
|
|
926
|
-
"",
|
|
927
|
-
fixturesBlock,
|
|
928
|
-
`Output directory: ${opts.outputsDir}`,
|
|
929
|
-
"",
|
|
930
|
-
"Instructions:",
|
|
931
|
-
"- Write any files you produce into the output directory.",
|
|
932
|
-
`- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
|
|
933
|
-
"- Do not write outside the output directory.",
|
|
934
|
-
"",
|
|
935
|
-
"User request:",
|
|
936
|
-
opts.userPrompt,
|
|
937
|
-
);
|
|
938
|
-
sections.push(taskLines.join("\n"));
|
|
939
|
-
|
|
940
|
-
return {
|
|
941
|
-
eval_id: opts.evalId,
|
|
942
|
-
condition: opts.condition,
|
|
943
|
-
skill_path: opts.skillPath,
|
|
944
|
-
staged_skill_slug: opts.stagedSkillSlug,
|
|
945
|
-
user_prompt: opts.userPrompt,
|
|
946
|
-
fixtures: opts.fixtures,
|
|
947
|
-
outputs_dir: opts.outputsDir,
|
|
948
|
-
run_record_path: join(opts.condDir, "run.json"),
|
|
949
|
-
timing_path: join(opts.condDir, "timing.json"),
|
|
950
|
-
agent_description: opts.runTag
|
|
951
|
-
? `${opts.evalId}:${opts.condition}:${opts.runTag}`
|
|
952
|
-
: `${opts.evalId}:${opts.condition}`,
|
|
953
|
-
dispatch_prompt_path: join(opts.condDir, "dispatch-prompt.txt"),
|
|
954
|
-
dispatch_prompt: sections.join(""),
|
|
955
|
-
};
|
|
956
|
-
}
|
|
957
|
-
|
|
958
|
-
function buildManifest(opts: {
|
|
959
|
-
skillName: string;
|
|
960
|
-
mode: Mode;
|
|
961
|
-
baseline?: string;
|
|
962
|
-
iteration: number;
|
|
963
|
-
tasks: DispatchTask[];
|
|
964
|
-
}): string {
|
|
965
|
-
const header = [
|
|
966
|
-
`# Dispatch manifest — ${opts.skillName} iteration-${opts.iteration}`,
|
|
967
|
-
"",
|
|
968
|
-
`Mode: ${opts.mode}${opts.baseline ? ` (baseline: ${opts.baseline})` : ""}`,
|
|
969
|
-
`Generated: ${new Date().toISOString()}`,
|
|
970
|
-
`Total dispatches: ${opts.tasks.length}`,
|
|
971
|
-
"",
|
|
972
|
-
"## How to use this manifest",
|
|
973
|
-
"",
|
|
974
|
-
'In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the subagent with a short "read this file and follow it" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.',
|
|
975
|
-
"",
|
|
976
|
-
"**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
|
|
977
|
-
"",
|
|
978
|
-
"After every dispatch:",
|
|
979
|
-
"",
|
|
980
|
-
"1. Write `run.json` matching `skills/evaluating-skills/schema/run-record.schema.json` (enforced at runtime by grade/fill-transcripts/detect-stray-writes). Carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]` for now — `evals:fill-transcripts` will populate it from the persisted transcript in a later step.",
|
|
981
|
-
"2. Capture `total_tokens` and `duration_ms` from the harness's task completion event into `timing.json`. These values may not be persisted anywhere else — save them immediately.",
|
|
982
|
-
"",
|
|
983
|
-
"After all dispatches:",
|
|
984
|
-
"",
|
|
985
|
-
"3. (Claude Code only, optional) Run `bun run evals:fill-transcripts --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` to fill `tool_invocations` from each subagent's persisted transcript. Skipping this step leaves `transcript_check` assertions unverifiable.",
|
|
986
|
-
"4. Run `bun run evals:grade --skill <name> --iteration <N>` to grade.",
|
|
987
|
-
"",
|
|
988
|
-
"## Dispatches",
|
|
989
|
-
"",
|
|
990
|
-
].join("\n");
|
|
991
|
-
|
|
992
|
-
const entries = opts.tasks
|
|
993
|
-
.map((t) =>
|
|
994
|
-
[
|
|
995
|
-
`### ${t.eval_id} / ${t.condition}`,
|
|
996
|
-
"",
|
|
997
|
-
`- run.json: ${t.run_record_path}`,
|
|
998
|
-
`- timing.json: ${t.timing_path}`,
|
|
999
|
-
"",
|
|
1000
|
-
"```",
|
|
1001
|
-
t.dispatch_prompt,
|
|
1002
|
-
"```",
|
|
1003
|
-
"",
|
|
1004
|
-
].join("\n"),
|
|
1005
|
-
)
|
|
1006
|
-
.join("\n");
|
|
1007
|
-
|
|
1008
|
-
return header + entries;
|
|
1009
|
-
}
|
|
1010
|
-
|
|
1011
|
-
if (import.meta.main) {
|
|
1012
|
-
const argv = Bun.argv.slice(2);
|
|
1013
|
-
const args = parseArgs(argv);
|
|
1014
|
-
let ctx: RunContext;
|
|
1015
|
-
try {
|
|
1016
|
-
ctx = detectRunContext(argv);
|
|
1017
|
-
} catch (err) {
|
|
1018
|
-
die(err instanceof Error ? err.message : String(err));
|
|
1019
|
-
}
|
|
1020
|
-
if (args.command === "snapshot") commandSnapshot(args, ctx);
|
|
1021
|
-
else if (args.command === "teardown-guard") {
|
|
1022
|
-
const torn = teardownGuard(ctx.stageRoot);
|
|
1023
|
-
console.log(
|
|
1024
|
-
torn
|
|
1025
|
-
? "🛡 Write guard removed."
|
|
1026
|
-
: "No write guard was installed — nothing to remove.",
|
|
1027
|
-
);
|
|
1028
|
-
} else commandRun(args, ctx);
|
|
1029
|
-
}
|