@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,1388 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { randomBytes } from "node:crypto";
3
- import {
4
- cpSync,
5
- existsSync,
6
- mkdirSync,
7
- mkdtempSync,
8
- readdirSync,
9
- readFileSync,
10
- rmSync,
11
- statSync,
12
- writeFileSync,
13
- } from "node:fs";
14
- import { tmpdir } from "node:os";
15
- import { basename, dirname, join } from "node:path";
16
- import {
17
- renderAvailableSkillsBlock,
18
- renderPlanModeContext,
19
- } from "./adapters/claude-code-session";
20
- import { detectRunContext, type Harness, type RunContext } from "./context";
21
- import { installGuard, teardownGuard } from "./guard/install";
22
- import {
23
- detectPluginShadows,
24
- formatShadowBanner,
25
- resolveConfigDir,
26
- } from "./plugin-shadow";
27
- import type {
28
- AvailableSkill,
29
- ConditionsRecord,
30
- Eval,
31
- EvalsConfig,
32
- } from "./types";
33
- import { validateEvalsConfig } from "./validate";
34
- import { cleanupWorkspace, SNAPSHOT_META } from "./workspace-teardown";
35
-
36
- export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
37
- export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
38
-
39
- export function stageSkillForCC(opts: {
40
- content: string;
41
- iteration: number;
42
- condition: string;
43
- skillName: string;
44
- repoRoot: string;
45
- /**
46
- * Source skill directory whose sibling assets are copied alongside the staged
47
- * SKILL.md — everything next to SKILL.md except SKILL.md itself, the `evals/`
48
- * dir, and the snapshot bookkeeping file. A multi-file skill whose SKILL.md
49
- * links a sibling (e.g. `[code-review.md](code-review.md)`) would otherwise be
50
- * staged with a dangling link: the agent can't resolve the reference relative
51
- * to the staged dir, so the linked guidance is silently unreachable. Mirrors
52
- * the sibling-asset copy in `snapshot`. Omit to stage SKILL.md alone.
53
- */
54
- assetsDir?: string;
55
- /**
56
- * When set, stage under this verbatim identifier instead of the conspicuous
57
- * `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
58
- * against the eval-flagged one (issue #144 Step 2). A custom name is not
59
- * caught by `cleanupStagedSkills`'s prefix scan, so the caller must also call
60
- * `registerStagedSkillForCleanup` to have it removed on the next run.
61
- */
62
- stageNameOverride?: string;
63
- }): string {
64
- const slug =
65
- opts.stageNameOverride ??
66
- `${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
67
- const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
68
- mkdirSync(skillDir, { recursive: true });
69
- writeFileSync(join(skillDir, "SKILL.md"), opts.content);
70
- if (opts.assetsDir !== undefined && existsSync(opts.assetsDir)) {
71
- for (const entry of readdirSync(opts.assetsDir)) {
72
- if (entry === "SKILL.md" || entry === "evals" || entry === SNAPSHOT_META)
73
- continue;
74
- const src = join(opts.assetsDir, entry);
75
- const dst = join(skillDir, entry);
76
- if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
77
- else cpSync(src, dst);
78
- }
79
- }
80
- return slug;
81
- }
82
-
83
- /**
84
- * Adds a custom-named staged skill dir (one created via `stageNameOverride`) to
85
- * the sibling manifest's `created_entries` so the next run's
86
- * `cleanupStagedSkills` removes it — the prefix scan only catches
87
- * `slow-powers-eval-…` names. Idempotent: a name already recorded is left alone.
88
- */
89
- export function registerStagedSkillForCleanup(
90
- repoRoot: string,
91
- name: string,
92
- ): void {
93
- const skillsDir = join(repoRoot, ".claude", "skills");
94
- const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
95
- let manifest: SiblingManifest;
96
- if (existsSync(manifestPath)) {
97
- manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
98
- } else {
99
- manifest = {
100
- created_at: new Date().toISOString(),
101
- staged_under_test: name,
102
- skills_dir_preexisting: true,
103
- created_entries: [],
104
- };
105
- }
106
- if (manifest.created_entries.some((e) => e.name === name)) return;
107
- manifest.created_entries.push({ name, preexisting: false });
108
- writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
109
- }
110
-
111
- type SiblingManifest = {
112
- created_at: string;
113
- staged_under_test: string;
114
- /**
115
- * Whether `.claude/skills` already existed when staging began. When false the
116
- * runner created it, so {@link cleanupStagedSkills} may remove the whole tree
117
- * (and prune an emptied `.claude`); when true (or absent, on older manifests)
118
- * cleanup falls back to the surgical per-entry restore so a user's own
119
- * project skills are left intact.
120
- */
121
- skills_dir_preexisting?: boolean;
122
- created_entries: Array<{
123
- name: string;
124
- preexisting: boolean;
125
- backup_path?: string;
126
- }>;
127
- };
128
-
129
- export function stageSiblingSkills(opts: {
130
- skillUnderTest: string;
131
- skillsSourceDir: string;
132
- repoRoot: string;
133
- }): SiblingManifest {
134
- const skillsDir = join(opts.repoRoot, ".claude", "skills");
135
- const skillsDirPreexisting = existsSync(skillsDir);
136
- mkdirSync(skillsDir, { recursive: true });
137
-
138
- const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
139
- if (name === opts.skillUnderTest) return false;
140
- const srcDir = join(opts.skillsSourceDir, name);
141
- if (!statSync(srcDir).isDirectory()) return false;
142
- return existsSync(join(srcDir, "SKILL.md"));
143
- });
144
-
145
- const manifest: SiblingManifest = {
146
- created_at: new Date().toISOString(),
147
- staged_under_test: opts.skillUnderTest,
148
- skills_dir_preexisting: skillsDirPreexisting,
149
- created_entries: [],
150
- };
151
-
152
- for (const name of siblings) {
153
- const srcDir = join(opts.skillsSourceDir, name);
154
- const dstDir = join(skillsDir, name);
155
- const evalsSubdir = join(srcDir, "evals");
156
-
157
- const entry: SiblingManifest["created_entries"][number] = {
158
- name,
159
- preexisting: false,
160
- };
161
-
162
- if (existsSync(dstDir)) {
163
- entry.preexisting = true;
164
- const backupRoot = mkdtempSync(
165
- join(tmpdir(), "slow-powers-eval-backup-"),
166
- );
167
- entry.backup_path = join(backupRoot, name);
168
- cpSync(dstDir, entry.backup_path, { recursive: true });
169
- rmSync(dstDir, { recursive: true, force: true });
170
- }
171
-
172
- cpSync(srcDir, dstDir, {
173
- recursive: true,
174
- filter: (src) =>
175
- src !== evalsSubdir && !src.startsWith(`${evalsSubdir}/`),
176
- });
177
-
178
- manifest.created_entries.push(entry);
179
- }
180
-
181
- writeFileSync(
182
- join(skillsDir, STAGED_SIBLING_MANIFEST),
183
- `${JSON.stringify(manifest, null, 2)}\n`,
184
- );
185
- return manifest;
186
- }
187
-
188
- /** Remove `dir` only if it exists and is empty. Used to prune a `.claude` the
189
- * runner emptied without ever touching a `.claude` that still holds the user's
190
- * own files (e.g. `settings.json`). */
191
- function pruneIfEmpty(dir: string): void {
192
- if (existsSync(dir) && readdirSync(dir).length === 0) {
193
- rmSync(dir, { recursive: true, force: true });
194
- }
195
- }
196
-
197
- export function cleanupStagedSkills(repoRoot: string): void {
198
- const claudeDir = join(repoRoot, ".claude");
199
- const skillsDir = join(claudeDir, "skills");
200
- if (!existsSync(skillsDir)) return;
201
-
202
- for (const entry of readdirSync(skillsDir)) {
203
- if (!entry.startsWith(STAGED_SKILL_PREFIX)) continue;
204
- rmSync(join(skillsDir, entry), { recursive: true, force: true });
205
- }
206
-
207
- const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
208
- if (!existsSync(manifestPath)) return;
209
- let manifest: SiblingManifest;
210
- try {
211
- manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
212
- } catch {
213
- rmSync(manifestPath, { force: true });
214
- return;
215
- }
216
-
217
- // The runner created `.claude/skills` this run, so it can't be holding any of
218
- // the user's own skills — remove the whole staged tree (including any stray,
219
- // non-prefixed dirs a recursive eval left behind), then prune an emptied
220
- // `.claude`. In a real project `.claude/settings.json` keeps `.claude`
221
- // non-empty, so only the scaffolding we created is removed.
222
- if (manifest.skills_dir_preexisting === false) {
223
- rmSync(skillsDir, { recursive: true, force: true });
224
- pruneIfEmpty(claudeDir);
225
- return;
226
- }
227
-
228
- for (const e of manifest.created_entries) {
229
- const target = join(skillsDir, e.name);
230
- rmSync(target, { recursive: true, force: true });
231
- if (e.preexisting && e.backup_path && existsSync(e.backup_path)) {
232
- cpSync(e.backup_path, target, { recursive: true });
233
- rmSync(dirname(e.backup_path), { recursive: true, force: true });
234
- }
235
- }
236
- rmSync(manifestPath, { force: true });
237
- }
238
-
239
- type Mode = "new-skill" | "revision";
240
-
241
- type Args = {
242
- command:
243
- | "run"
244
- | "snapshot"
245
- | "teardown-guard"
246
- | "teardown"
247
- | "ingest"
248
- | "finalize";
249
- mode?: Mode;
250
- baseline?: string;
251
- label?: string;
252
- iteration?: number;
253
- only?: string[];
254
- skip?: string[];
255
- dryRun: boolean;
256
- noStage: boolean;
257
- guard: boolean;
258
- stageName?: string;
259
- planMode: boolean;
260
- ref?: string;
261
- subagentsDir?: string;
262
- };
263
-
264
- function die(msg: string): never {
265
- console.error(`error: ${msg}`);
266
- process.exit(1);
267
- }
268
-
269
- /**
270
- * Reads the bytes of `<ref>:./<relPath>` from git, resolving `relPath` relative
271
- * to `cwd` via the `./` prefix. Returns the raw stdout Buffer on success (write
272
- * it directly — never `.toString()` — so binary assets round-trip intact), or
273
- * `null` if the object doesn't exist at that ref (git exits non-zero). Mirrors
274
- * the `Bun.spawnSync` git pattern in `promote-baseline.ts:gitHead`; runs git
275
- * directly (no shell), so the ref/path aren't interpolated into a shell string.
276
- */
277
- function gitShowBytes(
278
- cwd: string,
279
- ref: string,
280
- relPath: string,
281
- ): Buffer | null {
282
- const res = Bun.spawnSync(["git", "show", `${ref}:./${relPath}`], {
283
- cwd,
284
- stdout: "pipe",
285
- stderr: "pipe",
286
- });
287
- if (res.exitCode !== 0) return null;
288
- return Buffer.from(res.stdout);
289
- }
290
-
291
- /**
292
- * Lists every file under `cwd` as it existed at `<ref>`, as paths relative to
293
- * `cwd` (git's default ls-tree output strips the cwd prefix). `die`s with git's
294
- * stderr on failure — a bad ref or a cwd outside any repo surfaces here.
295
- */
296
- function gitLsFiles(cwd: string, ref: string): string[] {
297
- const res = Bun.spawnSync(["git", "ls-tree", "-r", "--name-only", ref, "."], {
298
- cwd,
299
- stdout: "pipe",
300
- stderr: "pipe",
301
- });
302
- if (res.exitCode !== 0)
303
- die(`git ls-tree failed for ref ${ref}: ${res.stderr.toString().trim()}`);
304
- return res.stdout
305
- .toString()
306
- .split("\n")
307
- .map((s) => s.trim())
308
- .filter(Boolean);
309
- }
310
-
311
- function parseArgs(argv: string[]): Args {
312
- const positionals = argv.filter((a) => !a.startsWith("--"));
313
- const COMMANDS: Args["command"][] = [
314
- "snapshot",
315
- "teardown-guard",
316
- "teardown",
317
- "ingest",
318
- "finalize",
319
- ];
320
- const command: Args["command"] =
321
- COMMANDS.find((c) => c === positionals[0]) ?? "run";
322
-
323
- const flag = (name: string): string | undefined => {
324
- const i = argv.indexOf(`--${name}`);
325
- if (i === -1) return undefined;
326
- const v = argv[i + 1];
327
- if (v === undefined || v.startsWith("--")) {
328
- die(`flag --${name} requires a value`);
329
- }
330
- return v;
331
- };
332
-
333
- const has = (name: string) => argv.includes(`--${name}`);
334
-
335
- const iterationFlag = flag("iteration");
336
- const iteration =
337
- iterationFlag !== undefined ? Number(iterationFlag) : undefined;
338
- if (iteration !== undefined && !Number.isInteger(iteration))
339
- die(`--iteration must be an integer, got ${iterationFlag}`);
340
-
341
- const parseIdList = (v: string | undefined): string[] | undefined =>
342
- v === undefined
343
- ? undefined
344
- : v
345
- .split(",")
346
- .map((s) => s.trim())
347
- .filter(Boolean);
348
-
349
- return {
350
- command,
351
- mode: flag("mode") as Mode | undefined,
352
- baseline: flag("baseline"),
353
- label: flag("label"),
354
- iteration,
355
- only: parseIdList(flag("only")),
356
- skip: parseIdList(flag("skip")),
357
- dryRun: has("dry-run"),
358
- noStage: has("no-stage"),
359
- guard: has("guard"),
360
- stageName: flag("stage-name"),
361
- planMode: has("plan-mode"),
362
- ref: flag("ref"),
363
- subagentsDir: flag("subagents-dir"),
364
- };
365
- }
366
-
367
- function ensureDir(path: string): void {
368
- if (!existsSync(path)) mkdirSync(path, { recursive: true });
369
- }
370
-
371
- function writeJson(path: string, value: unknown): void {
372
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
373
- }
374
-
375
- function readJson<T>(path: string): T {
376
- return JSON.parse(readFileSync(path, "utf8"));
377
- }
378
-
379
- function nextIteration(workspaceSkillDir: string, override?: number): number {
380
- if (override !== undefined) return override;
381
- if (!existsSync(workspaceSkillDir)) return 1;
382
- const entries = readdirSync(workspaceSkillDir).filter((e) =>
383
- e.startsWith("iteration-"),
384
- );
385
- if (entries.length === 0) return 1;
386
- const nums = entries
387
- .map((e) => Number(e.slice("iteration-".length)))
388
- .filter((n) => Number.isFinite(n));
389
- return Math.max(...nums, 0) + 1;
390
- }
391
-
392
- function conditionNamesFor(mode: Mode): [string, string] {
393
- return mode === "new-skill"
394
- ? ["with_skill", "without_skill"]
395
- : ["old_skill", "new_skill"];
396
- }
397
-
398
- function commandSnapshot(args: Args, ctx: RunContext): void {
399
- if (!args.label) die("snapshot requires --label <name>");
400
- const skillDir = ctx.skillSubdir;
401
-
402
- const destDir = join(
403
- ctx.workspaceRoot,
404
- ctx.skillName,
405
- "snapshots",
406
- args.label,
407
- );
408
- if (existsSync(destDir))
409
- die(
410
- `snapshot already exists: ${destDir}\n` +
411
- " Use a different --label or delete the existing snapshot first.",
412
- );
413
-
414
- if (args.ref !== undefined) {
415
- snapshotFromRef(args.ref, skillDir, destDir, ctx.skillName);
416
- return;
417
- }
418
-
419
- const skillMd = join(skillDir, "SKILL.md");
420
- if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
421
- ensureDir(destDir);
422
-
423
- cpSync(skillMd, join(destDir, "SKILL.md"));
424
- for (const entry of readdirSync(skillDir)) {
425
- if (entry === "SKILL.md" || entry === "evals") continue;
426
- const src = join(skillDir, entry);
427
- const dst = join(destDir, entry);
428
- if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
429
- else cpSync(src, dst);
430
- }
431
-
432
- // Record provenance so teardown keeps this (working-tree) snapshot — unlike a
433
- // ref snapshot, it can't be regenerated from git.
434
- writeJson(join(destDir, SNAPSHOT_META), { source: "working-tree" });
435
-
436
- console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
437
- }
438
-
439
- /**
440
- * Snapshots the skill (SKILL.md + sibling assets) as it existed at a git ref,
441
- * read straight from the object database without touching the working tree
442
- * (issue #122). The `evals/` directory is excluded to match the working-tree
443
- * branch. Git runs from `skillDir`, which must sit inside a repo; a bad ref or a
444
- * skill absent at that ref `die`s with a clear message.
445
- */
446
- function snapshotFromRef(
447
- ref: string,
448
- skillDir: string,
449
- destDir: string,
450
- skillName: string,
451
- ): void {
452
- const skillMd = gitShowBytes(skillDir, ref, "SKILL.md");
453
- if (skillMd === null)
454
- die(
455
- `skill not found at ${ref}: ${join(skillDir, "SKILL.md")}\n` +
456
- " Check the ref exists and that the skill was present there (and that this is a git repo).",
457
- );
458
-
459
- ensureDir(destDir);
460
- writeFileSync(join(destDir, "SKILL.md"), skillMd);
461
-
462
- for (const relPath of gitLsFiles(skillDir, ref)) {
463
- if (relPath === "SKILL.md") continue;
464
- if (relPath === "evals" || relPath.startsWith("evals/")) continue;
465
- const bytes = gitShowBytes(skillDir, ref, relPath);
466
- if (bytes === null) continue; // listed but unreadable (e.g. submodule/gitlink)
467
- const dst = join(destDir, relPath);
468
- ensureDir(dirname(dst));
469
- writeFileSync(dst, bytes);
470
- }
471
-
472
- // Record provenance so teardown can reclaim this snapshot — it's fully
473
- // reproducible from the ref.
474
- writeJson(join(destDir, SNAPSHOT_META), { source: "ref", ref });
475
-
476
- console.log(`Snapshotted ${skillName} at ${ref} → ${destDir}`);
477
- }
478
-
479
- function commandRun(args: Args, ctx: RunContext): void {
480
- if (!args.mode) die("--mode required: new-skill | revision");
481
- if (args.mode !== "new-skill" && args.mode !== "revision")
482
- die(`unknown --mode: ${args.mode}`);
483
- if (args.mode === "revision" && !args.baseline)
484
- die("revision mode requires --baseline <label>");
485
-
486
- const skillDir = ctx.skillSubdir;
487
- const skillMd = join(skillDir, "SKILL.md");
488
- if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
489
-
490
- const evalsPath = join(skillDir, "evals", "evals.json");
491
- if (!existsSync(evalsPath)) die(`evals.json not found: ${evalsPath}`);
492
-
493
- const config: EvalsConfig = validateEvalsConfig(
494
- readJson(evalsPath),
495
- evalsPath,
496
- );
497
- if (config.skill_name !== ctx.skillName)
498
- console.warn(
499
- `warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
500
- );
501
-
502
- let selectedEvals: Eval[];
503
- try {
504
- selectedEvals = selectEvals(config.evals, {
505
- only: args.only,
506
- skip: args.skip,
507
- });
508
- } catch (err) {
509
- die(err instanceof Error ? err.message : String(err));
510
- }
511
-
512
- const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
513
- const iteration = nextIteration(workspaceSkillDir, args.iteration);
514
- const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
515
-
516
- // A per-run nonce makes each dispatch description globally unique. The
517
- // subagents dir is shared across iterations of one parent session, so a bare
518
- // `<eval>:<condition>` description repeats and fill-transcripts could fill an
519
- // iteration's run from a colliding agent in another iteration. `i<N>-<nonce>`
520
- // also disambiguates re-running the same iteration number.
521
- const runNonce = `${Date.now().toString(36)}-${randomBytes(3).toString("hex")}`;
522
- const runTag = `i${iteration}-${runNonce}`;
523
-
524
- if (existsSync(iterationDir) && args.iteration === undefined)
525
- die(
526
- `iteration-${iteration} already exists; pass --iteration to overwrite explicitly`,
527
- );
528
-
529
- const [conditionA, conditionB] = conditionNamesFor(args.mode);
530
-
531
- let skillPathForA: string | null;
532
- let skillPathForB: string | null;
533
- if (args.mode === "new-skill") {
534
- skillPathForA = skillMd;
535
- skillPathForB = null;
536
- } else {
537
- const baselineSkill = join(
538
- workspaceSkillDir,
539
- "snapshots",
540
- args.baseline as string,
541
- "SKILL.md",
542
- );
543
- if (!existsSync(baselineSkill))
544
- die(
545
- `baseline snapshot not found: ${baselineSkill}\n` +
546
- ` Run: bun run evals:snapshot --skill ${ctx.skillName} --skill-dir ${ctx.skillDir} --label ${args.baseline} (before editing)`,
547
- );
548
- skillPathForA = baselineSkill;
549
- skillPathForB = skillMd;
550
- }
551
-
552
- console.log(
553
- `Preparing ${ctx.skillName} iteration-${iteration} (${args.mode})`,
554
- );
555
- console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
556
- console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
557
- if (selectedEvals.length !== config.evals.length) {
558
- const [flagName, ids] = args.only
559
- ? ["--only", args.only]
560
- : ["--skip", args.skip ?? []];
561
- console.log(
562
- ` selection: ${selectedEvals.length} of ${config.evals.length} evals (${flagName} ${ids.join(", ")})`,
563
- );
564
- }
565
- if (args.noStage)
566
- console.log(
567
- " staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
568
- );
569
-
570
- ensureDir(iterationDir);
571
- cpSync(skillMd, join(iterationDir, "skill-snapshot.md"));
572
-
573
- // Always disarm a prior run's guard before re-staging, so a crashed run can't
574
- // leave the write-blocking hook armed across runs.
575
- teardownGuard(ctx.stageRoot);
576
-
577
- if (!args.noStage) cleanupStagedSkills(ctx.stageRoot);
578
-
579
- if (!args.noStage) {
580
- stageSiblingSkills({
581
- skillUnderTest: ctx.skillName,
582
- skillsSourceDir: ctx.skillDir,
583
- repoRoot: ctx.stageRoot,
584
- });
585
- }
586
-
587
- const bootstrapContent =
588
- ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
589
-
590
- // `--plan-mode` (issue #142): inject the harness's verbatim plan-mode
591
- // procedure as an operating-context layer. The profile is a bundled asset
592
- // resolved relative to this runner (mirroring the guard-script resolution
593
- // below) and keyed by harness, so a harness without a profile simply has no
594
- // `--plan-mode` and the portable dispatch contract is unchanged.
595
- const planModeContent = args.planMode
596
- ? resolvePlanModeProfile(ctx.harness)
597
- : null;
598
- if (args.planMode)
599
- console.log(
600
- ` plan-mode: injecting ${ctx.harness} plan-mode profile as operating context (issue #142; necessary-not-sufficient fidelity layer)`,
601
- );
602
-
603
- // Sibling skill metadata, shared across conditions. Empty when --no-stage
604
- // (nothing is staged, so nothing is discoverable to list).
605
- const siblingSkills: AvailableSkill[] = args.noStage
606
- ? []
607
- : ctx.siblingSkillNames.map((name) => {
608
- const p = join(ctx.skillDir, name, "SKILL.md");
609
- return { name, path: p, description: getSkillDescription(p) };
610
- });
611
-
612
- // `--stage-name` overrides the conspicuous `slow-powers-eval-…` slug with a
613
- // verbatim name (issue #144 Step 2: A/B a natural name against the eval slug).
614
- // It targets the single staging condition, so reject the case where both
615
- // conditions stage (e.g. revision mode) — one name can't cover two dirs — and
616
- // refuse to clobber a dir that already exists (a real project skill the user
617
- // owns; cleanup has already removed our own prior custom dirs by this point).
618
- if (args.stageName !== undefined && !args.noStage) {
619
- if (skillPathForA !== null && skillPathForB !== null) {
620
- die(
621
- "--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
622
- );
623
- }
624
- const target = join(ctx.stageRoot, ".claude", "skills", args.stageName);
625
- if (existsSync(target)) {
626
- die(
627
- `--stage-name "${args.stageName}": ${target} already exists; refusing to clobber it. Remove it or choose a different name.`,
628
- );
629
- }
630
- }
631
-
632
- const stageFor = (
633
- condName: string,
634
- condSkillPath: string | null,
635
- ): string | null => {
636
- if (!condSkillPath || args.noStage) return null;
637
- return stageSkillForCC({
638
- content: readFileSync(condSkillPath, "utf8"),
639
- iteration,
640
- condition: condName,
641
- skillName: ctx.skillName,
642
- repoRoot: ctx.stageRoot,
643
- assetsDir: dirname(condSkillPath),
644
- stageNameOverride: args.stageName,
645
- });
646
- };
647
-
648
- const conditionASlug = stageFor(conditionA, skillPathForA);
649
- const conditionBSlug = stageFor(conditionB, skillPathForB);
650
-
651
- // A custom-named dir isn't caught by cleanupStagedSkills's prefix scan; record
652
- // it in the sibling manifest so the next run removes it.
653
- if (
654
- args.stageName !== undefined &&
655
- (conditionASlug === args.stageName || conditionBSlug === args.stageName)
656
- ) {
657
- registerStagedSkillForCleanup(ctx.stageRoot, args.stageName);
658
- }
659
-
660
- const conditions: ConditionsRecord = {
661
- mode: args.mode,
662
- baseline: args.baseline,
663
- conditions: [
664
- {
665
- name: conditionA,
666
- skill_path: skillPathForA,
667
- staged_skill_slug: conditionASlug,
668
- },
669
- {
670
- name: conditionB,
671
- skill_path: skillPathForB,
672
- staged_skill_slug: conditionBSlug,
673
- },
674
- ],
675
- timestamp: new Date().toISOString(),
676
- harness: ctx.harness,
677
- run_nonce: runNonce,
678
- };
679
- writeJson(join(iterationDir, "conditions.json"), conditions);
680
-
681
- // availableSkills for a condition = siblings + the skill-under-test when
682
- // that condition loads it. Empty when nothing was staged.
683
- const availableSkillsFor = (
684
- condSkillPath: string | null,
685
- ): AvailableSkill[] => {
686
- if (args.noStage) return [];
687
- const skills = [...siblingSkills];
688
- if (condSkillPath) {
689
- skills.push({
690
- name: ctx.skillName,
691
- path: condSkillPath,
692
- description: getSkillDescription(condSkillPath),
693
- });
694
- }
695
- return skills;
696
- };
697
-
698
- const tasks: DispatchTask[] = [];
699
- for (const ev of selectedEvals) {
700
- const evalDir = join(iterationDir, `eval-${ev.id}`);
701
- ensureDir(evalDir);
702
-
703
- for (const [condName, condSkillPath, condSlug] of [
704
- [conditionA, skillPathForA, conditionASlug],
705
- [conditionB, skillPathForB, conditionBSlug],
706
- ] as const) {
707
- const condDir = join(evalDir, condName);
708
- const outputsDir = join(condDir, "outputs");
709
- ensureDir(outputsDir);
710
-
711
- const fixtures = copyFixtures(ev, skillDir, condDir);
712
- tasks.push(
713
- buildDispatchTask({
714
- evalId: ev.id,
715
- condition: condName,
716
- skillPath: condSkillPath,
717
- stagedSkillSlug: condSlug,
718
- userPrompt: ev.prompt,
719
- fixtures,
720
- outputsDir,
721
- condDir,
722
- bootstrapContent,
723
- planModeContent,
724
- skillName: ctx.skillName,
725
- availableSkills: availableSkillsFor(condSkillPath),
726
- runTag,
727
- }),
728
- );
729
- }
730
- }
731
-
732
- const manifestPath = join(iterationDir, "dispatch-manifest.md");
733
- writeFileSync(
734
- manifestPath,
735
- buildManifest({
736
- skillName: ctx.skillName,
737
- mode: args.mode,
738
- baseline: args.baseline,
739
- iteration,
740
- tasks,
741
- }),
742
- );
743
-
744
- // Write each prompt to its own file and reference it by path in dispatch.json.
745
- // The orchestrator then dispatches with a short "read this file" prompt instead
746
- // of reproducing the full prompt verbatim per Task call.
747
- for (const task of tasks) {
748
- writeFileSync(task.dispatch_prompt_path, task.dispatch_prompt);
749
- }
750
-
751
- const dispatchJsonPath = join(iterationDir, "dispatch.json");
752
- writeJson(dispatchJsonPath, {
753
- skill_name: ctx.skillName,
754
- iteration,
755
- run_nonce: runNonce,
756
- iteration_dir: iterationDir,
757
- mode: args.mode,
758
- baseline: args.baseline ?? null,
759
- plan_mode: args.planMode,
760
- conditions: conditions.conditions,
761
- harness: ctx.harness,
762
- tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
763
- });
764
-
765
- // Opt-in hard guard. Stages a PreToolUse hook that blocks subagent
766
- // writes/installs outside the eval sandbox while dispatches run.
767
- if (args.guard && !args.dryRun) {
768
- if (args.noStage) {
769
- console.warn(
770
- "\n⚠ --guard requires staging enabled; skipping guard install.",
771
- );
772
- } else {
773
- const guardScriptPath = join(import.meta.dir, "guard", "guard.ts");
774
- installGuard({
775
- stageRoot: ctx.stageRoot,
776
- workspaceRoot: ctx.workspaceRoot,
777
- guardScriptPath,
778
- });
779
- console.log(
780
- "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n" +
781
- " and will block writes/installs outside the eval sandbox during dispatches.\n" +
782
- " It auto-expires in 6h and is removed on the next run; to remove it now:\n" +
783
- " bun run evals:teardown-guard --skill <name>",
784
- );
785
- }
786
- }
787
-
788
- // Plugin-shadow preflight (Claude Code): a staged skill name that is also
789
- // discoverable from an enabled plugin or the global skills dir contaminates the
790
- // run — subagents inherit this session's plugins, so both copies are reachable.
791
- // The runner can't unload a plugin from a live session; it only flags it. The
792
- // report is persisted so the aggregator can surface it in validity_warnings.
793
- if (ctx.harness === "claude-code") {
794
- const shadowReport = detectPluginShadows({
795
- configDir: resolveConfigDir(),
796
- cwd: ctx.stageRoot,
797
- stagedSkillNames: [ctx.skillName, ...ctx.siblingSkillNames],
798
- });
799
- if (shadowReport.shadowed.length > 0) {
800
- writeJson(join(iterationDir, "plugin-shadow.json"), shadowReport);
801
- console.warn(formatShadowBanner(shadowReport));
802
- }
803
- }
804
-
805
- console.log(`\nWorkspace prepared: ${iterationDir}`);
806
- console.log(`Dispatch manifest: ${manifestPath}`);
807
- console.log(`Dispatch tasks: ${dispatchJsonPath}`);
808
- console.log(
809
- `\n${tasks.length} dispatches required (${selectedEvals.length} evals × 2 conditions).`,
810
- );
811
-
812
- if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
813
- else
814
- console.log(
815
- "\nNext: read dispatch.json and dispatch each task as a subagent. Then run `ingest --iteration <N> --subagents-dir <path>` (Claude Code), or write run.json + timing.json to the paths in each task by hand and run the chained steps individually (transcript-less harnesses).",
816
- );
817
- }
818
-
819
- type DispatchTask = {
820
- eval_id: string;
821
- condition: string;
822
- skill_path: string | null;
823
- staged_skill_slug: string | null;
824
- user_prompt: string;
825
- fixtures: string[];
826
- outputs_dir: string;
827
- run_record_path: string;
828
- timing_path: string;
829
- agent_description: string;
830
- /**
831
- * Absolute path to the file holding the full dispatch prompt. The orchestrator
832
- * dispatches each subagent with a short "read this file and follow it" prompt
833
- * rather than inlining the prompt, so it never has to reproduce ~KB of text per
834
- * Task call. `dispatch_prompt` carries the same text in-memory (for manifest
835
- * building and unit tests) but is stripped from the serialized dispatch.json.
836
- */
837
- dispatch_prompt_path: string;
838
- dispatch_prompt: string;
839
- };
840
-
841
- export type { AvailableSkill } from "./types";
842
-
843
- /**
844
- * Filters the eval list to the subset requested via `--only` / `--skip`. The
845
- * two flags are mutually exclusive. Every requested id must exist in the config,
846
- * so a typo'd id is caught up front rather than silently producing an empty or
847
- * surprising run. Throws on invalid input; the caller routes the message to
848
- * `die`. `--only` preserves the config's eval order, not the order ids were
849
- * passed.
850
- */
851
- export function selectEvals(
852
- evals: Eval[],
853
- opts: { only?: string[]; skip?: string[] },
854
- ): Eval[] {
855
- if (opts.only && opts.skip)
856
- throw new Error("use only one of --only / --skip, not both");
857
- const requested = opts.only ?? opts.skip;
858
- if (requested === undefined) return evals;
859
- if (requested.length === 0)
860
- throw new Error("--only/--skip requires at least one eval id");
861
-
862
- const known = new Set(evals.map((e) => e.id));
863
- const unknown = requested.filter((id) => !known.has(id));
864
- if (unknown.length)
865
- throw new Error(
866
- `unknown eval id(s): ${unknown.join(", ")}. ` +
867
- `Available ids: ${[...known].join(", ")}`,
868
- );
869
-
870
- const set = new Set(requested);
871
- return opts.only
872
- ? evals.filter((e) => set.has(e.id))
873
- : evals.filter((e) => !set.has(e.id));
874
- }
875
-
876
- function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
877
- if (!ev.files || ev.files.length === 0) return [];
878
- const inputsDir = join(condDir, "inputs");
879
- ensureDir(inputsDir);
880
- const copied: string[] = [];
881
- for (const f of ev.files) {
882
- const src = join(skillDir, "evals", f);
883
- if (!existsSync(src)) die(`fixture not found: ${src}`);
884
- const dst = join(inputsDir, basename(f));
885
- if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
886
- else cpSync(src, dst);
887
- copied.push(dst);
888
- }
889
- return copied;
890
- }
891
-
892
- /**
893
- * Resolve the verbatim plan-mode procedure profile for a harness (issue #142).
894
- * The profile is a bundled supporting-file asset under
895
- * `profiles/<harness>/plan-mode.md`, resolved relative to this runner exactly
896
- * like the guard script (`join(import.meta.dir, "guard", "guard.ts")`). A
897
- * harness without a profile gets a clear error rather than a silent no-op — the
898
- * profile is Claude-tier fidelity, and a harness lacking one leaves the portable
899
- * dispatch contract unchanged (no `<system-reminder>` plan-mode block emitted).
900
- */
901
- function resolvePlanModeProfile(harness: Harness): string {
902
- const profilePath = join(
903
- import.meta.dir,
904
- "profiles",
905
- harness,
906
- "plan-mode.md",
907
- );
908
- if (!existsSync(profilePath)) {
909
- die(
910
- `--plan-mode: no plan-mode profile exists for harness '${harness}' ` +
911
- `(expected ${profilePath}). This is a Claude-tier fidelity layer; a ` +
912
- "harness without a profile leaves the portable dispatch contract unchanged.",
913
- );
914
- }
915
- return readFileSync(profilePath, "utf8");
916
- }
917
-
918
- function getSkillDescription(skillPath: string): string {
919
- try {
920
- const content = readFileSync(skillPath, "utf8");
921
- const match = content.match(/description:\s*([^\n\r]+)/);
922
- if (match) {
923
- let desc = match[1].trim();
924
- if (
925
- (desc.startsWith('"') && desc.endsWith('"')) ||
926
- (desc.startsWith("'") && desc.endsWith("'"))
927
- ) {
928
- desc = desc.slice(1, -1).trim();
929
- }
930
- return desc;
931
- }
932
- } catch {}
933
- return "No description available.";
934
- }
935
-
936
- /**
937
- * Removes the skill-under-test's "Active Skills Directory" entry from bootstrap
938
- * content so a skill-absent condition (e.g. `without_skill`) carries no
939
- * reference to it. Targets the markdown list-item block: a top-level `*`/`-`
940
- * bullet whose backticked name equals `skillName`, plus its indented
941
- * continuation lines (the `*Trigger:*` sub-bullet). Sibling entries and the
942
- * heading are left intact. The eval bootstrap names skills only in that
943
- * directory, so this is the sole reference vector to scrub.
944
- */
945
- export function redactSkillFromBootstrap(
946
- content: string,
947
- skillName: string,
948
- ): string {
949
- const out: string[] = [];
950
- let skipping = false;
951
- for (const line of content.split("\n")) {
952
- if (skipping) {
953
- // Indented continuation lines belong to the entry being dropped.
954
- if (/^\s+\S/.test(line)) continue;
955
- skipping = false;
956
- }
957
- if (/^[*-]\s/.test(line) && line.includes(`\`${skillName}\``)) {
958
- skipping = true;
959
- continue;
960
- }
961
- out.push(line);
962
- }
963
- return out.join("\n");
964
- }
965
-
966
- export function buildDispatchTask(opts: {
967
- evalId: string;
968
- condition: string;
969
- skillPath: string | null;
970
- stagedSkillSlug: string | null;
971
- userPrompt: string;
972
- fixtures: string[];
973
- outputsDir: string;
974
- condDir: string;
975
- bootstrapContent: string | null;
976
- /**
977
- * Verbatim plan-mode procedure profile (from
978
- * `profiles/<harness>/plan-mode.md`) to inject as an operating-context layer,
979
- * or null/undefined to omit it. Skill-agnostic, so it is identical across the
980
- * with/without-skill arms and needs no redaction. Set by the `--plan-mode`
981
- * flag (issue #142): the highest-fidelity in-runner approximation of a real
982
- * plan mode, still text the agent reads — a necessary-not-sufficient signal.
983
- */
984
- planModeContent?: string | null;
985
- skillName: string;
986
- availableSkills: AvailableSkill[];
987
- /**
988
- * Per-run uniqueness suffix (`i<iteration>-<nonce>`). Appended to the
989
- * dispatch description so transcripts can't collide across iterations or
990
- * re-runs. Omitted in unit tests that exercise prompt assembly directly.
991
- */
992
- runTag?: string;
993
- }): DispatchTask {
994
- const stagedSkills = [...opts.availableSkills].sort((a, b) =>
995
- a.name.localeCompare(b.name),
996
- );
997
-
998
- let skillBlock: string;
999
- if (opts.stagedSkillSlug) {
1000
- // Neutral slug disambiguation only — no imperative to invoke. The skill is
1001
- // staged under a unique slug; surface that identifier so a deliberate
1002
- // invocation targets the staged copy and the __skill_invoked meta-check can
1003
- // find it. Do NOT assert a plugin is "loaded" or tell the agent to prefer the
1004
- // slug "rather than the bare name": in an isolated run there is no global copy,
1005
- // and that framing invited the agent to hunt for one (issue #144 global-plugin
1006
- // leakage). Whether to invoke is left to the skill's own triggering (dropping
1007
- // the old "invoke if it applies" directive was the issue #119 ceiling fix).
1008
- skillBlock = [
1009
- `The \`${opts.skillName}\` skill is registered under the identifier \`${opts.stagedSkillSlug}\` and is discoverable via the Skill tool. If you invoke it, use that identifier.`,
1010
- ].join("\n");
1011
- } else if (opts.skillPath) {
1012
- skillBlock = [
1013
- "The following skill is loaded into your operating guidelines. Apply it where relevant to the user's request.",
1014
- "",
1015
- `<skill name="${basename(dirname(opts.skillPath))}">`,
1016
- readFileSync(opts.skillPath, "utf8").trim(),
1017
- "</skill>",
1018
- ].join("\n");
1019
- } else if (stagedSkills.length > 0 || opts.bootstrapContent) {
1020
- // Skill-absent arm in a realistic environment: stay silent. The
1021
- // available-skills block already omits the skill-under-test, so any
1022
- // commentary here would only announce the eval (and, in the control arm,
1023
- // draw attention to the very skill that is supposed to be absent).
1024
- skillBlock = "";
1025
- } else {
1026
- skillBlock = "No skill is loaded. Respond as you naturally would.";
1027
- }
1028
-
1029
- const fixturesBlock = opts.fixtures.length
1030
- ? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
1031
- : "Available fixture files: none";
1032
-
1033
- // A dispatch mirrors a real session by carrying two *separate* surfaces, the
1034
- // way the harness actually delivers them:
1035
- // 1. The verbatim --bootstrap file (the SessionStart-hook equivalent),
1036
- // wrapped in <session-start-context>, if supplied.
1037
- // 2. The list of discoverable skills, rendered in the harness's native
1038
- // presentation as its own block (see adapters/claude-code-session.ts).
1039
- // A condition that does not load the skill-under-test (the new-skill
1040
- // `without_skill` arm, under staging or --no-stage) must carry zero reference
1041
- // to it. The skill-under-test is auto-omitted from the available-skills block
1042
- // (see `availableSkillsFor`). redactSkillFromBootstrap covers the other path:
1043
- // a *user-supplied* --bootstrap that names the skill in its own prose would
1044
- // otherwise leak it into the control arm. (The shipped bootstrap.md no longer
1045
- // enumerates skills, so that redaction is a no-op against it.)
1046
- const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
1047
- const effectiveBootstrap =
1048
- opts.bootstrapContent && skillAbsent
1049
- ? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
1050
- : opts.bootstrapContent;
1051
-
1052
- const sections: string[] = [];
1053
- if (effectiveBootstrap) {
1054
- sections.push(
1055
- [
1056
- "<session-start-context>",
1057
- "The following guidelines were loaded at session start by the slow-powers plugin",
1058
- "(equivalent to the SessionStart hook firing in a real user's environment):",
1059
- "",
1060
- effectiveBootstrap.trim(),
1061
- "</session-start-context>",
1062
- "",
1063
- ].join("\n"),
1064
- );
1065
- }
1066
- const availableSkillsBlock = renderAvailableSkillsBlock(stagedSkills);
1067
- if (availableSkillsBlock) {
1068
- sections.push(`${availableSkillsBlock}\n\n`);
1069
- }
1070
- // Plan-mode operating context (issue #142). Injected as its own block after
1071
- // the session-start surfaces and before the eval task framing, so it reads as
1072
- // a session-level mode active for this turn — layered the way the real harness
1073
- // delivers it, not as seed prose. Skill-agnostic: identical in both arms.
1074
- const planModeBlock = opts.planModeContent
1075
- ? renderPlanModeContext(opts.planModeContent)
1076
- : "";
1077
- if (planModeBlock) {
1078
- sections.push(`${planModeBlock}\n\n`);
1079
- }
1080
- const taskLines = [
1081
- "You are executing a single test case for a skill evaluation framework.",
1082
- "Treat this as a real user request — do NOT optimize behavior for the eval.",
1083
- ];
1084
- if (skillBlock) taskLines.push("", skillBlock);
1085
- taskLines.push(
1086
- "",
1087
- fixturesBlock,
1088
- `Output directory: ${opts.outputsDir}`,
1089
- "",
1090
- "Instructions:",
1091
- "- Write any files you produce into the output directory.",
1092
- `- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
1093
- "- Do not write outside the output directory.",
1094
- "",
1095
- "User request:",
1096
- opts.userPrompt,
1097
- );
1098
- sections.push(taskLines.join("\n"));
1099
-
1100
- return {
1101
- eval_id: opts.evalId,
1102
- condition: opts.condition,
1103
- skill_path: opts.skillPath,
1104
- staged_skill_slug: opts.stagedSkillSlug,
1105
- user_prompt: opts.userPrompt,
1106
- fixtures: opts.fixtures,
1107
- outputs_dir: opts.outputsDir,
1108
- run_record_path: join(opts.condDir, "run.json"),
1109
- timing_path: join(opts.condDir, "timing.json"),
1110
- agent_description: opts.runTag
1111
- ? `${opts.evalId}:${opts.condition}:${opts.runTag}`
1112
- : `${opts.evalId}:${opts.condition}`,
1113
- dispatch_prompt_path: join(opts.condDir, "dispatch-prompt.txt"),
1114
- dispatch_prompt: sections.join(""),
1115
- };
1116
- }
1117
-
1118
- function buildManifest(opts: {
1119
- skillName: string;
1120
- mode: Mode;
1121
- baseline?: string;
1122
- iteration: number;
1123
- tasks: DispatchTask[];
1124
- }): string {
1125
- const header = [
1126
- `# Dispatch manifest — ${opts.skillName} iteration-${opts.iteration}`,
1127
- "",
1128
- `Mode: ${opts.mode}${opts.baseline ? ` (baseline: ${opts.baseline})` : ""}`,
1129
- `Generated: ${new Date().toISOString()}`,
1130
- `Total dispatches: ${opts.tasks.length}`,
1131
- "",
1132
- "## How to use this manifest",
1133
- "",
1134
- 'In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the subagent with a short "read this file and follow it" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.',
1135
- "",
1136
- "**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
1137
- "",
1138
- "After all dispatches (Claude Code):",
1139
- "",
1140
- '1. Run `bun run evals:ingest -- --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` — a fixed-order chain of record-runs (assembles every task\'s `run.json` from `dispatch.json` + the subagent\'s own `outputs/final-message.md` + the persisted transcript, and backfills `timing.json` with transcript-derived tokens/duration; never clobbers an existing record), fill-transcripts, detect-stray-writes, and grade. Optional higher-fidelity timing: write `{ "total_tokens": <n>, "duration_ms": <n>, "source": "completion-event" }` from the task completion event to `timing.json` right after a dispatch — completion-event numbers always win over the backfill.',
1141
- "2. Dispatch the judge tasks ingest lists, then run `bun run evals:finalize -- --skill <name> --iteration <N>` for the benchmark.",
1142
- "",
1143
- "On a harness without persisted transcripts, instead write each task's `run.json` (matching `skills/evaluating-skills/schema/run-record.schema.json`, enforced at runtime by grade/fill-transcripts/detect-stray-writes) and `timing.json` by hand when its subagent returns: carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]`; capture `total_tokens`/`duration_ms` from the task completion event immediately — they may not be persisted anywhere else.",
1144
- "",
1145
- "## Dispatches",
1146
- "",
1147
- ].join("\n");
1148
-
1149
- const entries = opts.tasks
1150
- .map((t) =>
1151
- [
1152
- `### ${t.eval_id} / ${t.condition}`,
1153
- "",
1154
- `- run.json: ${t.run_record_path}`,
1155
- `- timing.json: ${t.timing_path}`,
1156
- "",
1157
- "```",
1158
- t.dispatch_prompt,
1159
- "```",
1160
- "",
1161
- ].join("\n"),
1162
- )
1163
- .join("\n");
1164
-
1165
- return header + entries;
1166
- }
1167
-
1168
- // ---------------------------------------------------------------------------
1169
- // ingest / finalize — fixed-order orchestrators over the sibling commands.
1170
- //
1171
- // The eval loop has exactly two points where only the in-harness agent can act
1172
- // (dispatching eval subagents, dispatching judge subagents). Everything between
1173
- // them is mechanical, so each stretch is one command: `ingest` runs the
1174
- // post-dispatch chain and stops at the judge hand-off; `finalize` runs the
1175
- // post-judge chain and prints the benchmark. No workspace-state inference —
1176
- // each always runs the same steps in the same order, and every sub-step keeps
1177
- // its own skip-if-done guard, so re-running after a fix is safe.
1178
- // ---------------------------------------------------------------------------
1179
-
1180
- export type StepCommand = { label: string; argv: string[] };
1181
-
1182
- export function buildIngestCommands(opts: {
1183
- runnerDir: string;
1184
- skillDir: string;
1185
- skill: string;
1186
- iteration: number;
1187
- subagentsDir: string;
1188
- }): StepCommand[] {
1189
- const shared = [
1190
- "--skill-dir",
1191
- opts.skillDir,
1192
- "--skill",
1193
- opts.skill,
1194
- "--iteration",
1195
- String(opts.iteration),
1196
- ];
1197
- const transcripts = ["--subagents-dir", opts.subagentsDir];
1198
- const script = (name: string) => [
1199
- "bun",
1200
- "run",
1201
- join(opts.runnerDir, `${name}.ts`),
1202
- ];
1203
- return [
1204
- {
1205
- label: "record-runs",
1206
- argv: [...script("record-runs"), ...shared, ...transcripts],
1207
- },
1208
- // record-runs subsumes this for the records it wrote; it still fills any
1209
- // pre-existing (agent-written) run.json with empty tool_invocations.
1210
- {
1211
- label: "fill-transcripts",
1212
- argv: [...script("fill-transcripts"), ...shared, ...transcripts],
1213
- },
1214
- {
1215
- label: "detect-stray-writes",
1216
- argv: [...script("detect-stray-writes"), ...shared],
1217
- },
1218
- { label: "grade", argv: [...script("grade"), ...shared] },
1219
- ];
1220
- }
1221
-
1222
- export function buildFinalizeCommands(opts: {
1223
- runnerDir: string;
1224
- skillDir: string;
1225
- skill: string;
1226
- iteration: number;
1227
- }): StepCommand[] {
1228
- const shared = [
1229
- "--skill-dir",
1230
- opts.skillDir,
1231
- "--skill",
1232
- opts.skill,
1233
- "--iteration",
1234
- String(opts.iteration),
1235
- ];
1236
- return [
1237
- {
1238
- label: "grade --finalize",
1239
- argv: [
1240
- "bun",
1241
- "run",
1242
- join(opts.runnerDir, "grade.ts"),
1243
- ...shared,
1244
- "--finalize",
1245
- ],
1246
- },
1247
- {
1248
- label: "aggregate",
1249
- argv: ["bun", "run", join(opts.runnerDir, "aggregate.ts"), ...shared],
1250
- },
1251
- ];
1252
- }
1253
-
1254
- /**
1255
- * Runs steps in order, stopping at the first non-zero exit. A failure must
1256
- * halt the chain: grade's `__skill_invoked` code-check silently degrades to an
1257
- * LLM judge when `tool_invocations` is missing, so grading after a failed
1258
- * record/fill step would quietly lose the deterministic check.
1259
- */
1260
- export function runSteps(
1261
- steps: StepCommand[],
1262
- spawn: (step: StepCommand) => number = (step) =>
1263
- Bun.spawnSync(step.argv, { stdout: "inherit", stderr: "inherit" })
1264
- .exitCode ?? 1,
1265
- ): { failedAt: string | null } {
1266
- for (const step of steps) {
1267
- console.log(`\n── ${step.label} ──`);
1268
- if (spawn(step) !== 0) return { failedAt: step.label };
1269
- }
1270
- return { failedAt: null };
1271
- }
1272
-
1273
- function commandIngest(args: Args, ctx: RunContext): void {
1274
- if (args.iteration === undefined) die("ingest requires --iteration <N>");
1275
- if (!args.subagentsDir)
1276
- die(
1277
- "ingest requires --subagents-dir <path> (Claude Code persists subagent transcripts under ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
1278
- );
1279
- const { failedAt } = runSteps(
1280
- buildIngestCommands({
1281
- runnerDir: import.meta.dir,
1282
- skillDir: ctx.skillDir,
1283
- skill: ctx.skillName,
1284
- iteration: args.iteration,
1285
- subagentsDir: args.subagentsDir,
1286
- }),
1287
- );
1288
- if (failedAt)
1289
- die(
1290
- `ingest stopped at '${failedAt}'. Fix the failure and re-run ingest — completed steps skip work that's already done.`,
1291
- );
1292
-
1293
- const judgeTasksPath = join(
1294
- ctx.workspaceRoot,
1295
- ctx.skillName,
1296
- `iteration-${args.iteration}`,
1297
- "judge-tasks.json",
1298
- );
1299
- let totalTasks: number | null = null;
1300
- try {
1301
- totalTasks =
1302
- readJson<{ total_tasks?: number }>(judgeTasksPath).total_tasks ?? null;
1303
- } catch {
1304
- // grade always writes judge-tasks.json; treat a read failure as unknown.
1305
- }
1306
- if (totalTasks === 0) {
1307
- console.log(
1308
- `\n✅ Ingest complete — no judge dispatches needed.\nNext: bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
1309
- );
1310
- } else {
1311
- console.log(
1312
- `\n✅ Ingest complete. Dispatch the ${totalTasks ?? ""} judge task(s) grade listed above (judge-tasks.json), then:\n bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
1313
- );
1314
- }
1315
- }
1316
-
1317
- function commandFinalize(args: Args, ctx: RunContext): void {
1318
- if (args.iteration === undefined) die("finalize requires --iteration <N>");
1319
- const { failedAt } = runSteps(
1320
- buildFinalizeCommands({
1321
- runnerDir: import.meta.dir,
1322
- skillDir: ctx.skillDir,
1323
- skill: ctx.skillName,
1324
- iteration: args.iteration,
1325
- }),
1326
- );
1327
- if (failedAt)
1328
- die(
1329
- `finalize stopped at '${failedAt}'. Fix the failure and re-run finalize.`,
1330
- );
1331
- console.log(
1332
- `\n✅ Finalize complete. Read the benchmark above, then tear down: bun run evals:teardown --skill ${ctx.skillName}`,
1333
- );
1334
- }
1335
-
1336
- if (import.meta.main) {
1337
- const argv = Bun.argv.slice(2);
1338
- const args = parseArgs(argv);
1339
- let ctx: RunContext;
1340
- try {
1341
- ctx = detectRunContext(argv);
1342
- } catch (err) {
1343
- die(err instanceof Error ? err.message : String(err));
1344
- }
1345
- if (args.command === "snapshot") commandSnapshot(args, ctx);
1346
- else if (args.command === "ingest") commandIngest(args, ctx);
1347
- else if (args.command === "finalize") commandFinalize(args, ctx);
1348
- else if (args.command === "teardown-guard") {
1349
- const torn = teardownGuard(ctx.stageRoot);
1350
- console.log(
1351
- torn
1352
- ? "🛡 Write guard removed."
1353
- : "No write guard was installed — nothing to remove.",
1354
- );
1355
- } else if (args.command === "teardown") {
1356
- // Full end-of-run teardown: disarm the guard, remove the staged skill set
1357
- // (and prune a `.claude` the runner emptied), then reclaim the workspace —
1358
- // leaving the user's own `.claude/settings.json`, pre-existing project
1359
- // skills, and any uncommitted eval results intact.
1360
- const torn = teardownGuard(ctx.stageRoot);
1361
- cleanupStagedSkills(ctx.stageRoot);
1362
- const ws = cleanupWorkspace(ctx.workspaceRoot, ctx.skillName);
1363
- console.log(
1364
- `🧹 Eval teardown complete: staged skill set removed${
1365
- torn ? " and write guard disarmed" : ""
1366
- }.`,
1367
- );
1368
- const reclaimed = ws.removedIterations.length + ws.removedSnapshots.length;
1369
- if (reclaimed > 0) {
1370
- console.log(
1371
- ` Reclaimed ${ws.removedIterations.length} workspace iteration(s)` +
1372
- ` and ${ws.removedSnapshots.length} reproducible snapshot(s).`,
1373
- );
1374
- }
1375
- if (ws.keptIterations.length > 0) {
1376
- const lines = ws.keptIterations.map(
1377
- (k) => ` - ${k.iteration} (${k.reason})`,
1378
- );
1379
- console.warn(
1380
- `⚠ Kept ${ws.keptIterations.length} workspace iteration(s) with results ` +
1381
- `not yet committed:\n${lines.join("\n")}\n` +
1382
- ` Commit them, e.g.:\n` +
1383
- ` bun run evals:promote-baseline --skill ${ctx.skillName} --iteration <N>\n` +
1384
- ` or delete ${join("skills-workspace", ctx.skillName)}/ manually to discard.`,
1385
- );
1386
- }
1387
- } else commandRun(args, ctx);
1388
- }