@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,814 @@
1
+ #!/usr/bin/env bun
2
+ import { randomBytes } from "node:crypto";
3
+ import {
4
+ cpSync,
5
+ existsSync,
6
+ mkdirSync,
7
+ mkdtempSync,
8
+ readdirSync,
9
+ readFileSync,
10
+ rmSync,
11
+ statSync,
12
+ writeFileSync,
13
+ } from "node:fs";
14
+ import { tmpdir } from "node:os";
15
+ import { basename, dirname, join } from "node:path";
16
+ import { detectRunContext, type RunContext } from "./context";
17
+ import { installGuard, teardownGuard } from "./guard/install";
18
+ import type { ConditionsRecord, Eval, EvalsConfig } from "./types";
19
+ import { validateEvalsConfig } from "./validate";
20
+
21
+ export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
22
+ export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
23
+
24
+ export function stageSkillForCC(opts: {
25
+ content: string;
26
+ iteration: number;
27
+ condition: string;
28
+ skillName: string;
29
+ repoRoot: string;
30
+ }): string {
31
+ const slug = `${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
32
+ const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
33
+ mkdirSync(skillDir, { recursive: true });
34
+ writeFileSync(join(skillDir, "SKILL.md"), opts.content);
35
+ return slug;
36
+ }
37
+
38
+ type SiblingManifest = {
39
+ created_at: string;
40
+ staged_under_test: string;
41
+ created_entries: Array<{
42
+ name: string;
43
+ preexisting: boolean;
44
+ backup_path?: string;
45
+ }>;
46
+ };
47
+
48
+ export function stageSiblingSkills(opts: {
49
+ skillUnderTest: string;
50
+ skillsSourceDir: string;
51
+ repoRoot: string;
52
+ }): SiblingManifest {
53
+ const skillsDir = join(opts.repoRoot, ".claude", "skills");
54
+ mkdirSync(skillsDir, { recursive: true });
55
+
56
+ const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
57
+ if (name === opts.skillUnderTest) return false;
58
+ const srcDir = join(opts.skillsSourceDir, name);
59
+ if (!statSync(srcDir).isDirectory()) return false;
60
+ return existsSync(join(srcDir, "SKILL.md"));
61
+ });
62
+
63
+ const manifest: SiblingManifest = {
64
+ created_at: new Date().toISOString(),
65
+ staged_under_test: opts.skillUnderTest,
66
+ created_entries: [],
67
+ };
68
+
69
+ for (const name of siblings) {
70
+ const srcDir = join(opts.skillsSourceDir, name);
71
+ const dstDir = join(skillsDir, name);
72
+ const evalsSubdir = join(srcDir, "evals");
73
+
74
+ const entry: SiblingManifest["created_entries"][number] = {
75
+ name,
76
+ preexisting: false,
77
+ };
78
+
79
+ if (existsSync(dstDir)) {
80
+ entry.preexisting = true;
81
+ const backupRoot = mkdtempSync(
82
+ join(tmpdir(), "slow-powers-eval-backup-"),
83
+ );
84
+ entry.backup_path = join(backupRoot, name);
85
+ cpSync(dstDir, entry.backup_path, { recursive: true });
86
+ rmSync(dstDir, { recursive: true, force: true });
87
+ }
88
+
89
+ cpSync(srcDir, dstDir, {
90
+ recursive: true,
91
+ filter: (src) =>
92
+ src !== evalsSubdir && !src.startsWith(`${evalsSubdir}/`),
93
+ });
94
+
95
+ manifest.created_entries.push(entry);
96
+ }
97
+
98
+ writeFileSync(
99
+ join(skillsDir, STAGED_SIBLING_MANIFEST),
100
+ `${JSON.stringify(manifest, null, 2)}\n`,
101
+ );
102
+ return manifest;
103
+ }
104
+
105
+ export function cleanupStagedSkills(repoRoot: string): void {
106
+ const skillsDir = join(repoRoot, ".claude", "skills");
107
+ if (!existsSync(skillsDir)) return;
108
+
109
+ for (const entry of readdirSync(skillsDir)) {
110
+ if (!entry.startsWith(STAGED_SKILL_PREFIX)) continue;
111
+ rmSync(join(skillsDir, entry), { recursive: true, force: true });
112
+ }
113
+
114
+ const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
115
+ if (!existsSync(manifestPath)) return;
116
+ let manifest: SiblingManifest;
117
+ try {
118
+ manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
119
+ } catch {
120
+ rmSync(manifestPath, { force: true });
121
+ return;
122
+ }
123
+ for (const e of manifest.created_entries) {
124
+ const target = join(skillsDir, e.name);
125
+ rmSync(target, { recursive: true, force: true });
126
+ if (e.preexisting && e.backup_path && existsSync(e.backup_path)) {
127
+ cpSync(e.backup_path, target, { recursive: true });
128
+ rmSync(dirname(e.backup_path), { recursive: true, force: true });
129
+ }
130
+ }
131
+ rmSync(manifestPath, { force: true });
132
+ }
133
+
134
+ type Mode = "new-skill" | "revision";
135
+
136
+ type Args = {
137
+ command: "run" | "snapshot" | "teardown-guard";
138
+ mode?: Mode;
139
+ baseline?: string;
140
+ label?: string;
141
+ iteration?: number;
142
+ dryRun: boolean;
143
+ noStage: boolean;
144
+ guard: boolean;
145
+ };
146
+
147
+ function die(msg: string): never {
148
+ console.error(`error: ${msg}`);
149
+ process.exit(1);
150
+ }
151
+
152
+ function parseArgs(argv: string[]): Args {
153
+ const positionals = argv.filter((a) => !a.startsWith("--"));
154
+ const command: Args["command"] =
155
+ positionals[0] === "snapshot"
156
+ ? "snapshot"
157
+ : positionals[0] === "teardown-guard"
158
+ ? "teardown-guard"
159
+ : "run";
160
+
161
+ const flag = (name: string): string | undefined => {
162
+ const i = argv.indexOf(`--${name}`);
163
+ if (i === -1) return undefined;
164
+ const v = argv[i + 1];
165
+ if (v === undefined || v.startsWith("--")) {
166
+ die(`flag --${name} requires a value`);
167
+ }
168
+ return v;
169
+ };
170
+
171
+ const has = (name: string) => argv.includes(`--${name}`);
172
+
173
+ const iterationFlag = flag("iteration");
174
+ const iteration =
175
+ iterationFlag !== undefined ? Number(iterationFlag) : undefined;
176
+ if (iteration !== undefined && !Number.isInteger(iteration))
177
+ die(`--iteration must be an integer, got ${iterationFlag}`);
178
+
179
+ return {
180
+ command,
181
+ mode: flag("mode") as Mode | undefined,
182
+ baseline: flag("baseline"),
183
+ label: flag("label"),
184
+ iteration,
185
+ dryRun: has("dry-run"),
186
+ noStage: has("no-stage"),
187
+ guard: has("guard"),
188
+ };
189
+ }
190
+
191
+ function ensureDir(path: string): void {
192
+ if (!existsSync(path)) mkdirSync(path, { recursive: true });
193
+ }
194
+
195
+ function writeJson(path: string, value: unknown): void {
196
+ writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
197
+ }
198
+
199
+ function readJson<T>(path: string): T {
200
+ return JSON.parse(readFileSync(path, "utf8"));
201
+ }
202
+
203
+ function nextIteration(workspaceSkillDir: string, override?: number): number {
204
+ if (override !== undefined) return override;
205
+ if (!existsSync(workspaceSkillDir)) return 1;
206
+ const entries = readdirSync(workspaceSkillDir).filter((e) =>
207
+ e.startsWith("iteration-"),
208
+ );
209
+ if (entries.length === 0) return 1;
210
+ const nums = entries
211
+ .map((e) => Number(e.slice("iteration-".length)))
212
+ .filter((n) => Number.isFinite(n));
213
+ return Math.max(...nums, 0) + 1;
214
+ }
215
+
216
+ function conditionNamesFor(mode: Mode): [string, string] {
217
+ return mode === "new-skill"
218
+ ? ["with_skill", "without_skill"]
219
+ : ["old_skill", "new_skill"];
220
+ }
221
+
222
+ function commandSnapshot(args: Args, ctx: RunContext): void {
223
+ if (!args.label) die("snapshot requires --label <name>");
224
+ const skillDir = ctx.skillSubdir;
225
+ const skillMd = join(skillDir, "SKILL.md");
226
+ if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
227
+
228
+ const destDir = join(
229
+ ctx.workspaceRoot,
230
+ ctx.skillName,
231
+ "snapshots",
232
+ args.label,
233
+ );
234
+ if (existsSync(destDir))
235
+ die(
236
+ `snapshot already exists: ${destDir}\n` +
237
+ " Use a different --label or delete the existing snapshot first.",
238
+ );
239
+ ensureDir(destDir);
240
+
241
+ cpSync(skillMd, join(destDir, "SKILL.md"));
242
+ for (const entry of readdirSync(skillDir)) {
243
+ if (entry === "SKILL.md" || entry === "evals") continue;
244
+ const src = join(skillDir, entry);
245
+ const dst = join(destDir, entry);
246
+ if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
247
+ else cpSync(src, dst);
248
+ }
249
+
250
+ console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
251
+ }
252
+
253
+ function commandRun(args: Args, ctx: RunContext): void {
254
+ if (!args.mode) die("--mode required: new-skill | revision");
255
+ if (args.mode !== "new-skill" && args.mode !== "revision")
256
+ die(`unknown --mode: ${args.mode}`);
257
+ if (args.mode === "revision" && !args.baseline)
258
+ die("revision mode requires --baseline <label>");
259
+
260
+ const skillDir = ctx.skillSubdir;
261
+ const skillMd = join(skillDir, "SKILL.md");
262
+ if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
263
+
264
+ const evalsPath = join(skillDir, "evals", "evals.json");
265
+ if (!existsSync(evalsPath)) die(`evals.json not found: ${evalsPath}`);
266
+
267
+ const config: EvalsConfig = validateEvalsConfig(
268
+ readJson(evalsPath),
269
+ evalsPath,
270
+ );
271
+ if (config.skill_name !== ctx.skillName)
272
+ console.warn(
273
+ `warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
274
+ );
275
+
276
+ const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
277
+ const iteration = nextIteration(workspaceSkillDir, args.iteration);
278
+ const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
279
+
280
+ // A per-run nonce makes each dispatch description globally unique. The
281
+ // subagents dir is shared across iterations of one parent session, so a bare
282
+ // `<eval>:<condition>` description repeats and fill-transcripts could fill an
283
+ // iteration's run from a colliding agent in another iteration. `i<N>-<nonce>`
284
+ // also disambiguates re-running the same iteration number.
285
+ const runNonce = `${Date.now().toString(36)}-${randomBytes(3).toString("hex")}`;
286
+ const runTag = `i${iteration}-${runNonce}`;
287
+
288
+ if (existsSync(iterationDir) && args.iteration === undefined)
289
+ die(
290
+ `iteration-${iteration} already exists; pass --iteration to overwrite explicitly`,
291
+ );
292
+
293
+ const [conditionA, conditionB] = conditionNamesFor(args.mode);
294
+
295
+ let skillPathForA: string | null;
296
+ let skillPathForB: string | null;
297
+ if (args.mode === "new-skill") {
298
+ skillPathForA = skillMd;
299
+ skillPathForB = null;
300
+ } else {
301
+ const baselineSkill = join(
302
+ workspaceSkillDir,
303
+ "snapshots",
304
+ args.baseline as string,
305
+ "SKILL.md",
306
+ );
307
+ if (!existsSync(baselineSkill))
308
+ die(
309
+ `baseline snapshot not found: ${baselineSkill}\n` +
310
+ ` Run: bun run evals:snapshot --skill ${ctx.skillName} --skill-dir ${ctx.skillDir} --label ${args.baseline} (before editing)`,
311
+ );
312
+ skillPathForA = baselineSkill;
313
+ skillPathForB = skillMd;
314
+ }
315
+
316
+ console.log(
317
+ `Preparing ${ctx.skillName} iteration-${iteration} (${args.mode})`,
318
+ );
319
+ console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
320
+ console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
321
+ if (args.noStage)
322
+ console.log(
323
+ " staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
324
+ );
325
+
326
+ ensureDir(iterationDir);
327
+ cpSync(skillMd, join(iterationDir, "skill-snapshot.md"));
328
+
329
+ // Always disarm a prior run's guard before re-staging, so a crashed run can't
330
+ // leave the write-blocking hook armed across runs.
331
+ teardownGuard(ctx.stageRoot);
332
+
333
+ if (!args.noStage) cleanupStagedSkills(ctx.stageRoot);
334
+
335
+ if (!args.noStage) {
336
+ stageSiblingSkills({
337
+ skillUnderTest: ctx.skillName,
338
+ skillsSourceDir: ctx.skillDir,
339
+ repoRoot: ctx.stageRoot,
340
+ });
341
+ }
342
+
343
+ const bootstrapContent =
344
+ ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
345
+
346
+ // Sibling skill metadata, shared across conditions. Empty when --no-stage
347
+ // (nothing is staged, so nothing is discoverable to list).
348
+ const siblingSkills: AvailableSkill[] = args.noStage
349
+ ? []
350
+ : ctx.siblingSkillNames.map((name) => {
351
+ const p = join(ctx.skillDir, name, "SKILL.md");
352
+ return { name, path: p, description: getSkillDescription(p) };
353
+ });
354
+
355
+ const stageFor = (
356
+ condName: string,
357
+ condSkillPath: string | null,
358
+ ): string | null => {
359
+ if (!condSkillPath || args.noStage) return null;
360
+ return stageSkillForCC({
361
+ content: readFileSync(condSkillPath, "utf8"),
362
+ iteration,
363
+ condition: condName,
364
+ skillName: ctx.skillName,
365
+ repoRoot: ctx.stageRoot,
366
+ });
367
+ };
368
+
369
+ const conditionASlug = stageFor(conditionA, skillPathForA);
370
+ const conditionBSlug = stageFor(conditionB, skillPathForB);
371
+
372
+ const conditions: ConditionsRecord = {
373
+ mode: args.mode,
374
+ baseline: args.baseline,
375
+ conditions: [
376
+ {
377
+ name: conditionA,
378
+ skill_path: skillPathForA,
379
+ staged_skill_slug: conditionASlug,
380
+ },
381
+ {
382
+ name: conditionB,
383
+ skill_path: skillPathForB,
384
+ staged_skill_slug: conditionBSlug,
385
+ },
386
+ ],
387
+ timestamp: new Date().toISOString(),
388
+ harness: ctx.harness,
389
+ run_nonce: runNonce,
390
+ };
391
+ writeJson(join(iterationDir, "conditions.json"), conditions);
392
+
393
+ // availableSkills for a condition = siblings + the skill-under-test when
394
+ // that condition loads it. Empty when nothing was staged.
395
+ const availableSkillsFor = (
396
+ condSkillPath: string | null,
397
+ ): AvailableSkill[] => {
398
+ if (args.noStage) return [];
399
+ const skills = [...siblingSkills];
400
+ if (condSkillPath) {
401
+ skills.push({
402
+ name: ctx.skillName,
403
+ path: condSkillPath,
404
+ description: getSkillDescription(condSkillPath),
405
+ });
406
+ }
407
+ return skills;
408
+ };
409
+
410
+ const tasks: DispatchTask[] = [];
411
+ for (const ev of config.evals) {
412
+ const evalDir = join(iterationDir, `eval-${ev.id}`);
413
+ ensureDir(evalDir);
414
+
415
+ for (const [condName, condSkillPath, condSlug] of [
416
+ [conditionA, skillPathForA, conditionASlug],
417
+ [conditionB, skillPathForB, conditionBSlug],
418
+ ] as const) {
419
+ const condDir = join(evalDir, condName);
420
+ const outputsDir = join(condDir, "outputs");
421
+ ensureDir(outputsDir);
422
+
423
+ const fixtures = copyFixtures(ev, skillDir, condDir);
424
+ tasks.push(
425
+ buildDispatchTask({
426
+ evalId: ev.id,
427
+ condition: condName,
428
+ skillPath: condSkillPath,
429
+ stagedSkillSlug: condSlug,
430
+ userPrompt: ev.prompt,
431
+ fixtures,
432
+ outputsDir,
433
+ condDir,
434
+ bootstrapContent,
435
+ skillName: ctx.skillName,
436
+ availableSkills: availableSkillsFor(condSkillPath),
437
+ runTag,
438
+ }),
439
+ );
440
+ }
441
+ }
442
+
443
+ const manifestPath = join(iterationDir, "dispatch-manifest.md");
444
+ writeFileSync(
445
+ manifestPath,
446
+ buildManifest({
447
+ skillName: ctx.skillName,
448
+ mode: args.mode,
449
+ baseline: args.baseline,
450
+ iteration,
451
+ tasks,
452
+ }),
453
+ );
454
+
455
+ // Write each prompt to its own file and reference it by path in dispatch.json.
456
+ // The orchestrator then dispatches with a short "read this file" prompt instead
457
+ // of reproducing the full prompt verbatim per Task call.
458
+ for (const task of tasks) {
459
+ writeFileSync(task.dispatch_prompt_path, task.dispatch_prompt);
460
+ }
461
+
462
+ const dispatchJsonPath = join(iterationDir, "dispatch.json");
463
+ writeJson(dispatchJsonPath, {
464
+ skill_name: ctx.skillName,
465
+ iteration,
466
+ run_nonce: runNonce,
467
+ iteration_dir: iterationDir,
468
+ mode: args.mode,
469
+ baseline: args.baseline ?? null,
470
+ conditions: conditions.conditions,
471
+ harness: ctx.harness,
472
+ tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
473
+ });
474
+
475
+ // Opt-in hard guard. Stages a PreToolUse hook that blocks subagent
476
+ // writes/installs outside the eval sandbox while dispatches run.
477
+ if (args.guard && !args.dryRun) {
478
+ if (args.noStage) {
479
+ console.warn(
480
+ "\n⚠ --guard requires staging enabled; skipping guard install.",
481
+ );
482
+ } else {
483
+ const guardScriptPath = join(import.meta.dir, "guard", "guard.ts");
484
+ installGuard({
485
+ stageRoot: ctx.stageRoot,
486
+ workspaceRoot: ctx.workspaceRoot,
487
+ guardScriptPath,
488
+ });
489
+ console.log(
490
+ "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n" +
491
+ " and will block writes/installs outside the eval sandbox during dispatches.\n" +
492
+ " It auto-expires in 6h and is removed on the next run; to remove it now:\n" +
493
+ " bun run evals:teardown-guard --skill <name>",
494
+ );
495
+ }
496
+ }
497
+
498
+ console.log(`\nWorkspace prepared: ${iterationDir}`);
499
+ console.log(`Dispatch manifest: ${manifestPath}`);
500
+ console.log(`Dispatch tasks: ${dispatchJsonPath}`);
501
+ console.log(
502
+ `\n${tasks.length} dispatches required (${config.evals.length} evals × 2 conditions).`,
503
+ );
504
+
505
+ if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
506
+ else
507
+ console.log(
508
+ "\nNext: read dispatch.json, dispatch each task as a subagent, write run.json + timing.json to the paths in each task.",
509
+ );
510
+ }
511
+
512
+ type DispatchTask = {
513
+ eval_id: string;
514
+ condition: string;
515
+ skill_path: string | null;
516
+ staged_skill_slug: string | null;
517
+ user_prompt: string;
518
+ fixtures: string[];
519
+ outputs_dir: string;
520
+ run_record_path: string;
521
+ timing_path: string;
522
+ agent_description: string;
523
+ /**
524
+ * Absolute path to the file holding the full dispatch prompt. The orchestrator
525
+ * dispatches each subagent with a short "read this file and follow it" prompt
526
+ * rather than inlining the prompt, so it never has to reproduce ~KB of text per
527
+ * Task call. `dispatch_prompt` carries the same text in-memory (for manifest
528
+ * building and unit tests) but is stripped from the serialized dispatch.json.
529
+ */
530
+ dispatch_prompt_path: string;
531
+ dispatch_prompt: string;
532
+ };
533
+
534
+ export type AvailableSkill = {
535
+ name: string;
536
+ path: string;
537
+ description: string;
538
+ };
539
+
540
+ function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
541
+ if (!ev.files || ev.files.length === 0) return [];
542
+ const inputsDir = join(condDir, "inputs");
543
+ ensureDir(inputsDir);
544
+ const copied: string[] = [];
545
+ for (const f of ev.files) {
546
+ const src = join(skillDir, "evals", f);
547
+ if (!existsSync(src)) die(`fixture not found: ${src}`);
548
+ const dst = join(inputsDir, basename(f));
549
+ if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
550
+ else cpSync(src, dst);
551
+ copied.push(dst);
552
+ }
553
+ return copied;
554
+ }
555
+
556
+ function getSkillDescription(skillPath: string): string {
557
+ try {
558
+ const content = readFileSync(skillPath, "utf8");
559
+ const match = content.match(/description:\s*([^\n\r]+)/);
560
+ if (match) {
561
+ let desc = match[1].trim();
562
+ if (
563
+ (desc.startsWith('"') && desc.endsWith('"')) ||
564
+ (desc.startsWith("'") && desc.endsWith("'"))
565
+ ) {
566
+ desc = desc.slice(1, -1).trim();
567
+ }
568
+ return desc;
569
+ }
570
+ } catch {}
571
+ return "No description available.";
572
+ }
573
+
574
+ /**
575
+ * Removes the skill-under-test's "Active Skills Directory" entry from bootstrap
576
+ * content so a skill-absent condition (e.g. `without_skill`) carries no
577
+ * reference to it. Targets the markdown list-item block: a top-level `*`/`-`
578
+ * bullet whose backticked name equals `skillName`, plus its indented
579
+ * continuation lines (the `*Trigger:*` sub-bullet). Sibling entries and the
580
+ * heading are left intact. The eval bootstrap names skills only in that
581
+ * directory, so this is the sole reference vector to scrub.
582
+ */
583
+ export function redactSkillFromBootstrap(
584
+ content: string,
585
+ skillName: string,
586
+ ): string {
587
+ const out: string[] = [];
588
+ let skipping = false;
589
+ for (const line of content.split("\n")) {
590
+ if (skipping) {
591
+ // Indented continuation lines belong to the entry being dropped.
592
+ if (/^\s+\S/.test(line)) continue;
593
+ skipping = false;
594
+ }
595
+ if (/^[*-]\s/.test(line) && line.includes(`\`${skillName}\``)) {
596
+ skipping = true;
597
+ continue;
598
+ }
599
+ out.push(line);
600
+ }
601
+ return out.join("\n");
602
+ }
603
+
604
+ export function buildDispatchTask(opts: {
605
+ evalId: string;
606
+ condition: string;
607
+ skillPath: string | null;
608
+ stagedSkillSlug: string | null;
609
+ userPrompt: string;
610
+ fixtures: string[];
611
+ outputsDir: string;
612
+ condDir: string;
613
+ bootstrapContent: string | null;
614
+ skillName: string;
615
+ availableSkills: AvailableSkill[];
616
+ /**
617
+ * Per-run uniqueness suffix (`i<iteration>-<nonce>`). Appended to the
618
+ * dispatch description so transcripts can't collide across iterations or
619
+ * re-runs. Omitted in unit tests that exercise prompt assembly directly.
620
+ */
621
+ runTag?: string;
622
+ }): DispatchTask {
623
+ const stagedSkills = [...opts.availableSkills].sort((a, b) =>
624
+ a.name.localeCompare(b.name),
625
+ );
626
+
627
+ let skillBlock: string;
628
+ if (opts.stagedSkillSlug) {
629
+ skillBlock = [
630
+ "Your environment has the slow-powers plugin loaded. All slow-powers skills are",
631
+ "discoverable via the Skill tool. The skill currently under evaluation is",
632
+ `staged under the unique slug "${opts.stagedSkillSlug}" — invoke that slug rather`,
633
+ "than the natural name if the skill applies to the user's request.",
634
+ ].join("\n");
635
+ } else if (opts.skillPath) {
636
+ skillBlock = [
637
+ "The following skill is loaded into your operating guidelines. Apply it where relevant to the user's request.",
638
+ "",
639
+ `<skill name="${basename(dirname(opts.skillPath))}">`,
640
+ readFileSync(opts.skillPath, "utf8").trim(),
641
+ "</skill>",
642
+ ].join("\n");
643
+ } else if (stagedSkills.length > 0 || opts.bootstrapContent) {
644
+ skillBlock = [
645
+ "The skill currently under evaluation is NOT available in this environment.",
646
+ "Other staged skills remain discoverable via the Skill tool; apply any",
647
+ "that fit the user's request.",
648
+ ].join("\n");
649
+ } else {
650
+ skillBlock = "No skill is loaded. Respond as you naturally would.";
651
+ }
652
+
653
+ const fixturesBlock = opts.fixtures.length
654
+ ? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
655
+ : "Available fixture files: none";
656
+
657
+ // The session-start context carries two kinds of content:
658
+ // 1. The verbatim --bootstrap file (product-specific framing), if supplied.
659
+ // 2. An auto-built inventory of the skills staged for this eval.
660
+ // A condition that does not load the skill-under-test (the new-skill
661
+ // `without_skill` arm, under staging or --no-stage) must carry zero reference
662
+ // to it — including in the verbatim bootstrap, which otherwise lists it in its
663
+ // Active Skills Directory and leaks the skill into the control arm.
664
+ const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
665
+ const effectiveBootstrap =
666
+ opts.bootstrapContent && skillAbsent
667
+ ? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
668
+ : opts.bootstrapContent;
669
+
670
+ const startContextParts: string[] = [];
671
+ if (effectiveBootstrap) {
672
+ startContextParts.push(
673
+ [
674
+ "The following guidelines were loaded at session start by the slow-powers plugin",
675
+ "(equivalent to the SessionStart hook firing in a real user's environment):",
676
+ "",
677
+ effectiveBootstrap.trim(),
678
+ ].join("\n"),
679
+ );
680
+ }
681
+ if (stagedSkills.length > 0) {
682
+ const inventoryLines = stagedSkills.map(
683
+ (s) => `* \`${s.name}\`\n * *Trigger:* ${s.description}`,
684
+ );
685
+ startContextParts.push(
686
+ [
687
+ "The following skills are staged and discoverable in this eval environment:",
688
+ "",
689
+ ...inventoryLines,
690
+ ].join("\n"),
691
+ );
692
+ }
693
+
694
+ const sections: string[] = [];
695
+ if (startContextParts.length > 0) {
696
+ sections.push(
697
+ [
698
+ "<session-start-context>",
699
+ startContextParts.join("\n\n"),
700
+ "</session-start-context>",
701
+ "",
702
+ ].join("\n"),
703
+ );
704
+ }
705
+ sections.push(
706
+ [
707
+ "You are executing a single test case for a skill evaluation framework.",
708
+ "Treat this as a real user request — do NOT optimize behavior for the eval.",
709
+ "",
710
+ skillBlock,
711
+ "",
712
+ fixturesBlock,
713
+ `Output directory: ${opts.outputsDir}`,
714
+ "",
715
+ "Instructions:",
716
+ "- Write any files you produce into the output directory.",
717
+ `- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
718
+ "- Do not write outside the output directory.",
719
+ "",
720
+ "User request:",
721
+ opts.userPrompt,
722
+ ].join("\n"),
723
+ );
724
+
725
+ return {
726
+ eval_id: opts.evalId,
727
+ condition: opts.condition,
728
+ skill_path: opts.skillPath,
729
+ staged_skill_slug: opts.stagedSkillSlug,
730
+ user_prompt: opts.userPrompt,
731
+ fixtures: opts.fixtures,
732
+ outputs_dir: opts.outputsDir,
733
+ run_record_path: join(opts.condDir, "run.json"),
734
+ timing_path: join(opts.condDir, "timing.json"),
735
+ agent_description: opts.runTag
736
+ ? `${opts.evalId}:${opts.condition}:${opts.runTag}`
737
+ : `${opts.evalId}:${opts.condition}`,
738
+ dispatch_prompt_path: join(opts.condDir, "dispatch-prompt.txt"),
739
+ dispatch_prompt: sections.join(""),
740
+ };
741
+ }
742
+
743
+ function buildManifest(opts: {
744
+ skillName: string;
745
+ mode: Mode;
746
+ baseline?: string;
747
+ iteration: number;
748
+ tasks: DispatchTask[];
749
+ }): string {
750
+ const header = [
751
+ `# Dispatch manifest — ${opts.skillName} iteration-${opts.iteration}`,
752
+ "",
753
+ `Mode: ${opts.mode}${opts.baseline ? ` (baseline: ${opts.baseline})` : ""}`,
754
+ `Generated: ${new Date().toISOString()}`,
755
+ `Total dispatches: ${opts.tasks.length}`,
756
+ "",
757
+ "## How to use this manifest",
758
+ "",
759
+ 'In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the subagent with a short "read this file and follow it" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.',
760
+ "",
761
+ "**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
762
+ "",
763
+ "After every dispatch:",
764
+ "",
765
+ "1. Write `run.json` matching `skills/evaluating-skills/schema/run-record.schema.json` (enforced at runtime by grade/fill-transcripts/detect-stray-writes). Carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]` for now — `evals:fill-transcripts` will populate it from the persisted transcript in a later step.",
766
+ "2. Capture `total_tokens` and `duration_ms` from the harness's task completion event into `timing.json`. These values may not be persisted anywhere else — save them immediately.",
767
+ "",
768
+ "After all dispatches:",
769
+ "",
770
+ "3. (Claude Code only, optional) Run `bun run evals:fill-transcripts --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` to fill `tool_invocations` from each subagent's persisted transcript. Skipping this step leaves `transcript_check` assertions unverifiable.",
771
+ "4. Run `bun run evals:grade --skill <name> --iteration <N>` to grade.",
772
+ "",
773
+ "## Dispatches",
774
+ "",
775
+ ].join("\n");
776
+
777
+ const entries = opts.tasks
778
+ .map((t) =>
779
+ [
780
+ `### ${t.eval_id} / ${t.condition}`,
781
+ "",
782
+ `- run.json: ${t.run_record_path}`,
783
+ `- timing.json: ${t.timing_path}`,
784
+ "",
785
+ "```",
786
+ t.dispatch_prompt,
787
+ "```",
788
+ "",
789
+ ].join("\n"),
790
+ )
791
+ .join("\n");
792
+
793
+ return header + entries;
794
+ }
795
+
796
+ if (import.meta.main) {
797
+ const argv = Bun.argv.slice(2);
798
+ const args = parseArgs(argv);
799
+ let ctx: RunContext;
800
+ try {
801
+ ctx = detectRunContext(argv);
802
+ } catch (err) {
803
+ die(err instanceof Error ? err.message : String(err));
804
+ }
805
+ if (args.command === "snapshot") commandSnapshot(args, ctx);
806
+ else if (args.command === "teardown-guard") {
807
+ const torn = teardownGuard(ctx.stageRoot);
808
+ console.log(
809
+ torn
810
+ ? "🛡 Write guard removed."
811
+ : "No write guard was installed — nothing to remove.",
812
+ );
813
+ } else commandRun(args, ctx);
814
+ }