@slowdini/slow-powers-opencode 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +37 -65
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -13
  5. package/skills/evaluating-skills/SKILL.md +91 -337
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/verifying-development-work/SKILL.md +17 -6
  17. package/skills/verifying-development-work/code-review.md +68 -0
  18. package/skills/verifying-development-work/comment-review.md +85 -0
  19. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  20. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  21. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  22. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  23. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  26. package/skills/verifying-development-work/evals/evals.json +34 -2
  27. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  28. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  29. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  30. package/skills/evaluating-skills/harness-details/claude.md +0 -158
  31. package/skills/evaluating-skills/runner/README.md +0 -154
  32. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  33. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  34. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
  35. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
  36. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
  37. package/skills/evaluating-skills/runner/aggregate.ts +0 -248
  38. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  39. package/skills/evaluating-skills/runner/context.ts +0 -90
  40. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
  41. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
  42. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  43. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  44. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  45. package/skills/evaluating-skills/runner/grade.ts +0 -603
  46. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  47. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  48. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  49. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
  50. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  51. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  52. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  53. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  54. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
  55. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
  56. package/skills/evaluating-skills/runner/run.test.ts +0 -1180
  57. package/skills/evaluating-skills/runner/run.ts +0 -1029
  58. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
  59. package/skills/evaluating-skills/runner/types.ts +0 -112
  60. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  61. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  62. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  63. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  64. package/skills/evaluating-skills/runner/validate.ts +0 -21
  65. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  66. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  67. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  68. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
  69. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
  70. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  71. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  72. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
  73. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  74. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  75. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  76. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  77. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  78. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  79. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  80. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  81. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  82. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -1,1029 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { randomBytes } from "node:crypto";
3
- import {
4
- cpSync,
5
- existsSync,
6
- mkdirSync,
7
- mkdtempSync,
8
- readdirSync,
9
- readFileSync,
10
- rmSync,
11
- statSync,
12
- writeFileSync,
13
- } from "node:fs";
14
- import { tmpdir } from "node:os";
15
- import { basename, dirname, join } from "node:path";
16
- import {
17
- renderAvailableSkillsBlock,
18
- renderPlanModeContext,
19
- } from "./adapters/claude-code-session";
20
- import { detectRunContext, type Harness, type RunContext } from "./context";
21
- import { installGuard, teardownGuard } from "./guard/install";
22
- import {
23
- detectPluginShadows,
24
- formatShadowBanner,
25
- resolveConfigDir,
26
- } from "./plugin-shadow";
27
- import type {
28
- AvailableSkill,
29
- ConditionsRecord,
30
- Eval,
31
- EvalsConfig,
32
- } from "./types";
33
- import { validateEvalsConfig } from "./validate";
34
-
35
- export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
36
- export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
37
-
38
- export function stageSkillForCC(opts: {
39
- content: string;
40
- iteration: number;
41
- condition: string;
42
- skillName: string;
43
- repoRoot: string;
44
- /**
45
- * When set, stage under this verbatim identifier instead of the conspicuous
46
- * `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
47
- * against the eval-flagged one (issue #144 Step 2). A custom name is not
48
- * caught by `cleanupStagedSkills`'s prefix scan, so the caller must also call
49
- * `registerStagedSkillForCleanup` to have it removed on the next run.
50
- */
51
- stageNameOverride?: string;
52
- }): string {
53
- const slug =
54
- opts.stageNameOverride ??
55
- `${STAGED_SKILL_PREFIX}${opts.iteration}-${opts.condition}__${opts.skillName}`;
56
- const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
57
- mkdirSync(skillDir, { recursive: true });
58
- writeFileSync(join(skillDir, "SKILL.md"), opts.content);
59
- return slug;
60
- }
61
-
62
- /**
63
- * Adds a custom-named staged skill dir (one created via `stageNameOverride`) to
64
- * the sibling manifest's `created_entries` so the next run's
65
- * `cleanupStagedSkills` removes it — the prefix scan only catches
66
- * `slow-powers-eval-…` names. Idempotent: a name already recorded is left alone.
67
- */
68
- export function registerStagedSkillForCleanup(
69
- repoRoot: string,
70
- name: string,
71
- ): void {
72
- const skillsDir = join(repoRoot, ".claude", "skills");
73
- const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
74
- let manifest: SiblingManifest;
75
- if (existsSync(manifestPath)) {
76
- manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
77
- } else {
78
- manifest = {
79
- created_at: new Date().toISOString(),
80
- staged_under_test: name,
81
- created_entries: [],
82
- };
83
- }
84
- if (manifest.created_entries.some((e) => e.name === name)) return;
85
- manifest.created_entries.push({ name, preexisting: false });
86
- writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`);
87
- }
88
-
89
- type SiblingManifest = {
90
- created_at: string;
91
- staged_under_test: string;
92
- created_entries: Array<{
93
- name: string;
94
- preexisting: boolean;
95
- backup_path?: string;
96
- }>;
97
- };
98
-
99
- export function stageSiblingSkills(opts: {
100
- skillUnderTest: string;
101
- skillsSourceDir: string;
102
- repoRoot: string;
103
- }): SiblingManifest {
104
- const skillsDir = join(opts.repoRoot, ".claude", "skills");
105
- mkdirSync(skillsDir, { recursive: true });
106
-
107
- const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
108
- if (name === opts.skillUnderTest) return false;
109
- const srcDir = join(opts.skillsSourceDir, name);
110
- if (!statSync(srcDir).isDirectory()) return false;
111
- return existsSync(join(srcDir, "SKILL.md"));
112
- });
113
-
114
- const manifest: SiblingManifest = {
115
- created_at: new Date().toISOString(),
116
- staged_under_test: opts.skillUnderTest,
117
- created_entries: [],
118
- };
119
-
120
- for (const name of siblings) {
121
- const srcDir = join(opts.skillsSourceDir, name);
122
- const dstDir = join(skillsDir, name);
123
- const evalsSubdir = join(srcDir, "evals");
124
-
125
- const entry: SiblingManifest["created_entries"][number] = {
126
- name,
127
- preexisting: false,
128
- };
129
-
130
- if (existsSync(dstDir)) {
131
- entry.preexisting = true;
132
- const backupRoot = mkdtempSync(
133
- join(tmpdir(), "slow-powers-eval-backup-"),
134
- );
135
- entry.backup_path = join(backupRoot, name);
136
- cpSync(dstDir, entry.backup_path, { recursive: true });
137
- rmSync(dstDir, { recursive: true, force: true });
138
- }
139
-
140
- cpSync(srcDir, dstDir, {
141
- recursive: true,
142
- filter: (src) =>
143
- src !== evalsSubdir && !src.startsWith(`${evalsSubdir}/`),
144
- });
145
-
146
- manifest.created_entries.push(entry);
147
- }
148
-
149
- writeFileSync(
150
- join(skillsDir, STAGED_SIBLING_MANIFEST),
151
- `${JSON.stringify(manifest, null, 2)}\n`,
152
- );
153
- return manifest;
154
- }
155
-
156
- export function cleanupStagedSkills(repoRoot: string): void {
157
- const skillsDir = join(repoRoot, ".claude", "skills");
158
- if (!existsSync(skillsDir)) return;
159
-
160
- for (const entry of readdirSync(skillsDir)) {
161
- if (!entry.startsWith(STAGED_SKILL_PREFIX)) continue;
162
- rmSync(join(skillsDir, entry), { recursive: true, force: true });
163
- }
164
-
165
- const manifestPath = join(skillsDir, STAGED_SIBLING_MANIFEST);
166
- if (!existsSync(manifestPath)) return;
167
- let manifest: SiblingManifest;
168
- try {
169
- manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
170
- } catch {
171
- rmSync(manifestPath, { force: true });
172
- return;
173
- }
174
- for (const e of manifest.created_entries) {
175
- const target = join(skillsDir, e.name);
176
- rmSync(target, { recursive: true, force: true });
177
- if (e.preexisting && e.backup_path && existsSync(e.backup_path)) {
178
- cpSync(e.backup_path, target, { recursive: true });
179
- rmSync(dirname(e.backup_path), { recursive: true, force: true });
180
- }
181
- }
182
- rmSync(manifestPath, { force: true });
183
- }
184
-
185
- type Mode = "new-skill" | "revision";
186
-
187
- type Args = {
188
- command: "run" | "snapshot" | "teardown-guard";
189
- mode?: Mode;
190
- baseline?: string;
191
- label?: string;
192
- iteration?: number;
193
- only?: string[];
194
- skip?: string[];
195
- dryRun: boolean;
196
- noStage: boolean;
197
- guard: boolean;
198
- stageName?: string;
199
- planMode: boolean;
200
- };
201
-
202
- function die(msg: string): never {
203
- console.error(`error: ${msg}`);
204
- process.exit(1);
205
- }
206
-
207
- function parseArgs(argv: string[]): Args {
208
- const positionals = argv.filter((a) => !a.startsWith("--"));
209
- const command: Args["command"] =
210
- positionals[0] === "snapshot"
211
- ? "snapshot"
212
- : positionals[0] === "teardown-guard"
213
- ? "teardown-guard"
214
- : "run";
215
-
216
- const flag = (name: string): string | undefined => {
217
- const i = argv.indexOf(`--${name}`);
218
- if (i === -1) return undefined;
219
- const v = argv[i + 1];
220
- if (v === undefined || v.startsWith("--")) {
221
- die(`flag --${name} requires a value`);
222
- }
223
- return v;
224
- };
225
-
226
- const has = (name: string) => argv.includes(`--${name}`);
227
-
228
- const iterationFlag = flag("iteration");
229
- const iteration =
230
- iterationFlag !== undefined ? Number(iterationFlag) : undefined;
231
- if (iteration !== undefined && !Number.isInteger(iteration))
232
- die(`--iteration must be an integer, got ${iterationFlag}`);
233
-
234
- const parseIdList = (v: string | undefined): string[] | undefined =>
235
- v === undefined
236
- ? undefined
237
- : v
238
- .split(",")
239
- .map((s) => s.trim())
240
- .filter(Boolean);
241
-
242
- return {
243
- command,
244
- mode: flag("mode") as Mode | undefined,
245
- baseline: flag("baseline"),
246
- label: flag("label"),
247
- iteration,
248
- only: parseIdList(flag("only")),
249
- skip: parseIdList(flag("skip")),
250
- dryRun: has("dry-run"),
251
- noStage: has("no-stage"),
252
- guard: has("guard"),
253
- stageName: flag("stage-name"),
254
- planMode: has("plan-mode"),
255
- };
256
- }
257
-
258
- function ensureDir(path: string): void {
259
- if (!existsSync(path)) mkdirSync(path, { recursive: true });
260
- }
261
-
262
- function writeJson(path: string, value: unknown): void {
263
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
264
- }
265
-
266
- function readJson<T>(path: string): T {
267
- return JSON.parse(readFileSync(path, "utf8"));
268
- }
269
-
270
- function nextIteration(workspaceSkillDir: string, override?: number): number {
271
- if (override !== undefined) return override;
272
- if (!existsSync(workspaceSkillDir)) return 1;
273
- const entries = readdirSync(workspaceSkillDir).filter((e) =>
274
- e.startsWith("iteration-"),
275
- );
276
- if (entries.length === 0) return 1;
277
- const nums = entries
278
- .map((e) => Number(e.slice("iteration-".length)))
279
- .filter((n) => Number.isFinite(n));
280
- return Math.max(...nums, 0) + 1;
281
- }
282
-
283
- function conditionNamesFor(mode: Mode): [string, string] {
284
- return mode === "new-skill"
285
- ? ["with_skill", "without_skill"]
286
- : ["old_skill", "new_skill"];
287
- }
288
-
289
- function commandSnapshot(args: Args, ctx: RunContext): void {
290
- if (!args.label) die("snapshot requires --label <name>");
291
- const skillDir = ctx.skillSubdir;
292
- const skillMd = join(skillDir, "SKILL.md");
293
- if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
294
-
295
- const destDir = join(
296
- ctx.workspaceRoot,
297
- ctx.skillName,
298
- "snapshots",
299
- args.label,
300
- );
301
- if (existsSync(destDir))
302
- die(
303
- `snapshot already exists: ${destDir}\n` +
304
- " Use a different --label or delete the existing snapshot first.",
305
- );
306
- ensureDir(destDir);
307
-
308
- cpSync(skillMd, join(destDir, "SKILL.md"));
309
- for (const entry of readdirSync(skillDir)) {
310
- if (entry === "SKILL.md" || entry === "evals") continue;
311
- const src = join(skillDir, entry);
312
- const dst = join(destDir, entry);
313
- if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
314
- else cpSync(src, dst);
315
- }
316
-
317
- console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
318
- }
319
-
320
- function commandRun(args: Args, ctx: RunContext): void {
321
- if (!args.mode) die("--mode required: new-skill | revision");
322
- if (args.mode !== "new-skill" && args.mode !== "revision")
323
- die(`unknown --mode: ${args.mode}`);
324
- if (args.mode === "revision" && !args.baseline)
325
- die("revision mode requires --baseline <label>");
326
-
327
- const skillDir = ctx.skillSubdir;
328
- const skillMd = join(skillDir, "SKILL.md");
329
- if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
330
-
331
- const evalsPath = join(skillDir, "evals", "evals.json");
332
- if (!existsSync(evalsPath)) die(`evals.json not found: ${evalsPath}`);
333
-
334
- const config: EvalsConfig = validateEvalsConfig(
335
- readJson(evalsPath),
336
- evalsPath,
337
- );
338
- if (config.skill_name !== ctx.skillName)
339
- console.warn(
340
- `warning: evals.json skill_name (${config.skill_name}) does not match the skill folder (${ctx.skillName}). Proceeding with ${ctx.skillName}.`,
341
- );
342
-
343
- let selectedEvals: Eval[];
344
- try {
345
- selectedEvals = selectEvals(config.evals, {
346
- only: args.only,
347
- skip: args.skip,
348
- });
349
- } catch (err) {
350
- die(err instanceof Error ? err.message : String(err));
351
- }
352
-
353
- const workspaceSkillDir = join(ctx.workspaceRoot, ctx.skillName);
354
- const iteration = nextIteration(workspaceSkillDir, args.iteration);
355
- const iterationDir = join(workspaceSkillDir, `iteration-${iteration}`);
356
-
357
- // A per-run nonce makes each dispatch description globally unique. The
358
- // subagents dir is shared across iterations of one parent session, so a bare
359
- // `<eval>:<condition>` description repeats and fill-transcripts could fill an
360
- // iteration's run from a colliding agent in another iteration. `i<N>-<nonce>`
361
- // also disambiguates re-running the same iteration number.
362
- const runNonce = `${Date.now().toString(36)}-${randomBytes(3).toString("hex")}`;
363
- const runTag = `i${iteration}-${runNonce}`;
364
-
365
- if (existsSync(iterationDir) && args.iteration === undefined)
366
- die(
367
- `iteration-${iteration} already exists; pass --iteration to overwrite explicitly`,
368
- );
369
-
370
- const [conditionA, conditionB] = conditionNamesFor(args.mode);
371
-
372
- let skillPathForA: string | null;
373
- let skillPathForB: string | null;
374
- if (args.mode === "new-skill") {
375
- skillPathForA = skillMd;
376
- skillPathForB = null;
377
- } else {
378
- const baselineSkill = join(
379
- workspaceSkillDir,
380
- "snapshots",
381
- args.baseline as string,
382
- "SKILL.md",
383
- );
384
- if (!existsSync(baselineSkill))
385
- die(
386
- `baseline snapshot not found: ${baselineSkill}\n` +
387
- ` Run: bun run evals:snapshot --skill ${ctx.skillName} --skill-dir ${ctx.skillDir} --label ${args.baseline} (before editing)`,
388
- );
389
- skillPathForA = baselineSkill;
390
- skillPathForB = skillMd;
391
- }
392
-
393
- console.log(
394
- `Preparing ${ctx.skillName} iteration-${iteration} (${args.mode})`,
395
- );
396
- console.log(` ${conditionA}: ${skillPathForA ?? "(no skill)"}`);
397
- console.log(` ${conditionB}: ${skillPathForB ?? "(no skill)"}`);
398
- if (selectedEvals.length !== config.evals.length) {
399
- const [flagName, ids] = args.only
400
- ? ["--only", args.only]
401
- : ["--skip", args.skip ?? []];
402
- console.log(
403
- ` selection: ${selectedEvals.length} of ${config.evals.length} evals (${flagName} ${ids.join(", ")})`,
404
- );
405
- }
406
- if (args.noStage)
407
- console.log(
408
- " staging: disabled (--no-stage) — skills will be inlined into dispatch_prompt for harnesses without project-local skill discovery",
409
- );
410
-
411
- ensureDir(iterationDir);
412
- cpSync(skillMd, join(iterationDir, "skill-snapshot.md"));
413
-
414
- // Always disarm a prior run's guard before re-staging, so a crashed run can't
415
- // leave the write-blocking hook armed across runs.
416
- teardownGuard(ctx.stageRoot);
417
-
418
- if (!args.noStage) cleanupStagedSkills(ctx.stageRoot);
419
-
420
- if (!args.noStage) {
421
- stageSiblingSkills({
422
- skillUnderTest: ctx.skillName,
423
- skillsSourceDir: ctx.skillDir,
424
- repoRoot: ctx.stageRoot,
425
- });
426
- }
427
-
428
- const bootstrapContent =
429
- ctx.bootstrapPath !== null ? readFileSync(ctx.bootstrapPath, "utf8") : null;
430
-
431
- // `--plan-mode` (issue #142): inject the harness's verbatim plan-mode
432
- // procedure as an operating-context layer. The profile is a bundled asset
433
- // resolved relative to this runner (mirroring the guard-script resolution
434
- // below) and keyed by harness, so a harness without a profile simply has no
435
- // `--plan-mode` and the portable dispatch contract is unchanged.
436
- const planModeContent = args.planMode
437
- ? resolvePlanModeProfile(ctx.harness)
438
- : null;
439
- if (args.planMode)
440
- console.log(
441
- ` plan-mode: injecting ${ctx.harness} plan-mode profile as operating context (issue #142; necessary-not-sufficient fidelity layer)`,
442
- );
443
-
444
- // Sibling skill metadata, shared across conditions. Empty when --no-stage
445
- // (nothing is staged, so nothing is discoverable to list).
446
- const siblingSkills: AvailableSkill[] = args.noStage
447
- ? []
448
- : ctx.siblingSkillNames.map((name) => {
449
- const p = join(ctx.skillDir, name, "SKILL.md");
450
- return { name, path: p, description: getSkillDescription(p) };
451
- });
452
-
453
- // `--stage-name` overrides the conspicuous `slow-powers-eval-…` slug with a
454
- // verbatim name (issue #144 Step 2: A/B a natural name against the eval slug).
455
- // It targets the single staging condition, so reject the case where both
456
- // conditions stage (e.g. revision mode) — one name can't cover two dirs — and
457
- // refuse to clobber a dir that already exists (a real project skill the user
458
- // owns; cleanup has already removed our own prior custom dirs by this point).
459
- if (args.stageName !== undefined && !args.noStage) {
460
- if (skillPathForA !== null && skillPathForB !== null) {
461
- die(
462
- "--stage-name is only supported when exactly one condition stages the skill (e.g. --mode new-skill); both conditions stage here.",
463
- );
464
- }
465
- const target = join(ctx.stageRoot, ".claude", "skills", args.stageName);
466
- if (existsSync(target)) {
467
- die(
468
- `--stage-name "${args.stageName}": ${target} already exists; refusing to clobber it. Remove it or choose a different name.`,
469
- );
470
- }
471
- }
472
-
473
- const stageFor = (
474
- condName: string,
475
- condSkillPath: string | null,
476
- ): string | null => {
477
- if (!condSkillPath || args.noStage) return null;
478
- return stageSkillForCC({
479
- content: readFileSync(condSkillPath, "utf8"),
480
- iteration,
481
- condition: condName,
482
- skillName: ctx.skillName,
483
- repoRoot: ctx.stageRoot,
484
- stageNameOverride: args.stageName,
485
- });
486
- };
487
-
488
- const conditionASlug = stageFor(conditionA, skillPathForA);
489
- const conditionBSlug = stageFor(conditionB, skillPathForB);
490
-
491
- // A custom-named dir isn't caught by cleanupStagedSkills's prefix scan; record
492
- // it in the sibling manifest so the next run removes it.
493
- if (
494
- args.stageName !== undefined &&
495
- (conditionASlug === args.stageName || conditionBSlug === args.stageName)
496
- ) {
497
- registerStagedSkillForCleanup(ctx.stageRoot, args.stageName);
498
- }
499
-
500
- const conditions: ConditionsRecord = {
501
- mode: args.mode,
502
- baseline: args.baseline,
503
- conditions: [
504
- {
505
- name: conditionA,
506
- skill_path: skillPathForA,
507
- staged_skill_slug: conditionASlug,
508
- },
509
- {
510
- name: conditionB,
511
- skill_path: skillPathForB,
512
- staged_skill_slug: conditionBSlug,
513
- },
514
- ],
515
- timestamp: new Date().toISOString(),
516
- harness: ctx.harness,
517
- run_nonce: runNonce,
518
- };
519
- writeJson(join(iterationDir, "conditions.json"), conditions);
520
-
521
- // availableSkills for a condition = siblings + the skill-under-test when
522
- // that condition loads it. Empty when nothing was staged.
523
- const availableSkillsFor = (
524
- condSkillPath: string | null,
525
- ): AvailableSkill[] => {
526
- if (args.noStage) return [];
527
- const skills = [...siblingSkills];
528
- if (condSkillPath) {
529
- skills.push({
530
- name: ctx.skillName,
531
- path: condSkillPath,
532
- description: getSkillDescription(condSkillPath),
533
- });
534
- }
535
- return skills;
536
- };
537
-
538
- const tasks: DispatchTask[] = [];
539
- for (const ev of selectedEvals) {
540
- const evalDir = join(iterationDir, `eval-${ev.id}`);
541
- ensureDir(evalDir);
542
-
543
- for (const [condName, condSkillPath, condSlug] of [
544
- [conditionA, skillPathForA, conditionASlug],
545
- [conditionB, skillPathForB, conditionBSlug],
546
- ] as const) {
547
- const condDir = join(evalDir, condName);
548
- const outputsDir = join(condDir, "outputs");
549
- ensureDir(outputsDir);
550
-
551
- const fixtures = copyFixtures(ev, skillDir, condDir);
552
- tasks.push(
553
- buildDispatchTask({
554
- evalId: ev.id,
555
- condition: condName,
556
- skillPath: condSkillPath,
557
- stagedSkillSlug: condSlug,
558
- userPrompt: ev.prompt,
559
- fixtures,
560
- outputsDir,
561
- condDir,
562
- bootstrapContent,
563
- planModeContent,
564
- skillName: ctx.skillName,
565
- availableSkills: availableSkillsFor(condSkillPath),
566
- runTag,
567
- }),
568
- );
569
- }
570
- }
571
-
572
- const manifestPath = join(iterationDir, "dispatch-manifest.md");
573
- writeFileSync(
574
- manifestPath,
575
- buildManifest({
576
- skillName: ctx.skillName,
577
- mode: args.mode,
578
- baseline: args.baseline,
579
- iteration,
580
- tasks,
581
- }),
582
- );
583
-
584
- // Write each prompt to its own file and reference it by path in dispatch.json.
585
- // The orchestrator then dispatches with a short "read this file" prompt instead
586
- // of reproducing the full prompt verbatim per Task call.
587
- for (const task of tasks) {
588
- writeFileSync(task.dispatch_prompt_path, task.dispatch_prompt);
589
- }
590
-
591
- const dispatchJsonPath = join(iterationDir, "dispatch.json");
592
- writeJson(dispatchJsonPath, {
593
- skill_name: ctx.skillName,
594
- iteration,
595
- run_nonce: runNonce,
596
- iteration_dir: iterationDir,
597
- mode: args.mode,
598
- baseline: args.baseline ?? null,
599
- plan_mode: args.planMode,
600
- conditions: conditions.conditions,
601
- harness: ctx.harness,
602
- tasks: tasks.map(({ dispatch_prompt: _omit, ...rest }) => rest),
603
- });
604
-
605
- // Opt-in hard guard. Stages a PreToolUse hook that blocks subagent
606
- // writes/installs outside the eval sandbox while dispatches run.
607
- if (args.guard && !args.dryRun) {
608
- if (args.noStage) {
609
- console.warn(
610
- "\n⚠ --guard requires staging enabled; skipping guard install.",
611
- );
612
- } else {
613
- const guardScriptPath = join(import.meta.dir, "guard", "guard.ts");
614
- installGuard({
615
- stageRoot: ctx.stageRoot,
616
- workspaceRoot: ctx.workspaceRoot,
617
- guardScriptPath,
618
- });
619
- console.log(
620
- "\n🛡 Write guard armed: a PreToolUse hook is staged in .claude/settings.local.json\n" +
621
- " and will block writes/installs outside the eval sandbox during dispatches.\n" +
622
- " It auto-expires in 6h and is removed on the next run; to remove it now:\n" +
623
- " bun run evals:teardown-guard --skill <name>",
624
- );
625
- }
626
- }
627
-
628
- // Plugin-shadow preflight (Claude Code): a staged skill name that is also
629
- // discoverable from an enabled plugin or the global skills dir contaminates the
630
- // run — subagents inherit this session's plugins, so both copies are reachable.
631
- // The runner can't unload a plugin from a live session; it only flags it. The
632
- // report is persisted so the aggregator can surface it in validity_warnings.
633
- if (ctx.harness === "claude-code") {
634
- const shadowReport = detectPluginShadows({
635
- configDir: resolveConfigDir(),
636
- cwd: ctx.stageRoot,
637
- stagedSkillNames: [ctx.skillName, ...ctx.siblingSkillNames],
638
- });
639
- if (shadowReport.shadowed.length > 0) {
640
- writeJson(join(iterationDir, "plugin-shadow.json"), shadowReport);
641
- console.warn(formatShadowBanner(shadowReport));
642
- }
643
- }
644
-
645
- console.log(`\nWorkspace prepared: ${iterationDir}`);
646
- console.log(`Dispatch manifest: ${manifestPath}`);
647
- console.log(`Dispatch tasks: ${dispatchJsonPath}`);
648
- console.log(
649
- `\n${tasks.length} dispatches required (${selectedEvals.length} evals × 2 conditions).`,
650
- );
651
-
652
- if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
653
- else
654
- console.log(
655
- "\nNext: read dispatch.json, dispatch each task as a subagent, write run.json + timing.json to the paths in each task.",
656
- );
657
- }
658
-
659
- type DispatchTask = {
660
- eval_id: string;
661
- condition: string;
662
- skill_path: string | null;
663
- staged_skill_slug: string | null;
664
- user_prompt: string;
665
- fixtures: string[];
666
- outputs_dir: string;
667
- run_record_path: string;
668
- timing_path: string;
669
- agent_description: string;
670
- /**
671
- * Absolute path to the file holding the full dispatch prompt. The orchestrator
672
- * dispatches each subagent with a short "read this file and follow it" prompt
673
- * rather than inlining the prompt, so it never has to reproduce ~KB of text per
674
- * Task call. `dispatch_prompt` carries the same text in-memory (for manifest
675
- * building and unit tests) but is stripped from the serialized dispatch.json.
676
- */
677
- dispatch_prompt_path: string;
678
- dispatch_prompt: string;
679
- };
680
-
681
- export type { AvailableSkill } from "./types";
682
-
683
- /**
684
- * Filters the eval list to the subset requested via `--only` / `--skip`. The
685
- * two flags are mutually exclusive. Every requested id must exist in the config,
686
- * so a typo'd id is caught up front rather than silently producing an empty or
687
- * surprising run. Throws on invalid input; the caller routes the message to
688
- * `die`. `--only` preserves the config's eval order, not the order ids were
689
- * passed.
690
- */
691
- export function selectEvals(
692
- evals: Eval[],
693
- opts: { only?: string[]; skip?: string[] },
694
- ): Eval[] {
695
- if (opts.only && opts.skip)
696
- throw new Error("use only one of --only / --skip, not both");
697
- const requested = opts.only ?? opts.skip;
698
- if (requested === undefined) return evals;
699
- if (requested.length === 0)
700
- throw new Error("--only/--skip requires at least one eval id");
701
-
702
- const known = new Set(evals.map((e) => e.id));
703
- const unknown = requested.filter((id) => !known.has(id));
704
- if (unknown.length)
705
- throw new Error(
706
- `unknown eval id(s): ${unknown.join(", ")}. ` +
707
- `Available ids: ${[...known].join(", ")}`,
708
- );
709
-
710
- const set = new Set(requested);
711
- return opts.only
712
- ? evals.filter((e) => set.has(e.id))
713
- : evals.filter((e) => !set.has(e.id));
714
- }
715
-
716
- function copyFixtures(ev: Eval, skillDir: string, condDir: string): string[] {
717
- if (!ev.files || ev.files.length === 0) return [];
718
- const inputsDir = join(condDir, "inputs");
719
- ensureDir(inputsDir);
720
- const copied: string[] = [];
721
- for (const f of ev.files) {
722
- const src = join(skillDir, "evals", f);
723
- if (!existsSync(src)) die(`fixture not found: ${src}`);
724
- const dst = join(inputsDir, basename(f));
725
- if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
726
- else cpSync(src, dst);
727
- copied.push(dst);
728
- }
729
- return copied;
730
- }
731
-
732
- /**
733
- * Resolve the verbatim plan-mode procedure profile for a harness (issue #142).
734
- * The profile is a bundled supporting-file asset under
735
- * `profiles/<harness>/plan-mode.md`, resolved relative to this runner exactly
736
- * like the guard script (`join(import.meta.dir, "guard", "guard.ts")`). A
737
- * harness without a profile gets a clear error rather than a silent no-op — the
738
- * profile is Claude-tier fidelity, and a harness lacking one leaves the portable
739
- * dispatch contract unchanged (no `<system-reminder>` plan-mode block emitted).
740
- */
741
- function resolvePlanModeProfile(harness: Harness): string {
742
- const profilePath = join(
743
- import.meta.dir,
744
- "profiles",
745
- harness,
746
- "plan-mode.md",
747
- );
748
- if (!existsSync(profilePath)) {
749
- die(
750
- `--plan-mode: no plan-mode profile exists for harness '${harness}' ` +
751
- `(expected ${profilePath}). This is a Claude-tier fidelity layer; a ` +
752
- "harness without a profile leaves the portable dispatch contract unchanged.",
753
- );
754
- }
755
- return readFileSync(profilePath, "utf8");
756
- }
757
-
758
- function getSkillDescription(skillPath: string): string {
759
- try {
760
- const content = readFileSync(skillPath, "utf8");
761
- const match = content.match(/description:\s*([^\n\r]+)/);
762
- if (match) {
763
- let desc = match[1].trim();
764
- if (
765
- (desc.startsWith('"') && desc.endsWith('"')) ||
766
- (desc.startsWith("'") && desc.endsWith("'"))
767
- ) {
768
- desc = desc.slice(1, -1).trim();
769
- }
770
- return desc;
771
- }
772
- } catch {}
773
- return "No description available.";
774
- }
775
-
776
- /**
777
- * Removes the skill-under-test's "Active Skills Directory" entry from bootstrap
778
- * content so a skill-absent condition (e.g. `without_skill`) carries no
779
- * reference to it. Targets the markdown list-item block: a top-level `*`/`-`
780
- * bullet whose backticked name equals `skillName`, plus its indented
781
- * continuation lines (the `*Trigger:*` sub-bullet). Sibling entries and the
782
- * heading are left intact. The eval bootstrap names skills only in that
783
- * directory, so this is the sole reference vector to scrub.
784
- */
785
- export function redactSkillFromBootstrap(
786
- content: string,
787
- skillName: string,
788
- ): string {
789
- const out: string[] = [];
790
- let skipping = false;
791
- for (const line of content.split("\n")) {
792
- if (skipping) {
793
- // Indented continuation lines belong to the entry being dropped.
794
- if (/^\s+\S/.test(line)) continue;
795
- skipping = false;
796
- }
797
- if (/^[*-]\s/.test(line) && line.includes(`\`${skillName}\``)) {
798
- skipping = true;
799
- continue;
800
- }
801
- out.push(line);
802
- }
803
- return out.join("\n");
804
- }
805
-
806
- export function buildDispatchTask(opts: {
807
- evalId: string;
808
- condition: string;
809
- skillPath: string | null;
810
- stagedSkillSlug: string | null;
811
- userPrompt: string;
812
- fixtures: string[];
813
- outputsDir: string;
814
- condDir: string;
815
- bootstrapContent: string | null;
816
- /**
817
- * Verbatim plan-mode procedure profile (from
818
- * `profiles/<harness>/plan-mode.md`) to inject as an operating-context layer,
819
- * or null/undefined to omit it. Skill-agnostic, so it is identical across the
820
- * with/without-skill arms and needs no redaction. Set by the `--plan-mode`
821
- * flag (issue #142): the highest-fidelity in-runner approximation of a real
822
- * plan mode, still text the agent reads — a necessary-not-sufficient signal.
823
- */
824
- planModeContent?: string | null;
825
- skillName: string;
826
- availableSkills: AvailableSkill[];
827
- /**
828
- * Per-run uniqueness suffix (`i<iteration>-<nonce>`). Appended to the
829
- * dispatch description so transcripts can't collide across iterations or
830
- * re-runs. Omitted in unit tests that exercise prompt assembly directly.
831
- */
832
- runTag?: string;
833
- }): DispatchTask {
834
- const stagedSkills = [...opts.availableSkills].sort((a, b) =>
835
- a.name.localeCompare(b.name),
836
- );
837
-
838
- let skillBlock: string;
839
- if (opts.stagedSkillSlug) {
840
- // Neutral slug disambiguation only — no imperative to invoke. The skill is
841
- // staged under a unique slug; surface that identifier so a deliberate
842
- // invocation targets the staged copy and the __skill_invoked meta-check can
843
- // find it. Do NOT assert a plugin is "loaded" or tell the agent to prefer the
844
- // slug "rather than the bare name": in an isolated run there is no global copy,
845
- // and that framing invited the agent to hunt for one (issue #144 global-plugin
846
- // leakage). Whether to invoke is left to the skill's own triggering (dropping
847
- // the old "invoke if it applies" directive was the issue #119 ceiling fix).
848
- skillBlock = [
849
- `The \`${opts.skillName}\` skill is registered under the identifier \`${opts.stagedSkillSlug}\` and is discoverable via the Skill tool. If you invoke it, use that identifier.`,
850
- ].join("\n");
851
- } else if (opts.skillPath) {
852
- skillBlock = [
853
- "The following skill is loaded into your operating guidelines. Apply it where relevant to the user's request.",
854
- "",
855
- `<skill name="${basename(dirname(opts.skillPath))}">`,
856
- readFileSync(opts.skillPath, "utf8").trim(),
857
- "</skill>",
858
- ].join("\n");
859
- } else if (stagedSkills.length > 0 || opts.bootstrapContent) {
860
- // Skill-absent arm in a realistic environment: stay silent. The
861
- // available-skills block already omits the skill-under-test, so any
862
- // commentary here would only announce the eval (and, in the control arm,
863
- // draw attention to the very skill that is supposed to be absent).
864
- skillBlock = "";
865
- } else {
866
- skillBlock = "No skill is loaded. Respond as you naturally would.";
867
- }
868
-
869
- const fixturesBlock = opts.fixtures.length
870
- ? `Available fixture files:\n${opts.fixtures.map((f) => ` - ${f}`).join("\n")}`
871
- : "Available fixture files: none";
872
-
873
- // A dispatch mirrors a real session by carrying two *separate* surfaces, the
874
- // way the harness actually delivers them:
875
- // 1. The verbatim --bootstrap file (the SessionStart-hook equivalent),
876
- // wrapped in <session-start-context>, if supplied.
877
- // 2. The list of discoverable skills, rendered in the harness's native
878
- // presentation as its own block (see adapters/claude-code-session.ts).
879
- // A condition that does not load the skill-under-test (the new-skill
880
- // `without_skill` arm, under staging or --no-stage) must carry zero reference
881
- // to it. The skill-under-test is auto-omitted from the available-skills block
882
- // (see `availableSkillsFor`). redactSkillFromBootstrap covers the other path:
883
- // a *user-supplied* --bootstrap that names the skill in its own prose would
884
- // otherwise leak it into the control arm. (The shipped bootstrap.md no longer
885
- // enumerates skills, so that redaction is a no-op against it.)
886
- const skillAbsent = !opts.skillPath && !opts.stagedSkillSlug;
887
- const effectiveBootstrap =
888
- opts.bootstrapContent && skillAbsent
889
- ? redactSkillFromBootstrap(opts.bootstrapContent, opts.skillName)
890
- : opts.bootstrapContent;
891
-
892
- const sections: string[] = [];
893
- if (effectiveBootstrap) {
894
- sections.push(
895
- [
896
- "<session-start-context>",
897
- "The following guidelines were loaded at session start by the slow-powers plugin",
898
- "(equivalent to the SessionStart hook firing in a real user's environment):",
899
- "",
900
- effectiveBootstrap.trim(),
901
- "</session-start-context>",
902
- "",
903
- ].join("\n"),
904
- );
905
- }
906
- const availableSkillsBlock = renderAvailableSkillsBlock(stagedSkills);
907
- if (availableSkillsBlock) {
908
- sections.push(`${availableSkillsBlock}\n\n`);
909
- }
910
- // Plan-mode operating context (issue #142). Injected as its own block after
911
- // the session-start surfaces and before the eval task framing, so it reads as
912
- // a session-level mode active for this turn — layered the way the real harness
913
- // delivers it, not as seed prose. Skill-agnostic: identical in both arms.
914
- const planModeBlock = opts.planModeContent
915
- ? renderPlanModeContext(opts.planModeContent)
916
- : "";
917
- if (planModeBlock) {
918
- sections.push(`${planModeBlock}\n\n`);
919
- }
920
- const taskLines = [
921
- "You are executing a single test case for a skill evaluation framework.",
922
- "Treat this as a real user request — do NOT optimize behavior for the eval.",
923
- ];
924
- if (skillBlock) taskLines.push("", skillBlock);
925
- taskLines.push(
926
- "",
927
- fixturesBlock,
928
- `Output directory: ${opts.outputsDir}`,
929
- "",
930
- "Instructions:",
931
- "- Write any files you produce into the output directory.",
932
- `- After completing the task, write your final user-facing response to ${opts.outputsDir}/final-message.md.`,
933
- "- Do not write outside the output directory.",
934
- "",
935
- "User request:",
936
- opts.userPrompt,
937
- );
938
- sections.push(taskLines.join("\n"));
939
-
940
- return {
941
- eval_id: opts.evalId,
942
- condition: opts.condition,
943
- skill_path: opts.skillPath,
944
- staged_skill_slug: opts.stagedSkillSlug,
945
- user_prompt: opts.userPrompt,
946
- fixtures: opts.fixtures,
947
- outputs_dir: opts.outputsDir,
948
- run_record_path: join(opts.condDir, "run.json"),
949
- timing_path: join(opts.condDir, "timing.json"),
950
- agent_description: opts.runTag
951
- ? `${opts.evalId}:${opts.condition}:${opts.runTag}`
952
- : `${opts.evalId}:${opts.condition}`,
953
- dispatch_prompt_path: join(opts.condDir, "dispatch-prompt.txt"),
954
- dispatch_prompt: sections.join(""),
955
- };
956
- }
957
-
958
- function buildManifest(opts: {
959
- skillName: string;
960
- mode: Mode;
961
- baseline?: string;
962
- iteration: number;
963
- tasks: DispatchTask[];
964
- }): string {
965
- const header = [
966
- `# Dispatch manifest — ${opts.skillName} iteration-${opts.iteration}`,
967
- "",
968
- `Mode: ${opts.mode}${opts.baseline ? ` (baseline: ${opts.baseline})` : ""}`,
969
- `Generated: ${new Date().toISOString()}`,
970
- `Total dispatches: ${opts.tasks.length}`,
971
- "",
972
- "## How to use this manifest",
973
- "",
974
- 'In an agent session, read `dispatch.json` (sibling of this file) instead of this manifest. Each task has a `dispatch_prompt_path` field pointing at the file that holds the full prompt — dispatch the subagent with a short "read this file and follow it" instruction rather than inlining the prompt — plus exact paths for `run.json` and `timing.json`.',
975
- "",
976
- "**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
977
- "",
978
- "After every dispatch:",
979
- "",
980
- "1. Write `run.json` matching `skills/evaluating-skills/schema/run-record.schema.json` (enforced at runtime by grade/fill-transcripts/detect-stray-writes). Carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]` for now — `evals:fill-transcripts` will populate it from the persisted transcript in a later step.",
981
- "2. Capture `total_tokens` and `duration_ms` from the harness's task completion event into `timing.json`. These values may not be persisted anywhere else — save them immediately.",
982
- "",
983
- "After all dispatches:",
984
- "",
985
- "3. (Claude Code only, optional) Run `bun run evals:fill-transcripts --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` to fill `tool_invocations` from each subagent's persisted transcript. Skipping this step leaves `transcript_check` assertions unverifiable.",
986
- "4. Run `bun run evals:grade --skill <name> --iteration <N>` to grade.",
987
- "",
988
- "## Dispatches",
989
- "",
990
- ].join("\n");
991
-
992
- const entries = opts.tasks
993
- .map((t) =>
994
- [
995
- `### ${t.eval_id} / ${t.condition}`,
996
- "",
997
- `- run.json: ${t.run_record_path}`,
998
- `- timing.json: ${t.timing_path}`,
999
- "",
1000
- "```",
1001
- t.dispatch_prompt,
1002
- "```",
1003
- "",
1004
- ].join("\n"),
1005
- )
1006
- .join("\n");
1007
-
1008
- return header + entries;
1009
- }
1010
-
1011
- if (import.meta.main) {
1012
- const argv = Bun.argv.slice(2);
1013
- const args = parseArgs(argv);
1014
- let ctx: RunContext;
1015
- try {
1016
- ctx = detectRunContext(argv);
1017
- } catch (err) {
1018
- die(err instanceof Error ? err.message : String(err));
1019
- }
1020
- if (args.command === "snapshot") commandSnapshot(args, ctx);
1021
- else if (args.command === "teardown-guard") {
1022
- const torn = teardownGuard(ctx.stageRoot);
1023
- console.log(
1024
- torn
1025
- ? "🛡 Write guard removed."
1026
- : "No write guard was installed — nothing to remove.",
1027
- );
1028
- } else commandRun(args, ctx);
1029
- }