@slowdini/slow-powers-opencode 0.1.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +32 -13
  2. package/package.json +5 -1
  3. package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
  4. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  5. package/skills/evaluating-skills/SKILL.md +22 -20
  6. package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
  7. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  8. package/skills/evaluating-skills/harness-parity.md +155 -0
  9. package/skills/evaluating-skills/pressure-scenarios.md +1 -1
  10. package/skills/evaluating-skills/runner/README.md +28 -19
  11. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  12. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  13. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  14. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  15. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  16. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  17. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  18. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  19. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  20. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  21. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  22. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  23. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  24. package/skills/evaluating-skills/runner/run.ts +376 -17
  25. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  26. package/skills/evaluating-skills/runner/types.ts +9 -0
  27. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  28. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  29. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  30. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  31. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  32. package/skills/hardening-plans/SKILL.md +1 -1
  33. package/skills/systematic-debugging/SKILL.md +4 -0
  34. package/skills/test-driven-development/SKILL.md +2 -0
  35. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  36. package/skills/verifying-development-work/SKILL.md +99 -0
  37. package/skills/verifying-development-work/code-review.md +68 -0
  38. package/skills/verifying-development-work/comment-review.md +85 -0
  39. package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
  40. package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
  41. package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  43. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  44. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  45. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  46. package/skills/verifying-development-work/evals/evals.json +178 -0
  47. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  48. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  49. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
  50. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
  51. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
  52. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
  53. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
  54. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
  55. package/skills/working-in-isolation/SKILL.md +2 -2
  56. package/skills/writing-skills/SKILL.md +2 -3
  57. package/skills/finishing-a-development-branch/SKILL.md +0 -96
  58. package/skills/finishing-a-development-branch/evals/evals.json +0 -41
  59. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
  60. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
  61. package/skills/verification-before-completion/SKILL.md +0 -65
  62. package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
  63. package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
  64. package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
  65. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  66. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  67. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  68. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  69. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  70. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  71. package/skills/verification-before-completion/evals/evals.json +0 -77
  72. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
  73. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
  74. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
  75. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
  76. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
@@ -31,6 +31,7 @@ import type {
31
31
  EvalsConfig,
32
32
  } from "./types";
33
33
  import { validateEvalsConfig } from "./validate";
34
+ import { cleanupWorkspace, SNAPSHOT_META } from "./workspace-teardown";
34
35
 
35
36
  export const STAGED_SKILL_PREFIX = "slow-powers-eval-";
36
37
  export const STAGED_SIBLING_MANIFEST = ".slow-powers-eval-manifest.json";
@@ -41,6 +42,16 @@ export function stageSkillForCC(opts: {
41
42
  condition: string;
42
43
  skillName: string;
43
44
  repoRoot: string;
45
+ /**
46
+ * Source skill directory whose sibling assets are copied alongside the staged
47
+ * SKILL.md — everything next to SKILL.md except SKILL.md itself, the `evals/`
48
+ * dir, and the snapshot bookkeeping file. A multi-file skill whose SKILL.md
49
+ * links a sibling (e.g. `[code-review.md](code-review.md)`) would otherwise be
50
+ * staged with a dangling link: the agent can't resolve the reference relative
51
+ * to the staged dir, so the linked guidance is silently unreachable. Mirrors
52
+ * the sibling-asset copy in `snapshot`. Omit to stage SKILL.md alone.
53
+ */
54
+ assetsDir?: string;
44
55
  /**
45
56
  * When set, stage under this verbatim identifier instead of the conspicuous
46
57
  * `slow-powers-eval-…` slug. Used by `--stage-name` to A/B a natural name
@@ -56,6 +67,16 @@ export function stageSkillForCC(opts: {
56
67
  const skillDir = join(opts.repoRoot, ".claude", "skills", slug);
57
68
  mkdirSync(skillDir, { recursive: true });
58
69
  writeFileSync(join(skillDir, "SKILL.md"), opts.content);
70
+ if (opts.assetsDir !== undefined && existsSync(opts.assetsDir)) {
71
+ for (const entry of readdirSync(opts.assetsDir)) {
72
+ if (entry === "SKILL.md" || entry === "evals" || entry === SNAPSHOT_META)
73
+ continue;
74
+ const src = join(opts.assetsDir, entry);
75
+ const dst = join(skillDir, entry);
76
+ if (statSync(src).isDirectory()) cpSync(src, dst, { recursive: true });
77
+ else cpSync(src, dst);
78
+ }
79
+ }
59
80
  return slug;
60
81
  }
61
82
 
@@ -78,6 +99,7 @@ export function registerStagedSkillForCleanup(
78
99
  manifest = {
79
100
  created_at: new Date().toISOString(),
80
101
  staged_under_test: name,
102
+ skills_dir_preexisting: true,
81
103
  created_entries: [],
82
104
  };
83
105
  }
@@ -89,6 +111,14 @@ export function registerStagedSkillForCleanup(
89
111
  type SiblingManifest = {
90
112
  created_at: string;
91
113
  staged_under_test: string;
114
+ /**
115
+ * Whether `.claude/skills` already existed when staging began. When false the
116
+ * runner created it, so {@link cleanupStagedSkills} may remove the whole tree
117
+ * (and prune an emptied `.claude`); when true (or absent, on older manifests)
118
+ * cleanup falls back to the surgical per-entry restore so a user's own
119
+ * project skills are left intact.
120
+ */
121
+ skills_dir_preexisting?: boolean;
92
122
  created_entries: Array<{
93
123
  name: string;
94
124
  preexisting: boolean;
@@ -102,6 +132,7 @@ export function stageSiblingSkills(opts: {
102
132
  repoRoot: string;
103
133
  }): SiblingManifest {
104
134
  const skillsDir = join(opts.repoRoot, ".claude", "skills");
135
+ const skillsDirPreexisting = existsSync(skillsDir);
105
136
  mkdirSync(skillsDir, { recursive: true });
106
137
 
107
138
  const siblings = readdirSync(opts.skillsSourceDir).filter((name) => {
@@ -114,6 +145,7 @@ export function stageSiblingSkills(opts: {
114
145
  const manifest: SiblingManifest = {
115
146
  created_at: new Date().toISOString(),
116
147
  staged_under_test: opts.skillUnderTest,
148
+ skills_dir_preexisting: skillsDirPreexisting,
117
149
  created_entries: [],
118
150
  };
119
151
 
@@ -153,8 +185,18 @@ export function stageSiblingSkills(opts: {
153
185
  return manifest;
154
186
  }
155
187
 
188
+ /** Remove `dir` only if it exists and is empty. Used to prune a `.claude` the
189
+ * runner emptied without ever touching a `.claude` that still holds the user's
190
+ * own files (e.g. `settings.json`). */
191
+ function pruneIfEmpty(dir: string): void {
192
+ if (existsSync(dir) && readdirSync(dir).length === 0) {
193
+ rmSync(dir, { recursive: true, force: true });
194
+ }
195
+ }
196
+
156
197
  export function cleanupStagedSkills(repoRoot: string): void {
157
- const skillsDir = join(repoRoot, ".claude", "skills");
198
+ const claudeDir = join(repoRoot, ".claude");
199
+ const skillsDir = join(claudeDir, "skills");
158
200
  if (!existsSync(skillsDir)) return;
159
201
 
160
202
  for (const entry of readdirSync(skillsDir)) {
@@ -171,6 +213,18 @@ export function cleanupStagedSkills(repoRoot: string): void {
171
213
  rmSync(manifestPath, { force: true });
172
214
  return;
173
215
  }
216
+
217
+ // The runner created `.claude/skills` this run, so it can't be holding any of
218
+ // the user's own skills — remove the whole staged tree (including any stray,
219
+ // non-prefixed dirs a recursive eval left behind), then prune an emptied
220
+ // `.claude`. In a real project `.claude/settings.json` keeps `.claude`
221
+ // non-empty, so only the scaffolding we created is removed.
222
+ if (manifest.skills_dir_preexisting === false) {
223
+ rmSync(skillsDir, { recursive: true, force: true });
224
+ pruneIfEmpty(claudeDir);
225
+ return;
226
+ }
227
+
174
228
  for (const e of manifest.created_entries) {
175
229
  const target = join(skillsDir, e.name);
176
230
  rmSync(target, { recursive: true, force: true });
@@ -185,7 +239,13 @@ export function cleanupStagedSkills(repoRoot: string): void {
185
239
  type Mode = "new-skill" | "revision";
186
240
 
187
241
  type Args = {
188
- command: "run" | "snapshot" | "teardown-guard";
242
+ command:
243
+ | "run"
244
+ | "snapshot"
245
+ | "teardown-guard"
246
+ | "teardown"
247
+ | "ingest"
248
+ | "finalize";
189
249
  mode?: Mode;
190
250
  baseline?: string;
191
251
  label?: string;
@@ -197,6 +257,8 @@ type Args = {
197
257
  guard: boolean;
198
258
  stageName?: string;
199
259
  planMode: boolean;
260
+ ref?: string;
261
+ subagentsDir?: string;
200
262
  };
201
263
 
202
264
  function die(msg: string): never {
@@ -204,14 +266,59 @@ function die(msg: string): never {
204
266
  process.exit(1);
205
267
  }
206
268
 
269
+ /**
270
+ * Reads the bytes of `<ref>:./<relPath>` from git, resolving `relPath` relative
271
+ * to `cwd` via the `./` prefix. Returns the raw stdout Buffer on success (write
272
+ * it directly — never `.toString()` — so binary assets round-trip intact), or
273
+ * `null` if the object doesn't exist at that ref (git exits non-zero). Mirrors
274
+ * the `Bun.spawnSync` git pattern in `promote-baseline.ts:gitHead`; runs git
275
+ * directly (no shell), so the ref/path aren't interpolated into a shell string.
276
+ */
277
+ function gitShowBytes(
278
+ cwd: string,
279
+ ref: string,
280
+ relPath: string,
281
+ ): Buffer | null {
282
+ const res = Bun.spawnSync(["git", "show", `${ref}:./${relPath}`], {
283
+ cwd,
284
+ stdout: "pipe",
285
+ stderr: "pipe",
286
+ });
287
+ if (res.exitCode !== 0) return null;
288
+ return Buffer.from(res.stdout);
289
+ }
290
+
291
+ /**
292
+ * Lists every file under `cwd` as it existed at `<ref>`, as paths relative to
293
+ * `cwd` (git's default ls-tree output strips the cwd prefix). `die`s with git's
294
+ * stderr on failure — a bad ref or a cwd outside any repo surfaces here.
295
+ */
296
+ function gitLsFiles(cwd: string, ref: string): string[] {
297
+ const res = Bun.spawnSync(["git", "ls-tree", "-r", "--name-only", ref, "."], {
298
+ cwd,
299
+ stdout: "pipe",
300
+ stderr: "pipe",
301
+ });
302
+ if (res.exitCode !== 0)
303
+ die(`git ls-tree failed for ref ${ref}: ${res.stderr.toString().trim()}`);
304
+ return res.stdout
305
+ .toString()
306
+ .split("\n")
307
+ .map((s) => s.trim())
308
+ .filter(Boolean);
309
+ }
310
+
207
311
  function parseArgs(argv: string[]): Args {
208
312
  const positionals = argv.filter((a) => !a.startsWith("--"));
313
+ const COMMANDS: Args["command"][] = [
314
+ "snapshot",
315
+ "teardown-guard",
316
+ "teardown",
317
+ "ingest",
318
+ "finalize",
319
+ ];
209
320
  const command: Args["command"] =
210
- positionals[0] === "snapshot"
211
- ? "snapshot"
212
- : positionals[0] === "teardown-guard"
213
- ? "teardown-guard"
214
- : "run";
321
+ COMMANDS.find((c) => c === positionals[0]) ?? "run";
215
322
 
216
323
  const flag = (name: string): string | undefined => {
217
324
  const i = argv.indexOf(`--${name}`);
@@ -252,6 +359,8 @@ function parseArgs(argv: string[]): Args {
252
359
  guard: has("guard"),
253
360
  stageName: flag("stage-name"),
254
361
  planMode: has("plan-mode"),
362
+ ref: flag("ref"),
363
+ subagentsDir: flag("subagents-dir"),
255
364
  };
256
365
  }
257
366
 
@@ -289,8 +398,6 @@ function conditionNamesFor(mode: Mode): [string, string] {
289
398
  function commandSnapshot(args: Args, ctx: RunContext): void {
290
399
  if (!args.label) die("snapshot requires --label <name>");
291
400
  const skillDir = ctx.skillSubdir;
292
- const skillMd = join(skillDir, "SKILL.md");
293
- if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
294
401
 
295
402
  const destDir = join(
296
403
  ctx.workspaceRoot,
@@ -303,6 +410,14 @@ function commandSnapshot(args: Args, ctx: RunContext): void {
303
410
  `snapshot already exists: ${destDir}\n` +
304
411
  " Use a different --label or delete the existing snapshot first.",
305
412
  );
413
+
414
+ if (args.ref !== undefined) {
415
+ snapshotFromRef(args.ref, skillDir, destDir, ctx.skillName);
416
+ return;
417
+ }
418
+
419
+ const skillMd = join(skillDir, "SKILL.md");
420
+ if (!existsSync(skillMd)) die(`skill not found: ${skillMd}`);
306
421
  ensureDir(destDir);
307
422
 
308
423
  cpSync(skillMd, join(destDir, "SKILL.md"));
@@ -314,9 +429,53 @@ function commandSnapshot(args: Args, ctx: RunContext): void {
314
429
  else cpSync(src, dst);
315
430
  }
316
431
 
432
+ // Record provenance so teardown keeps this (working-tree) snapshot — unlike a
433
+ // ref snapshot, it can't be regenerated from git.
434
+ writeJson(join(destDir, SNAPSHOT_META), { source: "working-tree" });
435
+
317
436
  console.log(`Snapshotted ${ctx.skillName} → ${destDir}`);
318
437
  }
319
438
 
439
+ /**
440
+ * Snapshots the skill (SKILL.md + sibling assets) as it existed at a git ref,
441
+ * read straight from the object database without touching the working tree
442
+ * (issue #122). The `evals/` directory is excluded to match the working-tree
443
+ * branch. Git runs from `skillDir`, which must sit inside a repo; a bad ref or a
444
+ * skill absent at that ref `die`s with a clear message.
445
+ */
446
+ function snapshotFromRef(
447
+ ref: string,
448
+ skillDir: string,
449
+ destDir: string,
450
+ skillName: string,
451
+ ): void {
452
+ const skillMd = gitShowBytes(skillDir, ref, "SKILL.md");
453
+ if (skillMd === null)
454
+ die(
455
+ `skill not found at ${ref}: ${join(skillDir, "SKILL.md")}\n` +
456
+ " Check the ref exists and that the skill was present there (and that this is a git repo).",
457
+ );
458
+
459
+ ensureDir(destDir);
460
+ writeFileSync(join(destDir, "SKILL.md"), skillMd);
461
+
462
+ for (const relPath of gitLsFiles(skillDir, ref)) {
463
+ if (relPath === "SKILL.md") continue;
464
+ if (relPath === "evals" || relPath.startsWith("evals/")) continue;
465
+ const bytes = gitShowBytes(skillDir, ref, relPath);
466
+ if (bytes === null) continue; // listed but unreadable (e.g. submodule/gitlink)
467
+ const dst = join(destDir, relPath);
468
+ ensureDir(dirname(dst));
469
+ writeFileSync(dst, bytes);
470
+ }
471
+
472
+ // Record provenance so teardown can reclaim this snapshot — it's fully
473
+ // reproducible from the ref.
474
+ writeJson(join(destDir, SNAPSHOT_META), { source: "ref", ref });
475
+
476
+ console.log(`Snapshotted ${skillName} at ${ref} → ${destDir}`);
477
+ }
478
+
320
479
  function commandRun(args: Args, ctx: RunContext): void {
321
480
  if (!args.mode) die("--mode required: new-skill | revision");
322
481
  if (args.mode !== "new-skill" && args.mode !== "revision")
@@ -481,6 +640,7 @@ function commandRun(args: Args, ctx: RunContext): void {
481
640
  condition: condName,
482
641
  skillName: ctx.skillName,
483
642
  repoRoot: ctx.stageRoot,
643
+ assetsDir: dirname(condSkillPath),
484
644
  stageNameOverride: args.stageName,
485
645
  });
486
646
  };
@@ -652,7 +812,7 @@ function commandRun(args: Args, ctx: RunContext): void {
652
812
  if (args.dryRun) console.log("\n--dry-run: stopping after workspace prep.");
653
813
  else
654
814
  console.log(
655
- "\nNext: read dispatch.json, dispatch each task as a subagent, write run.json + timing.json to the paths in each task.",
815
+ "\nNext: read dispatch.json and dispatch each task as a subagent. Then run `ingest --iteration <N> --subagents-dir <path>` (Claude Code), or write run.json + timing.json to the paths in each task by hand and run the chained steps individually (transcript-less harnesses).",
656
816
  );
657
817
  }
658
818
 
@@ -975,15 +1135,12 @@ function buildManifest(opts: {
975
1135
  "",
976
1136
  "**Transcript correlation:** Each task has an `agent_description` field of the form `<eval_id>:<condition>:i<N>-<nonce>`. When dispatching the subagent via the host's primitive (e.g. Claude Code's Agent tool), pass this string verbatim as the dispatch `description` — do not reconstruct it. The per-run nonce keeps descriptions unique across iterations sharing one session's subagents dir, so the transcript adapter correlates each subagent's persisted transcript back to the right `(eval, condition)` slot without collisions.",
977
1137
  "",
978
- "After every dispatch:",
979
- "",
980
- "1. Write `run.json` matching `skills/evaluating-skills/schema/run-record.schema.json` (enforced at runtime by grade/fill-transcripts/detect-stray-writes). Carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]` for now — `evals:fill-transcripts` will populate it from the persisted transcript in a later step.",
981
- "2. Capture `total_tokens` and `duration_ms` from the harness's task completion event into `timing.json`. These values may not be persisted anywhere else — save them immediately.",
1138
+ "After all dispatches (Claude Code):",
982
1139
  "",
983
- "After all dispatches:",
1140
+ '1. Run `bun run evals:ingest -- --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` — a fixed-order chain of record-runs (assembles every task\'s `run.json` from `dispatch.json` + the subagent\'s own `outputs/final-message.md` + the persisted transcript, and backfills `timing.json` with transcript-derived tokens/duration; never clobbers an existing record), fill-transcripts, detect-stray-writes, and grade. Optional higher-fidelity timing: write `{ "total_tokens": <n>, "duration_ms": <n>, "source": "completion-event" }` from the task completion event to `timing.json` right after a dispatch — completion-event numbers always win over the backfill.',
1141
+ "2. Dispatch the judge tasks ingest lists, then run `bun run evals:finalize -- --skill <name> --iteration <N>` for the benchmark.",
984
1142
  "",
985
- "3. (Claude Code only, optional) Run `bun run evals:fill-transcripts --skill <name> --iteration <N> --subagents-dir ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/` to fill `tool_invocations` from each subagent's persisted transcript. Skipping this step leaves `transcript_check` assertions unverifiable.",
986
- "4. Run `bun run evals:grade --skill <name> --iteration <N>` to grade.",
1143
+ "On a harness without persisted transcripts, instead write each task's `run.json` (matching `skills/evaluating-skills/schema/run-record.schema.json`, enforced at runtime by grade/fill-transcripts/detect-stray-writes) and `timing.json` by hand when its subagent returns: carry over `eval_id`, `condition`, `skill_path` (`null` on the without_skill arm), `prompt`, and `files` from the task; populate `final_message` from the subagent's reply; leave `tool_invocations` as `[]`; capture `total_tokens`/`duration_ms` from the task completion event immediately — they may not be persisted anywhere else.",
987
1144
  "",
988
1145
  "## Dispatches",
989
1146
  "",
@@ -1008,6 +1165,174 @@ function buildManifest(opts: {
1008
1165
  return header + entries;
1009
1166
  }
1010
1167
 
1168
+ // ---------------------------------------------------------------------------
1169
+ // ingest / finalize — fixed-order orchestrators over the sibling commands.
1170
+ //
1171
+ // The eval loop has exactly two points where only the in-harness agent can act
1172
+ // (dispatching eval subagents, dispatching judge subagents). Everything between
1173
+ // them is mechanical, so each stretch is one command: `ingest` runs the
1174
+ // post-dispatch chain and stops at the judge hand-off; `finalize` runs the
1175
+ // post-judge chain and prints the benchmark. No workspace-state inference —
1176
+ // each always runs the same steps in the same order, and every sub-step keeps
1177
+ // its own skip-if-done guard, so re-running after a fix is safe.
1178
+ // ---------------------------------------------------------------------------
1179
+
1180
+ export type StepCommand = { label: string; argv: string[] };
1181
+
1182
+ export function buildIngestCommands(opts: {
1183
+ runnerDir: string;
1184
+ skillDir: string;
1185
+ skill: string;
1186
+ iteration: number;
1187
+ subagentsDir: string;
1188
+ }): StepCommand[] {
1189
+ const shared = [
1190
+ "--skill-dir",
1191
+ opts.skillDir,
1192
+ "--skill",
1193
+ opts.skill,
1194
+ "--iteration",
1195
+ String(opts.iteration),
1196
+ ];
1197
+ const transcripts = ["--subagents-dir", opts.subagentsDir];
1198
+ const script = (name: string) => [
1199
+ "bun",
1200
+ "run",
1201
+ join(opts.runnerDir, `${name}.ts`),
1202
+ ];
1203
+ return [
1204
+ {
1205
+ label: "record-runs",
1206
+ argv: [...script("record-runs"), ...shared, ...transcripts],
1207
+ },
1208
+ // record-runs subsumes this for the records it wrote; it still fills any
1209
+ // pre-existing (agent-written) run.json with empty tool_invocations.
1210
+ {
1211
+ label: "fill-transcripts",
1212
+ argv: [...script("fill-transcripts"), ...shared, ...transcripts],
1213
+ },
1214
+ {
1215
+ label: "detect-stray-writes",
1216
+ argv: [...script("detect-stray-writes"), ...shared],
1217
+ },
1218
+ { label: "grade", argv: [...script("grade"), ...shared] },
1219
+ ];
1220
+ }
1221
+
1222
+ export function buildFinalizeCommands(opts: {
1223
+ runnerDir: string;
1224
+ skillDir: string;
1225
+ skill: string;
1226
+ iteration: number;
1227
+ }): StepCommand[] {
1228
+ const shared = [
1229
+ "--skill-dir",
1230
+ opts.skillDir,
1231
+ "--skill",
1232
+ opts.skill,
1233
+ "--iteration",
1234
+ String(opts.iteration),
1235
+ ];
1236
+ return [
1237
+ {
1238
+ label: "grade --finalize",
1239
+ argv: [
1240
+ "bun",
1241
+ "run",
1242
+ join(opts.runnerDir, "grade.ts"),
1243
+ ...shared,
1244
+ "--finalize",
1245
+ ],
1246
+ },
1247
+ {
1248
+ label: "aggregate",
1249
+ argv: ["bun", "run", join(opts.runnerDir, "aggregate.ts"), ...shared],
1250
+ },
1251
+ ];
1252
+ }
1253
+
1254
+ /**
1255
+ * Runs steps in order, stopping at the first non-zero exit. A failure must
1256
+ * halt the chain: grade's `__skill_invoked` code-check silently degrades to an
1257
+ * LLM judge when `tool_invocations` is missing, so grading after a failed
1258
+ * record/fill step would quietly lose the deterministic check.
1259
+ */
1260
+ export function runSteps(
1261
+ steps: StepCommand[],
1262
+ spawn: (step: StepCommand) => number = (step) =>
1263
+ Bun.spawnSync(step.argv, { stdout: "inherit", stderr: "inherit" })
1264
+ .exitCode ?? 1,
1265
+ ): { failedAt: string | null } {
1266
+ for (const step of steps) {
1267
+ console.log(`\n── ${step.label} ──`);
1268
+ if (spawn(step) !== 0) return { failedAt: step.label };
1269
+ }
1270
+ return { failedAt: null };
1271
+ }
1272
+
1273
+ function commandIngest(args: Args, ctx: RunContext): void {
1274
+ if (args.iteration === undefined) die("ingest requires --iteration <N>");
1275
+ if (!args.subagentsDir)
1276
+ die(
1277
+ "ingest requires --subagents-dir <path> (Claude Code persists subagent transcripts under ~/.claude/projects/<project-slug>/<parent-session-id>/subagents/)",
1278
+ );
1279
+ const { failedAt } = runSteps(
1280
+ buildIngestCommands({
1281
+ runnerDir: import.meta.dir,
1282
+ skillDir: ctx.skillDir,
1283
+ skill: ctx.skillName,
1284
+ iteration: args.iteration,
1285
+ subagentsDir: args.subagentsDir,
1286
+ }),
1287
+ );
1288
+ if (failedAt)
1289
+ die(
1290
+ `ingest stopped at '${failedAt}'. Fix the failure and re-run ingest — completed steps skip work that's already done.`,
1291
+ );
1292
+
1293
+ const judgeTasksPath = join(
1294
+ ctx.workspaceRoot,
1295
+ ctx.skillName,
1296
+ `iteration-${args.iteration}`,
1297
+ "judge-tasks.json",
1298
+ );
1299
+ let totalTasks: number | null = null;
1300
+ try {
1301
+ totalTasks =
1302
+ readJson<{ total_tasks?: number }>(judgeTasksPath).total_tasks ?? null;
1303
+ } catch {
1304
+ // grade always writes judge-tasks.json; treat a read failure as unknown.
1305
+ }
1306
+ if (totalTasks === 0) {
1307
+ console.log(
1308
+ `\n✅ Ingest complete — no judge dispatches needed.\nNext: bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
1309
+ );
1310
+ } else {
1311
+ console.log(
1312
+ `\n✅ Ingest complete. Dispatch the ${totalTasks ?? ""} judge task(s) grade listed above (judge-tasks.json), then:\n bun run evals:finalize -- --skill ${ctx.skillName} --iteration ${args.iteration}`,
1313
+ );
1314
+ }
1315
+ }
1316
+
1317
+ function commandFinalize(args: Args, ctx: RunContext): void {
1318
+ if (args.iteration === undefined) die("finalize requires --iteration <N>");
1319
+ const { failedAt } = runSteps(
1320
+ buildFinalizeCommands({
1321
+ runnerDir: import.meta.dir,
1322
+ skillDir: ctx.skillDir,
1323
+ skill: ctx.skillName,
1324
+ iteration: args.iteration,
1325
+ }),
1326
+ );
1327
+ if (failedAt)
1328
+ die(
1329
+ `finalize stopped at '${failedAt}'. Fix the failure and re-run finalize.`,
1330
+ );
1331
+ console.log(
1332
+ `\n✅ Finalize complete. Read the benchmark above, then tear down: bun run evals:teardown --skill ${ctx.skillName}`,
1333
+ );
1334
+ }
1335
+
1011
1336
  if (import.meta.main) {
1012
1337
  const argv = Bun.argv.slice(2);
1013
1338
  const args = parseArgs(argv);
@@ -1018,6 +1343,8 @@ if (import.meta.main) {
1018
1343
  die(err instanceof Error ? err.message : String(err));
1019
1344
  }
1020
1345
  if (args.command === "snapshot") commandSnapshot(args, ctx);
1346
+ else if (args.command === "ingest") commandIngest(args, ctx);
1347
+ else if (args.command === "finalize") commandFinalize(args, ctx);
1021
1348
  else if (args.command === "teardown-guard") {
1022
1349
  const torn = teardownGuard(ctx.stageRoot);
1023
1350
  console.log(
@@ -1025,5 +1352,37 @@ if (import.meta.main) {
1025
1352
  ? "🛡 Write guard removed."
1026
1353
  : "No write guard was installed — nothing to remove.",
1027
1354
  );
1355
+ } else if (args.command === "teardown") {
1356
+ // Full end-of-run teardown: disarm the guard, remove the staged skill set
1357
+ // (and prune a `.claude` the runner emptied), then reclaim the workspace —
1358
+ // leaving the user's own `.claude/settings.json`, pre-existing project
1359
+ // skills, and any uncommitted eval results intact.
1360
+ const torn = teardownGuard(ctx.stageRoot);
1361
+ cleanupStagedSkills(ctx.stageRoot);
1362
+ const ws = cleanupWorkspace(ctx.workspaceRoot, ctx.skillName);
1363
+ console.log(
1364
+ `🧹 Eval teardown complete: staged skill set removed${
1365
+ torn ? " and write guard disarmed" : ""
1366
+ }.`,
1367
+ );
1368
+ const reclaimed = ws.removedIterations.length + ws.removedSnapshots.length;
1369
+ if (reclaimed > 0) {
1370
+ console.log(
1371
+ ` Reclaimed ${ws.removedIterations.length} workspace iteration(s)` +
1372
+ ` and ${ws.removedSnapshots.length} reproducible snapshot(s).`,
1373
+ );
1374
+ }
1375
+ if (ws.keptIterations.length > 0) {
1376
+ const lines = ws.keptIterations.map(
1377
+ (k) => ` - ${k.iteration} (${k.reason})`,
1378
+ );
1379
+ console.warn(
1380
+ `⚠ Kept ${ws.keptIterations.length} workspace iteration(s) with results ` +
1381
+ `not yet committed:\n${lines.join("\n")}\n` +
1382
+ ` Commit them, e.g.:\n` +
1383
+ ` bun run evals:promote-baseline --skill ${ctx.skillName} --iteration <N>\n` +
1384
+ ` or delete ${join("skills-workspace", ctx.skillName)}/ manually to discard.`,
1385
+ );
1386
+ }
1028
1387
  } else commandRun(args, ctx);
1029
1388
  }
@@ -25,6 +25,26 @@ export const BASH_MUTATION_PATTERNS: Array<{ re: RegExp; reason: string }> = [
25
25
  re: /\bgit\s+(commit|add|push|checkout|reset|restore|merge|rebase)\b/,
26
26
  reason: "git mutation",
27
27
  },
28
+ {
29
+ re: /\bgit\s+worktree\s+add\b/,
30
+ reason: "git worktree add (working tree outside the sandbox)",
31
+ },
32
+ // A create/copy/move/link verb whose operand is a path under `.claude` —
33
+ // catches stray writes to the harness config dir that aren't a `>` redirect
34
+ // (those are caught below). Read-only verbs (`cat`, `ls`) aren't listed, so
35
+ // inspecting `.claude` stays allowed.
36
+ {
37
+ re: /\b(cp|mv|mkdir|touch|ln|rsync|install)\b[^|;&\n]*\.claude(\/|\b)/,
38
+ reason: "path under .claude",
39
+ },
40
+ // The same create verbs whose operand is a top-level `skills/` directory —
41
+ // catches a bare `skills/` left in the cwd. `skills-workspace` and other
42
+ // `skills`-prefixed names are excluded by the trailing `/`, whitespace, or
43
+ // end-of-string boundary.
44
+ {
45
+ re: /\b(cp|mv|mkdir|touch|ln|rsync)\b[^|;&\n]*[\s'"=/]\.{0,2}\/?skills(\/|\s|$)/,
46
+ reason: "creates a bare skills/ dir",
47
+ },
28
48
  { re: /(^|\s)(>>?|tee)\s/, reason: "output redirection to a file" },
29
49
  ];
30
50
 
@@ -109,4 +109,13 @@ export const SKILL_INVOKED_META_ID = "__skill_invoked";
109
109
  export type TimingRecord = {
110
110
  total_tokens?: number | null;
111
111
  duration_ms?: number | null;
112
+ /**
113
+ * Where the numbers came from. "completion-event" = captured by the
114
+ * dispatching agent from the harness's task completion event;
115
+ * "transcript" = derived by record-runs from the persisted transcript
116
+ * (includes cache accounting — a different metric, not comparable 1:1).
117
+ * Absent on records written before provenance was tracked
118
+ * (completion-event in practice).
119
+ */
120
+ source?: "completion-event" | "transcript";
112
121
  };