@slowdini/slow-powers-opencode 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +18 -8
  2. package/package.json +5 -1
  3. package/skills/evaluating-skills/SKILL.md +19 -17
  4. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  5. package/skills/evaluating-skills/harness-parity.md +155 -0
  6. package/skills/evaluating-skills/runner/README.md +28 -19
  7. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  8. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  9. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  10. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  11. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  12. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  13. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  14. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  15. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  16. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  17. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  18. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  19. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  20. package/skills/evaluating-skills/runner/run.ts +376 -17
  21. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  22. package/skills/evaluating-skills/runner/types.ts +9 -0
  23. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  24. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  25. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  26. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  27. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  28. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  29. package/skills/verifying-development-work/SKILL.md +17 -6
  30. package/skills/verifying-development-work/code-review.md +68 -0
  31. package/skills/verifying-development-work/comment-review.md +85 -0
  32. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  33. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  34. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  35. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  36. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  37. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  38. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  39. package/skills/verifying-development-work/evals/evals.json +34 -2
  40. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  41. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  43. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  44. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  45. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  46. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  47. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  48. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  49. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  50. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  51. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -11,9 +11,12 @@ import { tmpdir } from "node:os";
11
11
  import { join } from "node:path";
12
12
  import {
13
13
  buildDispatchTask,
14
+ buildFinalizeCommands,
15
+ buildIngestCommands,
14
16
  cleanupStagedSkills,
15
17
  redactSkillFromBootstrap,
16
18
  registerStagedSkillForCleanup,
19
+ runSteps,
17
20
  STAGED_SIBLING_MANIFEST,
18
21
  STAGED_SKILL_PREFIX,
19
22
  selectEvals,
@@ -21,6 +24,7 @@ import {
21
24
  stageSkillForCC,
22
25
  } from "./run";
23
26
  import type { Eval } from "./types";
27
+ import { SNAPSHOT_META } from "./workspace-teardown";
24
28
 
25
29
  const FIXTURE_ROOT = join(tmpdir(), `slow-powers-run-test-${process.pid}`);
26
30
 
@@ -121,6 +125,83 @@ describe("stageSkillForCC", () => {
121
125
  expect(readFileSync(stagedPath, "utf8")).toBe("second");
122
126
  });
123
127
 
128
+ test("copies sibling assets from assetsDir alongside the staged SKILL.md", () => {
129
+ const repoRoot = join(FIXTURE_ROOT, "stage-assets");
130
+ const assetsDir = join(FIXTURE_ROOT, "stage-assets-src");
131
+ mkdirSync(repoRoot, { recursive: true });
132
+ mkdirSync(join(assetsDir, "scripts"), { recursive: true });
133
+ writeFileSync(join(assetsDir, "SKILL.md"), "the source skill md");
134
+ writeFileSync(join(assetsDir, "code-review.md"), "review guidance");
135
+ writeFileSync(
136
+ join(assetsDir, "scripts", "helper.ts"),
137
+ "export const x = 1",
138
+ );
139
+
140
+ const slug = stageSkillForCC({
141
+ content: "staged content",
142
+ iteration: 1,
143
+ condition: "new_skill",
144
+ skillName: "s",
145
+ repoRoot,
146
+ assetsDir,
147
+ });
148
+
149
+ const stagedDir = join(repoRoot, ".claude", "skills", slug);
150
+ // SKILL.md comes from `content`, not the assetsDir copy.
151
+ expect(readFileSync(join(stagedDir, "SKILL.md"), "utf8")).toBe(
152
+ "staged content",
153
+ );
154
+ expect(readFileSync(join(stagedDir, "code-review.md"), "utf8")).toBe(
155
+ "review guidance",
156
+ );
157
+ expect(readFileSync(join(stagedDir, "scripts", "helper.ts"), "utf8")).toBe(
158
+ "export const x = 1",
159
+ );
160
+ });
161
+
162
+ test("excludes SKILL.md, evals/, and the snapshot meta file from the asset copy", () => {
163
+ const repoRoot = join(FIXTURE_ROOT, "stage-assets-excludes");
164
+ const assetsDir = join(FIXTURE_ROOT, "stage-assets-excludes-src");
165
+ mkdirSync(join(assetsDir, "evals", "fixtures"), { recursive: true });
166
+ mkdirSync(repoRoot, { recursive: true });
167
+ writeFileSync(join(assetsDir, "SKILL.md"), "src skill md");
168
+ writeFileSync(join(assetsDir, "code-review.md"), "keep me");
169
+ writeFileSync(join(assetsDir, SNAPSHOT_META), '{"source":"ref"}');
170
+ writeFileSync(join(assetsDir, "evals", "evals.json"), "{}");
171
+
172
+ const slug = stageSkillForCC({
173
+ content: "staged",
174
+ iteration: 1,
175
+ condition: "old_skill",
176
+ skillName: "s",
177
+ repoRoot,
178
+ assetsDir,
179
+ });
180
+
181
+ const stagedDir = join(repoRoot, ".claude", "skills", slug);
182
+ expect(existsSync(join(stagedDir, "code-review.md"))).toBe(true);
183
+ expect(existsSync(join(stagedDir, "evals"))).toBe(false);
184
+ expect(existsSync(join(stagedDir, SNAPSHOT_META))).toBe(false);
185
+ // SKILL.md exists (from content) but the assetsDir SKILL.md didn't overwrite it.
186
+ expect(readFileSync(join(stagedDir, "SKILL.md"), "utf8")).toBe("staged");
187
+ });
188
+
189
+ test("stages SKILL.md alone when assetsDir is omitted", () => {
190
+ const repoRoot = join(FIXTURE_ROOT, "stage-no-assets");
191
+ mkdirSync(repoRoot, { recursive: true });
192
+
193
+ const slug = stageSkillForCC({
194
+ content: "solo",
195
+ iteration: 1,
196
+ condition: "with_skill",
197
+ skillName: "s",
198
+ repoRoot,
199
+ });
200
+
201
+ const stagedDir = join(repoRoot, ".claude", "skills", slug);
202
+ expect(readdirSync(stagedDir)).toEqual(["SKILL.md"]);
203
+ });
204
+
124
205
  test("stageNameOverride stages under the verbatim name instead of the eval slug", () => {
125
206
  const repoRoot = join(FIXTURE_ROOT, "stage-override");
126
207
  mkdirSync(repoRoot, { recursive: true });
@@ -396,6 +477,81 @@ describe("cleanupStagedSkills (manifest-aware)", () => {
396
477
  });
397
478
  });
398
479
 
480
+ describe("cleanupStagedSkills (runner-created .claude/skills)", () => {
481
+ test("removes the whole .claude/skills tree when the runner created it, and prunes an empty .claude", () => {
482
+ const root = join(FIXTURE_ROOT, "cleanup-created");
483
+ mkdirSync(root, { recursive: true });
484
+ const src = join(root, "src-skills");
485
+ mkdirSync(join(src, "alpha"), { recursive: true });
486
+ writeFileSync(join(src, "alpha", "SKILL.md"), "alpha");
487
+
488
+ // .claude/skills did NOT pre-exist — stageSiblingSkills creates it.
489
+ stageSiblingSkills({
490
+ skillUnderTest: "x",
491
+ skillsSourceDir: src,
492
+ repoRoot: root,
493
+ });
494
+ // A stray, non-prefixed dir a recursive eval might have left behind.
495
+ mkdirSync(join(root, ".claude", "skills", "stray-leftover"), {
496
+ recursive: true,
497
+ });
498
+
499
+ cleanupStagedSkills(root);
500
+
501
+ expect(existsSync(join(root, ".claude", "skills"))).toBe(false);
502
+ // .claude held nothing else, so it is pruned too.
503
+ expect(existsSync(join(root, ".claude"))).toBe(false);
504
+ });
505
+
506
+ test("keeps .claude (and settings.json) when the runner created only skills/", () => {
507
+ const root = join(FIXTURE_ROOT, "cleanup-keep-settings");
508
+ const claudeDir = join(root, ".claude");
509
+ mkdirSync(claudeDir, { recursive: true });
510
+ writeFileSync(join(claudeDir, "settings.json"), "{}");
511
+ const src = join(root, "src-skills");
512
+ mkdirSync(join(src, "alpha"), { recursive: true });
513
+ writeFileSync(join(src, "alpha", "SKILL.md"), "alpha");
514
+
515
+ // .claude exists but .claude/skills does not — runner creates skills/.
516
+ stageSiblingSkills({
517
+ skillUnderTest: "x",
518
+ skillsSourceDir: src,
519
+ repoRoot: root,
520
+ });
521
+
522
+ cleanupStagedSkills(root);
523
+
524
+ expect(existsSync(join(claudeDir, "skills"))).toBe(false);
525
+ expect(existsSync(claudeDir)).toBe(true);
526
+ expect(existsSync(join(claudeDir, "settings.json"))).toBe(true);
527
+ });
528
+
529
+ test("leaves a pre-existing .claude/skills dir in place (surgical restore only)", () => {
530
+ const root = join(FIXTURE_ROOT, "cleanup-preexisting-skillsdir");
531
+ const skillsDir = join(root, ".claude", "skills");
532
+ // The user already had a .claude/skills with their own skill.
533
+ mkdirSync(join(skillsDir, "user-owned"), { recursive: true });
534
+ writeFileSync(join(skillsDir, "user-owned", "SKILL.md"), "USER");
535
+ const src = join(root, "src-skills");
536
+ mkdirSync(join(src, "alpha"), { recursive: true });
537
+ writeFileSync(join(src, "alpha", "SKILL.md"), "alpha");
538
+
539
+ stageSiblingSkills({
540
+ skillUnderTest: "x",
541
+ skillsSourceDir: src,
542
+ repoRoot: root,
543
+ });
544
+
545
+ cleanupStagedSkills(root);
546
+
547
+ expect(existsSync(skillsDir)).toBe(true);
548
+ expect(
549
+ readFileSync(join(skillsDir, "user-owned", "SKILL.md"), "utf8"),
550
+ ).toBe("USER");
551
+ expect(existsSync(join(skillsDir, "alpha"))).toBe(false);
552
+ });
553
+ });
554
+
399
555
  describe("buildDispatchTask bootstrap injection", () => {
400
556
  const baseOpts = {
401
557
  evalId: "e1",
@@ -1004,6 +1160,79 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
1004
1160
  expect(existsSync(settingsPath)).toBe(false);
1005
1161
  });
1006
1162
 
1163
+ test("teardown removes the guard AND the staged skill set the runner created", () => {
1164
+ const { skillDir, cwd } = setup("usermode-teardown");
1165
+ const settingsPath = join(cwd, ".claude", "settings.local.json");
1166
+ const stagedSkillsDir = join(cwd, ".claude", "skills");
1167
+
1168
+ const res = runCli(
1169
+ [
1170
+ "--skill-dir",
1171
+ skillDir,
1172
+ "--skill",
1173
+ "mr-review",
1174
+ "--mode",
1175
+ "new-skill",
1176
+ "--guard",
1177
+ ],
1178
+ cwd,
1179
+ );
1180
+ expect(res.exitCode).toBe(0);
1181
+ expect(existsSync(settingsPath)).toBe(true);
1182
+ expect(existsSync(stagedSkillsDir)).toBe(true);
1183
+
1184
+ const down = runCli(
1185
+ ["teardown", "--skill-dir", skillDir, "--skill", "mr-review"],
1186
+ cwd,
1187
+ );
1188
+ expect(down.exitCode).toBe(0);
1189
+ // Guard gone, staged skills gone, and the .claude scaffolding the runner
1190
+ // created in this throwaway cwd (no settings.json) is pruned entirely.
1191
+ expect(existsSync(settingsPath)).toBe(false);
1192
+ expect(existsSync(stagedSkillsDir)).toBe(false);
1193
+ expect(existsSync(join(cwd, ".claude"))).toBe(false);
1194
+ // The run only produced scaffolding (no results), so teardown reclaims the
1195
+ // workspace too — a completed run leaves nothing uncommitted behind.
1196
+ expect(existsSync(join(cwd, "skills-workspace"))).toBe(false);
1197
+ });
1198
+
1199
+ test("teardown preserves an iteration with uncommitted results and warns", () => {
1200
+ const { skillDir, cwd } = setup("usermode-teardown-keep");
1201
+
1202
+ const res = runCli(
1203
+ ["--skill-dir", skillDir, "--skill", "mr-review", "--mode", "new-skill"],
1204
+ cwd,
1205
+ );
1206
+ expect(res.exitCode).toBe(0);
1207
+
1208
+ // Simulate a graded-but-not-promoted run: drop an aggregate into the
1209
+ // iteration the runner just created.
1210
+ const iterationDir = join(
1211
+ cwd,
1212
+ "skills-workspace",
1213
+ "mr-review",
1214
+ "iteration-1",
1215
+ );
1216
+ writeFileSync(
1217
+ join(iterationDir, "benchmark.json"),
1218
+ `${JSON.stringify({ delta: { pass_rate: 0.4 } })}\n`,
1219
+ );
1220
+
1221
+ const down = runCli(
1222
+ ["teardown", "--skill-dir", skillDir, "--skill", "mr-review"],
1223
+ cwd,
1224
+ );
1225
+ expect(down.exitCode).toBe(0);
1226
+
1227
+ // Uncommitted results are preserved, and the user is told how to commit.
1228
+ expect(existsSync(iterationDir)).toBe(true);
1229
+ const out =
1230
+ new TextDecoder().decode(down.stdout) +
1231
+ new TextDecoder().decode(down.stderr);
1232
+ expect(out).toContain("iteration-1");
1233
+ expect(out).toContain("promote-baseline");
1234
+ });
1235
+
1007
1236
  test("a normal run does not install a guard", () => {
1008
1237
  const { skillDir, cwd } = setup("usermode-noguard");
1009
1238
  const res = runCli(
@@ -1178,3 +1407,297 @@ describe("run.ts user-mode end-to-end (--skill-dir, isolated CWD)", () => {
1178
1407
  );
1179
1408
  });
1180
1409
  });
1410
+
1411
+ describe("snapshot --ref (read baseline from a git ref, issue #122)", () => {
1412
+ const RUN_TS = join(import.meta.dir, "run.ts");
1413
+
1414
+ function git(args: string[], cwd: string) {
1415
+ const res = Bun.spawnSync(
1416
+ [
1417
+ "git",
1418
+ "-c",
1419
+ "user.email=eval@test",
1420
+ "-c",
1421
+ "user.name=eval",
1422
+ "-c",
1423
+ "commit.gpgsign=false",
1424
+ ...args,
1425
+ ],
1426
+ { cwd, stdout: "pipe", stderr: "pipe" },
1427
+ );
1428
+ if (res.exitCode !== 0)
1429
+ throw new Error(`git ${args.join(" ")} failed: ${res.stderr.toString()}`);
1430
+ return res;
1431
+ }
1432
+
1433
+ function runCli(args: string[], cwd: string) {
1434
+ return Bun.spawnSync(["bun", "run", RUN_TS, ...args], {
1435
+ cwd,
1436
+ stdout: "pipe",
1437
+ stderr: "pipe",
1438
+ });
1439
+ }
1440
+
1441
+ /**
1442
+ * Builds a git repo at <root> containing a `mr-review` skill committed as v1,
1443
+ * then overwrites the working-tree SKILL.md with v2 (uncommitted). Returns the
1444
+ * paths a snapshot needs, so a test can assert `--ref HEAD` reads v1 while the
1445
+ * working tree keeps v2.
1446
+ */
1447
+ function setupRepo(
1448
+ name: string,
1449
+ opts: { extraCommitted?: Record<string, string> } = {},
1450
+ ): { root: string; skillDir: string; skillSub: string; cwd: string } {
1451
+ const root = join(FIXTURE_ROOT, name);
1452
+ const skillDir = join(root, "skill-dir");
1453
+ const skillSub = join(skillDir, "mr-review");
1454
+ mkdirSync(skillSub, { recursive: true });
1455
+ writeFileSync(join(skillSub, "SKILL.md"), "v1 baseline\n");
1456
+ for (const [rel, content] of Object.entries(opts.extraCommitted ?? {})) {
1457
+ const p = join(skillSub, rel);
1458
+ mkdirSync(join(p, ".."), { recursive: true });
1459
+ writeFileSync(p, content);
1460
+ }
1461
+
1462
+ git(["init", "-q"], root);
1463
+ git(["add", "-A"], root);
1464
+ git(["commit", "-q", "-m", "v1"], root);
1465
+
1466
+ // Working tree diverges to v2; the commit still holds v1.
1467
+ writeFileSync(join(skillSub, "SKILL.md"), "v2 working tree\n");
1468
+
1469
+ const cwd = join(root, "work");
1470
+ mkdirSync(cwd, { recursive: true });
1471
+ return { root, skillDir, skillSub, cwd };
1472
+ }
1473
+
1474
+ function snapshotPath(cwd: string, label: string, rel: string): string {
1475
+ return join(cwd, "skills-workspace", "mr-review", "snapshots", label, rel);
1476
+ }
1477
+
1478
+ test("snapshots the SKILL.md committed at the ref, leaving the working tree untouched", () => {
1479
+ const { skillDir, skillSub, cwd } = setupRepo("ref-old-content");
1480
+ const res = runCli(
1481
+ [
1482
+ "snapshot",
1483
+ "--skill-dir",
1484
+ skillDir,
1485
+ "--skill",
1486
+ "mr-review",
1487
+ "--label",
1488
+ "old",
1489
+ "--ref",
1490
+ "HEAD",
1491
+ ],
1492
+ cwd,
1493
+ );
1494
+ expect(res.exitCode).toBe(0);
1495
+
1496
+ // Snapshot holds the committed v1...
1497
+ expect(readFileSync(snapshotPath(cwd, "old", "SKILL.md"), "utf8")).toBe(
1498
+ "v1 baseline\n",
1499
+ );
1500
+ // ...and the working tree still holds the edited v2 (no clobber).
1501
+ expect(readFileSync(join(skillSub, "SKILL.md"), "utf8")).toBe(
1502
+ "v2 working tree\n",
1503
+ );
1504
+ });
1505
+
1506
+ test("captures sibling assets at the ref but excludes evals/", () => {
1507
+ const { skillDir, cwd } = setupRepo("ref-assets", {
1508
+ extraCommitted: {
1509
+ "assets/notes.md": "asset body\n",
1510
+ "evals/evals.json": '{"skill_name":"mr-review","evals":[]}',
1511
+ },
1512
+ });
1513
+ const res = runCli(
1514
+ [
1515
+ "snapshot",
1516
+ "--skill-dir",
1517
+ skillDir,
1518
+ "--skill",
1519
+ "mr-review",
1520
+ "--label",
1521
+ "old",
1522
+ "--ref",
1523
+ "HEAD",
1524
+ ],
1525
+ cwd,
1526
+ );
1527
+ expect(res.exitCode).toBe(0);
1528
+
1529
+ expect(existsSync(snapshotPath(cwd, "old", "assets/notes.md"))).toBe(true);
1530
+ expect(
1531
+ readFileSync(snapshotPath(cwd, "old", "assets/notes.md"), "utf8"),
1532
+ ).toBe("asset body\n");
1533
+ expect(existsSync(snapshotPath(cwd, "old", "evals"))).toBe(false);
1534
+ });
1535
+
1536
+ test("records ref provenance so teardown can reclaim the snapshot", () => {
1537
+ const { skillDir, cwd } = setupRepo("ref-meta");
1538
+ const res = runCli(
1539
+ [
1540
+ "snapshot",
1541
+ "--skill-dir",
1542
+ skillDir,
1543
+ "--skill",
1544
+ "mr-review",
1545
+ "--label",
1546
+ "old",
1547
+ "--ref",
1548
+ "HEAD",
1549
+ ],
1550
+ cwd,
1551
+ );
1552
+ expect(res.exitCode).toBe(0);
1553
+
1554
+ const meta = JSON.parse(
1555
+ readFileSync(snapshotPath(cwd, "old", SNAPSHOT_META), "utf8"),
1556
+ ) as { source: string; ref: string };
1557
+ expect(meta.source).toBe("ref");
1558
+ expect(meta.ref).toBe("HEAD");
1559
+ });
1560
+
1561
+ test("a ref that does not exist fails with a clear message", () => {
1562
+ const { skillDir, cwd } = setupRepo("ref-bad");
1563
+ const res = runCli(
1564
+ [
1565
+ "snapshot",
1566
+ "--skill-dir",
1567
+ skillDir,
1568
+ "--skill",
1569
+ "mr-review",
1570
+ "--label",
1571
+ "old",
1572
+ "--ref",
1573
+ "does-not-exist",
1574
+ ],
1575
+ cwd,
1576
+ );
1577
+ expect(res.exitCode).not.toBe(0);
1578
+ expect(new TextDecoder().decode(res.stderr)).toContain("does-not-exist");
1579
+ });
1580
+
1581
+ test("without --ref, snapshot still reads the working tree (v2)", () => {
1582
+ const { skillDir, cwd } = setupRepo("ref-default-path");
1583
+ const res = runCli(
1584
+ [
1585
+ "snapshot",
1586
+ "--skill-dir",
1587
+ skillDir,
1588
+ "--skill",
1589
+ "mr-review",
1590
+ "--label",
1591
+ "wt",
1592
+ ],
1593
+ cwd,
1594
+ );
1595
+ expect(res.exitCode).toBe(0);
1596
+ expect(readFileSync(snapshotPath(cwd, "wt", "SKILL.md"), "utf8")).toBe(
1597
+ "v2 working tree\n",
1598
+ );
1599
+ });
1600
+
1601
+ test("records working-tree provenance so teardown preserves the snapshot", () => {
1602
+ const { skillDir, cwd } = setupRepo("wt-meta");
1603
+ const res = runCli(
1604
+ [
1605
+ "snapshot",
1606
+ "--skill-dir",
1607
+ skillDir,
1608
+ "--skill",
1609
+ "mr-review",
1610
+ "--label",
1611
+ "wt",
1612
+ ],
1613
+ cwd,
1614
+ );
1615
+ expect(res.exitCode).toBe(0);
1616
+
1617
+ const meta = JSON.parse(
1618
+ readFileSync(snapshotPath(cwd, "wt", SNAPSHOT_META), "utf8"),
1619
+ ) as { source: string };
1620
+ expect(meta.source).toBe("working-tree");
1621
+ });
1622
+ });
1623
+
1624
+ describe("ingest / finalize step plans", () => {
1625
+ const opts = {
1626
+ runnerDir: "/runner",
1627
+ skillDir: "/skills",
1628
+ skill: "mr-review",
1629
+ iteration: 2,
1630
+ subagentsDir: "/subagents",
1631
+ };
1632
+
1633
+ test("buildIngestCommands runs record → fill → stray-writes → grade, in order", () => {
1634
+ const steps = buildIngestCommands(opts);
1635
+ expect(steps.map((s) => s.label)).toEqual([
1636
+ "record-runs",
1637
+ "fill-transcripts",
1638
+ "detect-stray-writes",
1639
+ "grade",
1640
+ ]);
1641
+ // Every step is a bun invocation of the sibling script with the shared flags.
1642
+ for (const step of steps) {
1643
+ expect(step.argv.slice(0, 2)).toEqual(["bun", "run"]);
1644
+ expect(step.argv[2]).toBe(`/runner/${step.label}.ts`);
1645
+ expect(step.argv).toContain("--skill-dir");
1646
+ expect(step.argv).toContain("/skills");
1647
+ expect(step.argv).toContain("--skill");
1648
+ expect(step.argv).toContain("mr-review");
1649
+ expect(step.argv).toContain("--iteration");
1650
+ expect(step.argv).toContain("2");
1651
+ }
1652
+ // The transcript-reading steps get --subagents-dir; the others must not.
1653
+ const byLabel = Object.fromEntries(steps.map((s) => [s.label, s.argv]));
1654
+ expect(byLabel["record-runs"]).toContain("--subagents-dir");
1655
+ expect(byLabel["fill-transcripts"]).toContain("--subagents-dir");
1656
+ expect(byLabel["detect-stray-writes"]).not.toContain("--subagents-dir");
1657
+ expect(byLabel.grade).not.toContain("--subagents-dir");
1658
+ });
1659
+
1660
+ test("buildFinalizeCommands runs grade --finalize then aggregate", () => {
1661
+ const steps = buildFinalizeCommands({
1662
+ runnerDir: "/runner",
1663
+ skillDir: "/skills",
1664
+ skill: "mr-review",
1665
+ iteration: 2,
1666
+ });
1667
+ expect(steps.map((s) => s.label)).toEqual([
1668
+ "grade --finalize",
1669
+ "aggregate",
1670
+ ]);
1671
+ expect(steps[0].argv[2]).toBe("/runner/grade.ts");
1672
+ expect(steps[0].argv).toContain("--finalize");
1673
+ expect(steps[1].argv[2]).toBe("/runner/aggregate.ts");
1674
+ });
1675
+
1676
+ test("runSteps stops at the first failing step and reports it", () => {
1677
+ const ran: string[] = [];
1678
+ const result = runSteps(
1679
+ [
1680
+ { label: "a", argv: ["x"] },
1681
+ { label: "b", argv: ["y"] },
1682
+ { label: "c", argv: ["z"] },
1683
+ ],
1684
+ (step) => {
1685
+ ran.push(step.label);
1686
+ return step.label === "b" ? 1 : 0;
1687
+ },
1688
+ );
1689
+ expect(ran).toEqual(["a", "b"]); // c never runs after b fails
1690
+ expect(result.failedAt).toBe("b");
1691
+ });
1692
+
1693
+ test("runSteps runs everything and reports no failure on success", () => {
1694
+ const result = runSteps(
1695
+ [
1696
+ { label: "a", argv: ["x"] },
1697
+ { label: "b", argv: ["y"] },
1698
+ ],
1699
+ () => 0,
1700
+ );
1701
+ expect(result.failedAt).toBeNull();
1702
+ });
1703
+ });