@slowdini/slow-powers-opencode 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +18 -8
  2. package/package.json +5 -1
  3. package/skills/evaluating-skills/SKILL.md +19 -17
  4. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  5. package/skills/evaluating-skills/harness-parity.md +155 -0
  6. package/skills/evaluating-skills/runner/README.md +28 -19
  7. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  8. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  9. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  10. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  11. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  12. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  13. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  14. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  15. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  16. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  17. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  18. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  19. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  20. package/skills/evaluating-skills/runner/run.ts +376 -17
  21. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  22. package/skills/evaluating-skills/runner/types.ts +9 -0
  23. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  24. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  25. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  26. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  27. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  28. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  29. package/skills/verifying-development-work/SKILL.md +17 -6
  30. package/skills/verifying-development-work/code-review.md +68 -0
  31. package/skills/verifying-development-work/comment-review.md +85 -0
  32. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  33. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  34. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  35. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  36. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  37. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  38. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  39. package/skills/verifying-development-work/evals/evals.json +34 -2
  40. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  41. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  43. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  44. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  45. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  46. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  47. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  48. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  49. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  50. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  51. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -1,9 +1,21 @@
1
- import { describe, expect, test } from "bun:test";
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import {
3
+ mkdirSync,
4
+ readFileSync,
5
+ realpathSync,
6
+ rmSync,
7
+ writeFileSync,
8
+ } from "node:fs";
9
+ import { tmpdir } from "node:os";
2
10
  import { join } from "node:path";
3
- import { detectStrayWrites } from "./detect-stray-writes";
11
+ import {
12
+ detectLiveSourceReads,
13
+ detectStrayWrites,
14
+ } from "./detect-stray-writes";
4
15
 
5
16
  const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
6
17
  const REPO = "/work/repo";
18
+ const LIVE_SKILL = join(REPO, "skills", "mr-review");
7
19
 
8
20
  describe("detectStrayWrites", () => {
9
21
  test("a Write inside the outputs dir is clean", () => {
@@ -87,6 +99,32 @@ describe("detectStrayWrites", () => {
87
99
  expect(findings.warnings).toHaveLength(0);
88
100
  });
89
101
 
102
+ test("git worktree add is a warning (working tree outside the sandbox)", () => {
103
+ const findings = detectStrayWrites(
104
+ [
105
+ {
106
+ name: "Bash",
107
+ args: { command: "git worktree add ../wt -b scratch" },
108
+ ordinal: 0,
109
+ },
110
+ ],
111
+ OUTPUTS,
112
+ REPO,
113
+ );
114
+ expect(findings.warnings).toHaveLength(1);
115
+ expect(findings.warnings[0].reason).toMatch(/worktree/i);
116
+ });
117
+
118
+ test("creating a path under .claude is a warning", () => {
119
+ const findings = detectStrayWrites(
120
+ [{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
121
+ OUTPUTS,
122
+ REPO,
123
+ );
124
+ expect(findings.warnings).toHaveLength(1);
125
+ expect(findings.warnings[0].reason).toMatch(/\.claude/i);
126
+ });
127
+
90
128
  test("read-only tools are never flagged", () => {
91
129
  const findings = detectStrayWrites(
92
130
  [
@@ -101,3 +139,258 @@ describe("detectStrayWrites", () => {
101
139
  expect(findings.warnings).toHaveLength(0);
102
140
  });
103
141
  });
142
+
143
+ describe("detectLiveSourceReads", () => {
144
+ test("a Read of the live SKILL.md is flagged", () => {
145
+ const findings = detectLiveSourceReads(
146
+ [
147
+ {
148
+ name: "Read",
149
+ args: { file_path: join(LIVE_SKILL, "SKILL.md") },
150
+ ordinal: 1,
151
+ },
152
+ ],
153
+ LIVE_SKILL,
154
+ REPO,
155
+ );
156
+ expect(findings).toHaveLength(1);
157
+ expect(findings[0]).toMatchObject({
158
+ tool: "Read",
159
+ path: join(LIVE_SKILL, "SKILL.md"),
160
+ ordinal: 1,
161
+ });
162
+ expect(findings[0].reason).toMatch(/live skill source/i);
163
+ });
164
+
165
+ test("a Read of a staged eval copy is not flagged", () => {
166
+ const findings = detectLiveSourceReads(
167
+ [
168
+ {
169
+ name: "Read",
170
+ args: {
171
+ file_path: join(
172
+ REPO,
173
+ ".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
174
+ ),
175
+ },
176
+ ordinal: 0,
177
+ },
178
+ ],
179
+ LIVE_SKILL,
180
+ REPO,
181
+ );
182
+ expect(findings).toHaveLength(0);
183
+ });
184
+
185
+ test("a relative Read path resolving under the live dir is flagged", () => {
186
+ const findings = detectLiveSourceReads(
187
+ [
188
+ {
189
+ name: "Read",
190
+ args: { file_path: "skills/mr-review/SKILL.md" },
191
+ ordinal: 0,
192
+ },
193
+ ],
194
+ LIVE_SKILL,
195
+ REPO,
196
+ );
197
+ expect(findings).toHaveLength(1);
198
+ });
199
+
200
+ test("a Grep scoped to the live dir is flagged", () => {
201
+ const findings = detectLiveSourceReads(
202
+ [{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
203
+ LIVE_SKILL,
204
+ REPO,
205
+ );
206
+ expect(findings).toHaveLength(1);
207
+ expect(findings[0].tool).toBe("Grep");
208
+ });
209
+
210
+ test("a Bash command referencing the live dir relatively is flagged", () => {
211
+ const findings = detectLiveSourceReads(
212
+ [
213
+ {
214
+ name: "Bash",
215
+ args: { command: "cat skills/mr-review/SKILL.md" },
216
+ ordinal: 3,
217
+ },
218
+ ],
219
+ LIVE_SKILL,
220
+ REPO,
221
+ );
222
+ expect(findings).toHaveLength(1);
223
+ expect(findings[0].tool).toBe("Bash");
224
+ expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
225
+ });
226
+
227
+ test("a Bash command referencing the live dir absolutely is flagged", () => {
228
+ const findings = detectLiveSourceReads(
229
+ [
230
+ {
231
+ name: "Bash",
232
+ args: { command: `grep -r trigger ${LIVE_SKILL}/` },
233
+ ordinal: 0,
234
+ },
235
+ ],
236
+ LIVE_SKILL,
237
+ REPO,
238
+ );
239
+ expect(findings).toHaveLength(1);
240
+ });
241
+
242
+ test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
243
+ // --stage-name can stage under the skill's natural name; that path contains
244
+ // `skills/<name>` but lives under `.claude/`, so it must not match.
245
+ const findings = detectLiveSourceReads(
246
+ [
247
+ {
248
+ name: "Bash",
249
+ args: { command: "cat .claude/skills/mr-review/SKILL.md" },
250
+ ordinal: 0,
251
+ },
252
+ ],
253
+ LIVE_SKILL,
254
+ REPO,
255
+ );
256
+ expect(findings).toHaveLength(0);
257
+ });
258
+
259
+ test("unrelated reads and commands are not flagged", () => {
260
+ const findings = detectLiveSourceReads(
261
+ [
262
+ {
263
+ name: "Read",
264
+ args: { file_path: join(OUTPUTS, "x.md") },
265
+ ordinal: 0,
266
+ },
267
+ { name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
268
+ {
269
+ name: "Write",
270
+ args: { file_path: join(LIVE_SKILL, "SKILL.md") },
271
+ ordinal: 2,
272
+ },
273
+ ],
274
+ LIVE_SKILL,
275
+ REPO,
276
+ );
277
+ // Write tools are detectStrayWrites' jurisdiction — this check is reads only.
278
+ expect(findings).toHaveLength(0);
279
+ });
280
+ });
281
+
282
+ describe("detect-stray-writes CLI", () => {
283
+ // realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
284
+ // so fixture paths must match that form for prefix checks to line up.
285
+ const FIXTURE_ROOT = join(
286
+ realpathSync(tmpdir()),
287
+ `slow-powers-detect-stray-test-${process.pid}`,
288
+ );
289
+ const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
290
+
291
+ beforeAll(() => {
292
+ mkdirSync(FIXTURE_ROOT, { recursive: true });
293
+ });
294
+
295
+ afterAll(() => {
296
+ rmSync(FIXTURE_ROOT, { recursive: true, force: true });
297
+ });
298
+
299
+ test("reports live-source reads per run in stray-writes.json", () => {
300
+ const root = join(FIXTURE_ROOT, "cli-live-reads");
301
+ const skillDir = join(root, "skill-dir");
302
+ const skillSub = join(skillDir, "mr-review");
303
+ mkdirSync(skillSub, { recursive: true });
304
+ writeFileSync(
305
+ join(skillSub, "SKILL.md"),
306
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
307
+ );
308
+
309
+ const cwd = join(root, "work");
310
+ const iterationDir = join(
311
+ cwd,
312
+ "skills-workspace",
313
+ "mr-review",
314
+ "iteration-1",
315
+ );
316
+ const condDir = join(iterationDir, "eval-e1", "old_skill");
317
+ mkdirSync(condDir, { recursive: true });
318
+ writeFileSync(
319
+ join(iterationDir, "conditions.json"),
320
+ `${JSON.stringify({
321
+ mode: "revision",
322
+ conditions: [
323
+ { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
324
+ { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
325
+ ],
326
+ timestamp: new Date().toISOString(),
327
+ harness: "claude-code",
328
+ })}\n`,
329
+ );
330
+ writeFileSync(
331
+ join(condDir, "run.json"),
332
+ `${JSON.stringify({
333
+ eval_id: "e1",
334
+ condition: "old_skill",
335
+ skill_path: join(skillSub, "SKILL.md"),
336
+ prompt: "do the task",
337
+ files: [],
338
+ final_message: "done",
339
+ tool_invocations: [
340
+ {
341
+ name: "Read",
342
+ args: { file_path: join(skillSub, "SKILL.md") },
343
+ ordinal: 0,
344
+ },
345
+ {
346
+ name: "Write",
347
+ args: { file_path: join(condDir, "outputs", "answer.md") },
348
+ ordinal: 1,
349
+ },
350
+ ],
351
+ })}\n`,
352
+ );
353
+
354
+ const res = Bun.spawnSync(
355
+ [
356
+ "bun",
357
+ "run",
358
+ SCRIPT,
359
+ "--skill-dir",
360
+ skillDir,
361
+ "--skill",
362
+ "mr-review",
363
+ "--iteration",
364
+ "1",
365
+ ],
366
+ { cwd, stdout: "pipe", stderr: "pipe" },
367
+ );
368
+ expect(res.exitCode).toBe(0);
369
+
370
+ const report = JSON.parse(
371
+ readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
372
+ ) as {
373
+ totals: {
374
+ violations: number;
375
+ warnings: number;
376
+ live_source_reads: number;
377
+ };
378
+ runs: Array<{
379
+ eval_id: string;
380
+ condition: string;
381
+ live_source_reads: Array<{ tool: string; path?: string }>;
382
+ }>;
383
+ };
384
+ expect(report.totals.live_source_reads).toBe(1);
385
+ expect(report.totals.violations).toBe(0);
386
+ expect(report.runs).toHaveLength(1);
387
+ expect(report.runs[0]).toMatchObject({
388
+ eval_id: "e1",
389
+ condition: "old_skill",
390
+ });
391
+ expect(report.runs[0].live_source_reads[0]).toMatchObject({
392
+ tool: "Read",
393
+ path: join(skillSub, "SKILL.md"),
394
+ });
395
+ });
396
+ });
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env bun
2
2
  import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join } from "node:path";
3
+ import { join, relative, resolve } from "node:path";
4
4
  import { detectRunContext } from "./context";
5
5
  import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
6
6
  import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
@@ -71,6 +71,81 @@ export function detectStrayWrites(
71
71
  return { violations, warnings };
72
72
  }
73
73
 
74
+ /** Read-only tools that carry a target path argument (see `pathArg`). */
75
+ const READ_TOOLS = new Set(["Read", "Glob", "Grep"]);
76
+
77
+ const LIVE_SOURCE_REASON =
78
+ "reads the live skill source instead of its staged copy — the arm may be contaminated";
79
+
80
+ function escapeRegExp(s: string): string {
81
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
82
+ }
83
+
84
+ /**
85
+ * Flag tool invocations that read the **live** skill-under-test directory.
86
+ *
87
+ * Eval subagents are only ever meant to see the *staged* copy of the skill
88
+ * (`.claude/skills/<slug>/`, or the inlined SKILL.md under `--no-stage`). A
89
+ * read of the live source typically means the Skill tool couldn't resolve the
90
+ * staged slug yet (mid-session registry refresh race) and the agent improvised
91
+ * — fatal in revision mode, where the old_skill arm then reads new-skill
92
+ * content. Reads are detected, not blocked: the guard stays read-permissive,
93
+ * so this surfaces post-hoc as a validity warning.
94
+ *
95
+ * - Read-tool calls (Read/Glob/Grep) whose path arg resolves under the live
96
+ * dir are flagged; relative paths resolve against `repoRoot`.
97
+ * - Bash commands that reference the live dir (absolute, or repo-relative
98
+ * text) are flagged. A staged copy under `.claude/skills/` can carry the
99
+ * same `skills/<name>` relative text (e.g. via `--stage-name`), so that
100
+ * prefix is excluded.
101
+ */
102
+ export function detectLiveSourceReads(
103
+ invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
104
+ liveSkillDir: string,
105
+ repoRoot: string,
106
+ ): StrayFinding[] {
107
+ const findings: StrayFinding[] = [];
108
+ const liveDir = resolve(liveSkillDir);
109
+ const rel = relative(repoRoot, liveDir);
110
+ const relRe = rel.startsWith("..")
111
+ ? null
112
+ : new RegExp(
113
+ // The lookbehind fires at the boundary char itself, so it checks for a
114
+ // bare `.claude` — the `/` is consumed by the boundary group.
115
+ `(?<!\\.claude)(^|[\\s'"=:(/])${escapeRegExp(rel)}(/|[\\s'")]|$)`,
116
+ );
117
+
118
+ for (const inv of invocations) {
119
+ if (READ_TOOLS.has(inv.name)) {
120
+ const p = pathArg(inv.args);
121
+ if (p && isUnder(p, liveDir, repoRoot)) {
122
+ findings.push({
123
+ tool: inv.name,
124
+ path: p,
125
+ ordinal: inv.ordinal,
126
+ reason: LIVE_SOURCE_REASON,
127
+ });
128
+ }
129
+ continue;
130
+ }
131
+
132
+ if (inv.name === "Bash") {
133
+ const args = inv.args as { command?: unknown } | undefined;
134
+ const command = typeof args?.command === "string" ? args.command : "";
135
+ if (command.includes(liveDir) || relRe?.test(command)) {
136
+ findings.push({
137
+ tool: "Bash",
138
+ command,
139
+ ordinal: inv.ordinal,
140
+ reason: LIVE_SOURCE_REASON,
141
+ });
142
+ }
143
+ }
144
+ }
145
+
146
+ return findings;
147
+ }
148
+
74
149
  if (import.meta.main) {
75
150
  const argv = Bun.argv.slice(2);
76
151
  const flag = (name: string): string | undefined => {
@@ -127,10 +202,12 @@ if (import.meta.main) {
127
202
  condition: string;
128
203
  violations: StrayFinding[];
129
204
  warnings: StrayFinding[];
205
+ live_source_reads: StrayFinding[];
130
206
  };
131
207
  const runs: RunReport[] = [];
132
208
  let totalViolations = 0;
133
209
  let totalWarnings = 0;
210
+ let totalLiveReads = 0;
134
211
 
135
212
  for (const evalDir of evalDirs) {
136
213
  const evalId = evalDir.replace(/^eval-/, "");
@@ -149,23 +226,38 @@ if (import.meta.main) {
149
226
  const outputsDir =
150
227
  outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
151
228
  const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
152
- if (findings.violations.length || findings.warnings.length) {
229
+ const liveReads = detectLiveSourceReads(
230
+ invocations,
231
+ ctx.skillSubdir,
232
+ repoRoot,
233
+ );
234
+ if (
235
+ findings.violations.length ||
236
+ findings.warnings.length ||
237
+ liveReads.length
238
+ ) {
153
239
  runs.push({
154
240
  eval_id: evalId,
155
241
  condition: cond,
156
242
  violations: findings.violations,
157
243
  warnings: findings.warnings,
244
+ live_source_reads: liveReads,
158
245
  });
159
246
  }
160
247
  totalViolations += findings.violations.length;
161
248
  totalWarnings += findings.warnings.length;
249
+ totalLiveReads += liveReads.length;
162
250
  }
163
251
  }
164
252
 
165
253
  const report = {
166
254
  generated: new Date().toISOString(),
167
255
  iteration: Number(iteration),
168
- totals: { violations: totalViolations, warnings: totalWarnings },
256
+ totals: {
257
+ violations: totalViolations,
258
+ warnings: totalWarnings,
259
+ live_source_reads: totalLiveReads,
260
+ },
169
261
  runs,
170
262
  };
171
263
  const outPath = join(iterationDir, "stray-writes.json");
@@ -182,11 +274,15 @@ if (import.meta.main) {
182
274
  console.warn(
183
275
  `⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
184
276
  );
277
+ for (const l of r.live_source_reads)
278
+ console.warn(
279
+ `⚠ ${r.eval_id}/${r.condition}: ${l.tool} read the live skill source (ordinal ${l.ordinal}): ${l.path ?? l.command}`,
280
+ );
185
281
  }
186
- if (totalViolations === 0 && totalWarnings === 0)
187
- console.log("✓ No out-of-bounds writes detected.");
282
+ if (totalViolations === 0 && totalWarnings === 0 && totalLiveReads === 0)
283
+ console.log("✓ No out-of-bounds writes or live-source reads detected.");
188
284
  else
189
285
  console.warn(
190
- `\n${totalViolations} violation(s), ${totalWarnings} warning(s). Runs with violations edited files outside their sandbox — treat those data points as tainted.`,
286
+ `\n${totalViolations} violation(s), ${totalWarnings} warning(s), ${totalLiveReads} live-source read(s). Runs with violations edited files outside their sandbox; runs with live-source reads saw the live skill instead of their staged copy — treat those data points as tainted.`,
191
287
  );
192
288
  }
@@ -68,4 +68,61 @@ describe("guard decide", () => {
68
68
  true,
69
69
  );
70
70
  });
71
+
72
+ test("denies git worktree add (working tree outside the sandbox)", () => {
73
+ const d = decide(
74
+ "Bash",
75
+ { command: "git worktree add ../wt -b scratch" },
76
+ marker(),
77
+ );
78
+ expect(d.allow).toBe(false);
79
+ expect(d.reason).toMatch(/worktree/i);
80
+ });
81
+
82
+ test("denies Bash that creates a path under .claude via a non-redirect verb", () => {
83
+ expect(
84
+ decide("Bash", { command: "mkdir -p .claude/foo" }, marker()).allow,
85
+ ).toBe(false);
86
+ expect(
87
+ decide("Bash", { command: "cp out.txt .claude/bar" }, marker()).allow,
88
+ ).toBe(false);
89
+ });
90
+
91
+ test("denies Bash that creates a bare skills/ dir", () => {
92
+ expect(decide("Bash", { command: "mkdir skills" }, marker()).allow).toBe(
93
+ false,
94
+ );
95
+ expect(
96
+ decide("Bash", { command: "cp -r src ./skills" }, marker()).allow,
97
+ ).toBe(false);
98
+ });
99
+
100
+ test("still allows reads of .claude (no create verb)", () => {
101
+ expect(
102
+ decide("Bash", { command: "cat .claude/settings.json" }, marker()).allow,
103
+ ).toBe(true);
104
+ expect(decide("Bash", { command: "ls .claude" }, marker()).allow).toBe(
105
+ true,
106
+ );
107
+ });
108
+
109
+ test("allows a create scoped to the .claude/skills staging root (allowed-root escape)", () => {
110
+ expect(
111
+ decide(
112
+ "Bash",
113
+ { command: "mkdir -p /work/.claude/skills/staged-x" },
114
+ marker(),
115
+ ).allow,
116
+ ).toBe(true);
117
+ });
118
+
119
+ test("does not flag skills-workspace as a bare skills/ write", () => {
120
+ expect(
121
+ decide(
122
+ "Bash",
123
+ { command: "mkdir -p /work/skills-workspace/x/outputs" },
124
+ marker(),
125
+ ).allow,
126
+ ).toBe(true);
127
+ });
71
128
  });
@@ -8,6 +8,7 @@ import {
8
8
  } from "node:fs";
9
9
  import { tmpdir } from "node:os";
10
10
  import { join } from "node:path";
11
+ import { PROMOTED_MARKER } from "./workspace-teardown";
11
12
 
12
13
  const FIXTURE_ROOT = join(tmpdir(), `slow-powers-promote-test-${process.pid}`);
13
14
  const PROMOTE_TS = join(import.meta.dir, "promote-baseline.ts");
@@ -137,6 +138,56 @@ describe("promote-baseline.ts (--skill-dir, isolated CWD)", () => {
137
138
  expect(provenance).toContain("Judge model | unspecified");
138
139
  });
139
140
 
141
+ test("drops a .promoted.json marker into the iteration dir for teardown", () => {
142
+ const root = join(FIXTURE_ROOT, "promote-marker");
143
+
144
+ const skillDir = join(root, "skill-dir");
145
+ const skillSub = join(skillDir, "mr-review");
146
+ mkdirSync(skillSub, { recursive: true });
147
+ writeFileSync(
148
+ join(skillSub, "SKILL.md"),
149
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
150
+ );
151
+
152
+ const cwd = join(root, "work");
153
+ const iterationDir = join(
154
+ cwd,
155
+ "skills-workspace",
156
+ "mr-review",
157
+ "iteration-3",
158
+ );
159
+ mkdirSync(iterationDir, { recursive: true });
160
+ writeJson(join(iterationDir, "benchmark.json"), {
161
+ delta: { pass_rate: 0 },
162
+ });
163
+
164
+ const res = Bun.spawnSync(
165
+ [
166
+ "bun",
167
+ "run",
168
+ PROMOTE_TS,
169
+ "--skill-dir",
170
+ skillDir,
171
+ "--skill",
172
+ "mr-review",
173
+ "--iteration",
174
+ "3",
175
+ ],
176
+ { cwd, stdout: "pipe", stderr: "pipe" },
177
+ );
178
+ expect(res.stderr.toString()).toBe("");
179
+ expect(res.exitCode).toBe(0);
180
+
181
+ const markerPath = join(iterationDir, PROMOTED_MARKER);
182
+ expect(existsSync(markerPath)).toBe(true);
183
+ const marker = JSON.parse(readFileSync(markerPath, "utf8")) as {
184
+ promoted_at: string;
185
+ baseline_dir: string;
186
+ };
187
+ expect(marker.promoted_at).toBeTruthy();
188
+ expect(marker.baseline_dir).toBe(join(skillSub, "evals", "baseline"));
189
+ });
190
+
140
191
  test("records agent and judge models in provenance when flags are passed", () => {
141
192
  const root = join(FIXTURE_ROOT, "promote-models");
142
193
 
@@ -10,6 +10,7 @@ import {
10
10
  import { join } from "node:path";
11
11
  import { detectRunContext } from "./context";
12
12
  import type { ConditionsRecord } from "./types";
13
+ import { PROMOTED_MARKER } from "./workspace-teardown";
13
14
 
14
15
  function die(msg: string): never {
15
16
  console.error(`error: ${msg}`);
@@ -120,7 +121,8 @@ export function promoteBaseline(opts: PromoteOptions): {
120
121
  "`bun run evals:promote-baseline -- --skill " +
121
122
  `${opts.skillName} --iteration <N>` +
122
123
  "` after aggregating. The ephemeral workspace (run records, timing,",
123
- "dispatch files, produced outputs) stays gitignored under `skills-workspace/`.",
124
+ "dispatch files, produced outputs) stays gitignored under `skills-workspace/`",
125
+ "and is reclaimable by `evals:teardown` once promoted (this commit's marker).",
124
126
  "",
125
127
  "| Field | Value |",
126
128
  "|-------|-------|",
@@ -141,6 +143,22 @@ export function promoteBaseline(opts: PromoteOptions): {
141
143
  ].join("\n");
142
144
  writeFileSync(join(baselineDir, "BASELINE.md"), `${provenance}\n`);
143
145
 
146
+ // Mark the iteration as committed so `teardown` can safely reclaim its
147
+ // workspace — without this marker teardown preserves the iteration as
148
+ // uncommitted results.
149
+ writeFileSync(
150
+ join(iterationDir, PROMOTED_MARKER),
151
+ `${JSON.stringify(
152
+ {
153
+ promoted_at: new Date().toISOString(),
154
+ baseline_dir: baselineDir,
155
+ commit: head,
156
+ },
157
+ null,
158
+ 2,
159
+ )}\n`,
160
+ );
161
+
144
162
  return { baselineDir, gradingsCopied };
145
163
  }
146
164