@slowdini/slow-powers-opencode 0.1.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +32 -13
  2. package/package.json +5 -1
  3. package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
  4. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
  5. package/skills/evaluating-skills/SKILL.md +22 -20
  6. package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
  7. package/skills/evaluating-skills/harness-details/claude.md +51 -15
  8. package/skills/evaluating-skills/harness-parity.md +155 -0
  9. package/skills/evaluating-skills/pressure-scenarios.md +1 -1
  10. package/skills/evaluating-skills/runner/README.md +28 -19
  11. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
  12. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
  13. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
  14. package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
  15. package/skills/evaluating-skills/runner/aggregate.ts +21 -0
  16. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
  17. package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
  18. package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
  19. package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
  20. package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
  21. package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
  22. package/skills/evaluating-skills/runner/record-runs.ts +209 -0
  23. package/skills/evaluating-skills/runner/run.test.ts +523 -0
  24. package/skills/evaluating-skills/runner/run.ts +376 -17
  25. package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
  26. package/skills/evaluating-skills/runner/types.ts +9 -0
  27. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
  28. package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
  29. package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
  30. package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
  31. package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
  32. package/skills/hardening-plans/SKILL.md +1 -1
  33. package/skills/systematic-debugging/SKILL.md +4 -0
  34. package/skills/test-driven-development/SKILL.md +2 -0
  35. package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
  36. package/skills/verifying-development-work/SKILL.md +99 -0
  37. package/skills/verifying-development-work/code-review.md +68 -0
  38. package/skills/verifying-development-work/comment-review.md +85 -0
  39. package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
  40. package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
  41. package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
  42. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  43. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  44. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  45. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  46. package/skills/verifying-development-work/evals/evals.json +178 -0
  47. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  48. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  49. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
  50. package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
  51. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
  52. package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
  53. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
  54. package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
  55. package/skills/working-in-isolation/SKILL.md +2 -2
  56. package/skills/writing-skills/SKILL.md +2 -3
  57. package/skills/finishing-a-development-branch/SKILL.md +0 -96
  58. package/skills/finishing-a-development-branch/evals/evals.json +0 -41
  59. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
  60. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
  61. package/skills/verification-before-completion/SKILL.md +0 -65
  62. package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
  63. package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
  64. package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
  65. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  66. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  67. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  68. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  69. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  70. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  71. package/skills/verification-before-completion/evals/evals.json +0 -77
  72. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
  73. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
  74. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
  75. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
  76. /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
@@ -186,6 +186,226 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
186
186
  ).toBe(true);
187
187
  });
188
188
 
189
+ test("surfaces live-source reads as validity_warnings", () => {
190
+ const root = join(FIXTURE_ROOT, "agg-live-reads");
191
+ const skillDir = join(root, "skill-dir");
192
+ const skillSub = join(skillDir, "mr-review");
193
+ mkdirSync(skillSub, { recursive: true });
194
+ writeFileSync(
195
+ join(skillSub, "SKILL.md"),
196
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
197
+ );
198
+
199
+ const cwd = join(root, "work");
200
+ const iterationDir = join(
201
+ cwd,
202
+ "skills-workspace",
203
+ "mr-review",
204
+ "iteration-1",
205
+ );
206
+ mkdirSync(iterationDir, { recursive: true });
207
+ writeJson(join(iterationDir, "conditions.json"), {
208
+ mode: "revision",
209
+ conditions: [
210
+ { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
211
+ { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
212
+ ],
213
+ timestamp: new Date().toISOString(),
214
+ harness: "claude-code",
215
+ });
216
+ for (const cond of ["old_skill", "new_skill"]) {
217
+ const condDir = join(iterationDir, "eval-e1", cond);
218
+ mkdirSync(condDir, { recursive: true });
219
+ writeJson(join(condDir, "grading.json"), {
220
+ assertion_results: [],
221
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
222
+ });
223
+ writeJson(join(condDir, "timing.json"), {
224
+ total_tokens: 100,
225
+ duration_ms: 1,
226
+ });
227
+ }
228
+ writeJson(join(iterationDir, "stray-writes.json"), {
229
+ generated: new Date().toISOString(),
230
+ iteration: 1,
231
+ totals: { violations: 0, warnings: 0, live_source_reads: 1 },
232
+ runs: [
233
+ {
234
+ eval_id: "e1",
235
+ condition: "old_skill",
236
+ violations: [],
237
+ warnings: [],
238
+ live_source_reads: [
239
+ {
240
+ tool: "Read",
241
+ path: join(skillSub, "SKILL.md"),
242
+ ordinal: 0,
243
+ reason: "x",
244
+ },
245
+ ],
246
+ },
247
+ ],
248
+ });
249
+
250
+ const res = Bun.spawnSync(
251
+ [
252
+ "bun",
253
+ "run",
254
+ AGGREGATE_TS,
255
+ "--skill-dir",
256
+ skillDir,
257
+ "--skill",
258
+ "mr-review",
259
+ "--iteration",
260
+ "1",
261
+ ],
262
+ { cwd, stdout: "pipe", stderr: "pipe" },
263
+ );
264
+ expect(res.exitCode).toBe(0);
265
+ const benchmark = JSON.parse(
266
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
267
+ ) as { validity_warnings: string[] };
268
+ expect(
269
+ benchmark.validity_warnings.some(
270
+ (w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
271
+ ),
272
+ ).toBe(true);
273
+ });
274
+
275
+ test("warns when timing sources are mixed across the compared runs", () => {
276
+ const root = join(FIXTURE_ROOT, "agg-mixed-timing");
277
+ const skillDir = join(root, "skill-dir");
278
+ const skillSub = join(skillDir, "mr-review");
279
+ mkdirSync(skillSub, { recursive: true });
280
+ writeFileSync(
281
+ join(skillSub, "SKILL.md"),
282
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
283
+ );
284
+
285
+ const cwd = join(root, "work");
286
+ const iterationDir = join(
287
+ cwd,
288
+ "skills-workspace",
289
+ "mr-review",
290
+ "iteration-1",
291
+ );
292
+ mkdirSync(iterationDir, { recursive: true });
293
+ writeJson(join(iterationDir, "conditions.json"), {
294
+ mode: "new-skill",
295
+ conditions: [
296
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
297
+ { name: "without_skill", skill_path: null },
298
+ ],
299
+ timestamp: new Date().toISOString(),
300
+ harness: "claude-code",
301
+ });
302
+ // One arm has agent-captured completion-event timing (no source field, the
303
+ // pre-provenance shape); the other was backfilled from the transcript.
304
+ const mkCond = (cond: string, timing: unknown) => {
305
+ const condDir = join(iterationDir, "eval-e1", cond);
306
+ mkdirSync(condDir, { recursive: true });
307
+ writeJson(join(condDir, "grading.json"), {
308
+ assertion_results: [],
309
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
310
+ });
311
+ writeJson(join(condDir, "timing.json"), timing);
312
+ };
313
+ mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
314
+ mkCond("without_skill", {
315
+ total_tokens: 90000,
316
+ duration_ms: 1200,
317
+ source: "transcript",
318
+ });
319
+
320
+ const res = Bun.spawnSync(
321
+ [
322
+ "bun",
323
+ "run",
324
+ AGGREGATE_TS,
325
+ "--skill-dir",
326
+ skillDir,
327
+ "--skill",
328
+ "mr-review",
329
+ "--iteration",
330
+ "1",
331
+ ],
332
+ { cwd, stdout: "pipe", stderr: "pipe" },
333
+ );
334
+ expect(res.exitCode).toBe(0);
335
+ const benchmark = JSON.parse(
336
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
337
+ ) as { validity_warnings: string[] };
338
+ expect(
339
+ benchmark.validity_warnings.some(
340
+ (w) => w.includes("timing source") && w.includes("transcript"),
341
+ ),
342
+ ).toBe(true);
343
+ });
344
+
345
+ test("does not warn when all timing comes from one source", () => {
346
+ const root = join(FIXTURE_ROOT, "agg-same-timing");
347
+ const skillDir = join(root, "skill-dir");
348
+ const skillSub = join(skillDir, "mr-review");
349
+ mkdirSync(skillSub, { recursive: true });
350
+ writeFileSync(
351
+ join(skillSub, "SKILL.md"),
352
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
353
+ );
354
+
355
+ const cwd = join(root, "work");
356
+ const iterationDir = join(
357
+ cwd,
358
+ "skills-workspace",
359
+ "mr-review",
360
+ "iteration-1",
361
+ );
362
+ mkdirSync(iterationDir, { recursive: true });
363
+ writeJson(join(iterationDir, "conditions.json"), {
364
+ mode: "new-skill",
365
+ conditions: [
366
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
367
+ { name: "without_skill", skill_path: null },
368
+ ],
369
+ timestamp: new Date().toISOString(),
370
+ harness: "claude-code",
371
+ });
372
+ for (const cond of ["with_skill", "without_skill"]) {
373
+ const condDir = join(iterationDir, "eval-e1", cond);
374
+ mkdirSync(condDir, { recursive: true });
375
+ writeJson(join(condDir, "grading.json"), {
376
+ assertion_results: [],
377
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
378
+ });
379
+ writeJson(join(condDir, "timing.json"), {
380
+ total_tokens: 100,
381
+ duration_ms: 1,
382
+ source: "transcript",
383
+ });
384
+ }
385
+
386
+ const res = Bun.spawnSync(
387
+ [
388
+ "bun",
389
+ "run",
390
+ AGGREGATE_TS,
391
+ "--skill-dir",
392
+ skillDir,
393
+ "--skill",
394
+ "mr-review",
395
+ "--iteration",
396
+ "1",
397
+ ],
398
+ { cwd, stdout: "pipe", stderr: "pipe" },
399
+ );
400
+ expect(res.exitCode).toBe(0);
401
+ const benchmark = JSON.parse(
402
+ readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
403
+ ) as { validity_warnings: string[] };
404
+ expect(
405
+ benchmark.validity_warnings.some((w) => w.includes("timing source")),
406
+ ).toBe(false);
407
+ });
408
+
189
409
  test("surfaces plugin-shadow findings as validity_warnings", () => {
190
410
  const root = join(FIXTURE_ROOT, "agg-shadow");
191
411
  const skillDir = join(root, "skill-dir");
@@ -94,6 +94,11 @@ for (const c of conditions.conditions) {
94
94
  }
95
95
 
96
96
  let missingGradings = 0;
97
+ // Timing provenance across all runs in the comparison. "completion-event"
98
+ // (the agent-captured default, also assumed when `source` is absent) and
99
+ // "transcript" (record-runs backfill, includes cache accounting) measure
100
+ // different things — a delta mixing them is comparing two metrics.
101
+ const timingSources = new Set<string>();
97
102
  for (const evalDir of evalDirs) {
98
103
  for (const cond of conditionNames) {
99
104
  const condDir = join(iterationDir, evalDir, cond);
@@ -116,6 +121,11 @@ for (const evalDir of evalDirs) {
116
121
  byCondition[cond].tokens.push(timing.total_tokens);
117
122
  if (typeof timing.duration_ms === "number")
118
123
  byCondition[cond].durations.push(timing.duration_ms);
124
+ if (
125
+ typeof timing.total_tokens === "number" ||
126
+ typeof timing.duration_ms === "number"
127
+ )
128
+ timingSources.add(timing.source ?? "completion-event");
119
129
  }
120
130
  }
121
131
  }
@@ -168,6 +178,11 @@ const delta = {
168
178
  };
169
179
 
170
180
  const validityWarnings: string[] = [];
181
+ if (timingSources.size > 1) {
182
+ validityWarnings.push(
183
+ `runs mix timing sources (${[...timingSources].sort().join(", ")}) — transcript-derived totals include cache accounting, so the token/duration delta compares two different metrics. Re-record one side or read the delta as a rough signal only.`,
184
+ );
185
+ }
171
186
  for (const cond of conditionNames) {
172
187
  const s = runSummary[cond];
173
188
  if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
@@ -188,6 +203,7 @@ if (existsSync(strayPath)) {
188
203
  eval_id: string;
189
204
  condition: string;
190
205
  violations?: unknown[];
206
+ live_source_reads?: unknown[];
191
207
  }>;
192
208
  };
193
209
  for (const r of stray.runs ?? []) {
@@ -196,6 +212,11 @@ if (existsSync(strayPath)) {
196
212
  validityWarnings.push(
197
213
  `${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
198
214
  );
215
+ const reads = r.live_source_reads?.length ?? 0;
216
+ if (reads > 0)
217
+ validityWarnings.push(
218
+ `${r.eval_id}/${r.condition} read the live skill source ${reads} time(s) instead of its staged copy — the arm may be contaminated (staged-slug resolution race; see stray-writes.json).`,
219
+ );
199
220
  }
200
221
  } catch {
201
222
  // ignore a malformed report rather than failing aggregation
@@ -1,9 +1,21 @@
1
- import { describe, expect, test } from "bun:test";
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import {
3
+ mkdirSync,
4
+ readFileSync,
5
+ realpathSync,
6
+ rmSync,
7
+ writeFileSync,
8
+ } from "node:fs";
9
+ import { tmpdir } from "node:os";
2
10
  import { join } from "node:path";
3
- import { detectStrayWrites } from "./detect-stray-writes";
11
+ import {
12
+ detectLiveSourceReads,
13
+ detectStrayWrites,
14
+ } from "./detect-stray-writes";
4
15
 
5
16
  const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
6
17
  const REPO = "/work/repo";
18
+ const LIVE_SKILL = join(REPO, "skills", "mr-review");
7
19
 
8
20
  describe("detectStrayWrites", () => {
9
21
  test("a Write inside the outputs dir is clean", () => {
@@ -87,6 +99,32 @@ describe("detectStrayWrites", () => {
87
99
  expect(findings.warnings).toHaveLength(0);
88
100
  });
89
101
 
102
+ test("git worktree add is a warning (working tree outside the sandbox)", () => {
103
+ const findings = detectStrayWrites(
104
+ [
105
+ {
106
+ name: "Bash",
107
+ args: { command: "git worktree add ../wt -b scratch" },
108
+ ordinal: 0,
109
+ },
110
+ ],
111
+ OUTPUTS,
112
+ REPO,
113
+ );
114
+ expect(findings.warnings).toHaveLength(1);
115
+ expect(findings.warnings[0].reason).toMatch(/worktree/i);
116
+ });
117
+
118
+ test("creating a path under .claude is a warning", () => {
119
+ const findings = detectStrayWrites(
120
+ [{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
121
+ OUTPUTS,
122
+ REPO,
123
+ );
124
+ expect(findings.warnings).toHaveLength(1);
125
+ expect(findings.warnings[0].reason).toMatch(/\.claude/i);
126
+ });
127
+
90
128
  test("read-only tools are never flagged", () => {
91
129
  const findings = detectStrayWrites(
92
130
  [
@@ -101,3 +139,258 @@ describe("detectStrayWrites", () => {
101
139
  expect(findings.warnings).toHaveLength(0);
102
140
  });
103
141
  });
142
+
143
+ describe("detectLiveSourceReads", () => {
144
+ test("a Read of the live SKILL.md is flagged", () => {
145
+ const findings = detectLiveSourceReads(
146
+ [
147
+ {
148
+ name: "Read",
149
+ args: { file_path: join(LIVE_SKILL, "SKILL.md") },
150
+ ordinal: 1,
151
+ },
152
+ ],
153
+ LIVE_SKILL,
154
+ REPO,
155
+ );
156
+ expect(findings).toHaveLength(1);
157
+ expect(findings[0]).toMatchObject({
158
+ tool: "Read",
159
+ path: join(LIVE_SKILL, "SKILL.md"),
160
+ ordinal: 1,
161
+ });
162
+ expect(findings[0].reason).toMatch(/live skill source/i);
163
+ });
164
+
165
+ test("a Read of a staged eval copy is not flagged", () => {
166
+ const findings = detectLiveSourceReads(
167
+ [
168
+ {
169
+ name: "Read",
170
+ args: {
171
+ file_path: join(
172
+ REPO,
173
+ ".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
174
+ ),
175
+ },
176
+ ordinal: 0,
177
+ },
178
+ ],
179
+ LIVE_SKILL,
180
+ REPO,
181
+ );
182
+ expect(findings).toHaveLength(0);
183
+ });
184
+
185
+ test("a relative Read path resolving under the live dir is flagged", () => {
186
+ const findings = detectLiveSourceReads(
187
+ [
188
+ {
189
+ name: "Read",
190
+ args: { file_path: "skills/mr-review/SKILL.md" },
191
+ ordinal: 0,
192
+ },
193
+ ],
194
+ LIVE_SKILL,
195
+ REPO,
196
+ );
197
+ expect(findings).toHaveLength(1);
198
+ });
199
+
200
+ test("a Grep scoped to the live dir is flagged", () => {
201
+ const findings = detectLiveSourceReads(
202
+ [{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
203
+ LIVE_SKILL,
204
+ REPO,
205
+ );
206
+ expect(findings).toHaveLength(1);
207
+ expect(findings[0].tool).toBe("Grep");
208
+ });
209
+
210
+ test("a Bash command referencing the live dir relatively is flagged", () => {
211
+ const findings = detectLiveSourceReads(
212
+ [
213
+ {
214
+ name: "Bash",
215
+ args: { command: "cat skills/mr-review/SKILL.md" },
216
+ ordinal: 3,
217
+ },
218
+ ],
219
+ LIVE_SKILL,
220
+ REPO,
221
+ );
222
+ expect(findings).toHaveLength(1);
223
+ expect(findings[0].tool).toBe("Bash");
224
+ expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
225
+ });
226
+
227
+ test("a Bash command referencing the live dir absolutely is flagged", () => {
228
+ const findings = detectLiveSourceReads(
229
+ [
230
+ {
231
+ name: "Bash",
232
+ args: { command: `grep -r trigger ${LIVE_SKILL}/` },
233
+ ordinal: 0,
234
+ },
235
+ ],
236
+ LIVE_SKILL,
237
+ REPO,
238
+ );
239
+ expect(findings).toHaveLength(1);
240
+ });
241
+
242
+ test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
243
+ // --stage-name can stage under the skill's natural name; that path contains
244
+ // `skills/<name>` but lives under `.claude/`, so it must not match.
245
+ const findings = detectLiveSourceReads(
246
+ [
247
+ {
248
+ name: "Bash",
249
+ args: { command: "cat .claude/skills/mr-review/SKILL.md" },
250
+ ordinal: 0,
251
+ },
252
+ ],
253
+ LIVE_SKILL,
254
+ REPO,
255
+ );
256
+ expect(findings).toHaveLength(0);
257
+ });
258
+
259
+ test("unrelated reads and commands are not flagged", () => {
260
+ const findings = detectLiveSourceReads(
261
+ [
262
+ {
263
+ name: "Read",
264
+ args: { file_path: join(OUTPUTS, "x.md") },
265
+ ordinal: 0,
266
+ },
267
+ { name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
268
+ {
269
+ name: "Write",
270
+ args: { file_path: join(LIVE_SKILL, "SKILL.md") },
271
+ ordinal: 2,
272
+ },
273
+ ],
274
+ LIVE_SKILL,
275
+ REPO,
276
+ );
277
+ // Write tools are detectStrayWrites' jurisdiction — this check is reads only.
278
+ expect(findings).toHaveLength(0);
279
+ });
280
+ });
281
+
282
+ describe("detect-stray-writes CLI", () => {
283
+ // realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
284
+ // so fixture paths must match that form for prefix checks to line up.
285
+ const FIXTURE_ROOT = join(
286
+ realpathSync(tmpdir()),
287
+ `slow-powers-detect-stray-test-${process.pid}`,
288
+ );
289
+ const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
290
+
291
+ beforeAll(() => {
292
+ mkdirSync(FIXTURE_ROOT, { recursive: true });
293
+ });
294
+
295
+ afterAll(() => {
296
+ rmSync(FIXTURE_ROOT, { recursive: true, force: true });
297
+ });
298
+
299
+ test("reports live-source reads per run in stray-writes.json", () => {
300
+ const root = join(FIXTURE_ROOT, "cli-live-reads");
301
+ const skillDir = join(root, "skill-dir");
302
+ const skillSub = join(skillDir, "mr-review");
303
+ mkdirSync(skillSub, { recursive: true });
304
+ writeFileSync(
305
+ join(skillSub, "SKILL.md"),
306
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
307
+ );
308
+
309
+ const cwd = join(root, "work");
310
+ const iterationDir = join(
311
+ cwd,
312
+ "skills-workspace",
313
+ "mr-review",
314
+ "iteration-1",
315
+ );
316
+ const condDir = join(iterationDir, "eval-e1", "old_skill");
317
+ mkdirSync(condDir, { recursive: true });
318
+ writeFileSync(
319
+ join(iterationDir, "conditions.json"),
320
+ `${JSON.stringify({
321
+ mode: "revision",
322
+ conditions: [
323
+ { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
324
+ { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
325
+ ],
326
+ timestamp: new Date().toISOString(),
327
+ harness: "claude-code",
328
+ })}\n`,
329
+ );
330
+ writeFileSync(
331
+ join(condDir, "run.json"),
332
+ `${JSON.stringify({
333
+ eval_id: "e1",
334
+ condition: "old_skill",
335
+ skill_path: join(skillSub, "SKILL.md"),
336
+ prompt: "do the task",
337
+ files: [],
338
+ final_message: "done",
339
+ tool_invocations: [
340
+ {
341
+ name: "Read",
342
+ args: { file_path: join(skillSub, "SKILL.md") },
343
+ ordinal: 0,
344
+ },
345
+ {
346
+ name: "Write",
347
+ args: { file_path: join(condDir, "outputs", "answer.md") },
348
+ ordinal: 1,
349
+ },
350
+ ],
351
+ })}\n`,
352
+ );
353
+
354
+ const res = Bun.spawnSync(
355
+ [
356
+ "bun",
357
+ "run",
358
+ SCRIPT,
359
+ "--skill-dir",
360
+ skillDir,
361
+ "--skill",
362
+ "mr-review",
363
+ "--iteration",
364
+ "1",
365
+ ],
366
+ { cwd, stdout: "pipe", stderr: "pipe" },
367
+ );
368
+ expect(res.exitCode).toBe(0);
369
+
370
+ const report = JSON.parse(
371
+ readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
372
+ ) as {
373
+ totals: {
374
+ violations: number;
375
+ warnings: number;
376
+ live_source_reads: number;
377
+ };
378
+ runs: Array<{
379
+ eval_id: string;
380
+ condition: string;
381
+ live_source_reads: Array<{ tool: string; path?: string }>;
382
+ }>;
383
+ };
384
+ expect(report.totals.live_source_reads).toBe(1);
385
+ expect(report.totals.violations).toBe(0);
386
+ expect(report.runs).toHaveLength(1);
387
+ expect(report.runs[0]).toMatchObject({
388
+ eval_id: "e1",
389
+ condition: "old_skill",
390
+ });
391
+ expect(report.runs[0].live_source_reads[0]).toMatchObject({
392
+ tool: "Read",
393
+ path: join(skillSub, "SKILL.md"),
394
+ });
395
+ });
396
+ });