@slowdini/slow-powers-opencode 0.1.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -13
- package/package.json +5 -1
- package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +22 -20
- package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/pressure-scenarios.md +1 -1
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/hardening-plans/SKILL.md +1 -1
- package/skills/systematic-debugging/SKILL.md +4 -0
- package/skills/test-driven-development/SKILL.md +2 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +99 -0
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
- package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
- package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +178 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
- package/skills/working-in-isolation/SKILL.md +2 -2
- package/skills/writing-skills/SKILL.md +2 -3
- package/skills/finishing-a-development-branch/SKILL.md +0 -96
- package/skills/finishing-a-development-branch/evals/evals.json +0 -41
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
- package/skills/verification-before-completion/SKILL.md +0 -65
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
- package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
- package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/evals.json +0 -77
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
|
@@ -186,6 +186,226 @@ describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
|
|
|
186
186
|
).toBe(true);
|
|
187
187
|
});
|
|
188
188
|
|
|
189
|
+
test("surfaces live-source reads as validity_warnings", () => {
|
|
190
|
+
const root = join(FIXTURE_ROOT, "agg-live-reads");
|
|
191
|
+
const skillDir = join(root, "skill-dir");
|
|
192
|
+
const skillSub = join(skillDir, "mr-review");
|
|
193
|
+
mkdirSync(skillSub, { recursive: true });
|
|
194
|
+
writeFileSync(
|
|
195
|
+
join(skillSub, "SKILL.md"),
|
|
196
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
197
|
+
);
|
|
198
|
+
|
|
199
|
+
const cwd = join(root, "work");
|
|
200
|
+
const iterationDir = join(
|
|
201
|
+
cwd,
|
|
202
|
+
"skills-workspace",
|
|
203
|
+
"mr-review",
|
|
204
|
+
"iteration-1",
|
|
205
|
+
);
|
|
206
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
207
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
208
|
+
mode: "revision",
|
|
209
|
+
conditions: [
|
|
210
|
+
{ name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
211
|
+
{ name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
212
|
+
],
|
|
213
|
+
timestamp: new Date().toISOString(),
|
|
214
|
+
harness: "claude-code",
|
|
215
|
+
});
|
|
216
|
+
for (const cond of ["old_skill", "new_skill"]) {
|
|
217
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
218
|
+
mkdirSync(condDir, { recursive: true });
|
|
219
|
+
writeJson(join(condDir, "grading.json"), {
|
|
220
|
+
assertion_results: [],
|
|
221
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
222
|
+
});
|
|
223
|
+
writeJson(join(condDir, "timing.json"), {
|
|
224
|
+
total_tokens: 100,
|
|
225
|
+
duration_ms: 1,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
writeJson(join(iterationDir, "stray-writes.json"), {
|
|
229
|
+
generated: new Date().toISOString(),
|
|
230
|
+
iteration: 1,
|
|
231
|
+
totals: { violations: 0, warnings: 0, live_source_reads: 1 },
|
|
232
|
+
runs: [
|
|
233
|
+
{
|
|
234
|
+
eval_id: "e1",
|
|
235
|
+
condition: "old_skill",
|
|
236
|
+
violations: [],
|
|
237
|
+
warnings: [],
|
|
238
|
+
live_source_reads: [
|
|
239
|
+
{
|
|
240
|
+
tool: "Read",
|
|
241
|
+
path: join(skillSub, "SKILL.md"),
|
|
242
|
+
ordinal: 0,
|
|
243
|
+
reason: "x",
|
|
244
|
+
},
|
|
245
|
+
],
|
|
246
|
+
},
|
|
247
|
+
],
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
const res = Bun.spawnSync(
|
|
251
|
+
[
|
|
252
|
+
"bun",
|
|
253
|
+
"run",
|
|
254
|
+
AGGREGATE_TS,
|
|
255
|
+
"--skill-dir",
|
|
256
|
+
skillDir,
|
|
257
|
+
"--skill",
|
|
258
|
+
"mr-review",
|
|
259
|
+
"--iteration",
|
|
260
|
+
"1",
|
|
261
|
+
],
|
|
262
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
263
|
+
);
|
|
264
|
+
expect(res.exitCode).toBe(0);
|
|
265
|
+
const benchmark = JSON.parse(
|
|
266
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
267
|
+
) as { validity_warnings: string[] };
|
|
268
|
+
expect(
|
|
269
|
+
benchmark.validity_warnings.some(
|
|
270
|
+
(w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
|
|
271
|
+
),
|
|
272
|
+
).toBe(true);
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
test("warns when timing sources are mixed across the compared runs", () => {
|
|
276
|
+
const root = join(FIXTURE_ROOT, "agg-mixed-timing");
|
|
277
|
+
const skillDir = join(root, "skill-dir");
|
|
278
|
+
const skillSub = join(skillDir, "mr-review");
|
|
279
|
+
mkdirSync(skillSub, { recursive: true });
|
|
280
|
+
writeFileSync(
|
|
281
|
+
join(skillSub, "SKILL.md"),
|
|
282
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
283
|
+
);
|
|
284
|
+
|
|
285
|
+
const cwd = join(root, "work");
|
|
286
|
+
const iterationDir = join(
|
|
287
|
+
cwd,
|
|
288
|
+
"skills-workspace",
|
|
289
|
+
"mr-review",
|
|
290
|
+
"iteration-1",
|
|
291
|
+
);
|
|
292
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
293
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
294
|
+
mode: "new-skill",
|
|
295
|
+
conditions: [
|
|
296
|
+
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
297
|
+
{ name: "without_skill", skill_path: null },
|
|
298
|
+
],
|
|
299
|
+
timestamp: new Date().toISOString(),
|
|
300
|
+
harness: "claude-code",
|
|
301
|
+
});
|
|
302
|
+
// One arm has agent-captured completion-event timing (no source field, the
|
|
303
|
+
// pre-provenance shape); the other was backfilled from the transcript.
|
|
304
|
+
const mkCond = (cond: string, timing: unknown) => {
|
|
305
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
306
|
+
mkdirSync(condDir, { recursive: true });
|
|
307
|
+
writeJson(join(condDir, "grading.json"), {
|
|
308
|
+
assertion_results: [],
|
|
309
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
310
|
+
});
|
|
311
|
+
writeJson(join(condDir, "timing.json"), timing);
|
|
312
|
+
};
|
|
313
|
+
mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
|
|
314
|
+
mkCond("without_skill", {
|
|
315
|
+
total_tokens: 90000,
|
|
316
|
+
duration_ms: 1200,
|
|
317
|
+
source: "transcript",
|
|
318
|
+
});
|
|
319
|
+
|
|
320
|
+
const res = Bun.spawnSync(
|
|
321
|
+
[
|
|
322
|
+
"bun",
|
|
323
|
+
"run",
|
|
324
|
+
AGGREGATE_TS,
|
|
325
|
+
"--skill-dir",
|
|
326
|
+
skillDir,
|
|
327
|
+
"--skill",
|
|
328
|
+
"mr-review",
|
|
329
|
+
"--iteration",
|
|
330
|
+
"1",
|
|
331
|
+
],
|
|
332
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
333
|
+
);
|
|
334
|
+
expect(res.exitCode).toBe(0);
|
|
335
|
+
const benchmark = JSON.parse(
|
|
336
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
337
|
+
) as { validity_warnings: string[] };
|
|
338
|
+
expect(
|
|
339
|
+
benchmark.validity_warnings.some(
|
|
340
|
+
(w) => w.includes("timing source") && w.includes("transcript"),
|
|
341
|
+
),
|
|
342
|
+
).toBe(true);
|
|
343
|
+
});
|
|
344
|
+
|
|
345
|
+
test("does not warn when all timing comes from one source", () => {
|
|
346
|
+
const root = join(FIXTURE_ROOT, "agg-same-timing");
|
|
347
|
+
const skillDir = join(root, "skill-dir");
|
|
348
|
+
const skillSub = join(skillDir, "mr-review");
|
|
349
|
+
mkdirSync(skillSub, { recursive: true });
|
|
350
|
+
writeFileSync(
|
|
351
|
+
join(skillSub, "SKILL.md"),
|
|
352
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
353
|
+
);
|
|
354
|
+
|
|
355
|
+
const cwd = join(root, "work");
|
|
356
|
+
const iterationDir = join(
|
|
357
|
+
cwd,
|
|
358
|
+
"skills-workspace",
|
|
359
|
+
"mr-review",
|
|
360
|
+
"iteration-1",
|
|
361
|
+
);
|
|
362
|
+
mkdirSync(iterationDir, { recursive: true });
|
|
363
|
+
writeJson(join(iterationDir, "conditions.json"), {
|
|
364
|
+
mode: "new-skill",
|
|
365
|
+
conditions: [
|
|
366
|
+
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
367
|
+
{ name: "without_skill", skill_path: null },
|
|
368
|
+
],
|
|
369
|
+
timestamp: new Date().toISOString(),
|
|
370
|
+
harness: "claude-code",
|
|
371
|
+
});
|
|
372
|
+
for (const cond of ["with_skill", "without_skill"]) {
|
|
373
|
+
const condDir = join(iterationDir, "eval-e1", cond);
|
|
374
|
+
mkdirSync(condDir, { recursive: true });
|
|
375
|
+
writeJson(join(condDir, "grading.json"), {
|
|
376
|
+
assertion_results: [],
|
|
377
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
378
|
+
});
|
|
379
|
+
writeJson(join(condDir, "timing.json"), {
|
|
380
|
+
total_tokens: 100,
|
|
381
|
+
duration_ms: 1,
|
|
382
|
+
source: "transcript",
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const res = Bun.spawnSync(
|
|
387
|
+
[
|
|
388
|
+
"bun",
|
|
389
|
+
"run",
|
|
390
|
+
AGGREGATE_TS,
|
|
391
|
+
"--skill-dir",
|
|
392
|
+
skillDir,
|
|
393
|
+
"--skill",
|
|
394
|
+
"mr-review",
|
|
395
|
+
"--iteration",
|
|
396
|
+
"1",
|
|
397
|
+
],
|
|
398
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
399
|
+
);
|
|
400
|
+
expect(res.exitCode).toBe(0);
|
|
401
|
+
const benchmark = JSON.parse(
|
|
402
|
+
readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
|
|
403
|
+
) as { validity_warnings: string[] };
|
|
404
|
+
expect(
|
|
405
|
+
benchmark.validity_warnings.some((w) => w.includes("timing source")),
|
|
406
|
+
).toBe(false);
|
|
407
|
+
});
|
|
408
|
+
|
|
189
409
|
test("surfaces plugin-shadow findings as validity_warnings", () => {
|
|
190
410
|
const root = join(FIXTURE_ROOT, "agg-shadow");
|
|
191
411
|
const skillDir = join(root, "skill-dir");
|
|
@@ -94,6 +94,11 @@ for (const c of conditions.conditions) {
|
|
|
94
94
|
}
|
|
95
95
|
|
|
96
96
|
let missingGradings = 0;
|
|
97
|
+
// Timing provenance across all runs in the comparison. "completion-event"
|
|
98
|
+
// (the agent-captured default, also assumed when `source` is absent) and
|
|
99
|
+
// "transcript" (record-runs backfill, includes cache accounting) measure
|
|
100
|
+
// different things — a delta mixing them is comparing two metrics.
|
|
101
|
+
const timingSources = new Set<string>();
|
|
97
102
|
for (const evalDir of evalDirs) {
|
|
98
103
|
for (const cond of conditionNames) {
|
|
99
104
|
const condDir = join(iterationDir, evalDir, cond);
|
|
@@ -116,6 +121,11 @@ for (const evalDir of evalDirs) {
|
|
|
116
121
|
byCondition[cond].tokens.push(timing.total_tokens);
|
|
117
122
|
if (typeof timing.duration_ms === "number")
|
|
118
123
|
byCondition[cond].durations.push(timing.duration_ms);
|
|
124
|
+
if (
|
|
125
|
+
typeof timing.total_tokens === "number" ||
|
|
126
|
+
typeof timing.duration_ms === "number"
|
|
127
|
+
)
|
|
128
|
+
timingSources.add(timing.source ?? "completion-event");
|
|
119
129
|
}
|
|
120
130
|
}
|
|
121
131
|
}
|
|
@@ -168,6 +178,11 @@ const delta = {
|
|
|
168
178
|
};
|
|
169
179
|
|
|
170
180
|
const validityWarnings: string[] = [];
|
|
181
|
+
if (timingSources.size > 1) {
|
|
182
|
+
validityWarnings.push(
|
|
183
|
+
`runs mix timing sources (${[...timingSources].sort().join(", ")}) — transcript-derived totals include cache accounting, so the token/duration delta compares two different metrics. Re-record one side or read the delta as a rough signal only.`,
|
|
184
|
+
);
|
|
185
|
+
}
|
|
171
186
|
for (const cond of conditionNames) {
|
|
172
187
|
const s = runSummary[cond];
|
|
173
188
|
if (s.skill_invocation_rate != null && s.skill_invocation_rate < 1) {
|
|
@@ -188,6 +203,7 @@ if (existsSync(strayPath)) {
|
|
|
188
203
|
eval_id: string;
|
|
189
204
|
condition: string;
|
|
190
205
|
violations?: unknown[];
|
|
206
|
+
live_source_reads?: unknown[];
|
|
191
207
|
}>;
|
|
192
208
|
};
|
|
193
209
|
for (const r of stray.runs ?? []) {
|
|
@@ -196,6 +212,11 @@ if (existsSync(strayPath)) {
|
|
|
196
212
|
validityWarnings.push(
|
|
197
213
|
`${r.eval_id}/${r.condition} wrote ${n} file(s) outside its outputs dir — data point may be tainted (see stray-writes.json).`,
|
|
198
214
|
);
|
|
215
|
+
const reads = r.live_source_reads?.length ?? 0;
|
|
216
|
+
if (reads > 0)
|
|
217
|
+
validityWarnings.push(
|
|
218
|
+
`${r.eval_id}/${r.condition} read the live skill source ${reads} time(s) instead of its staged copy — the arm may be contaminated (staged-slug resolution race; see stray-writes.json).`,
|
|
219
|
+
);
|
|
199
220
|
}
|
|
200
221
|
} catch {
|
|
201
222
|
// ignore a malformed report rather than failing aggregation
|
|
@@ -1,9 +1,21 @@
|
|
|
1
|
-
import { describe, expect, test } from "bun:test";
|
|
1
|
+
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
+
import {
|
|
3
|
+
mkdirSync,
|
|
4
|
+
readFileSync,
|
|
5
|
+
realpathSync,
|
|
6
|
+
rmSync,
|
|
7
|
+
writeFileSync,
|
|
8
|
+
} from "node:fs";
|
|
9
|
+
import { tmpdir } from "node:os";
|
|
2
10
|
import { join } from "node:path";
|
|
3
|
-
import {
|
|
11
|
+
import {
|
|
12
|
+
detectLiveSourceReads,
|
|
13
|
+
detectStrayWrites,
|
|
14
|
+
} from "./detect-stray-writes";
|
|
4
15
|
|
|
5
16
|
const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
|
|
6
17
|
const REPO = "/work/repo";
|
|
18
|
+
const LIVE_SKILL = join(REPO, "skills", "mr-review");
|
|
7
19
|
|
|
8
20
|
describe("detectStrayWrites", () => {
|
|
9
21
|
test("a Write inside the outputs dir is clean", () => {
|
|
@@ -87,6 +99,32 @@ describe("detectStrayWrites", () => {
|
|
|
87
99
|
expect(findings.warnings).toHaveLength(0);
|
|
88
100
|
});
|
|
89
101
|
|
|
102
|
+
test("git worktree add is a warning (working tree outside the sandbox)", () => {
|
|
103
|
+
const findings = detectStrayWrites(
|
|
104
|
+
[
|
|
105
|
+
{
|
|
106
|
+
name: "Bash",
|
|
107
|
+
args: { command: "git worktree add ../wt -b scratch" },
|
|
108
|
+
ordinal: 0,
|
|
109
|
+
},
|
|
110
|
+
],
|
|
111
|
+
OUTPUTS,
|
|
112
|
+
REPO,
|
|
113
|
+
);
|
|
114
|
+
expect(findings.warnings).toHaveLength(1);
|
|
115
|
+
expect(findings.warnings[0].reason).toMatch(/worktree/i);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
test("creating a path under .claude is a warning", () => {
|
|
119
|
+
const findings = detectStrayWrites(
|
|
120
|
+
[{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
|
|
121
|
+
OUTPUTS,
|
|
122
|
+
REPO,
|
|
123
|
+
);
|
|
124
|
+
expect(findings.warnings).toHaveLength(1);
|
|
125
|
+
expect(findings.warnings[0].reason).toMatch(/\.claude/i);
|
|
126
|
+
});
|
|
127
|
+
|
|
90
128
|
test("read-only tools are never flagged", () => {
|
|
91
129
|
const findings = detectStrayWrites(
|
|
92
130
|
[
|
|
@@ -101,3 +139,258 @@ describe("detectStrayWrites", () => {
|
|
|
101
139
|
expect(findings.warnings).toHaveLength(0);
|
|
102
140
|
});
|
|
103
141
|
});
|
|
142
|
+
|
|
143
|
+
describe("detectLiveSourceReads", () => {
|
|
144
|
+
test("a Read of the live SKILL.md is flagged", () => {
|
|
145
|
+
const findings = detectLiveSourceReads(
|
|
146
|
+
[
|
|
147
|
+
{
|
|
148
|
+
name: "Read",
|
|
149
|
+
args: { file_path: join(LIVE_SKILL, "SKILL.md") },
|
|
150
|
+
ordinal: 1,
|
|
151
|
+
},
|
|
152
|
+
],
|
|
153
|
+
LIVE_SKILL,
|
|
154
|
+
REPO,
|
|
155
|
+
);
|
|
156
|
+
expect(findings).toHaveLength(1);
|
|
157
|
+
expect(findings[0]).toMatchObject({
|
|
158
|
+
tool: "Read",
|
|
159
|
+
path: join(LIVE_SKILL, "SKILL.md"),
|
|
160
|
+
ordinal: 1,
|
|
161
|
+
});
|
|
162
|
+
expect(findings[0].reason).toMatch(/live skill source/i);
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
test("a Read of a staged eval copy is not flagged", () => {
|
|
166
|
+
const findings = detectLiveSourceReads(
|
|
167
|
+
[
|
|
168
|
+
{
|
|
169
|
+
name: "Read",
|
|
170
|
+
args: {
|
|
171
|
+
file_path: join(
|
|
172
|
+
REPO,
|
|
173
|
+
".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
|
|
174
|
+
),
|
|
175
|
+
},
|
|
176
|
+
ordinal: 0,
|
|
177
|
+
},
|
|
178
|
+
],
|
|
179
|
+
LIVE_SKILL,
|
|
180
|
+
REPO,
|
|
181
|
+
);
|
|
182
|
+
expect(findings).toHaveLength(0);
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
test("a relative Read path resolving under the live dir is flagged", () => {
|
|
186
|
+
const findings = detectLiveSourceReads(
|
|
187
|
+
[
|
|
188
|
+
{
|
|
189
|
+
name: "Read",
|
|
190
|
+
args: { file_path: "skills/mr-review/SKILL.md" },
|
|
191
|
+
ordinal: 0,
|
|
192
|
+
},
|
|
193
|
+
],
|
|
194
|
+
LIVE_SKILL,
|
|
195
|
+
REPO,
|
|
196
|
+
);
|
|
197
|
+
expect(findings).toHaveLength(1);
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
test("a Grep scoped to the live dir is flagged", () => {
|
|
201
|
+
const findings = detectLiveSourceReads(
|
|
202
|
+
[{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
|
|
203
|
+
LIVE_SKILL,
|
|
204
|
+
REPO,
|
|
205
|
+
);
|
|
206
|
+
expect(findings).toHaveLength(1);
|
|
207
|
+
expect(findings[0].tool).toBe("Grep");
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
test("a Bash command referencing the live dir relatively is flagged", () => {
|
|
211
|
+
const findings = detectLiveSourceReads(
|
|
212
|
+
[
|
|
213
|
+
{
|
|
214
|
+
name: "Bash",
|
|
215
|
+
args: { command: "cat skills/mr-review/SKILL.md" },
|
|
216
|
+
ordinal: 3,
|
|
217
|
+
},
|
|
218
|
+
],
|
|
219
|
+
LIVE_SKILL,
|
|
220
|
+
REPO,
|
|
221
|
+
);
|
|
222
|
+
expect(findings).toHaveLength(1);
|
|
223
|
+
expect(findings[0].tool).toBe("Bash");
|
|
224
|
+
expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
test("a Bash command referencing the live dir absolutely is flagged", () => {
|
|
228
|
+
const findings = detectLiveSourceReads(
|
|
229
|
+
[
|
|
230
|
+
{
|
|
231
|
+
name: "Bash",
|
|
232
|
+
args: { command: `grep -r trigger ${LIVE_SKILL}/` },
|
|
233
|
+
ordinal: 0,
|
|
234
|
+
},
|
|
235
|
+
],
|
|
236
|
+
LIVE_SKILL,
|
|
237
|
+
REPO,
|
|
238
|
+
);
|
|
239
|
+
expect(findings).toHaveLength(1);
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
|
|
243
|
+
// --stage-name can stage under the skill's natural name; that path contains
|
|
244
|
+
// `skills/<name>` but lives under `.claude/`, so it must not match.
|
|
245
|
+
const findings = detectLiveSourceReads(
|
|
246
|
+
[
|
|
247
|
+
{
|
|
248
|
+
name: "Bash",
|
|
249
|
+
args: { command: "cat .claude/skills/mr-review/SKILL.md" },
|
|
250
|
+
ordinal: 0,
|
|
251
|
+
},
|
|
252
|
+
],
|
|
253
|
+
LIVE_SKILL,
|
|
254
|
+
REPO,
|
|
255
|
+
);
|
|
256
|
+
expect(findings).toHaveLength(0);
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
test("unrelated reads and commands are not flagged", () => {
|
|
260
|
+
const findings = detectLiveSourceReads(
|
|
261
|
+
[
|
|
262
|
+
{
|
|
263
|
+
name: "Read",
|
|
264
|
+
args: { file_path: join(OUTPUTS, "x.md") },
|
|
265
|
+
ordinal: 0,
|
|
266
|
+
},
|
|
267
|
+
{ name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
|
|
268
|
+
{
|
|
269
|
+
name: "Write",
|
|
270
|
+
args: { file_path: join(LIVE_SKILL, "SKILL.md") },
|
|
271
|
+
ordinal: 2,
|
|
272
|
+
},
|
|
273
|
+
],
|
|
274
|
+
LIVE_SKILL,
|
|
275
|
+
REPO,
|
|
276
|
+
);
|
|
277
|
+
// Write tools are detectStrayWrites' jurisdiction — this check is reads only.
|
|
278
|
+
expect(findings).toHaveLength(0);
|
|
279
|
+
});
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
describe("detect-stray-writes CLI", () => {
|
|
283
|
+
// realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
|
|
284
|
+
// so fixture paths must match that form for prefix checks to line up.
|
|
285
|
+
const FIXTURE_ROOT = join(
|
|
286
|
+
realpathSync(tmpdir()),
|
|
287
|
+
`slow-powers-detect-stray-test-${process.pid}`,
|
|
288
|
+
);
|
|
289
|
+
const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
|
|
290
|
+
|
|
291
|
+
beforeAll(() => {
|
|
292
|
+
mkdirSync(FIXTURE_ROOT, { recursive: true });
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
afterAll(() => {
|
|
296
|
+
rmSync(FIXTURE_ROOT, { recursive: true, force: true });
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
test("reports live-source reads per run in stray-writes.json", () => {
|
|
300
|
+
const root = join(FIXTURE_ROOT, "cli-live-reads");
|
|
301
|
+
const skillDir = join(root, "skill-dir");
|
|
302
|
+
const skillSub = join(skillDir, "mr-review");
|
|
303
|
+
mkdirSync(skillSub, { recursive: true });
|
|
304
|
+
writeFileSync(
|
|
305
|
+
join(skillSub, "SKILL.md"),
|
|
306
|
+
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
307
|
+
);
|
|
308
|
+
|
|
309
|
+
const cwd = join(root, "work");
|
|
310
|
+
const iterationDir = join(
|
|
311
|
+
cwd,
|
|
312
|
+
"skills-workspace",
|
|
313
|
+
"mr-review",
|
|
314
|
+
"iteration-1",
|
|
315
|
+
);
|
|
316
|
+
const condDir = join(iterationDir, "eval-e1", "old_skill");
|
|
317
|
+
mkdirSync(condDir, { recursive: true });
|
|
318
|
+
writeFileSync(
|
|
319
|
+
join(iterationDir, "conditions.json"),
|
|
320
|
+
`${JSON.stringify({
|
|
321
|
+
mode: "revision",
|
|
322
|
+
conditions: [
|
|
323
|
+
{ name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
324
|
+
{ name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
325
|
+
],
|
|
326
|
+
timestamp: new Date().toISOString(),
|
|
327
|
+
harness: "claude-code",
|
|
328
|
+
})}\n`,
|
|
329
|
+
);
|
|
330
|
+
writeFileSync(
|
|
331
|
+
join(condDir, "run.json"),
|
|
332
|
+
`${JSON.stringify({
|
|
333
|
+
eval_id: "e1",
|
|
334
|
+
condition: "old_skill",
|
|
335
|
+
skill_path: join(skillSub, "SKILL.md"),
|
|
336
|
+
prompt: "do the task",
|
|
337
|
+
files: [],
|
|
338
|
+
final_message: "done",
|
|
339
|
+
tool_invocations: [
|
|
340
|
+
{
|
|
341
|
+
name: "Read",
|
|
342
|
+
args: { file_path: join(skillSub, "SKILL.md") },
|
|
343
|
+
ordinal: 0,
|
|
344
|
+
},
|
|
345
|
+
{
|
|
346
|
+
name: "Write",
|
|
347
|
+
args: { file_path: join(condDir, "outputs", "answer.md") },
|
|
348
|
+
ordinal: 1,
|
|
349
|
+
},
|
|
350
|
+
],
|
|
351
|
+
})}\n`,
|
|
352
|
+
);
|
|
353
|
+
|
|
354
|
+
const res = Bun.spawnSync(
|
|
355
|
+
[
|
|
356
|
+
"bun",
|
|
357
|
+
"run",
|
|
358
|
+
SCRIPT,
|
|
359
|
+
"--skill-dir",
|
|
360
|
+
skillDir,
|
|
361
|
+
"--skill",
|
|
362
|
+
"mr-review",
|
|
363
|
+
"--iteration",
|
|
364
|
+
"1",
|
|
365
|
+
],
|
|
366
|
+
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
367
|
+
);
|
|
368
|
+
expect(res.exitCode).toBe(0);
|
|
369
|
+
|
|
370
|
+
const report = JSON.parse(
|
|
371
|
+
readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
|
|
372
|
+
) as {
|
|
373
|
+
totals: {
|
|
374
|
+
violations: number;
|
|
375
|
+
warnings: number;
|
|
376
|
+
live_source_reads: number;
|
|
377
|
+
};
|
|
378
|
+
runs: Array<{
|
|
379
|
+
eval_id: string;
|
|
380
|
+
condition: string;
|
|
381
|
+
live_source_reads: Array<{ tool: string; path?: string }>;
|
|
382
|
+
}>;
|
|
383
|
+
};
|
|
384
|
+
expect(report.totals.live_source_reads).toBe(1);
|
|
385
|
+
expect(report.totals.violations).toBe(0);
|
|
386
|
+
expect(report.runs).toHaveLength(1);
|
|
387
|
+
expect(report.runs[0]).toMatchObject({
|
|
388
|
+
eval_id: "e1",
|
|
389
|
+
condition: "old_skill",
|
|
390
|
+
});
|
|
391
|
+
expect(report.runs[0].live_source_reads[0]).toMatchObject({
|
|
392
|
+
tool: "Read",
|
|
393
|
+
path: join(skillSub, "SKILL.md"),
|
|
394
|
+
});
|
|
395
|
+
});
|
|
396
|
+
});
|