@slowdini/slow-powers-opencode 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,396 +0,0 @@
|
|
|
1
|
-
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
-
import {
|
|
3
|
-
mkdirSync,
|
|
4
|
-
readFileSync,
|
|
5
|
-
realpathSync,
|
|
6
|
-
rmSync,
|
|
7
|
-
writeFileSync,
|
|
8
|
-
} from "node:fs";
|
|
9
|
-
import { tmpdir } from "node:os";
|
|
10
|
-
import { join } from "node:path";
|
|
11
|
-
import {
|
|
12
|
-
detectLiveSourceReads,
|
|
13
|
-
detectStrayWrites,
|
|
14
|
-
} from "./detect-stray-writes";
|
|
15
|
-
|
|
16
|
-
const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
|
|
17
|
-
const REPO = "/work/repo";
|
|
18
|
-
const LIVE_SKILL = join(REPO, "skills", "mr-review");
|
|
19
|
-
|
|
20
|
-
describe("detectStrayWrites", () => {
|
|
21
|
-
test("a Write inside the outputs dir is clean", () => {
|
|
22
|
-
const findings = detectStrayWrites(
|
|
23
|
-
[
|
|
24
|
-
{
|
|
25
|
-
name: "Write",
|
|
26
|
-
args: { file_path: join(OUTPUTS, "answer.md") },
|
|
27
|
-
ordinal: 0,
|
|
28
|
-
},
|
|
29
|
-
],
|
|
30
|
-
OUTPUTS,
|
|
31
|
-
REPO,
|
|
32
|
-
);
|
|
33
|
-
expect(findings.violations).toHaveLength(0);
|
|
34
|
-
expect(findings.warnings).toHaveLength(0);
|
|
35
|
-
});
|
|
36
|
-
|
|
37
|
-
test("a Write outside the outputs dir is a violation", () => {
|
|
38
|
-
const findings = detectStrayWrites(
|
|
39
|
-
[
|
|
40
|
-
{
|
|
41
|
-
name: "Write",
|
|
42
|
-
args: { file_path: join(REPO, "runner/run.ts") },
|
|
43
|
-
ordinal: 2,
|
|
44
|
-
},
|
|
45
|
-
],
|
|
46
|
-
OUTPUTS,
|
|
47
|
-
REPO,
|
|
48
|
-
);
|
|
49
|
-
expect(findings.violations).toHaveLength(1);
|
|
50
|
-
expect(findings.violations[0]).toMatchObject({
|
|
51
|
-
tool: "Write",
|
|
52
|
-
path: join(REPO, "runner/run.ts"),
|
|
53
|
-
ordinal: 2,
|
|
54
|
-
});
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
test("an Edit/MultiEdit/NotebookEdit outside outputs is a violation", () => {
|
|
58
|
-
const findings = detectStrayWrites(
|
|
59
|
-
[
|
|
60
|
-
{ name: "Edit", args: { file_path: "/etc/hosts" }, ordinal: 0 },
|
|
61
|
-
{
|
|
62
|
-
name: "NotebookEdit",
|
|
63
|
-
args: { notebook_path: "/tmp/x.ipynb" },
|
|
64
|
-
ordinal: 1,
|
|
65
|
-
},
|
|
66
|
-
],
|
|
67
|
-
OUTPUTS,
|
|
68
|
-
REPO,
|
|
69
|
-
);
|
|
70
|
-
expect(findings.violations.map((v) => v.tool).sort()).toEqual([
|
|
71
|
-
"Edit",
|
|
72
|
-
"NotebookEdit",
|
|
73
|
-
]);
|
|
74
|
-
});
|
|
75
|
-
|
|
76
|
-
test("an install command is a warning", () => {
|
|
77
|
-
const findings = detectStrayWrites(
|
|
78
|
-
[{ name: "Bash", args: { command: "npm install left-pad" }, ordinal: 0 }],
|
|
79
|
-
OUTPUTS,
|
|
80
|
-
REPO,
|
|
81
|
-
);
|
|
82
|
-
expect(findings.warnings).toHaveLength(1);
|
|
83
|
-
expect(findings.warnings[0].tool).toBe("Bash");
|
|
84
|
-
expect(findings.warnings[0].reason).toMatch(/install/i);
|
|
85
|
-
});
|
|
86
|
-
|
|
87
|
-
test("a mutating Bash command scoped to the outputs dir is not flagged", () => {
|
|
88
|
-
const findings = detectStrayWrites(
|
|
89
|
-
[
|
|
90
|
-
{
|
|
91
|
-
name: "Bash",
|
|
92
|
-
args: { command: `echo hi > ${join(OUTPUTS, "log.txt")}` },
|
|
93
|
-
ordinal: 0,
|
|
94
|
-
},
|
|
95
|
-
],
|
|
96
|
-
OUTPUTS,
|
|
97
|
-
REPO,
|
|
98
|
-
);
|
|
99
|
-
expect(findings.warnings).toHaveLength(0);
|
|
100
|
-
});
|
|
101
|
-
|
|
102
|
-
test("git worktree add is a warning (working tree outside the sandbox)", () => {
|
|
103
|
-
const findings = detectStrayWrites(
|
|
104
|
-
[
|
|
105
|
-
{
|
|
106
|
-
name: "Bash",
|
|
107
|
-
args: { command: "git worktree add ../wt -b scratch" },
|
|
108
|
-
ordinal: 0,
|
|
109
|
-
},
|
|
110
|
-
],
|
|
111
|
-
OUTPUTS,
|
|
112
|
-
REPO,
|
|
113
|
-
);
|
|
114
|
-
expect(findings.warnings).toHaveLength(1);
|
|
115
|
-
expect(findings.warnings[0].reason).toMatch(/worktree/i);
|
|
116
|
-
});
|
|
117
|
-
|
|
118
|
-
test("creating a path under .claude is a warning", () => {
|
|
119
|
-
const findings = detectStrayWrites(
|
|
120
|
-
[{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
|
|
121
|
-
OUTPUTS,
|
|
122
|
-
REPO,
|
|
123
|
-
);
|
|
124
|
-
expect(findings.warnings).toHaveLength(1);
|
|
125
|
-
expect(findings.warnings[0].reason).toMatch(/\.claude/i);
|
|
126
|
-
});
|
|
127
|
-
|
|
128
|
-
test("read-only tools are never flagged", () => {
|
|
129
|
-
const findings = detectStrayWrites(
|
|
130
|
-
[
|
|
131
|
-
{ name: "Read", args: { file_path: "/anywhere" }, ordinal: 0 },
|
|
132
|
-
{ name: "Grep", args: { pattern: "x" }, ordinal: 1 },
|
|
133
|
-
{ name: "Bash", args: { command: "ls -la /" }, ordinal: 2 },
|
|
134
|
-
],
|
|
135
|
-
OUTPUTS,
|
|
136
|
-
REPO,
|
|
137
|
-
);
|
|
138
|
-
expect(findings.violations).toHaveLength(0);
|
|
139
|
-
expect(findings.warnings).toHaveLength(0);
|
|
140
|
-
});
|
|
141
|
-
});
|
|
142
|
-
|
|
143
|
-
describe("detectLiveSourceReads", () => {
|
|
144
|
-
test("a Read of the live SKILL.md is flagged", () => {
|
|
145
|
-
const findings = detectLiveSourceReads(
|
|
146
|
-
[
|
|
147
|
-
{
|
|
148
|
-
name: "Read",
|
|
149
|
-
args: { file_path: join(LIVE_SKILL, "SKILL.md") },
|
|
150
|
-
ordinal: 1,
|
|
151
|
-
},
|
|
152
|
-
],
|
|
153
|
-
LIVE_SKILL,
|
|
154
|
-
REPO,
|
|
155
|
-
);
|
|
156
|
-
expect(findings).toHaveLength(1);
|
|
157
|
-
expect(findings[0]).toMatchObject({
|
|
158
|
-
tool: "Read",
|
|
159
|
-
path: join(LIVE_SKILL, "SKILL.md"),
|
|
160
|
-
ordinal: 1,
|
|
161
|
-
});
|
|
162
|
-
expect(findings[0].reason).toMatch(/live skill source/i);
|
|
163
|
-
});
|
|
164
|
-
|
|
165
|
-
test("a Read of a staged eval copy is not flagged", () => {
|
|
166
|
-
const findings = detectLiveSourceReads(
|
|
167
|
-
[
|
|
168
|
-
{
|
|
169
|
-
name: "Read",
|
|
170
|
-
args: {
|
|
171
|
-
file_path: join(
|
|
172
|
-
REPO,
|
|
173
|
-
".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
|
|
174
|
-
),
|
|
175
|
-
},
|
|
176
|
-
ordinal: 0,
|
|
177
|
-
},
|
|
178
|
-
],
|
|
179
|
-
LIVE_SKILL,
|
|
180
|
-
REPO,
|
|
181
|
-
);
|
|
182
|
-
expect(findings).toHaveLength(0);
|
|
183
|
-
});
|
|
184
|
-
|
|
185
|
-
test("a relative Read path resolving under the live dir is flagged", () => {
|
|
186
|
-
const findings = detectLiveSourceReads(
|
|
187
|
-
[
|
|
188
|
-
{
|
|
189
|
-
name: "Read",
|
|
190
|
-
args: { file_path: "skills/mr-review/SKILL.md" },
|
|
191
|
-
ordinal: 0,
|
|
192
|
-
},
|
|
193
|
-
],
|
|
194
|
-
LIVE_SKILL,
|
|
195
|
-
REPO,
|
|
196
|
-
);
|
|
197
|
-
expect(findings).toHaveLength(1);
|
|
198
|
-
});
|
|
199
|
-
|
|
200
|
-
test("a Grep scoped to the live dir is flagged", () => {
|
|
201
|
-
const findings = detectLiveSourceReads(
|
|
202
|
-
[{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
|
|
203
|
-
LIVE_SKILL,
|
|
204
|
-
REPO,
|
|
205
|
-
);
|
|
206
|
-
expect(findings).toHaveLength(1);
|
|
207
|
-
expect(findings[0].tool).toBe("Grep");
|
|
208
|
-
});
|
|
209
|
-
|
|
210
|
-
test("a Bash command referencing the live dir relatively is flagged", () => {
|
|
211
|
-
const findings = detectLiveSourceReads(
|
|
212
|
-
[
|
|
213
|
-
{
|
|
214
|
-
name: "Bash",
|
|
215
|
-
args: { command: "cat skills/mr-review/SKILL.md" },
|
|
216
|
-
ordinal: 3,
|
|
217
|
-
},
|
|
218
|
-
],
|
|
219
|
-
LIVE_SKILL,
|
|
220
|
-
REPO,
|
|
221
|
-
);
|
|
222
|
-
expect(findings).toHaveLength(1);
|
|
223
|
-
expect(findings[0].tool).toBe("Bash");
|
|
224
|
-
expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
|
|
225
|
-
});
|
|
226
|
-
|
|
227
|
-
test("a Bash command referencing the live dir absolutely is flagged", () => {
|
|
228
|
-
const findings = detectLiveSourceReads(
|
|
229
|
-
[
|
|
230
|
-
{
|
|
231
|
-
name: "Bash",
|
|
232
|
-
args: { command: `grep -r trigger ${LIVE_SKILL}/` },
|
|
233
|
-
ordinal: 0,
|
|
234
|
-
},
|
|
235
|
-
],
|
|
236
|
-
LIVE_SKILL,
|
|
237
|
-
REPO,
|
|
238
|
-
);
|
|
239
|
-
expect(findings).toHaveLength(1);
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
|
|
243
|
-
// --stage-name can stage under the skill's natural name; that path contains
|
|
244
|
-
// `skills/<name>` but lives under `.claude/`, so it must not match.
|
|
245
|
-
const findings = detectLiveSourceReads(
|
|
246
|
-
[
|
|
247
|
-
{
|
|
248
|
-
name: "Bash",
|
|
249
|
-
args: { command: "cat .claude/skills/mr-review/SKILL.md" },
|
|
250
|
-
ordinal: 0,
|
|
251
|
-
},
|
|
252
|
-
],
|
|
253
|
-
LIVE_SKILL,
|
|
254
|
-
REPO,
|
|
255
|
-
);
|
|
256
|
-
expect(findings).toHaveLength(0);
|
|
257
|
-
});
|
|
258
|
-
|
|
259
|
-
test("unrelated reads and commands are not flagged", () => {
|
|
260
|
-
const findings = detectLiveSourceReads(
|
|
261
|
-
[
|
|
262
|
-
{
|
|
263
|
-
name: "Read",
|
|
264
|
-
args: { file_path: join(OUTPUTS, "x.md") },
|
|
265
|
-
ordinal: 0,
|
|
266
|
-
},
|
|
267
|
-
{ name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
|
|
268
|
-
{
|
|
269
|
-
name: "Write",
|
|
270
|
-
args: { file_path: join(LIVE_SKILL, "SKILL.md") },
|
|
271
|
-
ordinal: 2,
|
|
272
|
-
},
|
|
273
|
-
],
|
|
274
|
-
LIVE_SKILL,
|
|
275
|
-
REPO,
|
|
276
|
-
);
|
|
277
|
-
// Write tools are detectStrayWrites' jurisdiction — this check is reads only.
|
|
278
|
-
expect(findings).toHaveLength(0);
|
|
279
|
-
});
|
|
280
|
-
});
|
|
281
|
-
|
|
282
|
-
describe("detect-stray-writes CLI", () => {
|
|
283
|
-
// realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
|
|
284
|
-
// so fixture paths must match that form for prefix checks to line up.
|
|
285
|
-
const FIXTURE_ROOT = join(
|
|
286
|
-
realpathSync(tmpdir()),
|
|
287
|
-
`slow-powers-detect-stray-test-${process.pid}`,
|
|
288
|
-
);
|
|
289
|
-
const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
|
|
290
|
-
|
|
291
|
-
beforeAll(() => {
|
|
292
|
-
mkdirSync(FIXTURE_ROOT, { recursive: true });
|
|
293
|
-
});
|
|
294
|
-
|
|
295
|
-
afterAll(() => {
|
|
296
|
-
rmSync(FIXTURE_ROOT, { recursive: true, force: true });
|
|
297
|
-
});
|
|
298
|
-
|
|
299
|
-
test("reports live-source reads per run in stray-writes.json", () => {
|
|
300
|
-
const root = join(FIXTURE_ROOT, "cli-live-reads");
|
|
301
|
-
const skillDir = join(root, "skill-dir");
|
|
302
|
-
const skillSub = join(skillDir, "mr-review");
|
|
303
|
-
mkdirSync(skillSub, { recursive: true });
|
|
304
|
-
writeFileSync(
|
|
305
|
-
join(skillSub, "SKILL.md"),
|
|
306
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
307
|
-
);
|
|
308
|
-
|
|
309
|
-
const cwd = join(root, "work");
|
|
310
|
-
const iterationDir = join(
|
|
311
|
-
cwd,
|
|
312
|
-
"skills-workspace",
|
|
313
|
-
"mr-review",
|
|
314
|
-
"iteration-1",
|
|
315
|
-
);
|
|
316
|
-
const condDir = join(iterationDir, "eval-e1", "old_skill");
|
|
317
|
-
mkdirSync(condDir, { recursive: true });
|
|
318
|
-
writeFileSync(
|
|
319
|
-
join(iterationDir, "conditions.json"),
|
|
320
|
-
`${JSON.stringify({
|
|
321
|
-
mode: "revision",
|
|
322
|
-
conditions: [
|
|
323
|
-
{ name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
324
|
-
{ name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
325
|
-
],
|
|
326
|
-
timestamp: new Date().toISOString(),
|
|
327
|
-
harness: "claude-code",
|
|
328
|
-
})}\n`,
|
|
329
|
-
);
|
|
330
|
-
writeFileSync(
|
|
331
|
-
join(condDir, "run.json"),
|
|
332
|
-
`${JSON.stringify({
|
|
333
|
-
eval_id: "e1",
|
|
334
|
-
condition: "old_skill",
|
|
335
|
-
skill_path: join(skillSub, "SKILL.md"),
|
|
336
|
-
prompt: "do the task",
|
|
337
|
-
files: [],
|
|
338
|
-
final_message: "done",
|
|
339
|
-
tool_invocations: [
|
|
340
|
-
{
|
|
341
|
-
name: "Read",
|
|
342
|
-
args: { file_path: join(skillSub, "SKILL.md") },
|
|
343
|
-
ordinal: 0,
|
|
344
|
-
},
|
|
345
|
-
{
|
|
346
|
-
name: "Write",
|
|
347
|
-
args: { file_path: join(condDir, "outputs", "answer.md") },
|
|
348
|
-
ordinal: 1,
|
|
349
|
-
},
|
|
350
|
-
],
|
|
351
|
-
})}\n`,
|
|
352
|
-
);
|
|
353
|
-
|
|
354
|
-
const res = Bun.spawnSync(
|
|
355
|
-
[
|
|
356
|
-
"bun",
|
|
357
|
-
"run",
|
|
358
|
-
SCRIPT,
|
|
359
|
-
"--skill-dir",
|
|
360
|
-
skillDir,
|
|
361
|
-
"--skill",
|
|
362
|
-
"mr-review",
|
|
363
|
-
"--iteration",
|
|
364
|
-
"1",
|
|
365
|
-
],
|
|
366
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
367
|
-
);
|
|
368
|
-
expect(res.exitCode).toBe(0);
|
|
369
|
-
|
|
370
|
-
const report = JSON.parse(
|
|
371
|
-
readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
|
|
372
|
-
) as {
|
|
373
|
-
totals: {
|
|
374
|
-
violations: number;
|
|
375
|
-
warnings: number;
|
|
376
|
-
live_source_reads: number;
|
|
377
|
-
};
|
|
378
|
-
runs: Array<{
|
|
379
|
-
eval_id: string;
|
|
380
|
-
condition: string;
|
|
381
|
-
live_source_reads: Array<{ tool: string; path?: string }>;
|
|
382
|
-
}>;
|
|
383
|
-
};
|
|
384
|
-
expect(report.totals.live_source_reads).toBe(1);
|
|
385
|
-
expect(report.totals.violations).toBe(0);
|
|
386
|
-
expect(report.runs).toHaveLength(1);
|
|
387
|
-
expect(report.runs[0]).toMatchObject({
|
|
388
|
-
eval_id: "e1",
|
|
389
|
-
condition: "old_skill",
|
|
390
|
-
});
|
|
391
|
-
expect(report.runs[0].live_source_reads[0]).toMatchObject({
|
|
392
|
-
tool: "Read",
|
|
393
|
-
path: join(skillSub, "SKILL.md"),
|
|
394
|
-
});
|
|
395
|
-
});
|
|
396
|
-
});
|
|
@@ -1,288 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bun
|
|
2
|
-
import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
3
|
-
import { join, relative, resolve } from "node:path";
|
|
4
|
-
import { detectRunContext } from "./context";
|
|
5
|
-
import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
|
|
6
|
-
import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
|
|
7
|
-
import { validateAgainstSchema } from "./validate-schema";
|
|
8
|
-
|
|
9
|
-
function die(msg: string): never {
|
|
10
|
-
console.error(`error: ${msg}`);
|
|
11
|
-
process.exit(1);
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
export type StrayFinding = {
|
|
15
|
-
tool: string;
|
|
16
|
-
path?: string;
|
|
17
|
-
command?: string;
|
|
18
|
-
ordinal: number;
|
|
19
|
-
reason: string;
|
|
20
|
-
};
|
|
21
|
-
|
|
22
|
-
export type RunFindings = {
|
|
23
|
-
violations: StrayFinding[];
|
|
24
|
-
warnings: StrayFinding[];
|
|
25
|
-
};
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
* Classify a run's tool invocations against its allowed outputs dir.
|
|
29
|
-
*
|
|
30
|
-
* - `violations`: file-write tools (Write/Edit/MultiEdit/NotebookEdit) whose
|
|
31
|
-
* target path resolves outside `outputsDir`. High confidence — a run that
|
|
32
|
-
* edits the real repo is a tainted data point.
|
|
33
|
-
* - `warnings`: Bash commands matching a mutating pattern that don't reference
|
|
34
|
-
* `outputsDir`. Heuristic — review before trusting.
|
|
35
|
-
*
|
|
36
|
-
* Relative paths resolve against `repoRoot` (the subagent's working dir);
|
|
37
|
-
* Claude Code's write tools use absolute paths, so this is a best-effort
|
|
38
|
-
* fallback only.
|
|
39
|
-
*/
|
|
40
|
-
export function detectStrayWrites(
|
|
41
|
-
invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
|
|
42
|
-
outputsDir: string,
|
|
43
|
-
repoRoot: string,
|
|
44
|
-
): RunFindings {
|
|
45
|
-
const violations: StrayFinding[] = [];
|
|
46
|
-
const warnings: StrayFinding[] = [];
|
|
47
|
-
|
|
48
|
-
for (const inv of invocations) {
|
|
49
|
-
if (WRITE_TOOLS.has(inv.name)) {
|
|
50
|
-
const p = pathArg(inv.args);
|
|
51
|
-
if (p && !isUnder(p, outputsDir, repoRoot)) {
|
|
52
|
-
violations.push({
|
|
53
|
-
tool: inv.name,
|
|
54
|
-
path: p,
|
|
55
|
-
ordinal: inv.ordinal,
|
|
56
|
-
reason: "writes outside the run's outputs dir",
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
continue;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
if (inv.name === "Bash") {
|
|
63
|
-
const args = inv.args as { command?: unknown } | undefined;
|
|
64
|
-
const command = typeof args?.command === "string" ? args.command : "";
|
|
65
|
-
const reason = classifyBash(command, [outputsDir]);
|
|
66
|
-
if (reason)
|
|
67
|
-
warnings.push({ tool: "Bash", command, ordinal: inv.ordinal, reason });
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
return { violations, warnings };
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
/** Read-only tools that carry a target path argument (see `pathArg`). */
|
|
75
|
-
const READ_TOOLS = new Set(["Read", "Glob", "Grep"]);
|
|
76
|
-
|
|
77
|
-
const LIVE_SOURCE_REASON =
|
|
78
|
-
"reads the live skill source instead of its staged copy — the arm may be contaminated";
|
|
79
|
-
|
|
80
|
-
function escapeRegExp(s: string): string {
|
|
81
|
-
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
/**
|
|
85
|
-
* Flag tool invocations that read the **live** skill-under-test directory.
|
|
86
|
-
*
|
|
87
|
-
* Eval subagents are only ever meant to see the *staged* copy of the skill
|
|
88
|
-
* (`.claude/skills/<slug>/`, or the inlined SKILL.md under `--no-stage`). A
|
|
89
|
-
* read of the live source typically means the Skill tool couldn't resolve the
|
|
90
|
-
* staged slug yet (mid-session registry refresh race) and the agent improvised
|
|
91
|
-
* — fatal in revision mode, where the old_skill arm then reads new-skill
|
|
92
|
-
* content. Reads are detected, not blocked: the guard stays read-permissive,
|
|
93
|
-
* so this surfaces post-hoc as a validity warning.
|
|
94
|
-
*
|
|
95
|
-
* - Read-tool calls (Read/Glob/Grep) whose path arg resolves under the live
|
|
96
|
-
* dir are flagged; relative paths resolve against `repoRoot`.
|
|
97
|
-
* - Bash commands that reference the live dir (absolute, or repo-relative
|
|
98
|
-
* text) are flagged. A staged copy under `.claude/skills/` can carry the
|
|
99
|
-
* same `skills/<name>` relative text (e.g. via `--stage-name`), so that
|
|
100
|
-
* prefix is excluded.
|
|
101
|
-
*/
|
|
102
|
-
export function detectLiveSourceReads(
|
|
103
|
-
invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
|
|
104
|
-
liveSkillDir: string,
|
|
105
|
-
repoRoot: string,
|
|
106
|
-
): StrayFinding[] {
|
|
107
|
-
const findings: StrayFinding[] = [];
|
|
108
|
-
const liveDir = resolve(liveSkillDir);
|
|
109
|
-
const rel = relative(repoRoot, liveDir);
|
|
110
|
-
const relRe = rel.startsWith("..")
|
|
111
|
-
? null
|
|
112
|
-
: new RegExp(
|
|
113
|
-
// The lookbehind fires at the boundary char itself, so it checks for a
|
|
114
|
-
// bare `.claude` — the `/` is consumed by the boundary group.
|
|
115
|
-
`(?<!\\.claude)(^|[\\s'"=:(/])${escapeRegExp(rel)}(/|[\\s'")]|$)`,
|
|
116
|
-
);
|
|
117
|
-
|
|
118
|
-
for (const inv of invocations) {
|
|
119
|
-
if (READ_TOOLS.has(inv.name)) {
|
|
120
|
-
const p = pathArg(inv.args);
|
|
121
|
-
if (p && isUnder(p, liveDir, repoRoot)) {
|
|
122
|
-
findings.push({
|
|
123
|
-
tool: inv.name,
|
|
124
|
-
path: p,
|
|
125
|
-
ordinal: inv.ordinal,
|
|
126
|
-
reason: LIVE_SOURCE_REASON,
|
|
127
|
-
});
|
|
128
|
-
}
|
|
129
|
-
continue;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
if (inv.name === "Bash") {
|
|
133
|
-
const args = inv.args as { command?: unknown } | undefined;
|
|
134
|
-
const command = typeof args?.command === "string" ? args.command : "";
|
|
135
|
-
if (command.includes(liveDir) || relRe?.test(command)) {
|
|
136
|
-
findings.push({
|
|
137
|
-
tool: "Bash",
|
|
138
|
-
command,
|
|
139
|
-
ordinal: inv.ordinal,
|
|
140
|
-
reason: LIVE_SOURCE_REASON,
|
|
141
|
-
});
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
}
|
|
145
|
-
|
|
146
|
-
return findings;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
if (import.meta.main) {
|
|
150
|
-
const argv = Bun.argv.slice(2);
|
|
151
|
-
const flag = (name: string): string | undefined => {
|
|
152
|
-
const i = argv.indexOf(`--${name}`);
|
|
153
|
-
return i === -1 ? undefined : argv[i + 1];
|
|
154
|
-
};
|
|
155
|
-
const iteration = flag("iteration");
|
|
156
|
-
if (!iteration) die("missing --iteration");
|
|
157
|
-
const ctx = detectRunContext(argv);
|
|
158
|
-
|
|
159
|
-
const iterationDir = join(
|
|
160
|
-
ctx.workspaceRoot,
|
|
161
|
-
ctx.skillName,
|
|
162
|
-
`iteration-${iteration}`,
|
|
163
|
-
);
|
|
164
|
-
if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
|
|
165
|
-
|
|
166
|
-
const conditionsPath = join(iterationDir, "conditions.json");
|
|
167
|
-
if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
|
|
168
|
-
const conditions: ConditionsRecord = JSON.parse(
|
|
169
|
-
readFileSync(conditionsPath, "utf8"),
|
|
170
|
-
);
|
|
171
|
-
const conditionNames = conditions.conditions.map((c) => c.name);
|
|
172
|
-
|
|
173
|
-
// dispatch.json carries the authoritative outputs_dir per task; fall back to
|
|
174
|
-
// the conventional <condDir>/outputs when it's absent (hand-authored runs).
|
|
175
|
-
const dispatchPath = join(iterationDir, "dispatch.json");
|
|
176
|
-
const outputsByKey = new Map<string, string>();
|
|
177
|
-
if (existsSync(dispatchPath)) {
|
|
178
|
-
try {
|
|
179
|
-
const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
|
|
180
|
-
tasks?: Array<{
|
|
181
|
-
eval_id: string;
|
|
182
|
-
condition: string;
|
|
183
|
-
outputs_dir?: string;
|
|
184
|
-
}>;
|
|
185
|
-
};
|
|
186
|
-
for (const t of dispatch.tasks ?? []) {
|
|
187
|
-
if (t.outputs_dir)
|
|
188
|
-
outputsByKey.set(`${t.eval_id}:${t.condition}`, t.outputs_dir);
|
|
189
|
-
}
|
|
190
|
-
} catch {
|
|
191
|
-
// fall through to convention
|
|
192
|
-
}
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
const repoRoot = process.cwd();
|
|
196
|
-
const evalDirs = readdirSync(iterationDir).filter((d) =>
|
|
197
|
-
d.startsWith("eval-"),
|
|
198
|
-
);
|
|
199
|
-
|
|
200
|
-
type RunReport = {
|
|
201
|
-
eval_id: string;
|
|
202
|
-
condition: string;
|
|
203
|
-
violations: StrayFinding[];
|
|
204
|
-
warnings: StrayFinding[];
|
|
205
|
-
live_source_reads: StrayFinding[];
|
|
206
|
-
};
|
|
207
|
-
const runs: RunReport[] = [];
|
|
208
|
-
let totalViolations = 0;
|
|
209
|
-
let totalWarnings = 0;
|
|
210
|
-
let totalLiveReads = 0;
|
|
211
|
-
|
|
212
|
-
for (const evalDir of evalDirs) {
|
|
213
|
-
const evalId = evalDir.replace(/^eval-/, "");
|
|
214
|
-
for (const cond of conditionNames) {
|
|
215
|
-
const condDir = join(iterationDir, evalDir, cond);
|
|
216
|
-
const runPath = join(condDir, "run.json");
|
|
217
|
-
if (!existsSync(runPath)) continue;
|
|
218
|
-
const run = validateAgainstSchema<RunRecord>(
|
|
219
|
-
"run-record",
|
|
220
|
-
JSON.parse(readFileSync(runPath, "utf8")),
|
|
221
|
-
runPath,
|
|
222
|
-
);
|
|
223
|
-
const invocations = Array.isArray(run.tool_invocations)
|
|
224
|
-
? run.tool_invocations
|
|
225
|
-
: [];
|
|
226
|
-
const outputsDir =
|
|
227
|
-
outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
|
|
228
|
-
const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
|
|
229
|
-
const liveReads = detectLiveSourceReads(
|
|
230
|
-
invocations,
|
|
231
|
-
ctx.skillSubdir,
|
|
232
|
-
repoRoot,
|
|
233
|
-
);
|
|
234
|
-
if (
|
|
235
|
-
findings.violations.length ||
|
|
236
|
-
findings.warnings.length ||
|
|
237
|
-
liveReads.length
|
|
238
|
-
) {
|
|
239
|
-
runs.push({
|
|
240
|
-
eval_id: evalId,
|
|
241
|
-
condition: cond,
|
|
242
|
-
violations: findings.violations,
|
|
243
|
-
warnings: findings.warnings,
|
|
244
|
-
live_source_reads: liveReads,
|
|
245
|
-
});
|
|
246
|
-
}
|
|
247
|
-
totalViolations += findings.violations.length;
|
|
248
|
-
totalWarnings += findings.warnings.length;
|
|
249
|
-
totalLiveReads += liveReads.length;
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
const report = {
|
|
254
|
-
generated: new Date().toISOString(),
|
|
255
|
-
iteration: Number(iteration),
|
|
256
|
-
totals: {
|
|
257
|
-
violations: totalViolations,
|
|
258
|
-
warnings: totalWarnings,
|
|
259
|
-
live_source_reads: totalLiveReads,
|
|
260
|
-
},
|
|
261
|
-
runs,
|
|
262
|
-
};
|
|
263
|
-
const outPath = join(iterationDir, "stray-writes.json");
|
|
264
|
-
validateAgainstSchema("stray-writes", report, outPath);
|
|
265
|
-
writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
|
|
266
|
-
console.log(`Wrote ${outPath}`);
|
|
267
|
-
|
|
268
|
-
for (const r of runs) {
|
|
269
|
-
for (const v of r.violations)
|
|
270
|
-
console.warn(
|
|
271
|
-
`✗ ${r.eval_id}/${r.condition}: ${v.tool} wrote outside outputs dir → ${v.path} (ordinal ${v.ordinal})`,
|
|
272
|
-
);
|
|
273
|
-
for (const w of r.warnings)
|
|
274
|
-
console.warn(
|
|
275
|
-
`⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
|
|
276
|
-
);
|
|
277
|
-
for (const l of r.live_source_reads)
|
|
278
|
-
console.warn(
|
|
279
|
-
`⚠ ${r.eval_id}/${r.condition}: ${l.tool} read the live skill source (ordinal ${l.ordinal}): ${l.path ?? l.command}`,
|
|
280
|
-
);
|
|
281
|
-
}
|
|
282
|
-
if (totalViolations === 0 && totalWarnings === 0 && totalLiveReads === 0)
|
|
283
|
-
console.log("✓ No out-of-bounds writes or live-source reads detected.");
|
|
284
|
-
else
|
|
285
|
-
console.warn(
|
|
286
|
-
`\n${totalViolations} violation(s), ${totalWarnings} warning(s), ${totalLiveReads} live-source read(s). Runs with violations edited files outside their sandbox; runs with live-source reads saw the live skill instead of their staged copy — treat those data points as tainted.`,
|
|
287
|
-
);
|
|
288
|
-
}
|