@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,396 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import {
3
- mkdirSync,
4
- readFileSync,
5
- realpathSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
- import {
12
- detectLiveSourceReads,
13
- detectStrayWrites,
14
- } from "./detect-stray-writes";
15
-
16
- const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
17
- const REPO = "/work/repo";
18
- const LIVE_SKILL = join(REPO, "skills", "mr-review");
19
-
20
- describe("detectStrayWrites", () => {
21
- test("a Write inside the outputs dir is clean", () => {
22
- const findings = detectStrayWrites(
23
- [
24
- {
25
- name: "Write",
26
- args: { file_path: join(OUTPUTS, "answer.md") },
27
- ordinal: 0,
28
- },
29
- ],
30
- OUTPUTS,
31
- REPO,
32
- );
33
- expect(findings.violations).toHaveLength(0);
34
- expect(findings.warnings).toHaveLength(0);
35
- });
36
-
37
- test("a Write outside the outputs dir is a violation", () => {
38
- const findings = detectStrayWrites(
39
- [
40
- {
41
- name: "Write",
42
- args: { file_path: join(REPO, "runner/run.ts") },
43
- ordinal: 2,
44
- },
45
- ],
46
- OUTPUTS,
47
- REPO,
48
- );
49
- expect(findings.violations).toHaveLength(1);
50
- expect(findings.violations[0]).toMatchObject({
51
- tool: "Write",
52
- path: join(REPO, "runner/run.ts"),
53
- ordinal: 2,
54
- });
55
- });
56
-
57
- test("an Edit/MultiEdit/NotebookEdit outside outputs is a violation", () => {
58
- const findings = detectStrayWrites(
59
- [
60
- { name: "Edit", args: { file_path: "/etc/hosts" }, ordinal: 0 },
61
- {
62
- name: "NotebookEdit",
63
- args: { notebook_path: "/tmp/x.ipynb" },
64
- ordinal: 1,
65
- },
66
- ],
67
- OUTPUTS,
68
- REPO,
69
- );
70
- expect(findings.violations.map((v) => v.tool).sort()).toEqual([
71
- "Edit",
72
- "NotebookEdit",
73
- ]);
74
- });
75
-
76
- test("an install command is a warning", () => {
77
- const findings = detectStrayWrites(
78
- [{ name: "Bash", args: { command: "npm install left-pad" }, ordinal: 0 }],
79
- OUTPUTS,
80
- REPO,
81
- );
82
- expect(findings.warnings).toHaveLength(1);
83
- expect(findings.warnings[0].tool).toBe("Bash");
84
- expect(findings.warnings[0].reason).toMatch(/install/i);
85
- });
86
-
87
- test("a mutating Bash command scoped to the outputs dir is not flagged", () => {
88
- const findings = detectStrayWrites(
89
- [
90
- {
91
- name: "Bash",
92
- args: { command: `echo hi > ${join(OUTPUTS, "log.txt")}` },
93
- ordinal: 0,
94
- },
95
- ],
96
- OUTPUTS,
97
- REPO,
98
- );
99
- expect(findings.warnings).toHaveLength(0);
100
- });
101
-
102
- test("git worktree add is a warning (working tree outside the sandbox)", () => {
103
- const findings = detectStrayWrites(
104
- [
105
- {
106
- name: "Bash",
107
- args: { command: "git worktree add ../wt -b scratch" },
108
- ordinal: 0,
109
- },
110
- ],
111
- OUTPUTS,
112
- REPO,
113
- );
114
- expect(findings.warnings).toHaveLength(1);
115
- expect(findings.warnings[0].reason).toMatch(/worktree/i);
116
- });
117
-
118
- test("creating a path under .claude is a warning", () => {
119
- const findings = detectStrayWrites(
120
- [{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
121
- OUTPUTS,
122
- REPO,
123
- );
124
- expect(findings.warnings).toHaveLength(1);
125
- expect(findings.warnings[0].reason).toMatch(/\.claude/i);
126
- });
127
-
128
- test("read-only tools are never flagged", () => {
129
- const findings = detectStrayWrites(
130
- [
131
- { name: "Read", args: { file_path: "/anywhere" }, ordinal: 0 },
132
- { name: "Grep", args: { pattern: "x" }, ordinal: 1 },
133
- { name: "Bash", args: { command: "ls -la /" }, ordinal: 2 },
134
- ],
135
- OUTPUTS,
136
- REPO,
137
- );
138
- expect(findings.violations).toHaveLength(0);
139
- expect(findings.warnings).toHaveLength(0);
140
- });
141
- });
142
-
143
- describe("detectLiveSourceReads", () => {
144
- test("a Read of the live SKILL.md is flagged", () => {
145
- const findings = detectLiveSourceReads(
146
- [
147
- {
148
- name: "Read",
149
- args: { file_path: join(LIVE_SKILL, "SKILL.md") },
150
- ordinal: 1,
151
- },
152
- ],
153
- LIVE_SKILL,
154
- REPO,
155
- );
156
- expect(findings).toHaveLength(1);
157
- expect(findings[0]).toMatchObject({
158
- tool: "Read",
159
- path: join(LIVE_SKILL, "SKILL.md"),
160
- ordinal: 1,
161
- });
162
- expect(findings[0].reason).toMatch(/live skill source/i);
163
- });
164
-
165
- test("a Read of a staged eval copy is not flagged", () => {
166
- const findings = detectLiveSourceReads(
167
- [
168
- {
169
- name: "Read",
170
- args: {
171
- file_path: join(
172
- REPO,
173
- ".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
174
- ),
175
- },
176
- ordinal: 0,
177
- },
178
- ],
179
- LIVE_SKILL,
180
- REPO,
181
- );
182
- expect(findings).toHaveLength(0);
183
- });
184
-
185
- test("a relative Read path resolving under the live dir is flagged", () => {
186
- const findings = detectLiveSourceReads(
187
- [
188
- {
189
- name: "Read",
190
- args: { file_path: "skills/mr-review/SKILL.md" },
191
- ordinal: 0,
192
- },
193
- ],
194
- LIVE_SKILL,
195
- REPO,
196
- );
197
- expect(findings).toHaveLength(1);
198
- });
199
-
200
- test("a Grep scoped to the live dir is flagged", () => {
201
- const findings = detectLiveSourceReads(
202
- [{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
203
- LIVE_SKILL,
204
- REPO,
205
- );
206
- expect(findings).toHaveLength(1);
207
- expect(findings[0].tool).toBe("Grep");
208
- });
209
-
210
- test("a Bash command referencing the live dir relatively is flagged", () => {
211
- const findings = detectLiveSourceReads(
212
- [
213
- {
214
- name: "Bash",
215
- args: { command: "cat skills/mr-review/SKILL.md" },
216
- ordinal: 3,
217
- },
218
- ],
219
- LIVE_SKILL,
220
- REPO,
221
- );
222
- expect(findings).toHaveLength(1);
223
- expect(findings[0].tool).toBe("Bash");
224
- expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
225
- });
226
-
227
- test("a Bash command referencing the live dir absolutely is flagged", () => {
228
- const findings = detectLiveSourceReads(
229
- [
230
- {
231
- name: "Bash",
232
- args: { command: `grep -r trigger ${LIVE_SKILL}/` },
233
- ordinal: 0,
234
- },
235
- ],
236
- LIVE_SKILL,
237
- REPO,
238
- );
239
- expect(findings).toHaveLength(1);
240
- });
241
-
242
- test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
243
- // --stage-name can stage under the skill's natural name; that path contains
244
- // `skills/<name>` but lives under `.claude/`, so it must not match.
245
- const findings = detectLiveSourceReads(
246
- [
247
- {
248
- name: "Bash",
249
- args: { command: "cat .claude/skills/mr-review/SKILL.md" },
250
- ordinal: 0,
251
- },
252
- ],
253
- LIVE_SKILL,
254
- REPO,
255
- );
256
- expect(findings).toHaveLength(0);
257
- });
258
-
259
- test("unrelated reads and commands are not flagged", () => {
260
- const findings = detectLiveSourceReads(
261
- [
262
- {
263
- name: "Read",
264
- args: { file_path: join(OUTPUTS, "x.md") },
265
- ordinal: 0,
266
- },
267
- { name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
268
- {
269
- name: "Write",
270
- args: { file_path: join(LIVE_SKILL, "SKILL.md") },
271
- ordinal: 2,
272
- },
273
- ],
274
- LIVE_SKILL,
275
- REPO,
276
- );
277
- // Write tools are detectStrayWrites' jurisdiction — this check is reads only.
278
- expect(findings).toHaveLength(0);
279
- });
280
- });
281
-
282
- describe("detect-stray-writes CLI", () => {
283
- // realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
284
- // so fixture paths must match that form for prefix checks to line up.
285
- const FIXTURE_ROOT = join(
286
- realpathSync(tmpdir()),
287
- `slow-powers-detect-stray-test-${process.pid}`,
288
- );
289
- const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
290
-
291
- beforeAll(() => {
292
- mkdirSync(FIXTURE_ROOT, { recursive: true });
293
- });
294
-
295
- afterAll(() => {
296
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
297
- });
298
-
299
- test("reports live-source reads per run in stray-writes.json", () => {
300
- const root = join(FIXTURE_ROOT, "cli-live-reads");
301
- const skillDir = join(root, "skill-dir");
302
- const skillSub = join(skillDir, "mr-review");
303
- mkdirSync(skillSub, { recursive: true });
304
- writeFileSync(
305
- join(skillSub, "SKILL.md"),
306
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
307
- );
308
-
309
- const cwd = join(root, "work");
310
- const iterationDir = join(
311
- cwd,
312
- "skills-workspace",
313
- "mr-review",
314
- "iteration-1",
315
- );
316
- const condDir = join(iterationDir, "eval-e1", "old_skill");
317
- mkdirSync(condDir, { recursive: true });
318
- writeFileSync(
319
- join(iterationDir, "conditions.json"),
320
- `${JSON.stringify({
321
- mode: "revision",
322
- conditions: [
323
- { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
324
- { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
325
- ],
326
- timestamp: new Date().toISOString(),
327
- harness: "claude-code",
328
- })}\n`,
329
- );
330
- writeFileSync(
331
- join(condDir, "run.json"),
332
- `${JSON.stringify({
333
- eval_id: "e1",
334
- condition: "old_skill",
335
- skill_path: join(skillSub, "SKILL.md"),
336
- prompt: "do the task",
337
- files: [],
338
- final_message: "done",
339
- tool_invocations: [
340
- {
341
- name: "Read",
342
- args: { file_path: join(skillSub, "SKILL.md") },
343
- ordinal: 0,
344
- },
345
- {
346
- name: "Write",
347
- args: { file_path: join(condDir, "outputs", "answer.md") },
348
- ordinal: 1,
349
- },
350
- ],
351
- })}\n`,
352
- );
353
-
354
- const res = Bun.spawnSync(
355
- [
356
- "bun",
357
- "run",
358
- SCRIPT,
359
- "--skill-dir",
360
- skillDir,
361
- "--skill",
362
- "mr-review",
363
- "--iteration",
364
- "1",
365
- ],
366
- { cwd, stdout: "pipe", stderr: "pipe" },
367
- );
368
- expect(res.exitCode).toBe(0);
369
-
370
- const report = JSON.parse(
371
- readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
372
- ) as {
373
- totals: {
374
- violations: number;
375
- warnings: number;
376
- live_source_reads: number;
377
- };
378
- runs: Array<{
379
- eval_id: string;
380
- condition: string;
381
- live_source_reads: Array<{ tool: string; path?: string }>;
382
- }>;
383
- };
384
- expect(report.totals.live_source_reads).toBe(1);
385
- expect(report.totals.violations).toBe(0);
386
- expect(report.runs).toHaveLength(1);
387
- expect(report.runs[0]).toMatchObject({
388
- eval_id: "e1",
389
- condition: "old_skill",
390
- });
391
- expect(report.runs[0].live_source_reads[0]).toMatchObject({
392
- tool: "Read",
393
- path: join(skillSub, "SKILL.md"),
394
- });
395
- });
396
- });
@@ -1,288 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join, relative, resolve } from "node:path";
4
- import { detectRunContext } from "./context";
5
- import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
6
- import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
7
- import { validateAgainstSchema } from "./validate-schema";
8
-
9
- function die(msg: string): never {
10
- console.error(`error: ${msg}`);
11
- process.exit(1);
12
- }
13
-
14
- export type StrayFinding = {
15
- tool: string;
16
- path?: string;
17
- command?: string;
18
- ordinal: number;
19
- reason: string;
20
- };
21
-
22
- export type RunFindings = {
23
- violations: StrayFinding[];
24
- warnings: StrayFinding[];
25
- };
26
-
27
- /**
28
- * Classify a run's tool invocations against its allowed outputs dir.
29
- *
30
- * - `violations`: file-write tools (Write/Edit/MultiEdit/NotebookEdit) whose
31
- * target path resolves outside `outputsDir`. High confidence — a run that
32
- * edits the real repo is a tainted data point.
33
- * - `warnings`: Bash commands matching a mutating pattern that don't reference
34
- * `outputsDir`. Heuristic — review before trusting.
35
- *
36
- * Relative paths resolve against `repoRoot` (the subagent's working dir);
37
- * Claude Code's write tools use absolute paths, so this is a best-effort
38
- * fallback only.
39
- */
40
- export function detectStrayWrites(
41
- invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
42
- outputsDir: string,
43
- repoRoot: string,
44
- ): RunFindings {
45
- const violations: StrayFinding[] = [];
46
- const warnings: StrayFinding[] = [];
47
-
48
- for (const inv of invocations) {
49
- if (WRITE_TOOLS.has(inv.name)) {
50
- const p = pathArg(inv.args);
51
- if (p && !isUnder(p, outputsDir, repoRoot)) {
52
- violations.push({
53
- tool: inv.name,
54
- path: p,
55
- ordinal: inv.ordinal,
56
- reason: "writes outside the run's outputs dir",
57
- });
58
- }
59
- continue;
60
- }
61
-
62
- if (inv.name === "Bash") {
63
- const args = inv.args as { command?: unknown } | undefined;
64
- const command = typeof args?.command === "string" ? args.command : "";
65
- const reason = classifyBash(command, [outputsDir]);
66
- if (reason)
67
- warnings.push({ tool: "Bash", command, ordinal: inv.ordinal, reason });
68
- }
69
- }
70
-
71
- return { violations, warnings };
72
- }
73
-
74
- /** Read-only tools that carry a target path argument (see `pathArg`). */
75
- const READ_TOOLS = new Set(["Read", "Glob", "Grep"]);
76
-
77
- const LIVE_SOURCE_REASON =
78
- "reads the live skill source instead of its staged copy — the arm may be contaminated";
79
-
80
- function escapeRegExp(s: string): string {
81
- return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
82
- }
83
-
84
- /**
85
- * Flag tool invocations that read the **live** skill-under-test directory.
86
- *
87
- * Eval subagents are only ever meant to see the *staged* copy of the skill
88
- * (`.claude/skills/<slug>/`, or the inlined SKILL.md under `--no-stage`). A
89
- * read of the live source typically means the Skill tool couldn't resolve the
90
- * staged slug yet (mid-session registry refresh race) and the agent improvised
91
- * — fatal in revision mode, where the old_skill arm then reads new-skill
92
- * content. Reads are detected, not blocked: the guard stays read-permissive,
93
- * so this surfaces post-hoc as a validity warning.
94
- *
95
- * - Read-tool calls (Read/Glob/Grep) whose path arg resolves under the live
96
- * dir are flagged; relative paths resolve against `repoRoot`.
97
- * - Bash commands that reference the live dir (absolute, or repo-relative
98
- * text) are flagged. A staged copy under `.claude/skills/` can carry the
99
- * same `skills/<name>` relative text (e.g. via `--stage-name`), so that
100
- * prefix is excluded.
101
- */
102
- export function detectLiveSourceReads(
103
- invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
104
- liveSkillDir: string,
105
- repoRoot: string,
106
- ): StrayFinding[] {
107
- const findings: StrayFinding[] = [];
108
- const liveDir = resolve(liveSkillDir);
109
- const rel = relative(repoRoot, liveDir);
110
- const relRe = rel.startsWith("..")
111
- ? null
112
- : new RegExp(
113
- // The lookbehind fires at the boundary char itself, so it checks for a
114
- // bare `.claude` — the `/` is consumed by the boundary group.
115
- `(?<!\\.claude)(^|[\\s'"=:(/])${escapeRegExp(rel)}(/|[\\s'")]|$)`,
116
- );
117
-
118
- for (const inv of invocations) {
119
- if (READ_TOOLS.has(inv.name)) {
120
- const p = pathArg(inv.args);
121
- if (p && isUnder(p, liveDir, repoRoot)) {
122
- findings.push({
123
- tool: inv.name,
124
- path: p,
125
- ordinal: inv.ordinal,
126
- reason: LIVE_SOURCE_REASON,
127
- });
128
- }
129
- continue;
130
- }
131
-
132
- if (inv.name === "Bash") {
133
- const args = inv.args as { command?: unknown } | undefined;
134
- const command = typeof args?.command === "string" ? args.command : "";
135
- if (command.includes(liveDir) || relRe?.test(command)) {
136
- findings.push({
137
- tool: "Bash",
138
- command,
139
- ordinal: inv.ordinal,
140
- reason: LIVE_SOURCE_REASON,
141
- });
142
- }
143
- }
144
- }
145
-
146
- return findings;
147
- }
148
-
149
- if (import.meta.main) {
150
- const argv = Bun.argv.slice(2);
151
- const flag = (name: string): string | undefined => {
152
- const i = argv.indexOf(`--${name}`);
153
- return i === -1 ? undefined : argv[i + 1];
154
- };
155
- const iteration = flag("iteration");
156
- if (!iteration) die("missing --iteration");
157
- const ctx = detectRunContext(argv);
158
-
159
- const iterationDir = join(
160
- ctx.workspaceRoot,
161
- ctx.skillName,
162
- `iteration-${iteration}`,
163
- );
164
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
165
-
166
- const conditionsPath = join(iterationDir, "conditions.json");
167
- if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
168
- const conditions: ConditionsRecord = JSON.parse(
169
- readFileSync(conditionsPath, "utf8"),
170
- );
171
- const conditionNames = conditions.conditions.map((c) => c.name);
172
-
173
- // dispatch.json carries the authoritative outputs_dir per task; fall back to
174
- // the conventional <condDir>/outputs when it's absent (hand-authored runs).
175
- const dispatchPath = join(iterationDir, "dispatch.json");
176
- const outputsByKey = new Map<string, string>();
177
- if (existsSync(dispatchPath)) {
178
- try {
179
- const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
180
- tasks?: Array<{
181
- eval_id: string;
182
- condition: string;
183
- outputs_dir?: string;
184
- }>;
185
- };
186
- for (const t of dispatch.tasks ?? []) {
187
- if (t.outputs_dir)
188
- outputsByKey.set(`${t.eval_id}:${t.condition}`, t.outputs_dir);
189
- }
190
- } catch {
191
- // fall through to convention
192
- }
193
- }
194
-
195
- const repoRoot = process.cwd();
196
- const evalDirs = readdirSync(iterationDir).filter((d) =>
197
- d.startsWith("eval-"),
198
- );
199
-
200
- type RunReport = {
201
- eval_id: string;
202
- condition: string;
203
- violations: StrayFinding[];
204
- warnings: StrayFinding[];
205
- live_source_reads: StrayFinding[];
206
- };
207
- const runs: RunReport[] = [];
208
- let totalViolations = 0;
209
- let totalWarnings = 0;
210
- let totalLiveReads = 0;
211
-
212
- for (const evalDir of evalDirs) {
213
- const evalId = evalDir.replace(/^eval-/, "");
214
- for (const cond of conditionNames) {
215
- const condDir = join(iterationDir, evalDir, cond);
216
- const runPath = join(condDir, "run.json");
217
- if (!existsSync(runPath)) continue;
218
- const run = validateAgainstSchema<RunRecord>(
219
- "run-record",
220
- JSON.parse(readFileSync(runPath, "utf8")),
221
- runPath,
222
- );
223
- const invocations = Array.isArray(run.tool_invocations)
224
- ? run.tool_invocations
225
- : [];
226
- const outputsDir =
227
- outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
228
- const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
229
- const liveReads = detectLiveSourceReads(
230
- invocations,
231
- ctx.skillSubdir,
232
- repoRoot,
233
- );
234
- if (
235
- findings.violations.length ||
236
- findings.warnings.length ||
237
- liveReads.length
238
- ) {
239
- runs.push({
240
- eval_id: evalId,
241
- condition: cond,
242
- violations: findings.violations,
243
- warnings: findings.warnings,
244
- live_source_reads: liveReads,
245
- });
246
- }
247
- totalViolations += findings.violations.length;
248
- totalWarnings += findings.warnings.length;
249
- totalLiveReads += liveReads.length;
250
- }
251
- }
252
-
253
- const report = {
254
- generated: new Date().toISOString(),
255
- iteration: Number(iteration),
256
- totals: {
257
- violations: totalViolations,
258
- warnings: totalWarnings,
259
- live_source_reads: totalLiveReads,
260
- },
261
- runs,
262
- };
263
- const outPath = join(iterationDir, "stray-writes.json");
264
- validateAgainstSchema("stray-writes", report, outPath);
265
- writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
266
- console.log(`Wrote ${outPath}`);
267
-
268
- for (const r of runs) {
269
- for (const v of r.violations)
270
- console.warn(
271
- `✗ ${r.eval_id}/${r.condition}: ${v.tool} wrote outside outputs dir → ${v.path} (ordinal ${v.ordinal})`,
272
- );
273
- for (const w of r.warnings)
274
- console.warn(
275
- `⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
276
- );
277
- for (const l of r.live_source_reads)
278
- console.warn(
279
- `⚠ ${r.eval_id}/${r.condition}: ${l.tool} read the live skill source (ordinal ${l.ordinal}): ${l.path ?? l.command}`,
280
- );
281
- }
282
- if (totalViolations === 0 && totalWarnings === 0 && totalLiveReads === 0)
283
- console.log("✓ No out-of-bounds writes or live-source reads detected.");
284
- else
285
- console.warn(
286
- `\n${totalViolations} violation(s), ${totalWarnings} warning(s), ${totalLiveReads} live-source read(s). Runs with violations edited files outside their sandbox; runs with live-source reads saw the live skill instead of their staged copy — treat those data points as tainted.`,
287
- );
288
- }