@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,396 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import {
3
- mkdirSync,
4
- readFileSync,
5
- realpathSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
- import {
12
- detectLiveSourceReads,
13
- detectStrayWrites,
14
- } from "./detect-stray-writes";
15
-
16
- const OUTPUTS = "/work/iteration-1/eval-x/with_skill/outputs";
17
- const REPO = "/work/repo";
18
- const LIVE_SKILL = join(REPO, "skills", "mr-review");
19
-
20
- describe("detectStrayWrites", () => {
21
- test("a Write inside the outputs dir is clean", () => {
22
- const findings = detectStrayWrites(
23
- [
24
- {
25
- name: "Write",
26
- args: { file_path: join(OUTPUTS, "answer.md") },
27
- ordinal: 0,
28
- },
29
- ],
30
- OUTPUTS,
31
- REPO,
32
- );
33
- expect(findings.violations).toHaveLength(0);
34
- expect(findings.warnings).toHaveLength(0);
35
- });
36
-
37
- test("a Write outside the outputs dir is a violation", () => {
38
- const findings = detectStrayWrites(
39
- [
40
- {
41
- name: "Write",
42
- args: { file_path: join(REPO, "runner/run.ts") },
43
- ordinal: 2,
44
- },
45
- ],
46
- OUTPUTS,
47
- REPO,
48
- );
49
- expect(findings.violations).toHaveLength(1);
50
- expect(findings.violations[0]).toMatchObject({
51
- tool: "Write",
52
- path: join(REPO, "runner/run.ts"),
53
- ordinal: 2,
54
- });
55
- });
56
-
57
- test("an Edit/MultiEdit/NotebookEdit outside outputs is a violation", () => {
58
- const findings = detectStrayWrites(
59
- [
60
- { name: "Edit", args: { file_path: "/etc/hosts" }, ordinal: 0 },
61
- {
62
- name: "NotebookEdit",
63
- args: { notebook_path: "/tmp/x.ipynb" },
64
- ordinal: 1,
65
- },
66
- ],
67
- OUTPUTS,
68
- REPO,
69
- );
70
- expect(findings.violations.map((v) => v.tool).sort()).toEqual([
71
- "Edit",
72
- "NotebookEdit",
73
- ]);
74
- });
75
-
76
- test("an install command is a warning", () => {
77
- const findings = detectStrayWrites(
78
- [{ name: "Bash", args: { command: "npm install left-pad" }, ordinal: 0 }],
79
- OUTPUTS,
80
- REPO,
81
- );
82
- expect(findings.warnings).toHaveLength(1);
83
- expect(findings.warnings[0].tool).toBe("Bash");
84
- expect(findings.warnings[0].reason).toMatch(/install/i);
85
- });
86
-
87
- test("a mutating Bash command scoped to the outputs dir is not flagged", () => {
88
- const findings = detectStrayWrites(
89
- [
90
- {
91
- name: "Bash",
92
- args: { command: `echo hi > ${join(OUTPUTS, "log.txt")}` },
93
- ordinal: 0,
94
- },
95
- ],
96
- OUTPUTS,
97
- REPO,
98
- );
99
- expect(findings.warnings).toHaveLength(0);
100
- });
101
-
102
- test("git worktree add is a warning (working tree outside the sandbox)", () => {
103
- const findings = detectStrayWrites(
104
- [
105
- {
106
- name: "Bash",
107
- args: { command: "git worktree add ../wt -b scratch" },
108
- ordinal: 0,
109
- },
110
- ],
111
- OUTPUTS,
112
- REPO,
113
- );
114
- expect(findings.warnings).toHaveLength(1);
115
- expect(findings.warnings[0].reason).toMatch(/worktree/i);
116
- });
117
-
118
- test("creating a path under .claude is a warning", () => {
119
- const findings = detectStrayWrites(
120
- [{ name: "Bash", args: { command: "mkdir -p .claude/foo" }, ordinal: 0 }],
121
- OUTPUTS,
122
- REPO,
123
- );
124
- expect(findings.warnings).toHaveLength(1);
125
- expect(findings.warnings[0].reason).toMatch(/\.claude/i);
126
- });
127
-
128
- test("read-only tools are never flagged", () => {
129
- const findings = detectStrayWrites(
130
- [
131
- { name: "Read", args: { file_path: "/anywhere" }, ordinal: 0 },
132
- { name: "Grep", args: { pattern: "x" }, ordinal: 1 },
133
- { name: "Bash", args: { command: "ls -la /" }, ordinal: 2 },
134
- ],
135
- OUTPUTS,
136
- REPO,
137
- );
138
- expect(findings.violations).toHaveLength(0);
139
- expect(findings.warnings).toHaveLength(0);
140
- });
141
- });
142
-
143
- describe("detectLiveSourceReads", () => {
144
- test("a Read of the live SKILL.md is flagged", () => {
145
- const findings = detectLiveSourceReads(
146
- [
147
- {
148
- name: "Read",
149
- args: { file_path: join(LIVE_SKILL, "SKILL.md") },
150
- ordinal: 1,
151
- },
152
- ],
153
- LIVE_SKILL,
154
- REPO,
155
- );
156
- expect(findings).toHaveLength(1);
157
- expect(findings[0]).toMatchObject({
158
- tool: "Read",
159
- path: join(LIVE_SKILL, "SKILL.md"),
160
- ordinal: 1,
161
- });
162
- expect(findings[0].reason).toMatch(/live skill source/i);
163
- });
164
-
165
- test("a Read of a staged eval copy is not flagged", () => {
166
- const findings = detectLiveSourceReads(
167
- [
168
- {
169
- name: "Read",
170
- args: {
171
- file_path: join(
172
- REPO,
173
- ".claude/skills/slow-powers-eval-1-old_skill__mr-review/SKILL.md",
174
- ),
175
- },
176
- ordinal: 0,
177
- },
178
- ],
179
- LIVE_SKILL,
180
- REPO,
181
- );
182
- expect(findings).toHaveLength(0);
183
- });
184
-
185
- test("a relative Read path resolving under the live dir is flagged", () => {
186
- const findings = detectLiveSourceReads(
187
- [
188
- {
189
- name: "Read",
190
- args: { file_path: "skills/mr-review/SKILL.md" },
191
- ordinal: 0,
192
- },
193
- ],
194
- LIVE_SKILL,
195
- REPO,
196
- );
197
- expect(findings).toHaveLength(1);
198
- });
199
-
200
- test("a Grep scoped to the live dir is flagged", () => {
201
- const findings = detectLiveSourceReads(
202
- [{ name: "Grep", args: { pattern: "x", path: LIVE_SKILL }, ordinal: 2 }],
203
- LIVE_SKILL,
204
- REPO,
205
- );
206
- expect(findings).toHaveLength(1);
207
- expect(findings[0].tool).toBe("Grep");
208
- });
209
-
210
- test("a Bash command referencing the live dir relatively is flagged", () => {
211
- const findings = detectLiveSourceReads(
212
- [
213
- {
214
- name: "Bash",
215
- args: { command: "cat skills/mr-review/SKILL.md" },
216
- ordinal: 3,
217
- },
218
- ],
219
- LIVE_SKILL,
220
- REPO,
221
- );
222
- expect(findings).toHaveLength(1);
223
- expect(findings[0].tool).toBe("Bash");
224
- expect(findings[0].command).toBe("cat skills/mr-review/SKILL.md");
225
- });
226
-
227
- test("a Bash command referencing the live dir absolutely is flagged", () => {
228
- const findings = detectLiveSourceReads(
229
- [
230
- {
231
- name: "Bash",
232
- args: { command: `grep -r trigger ${LIVE_SKILL}/` },
233
- ordinal: 0,
234
- },
235
- ],
236
- LIVE_SKILL,
237
- REPO,
238
- );
239
- expect(findings).toHaveLength(1);
240
- });
241
-
242
- test("a Bash command referencing a staged copy under .claude/skills is not flagged", () => {
243
- // --stage-name can stage under the skill's natural name; that path contains
244
- // `skills/<name>` but lives under `.claude/`, so it must not match.
245
- const findings = detectLiveSourceReads(
246
- [
247
- {
248
- name: "Bash",
249
- args: { command: "cat .claude/skills/mr-review/SKILL.md" },
250
- ordinal: 0,
251
- },
252
- ],
253
- LIVE_SKILL,
254
- REPO,
255
- );
256
- expect(findings).toHaveLength(0);
257
- });
258
-
259
- test("unrelated reads and commands are not flagged", () => {
260
- const findings = detectLiveSourceReads(
261
- [
262
- {
263
- name: "Read",
264
- args: { file_path: join(OUTPUTS, "x.md") },
265
- ordinal: 0,
266
- },
267
- { name: "Bash", args: { command: "ls skills-workspace" }, ordinal: 1 },
268
- {
269
- name: "Write",
270
- args: { file_path: join(LIVE_SKILL, "SKILL.md") },
271
- ordinal: 2,
272
- },
273
- ],
274
- LIVE_SKILL,
275
- REPO,
276
- );
277
- // Write tools are detectStrayWrites' jurisdiction — this check is reads only.
278
- expect(findings).toHaveLength(0);
279
- });
280
- });
281
-
282
- describe("detect-stray-writes CLI", () => {
283
- // realpath: the spawned CLI sees its cwd resolved (macOS /var → /private/var),
284
- // so fixture paths must match that form for prefix checks to line up.
285
- const FIXTURE_ROOT = join(
286
- realpathSync(tmpdir()),
287
- `slow-powers-detect-stray-test-${process.pid}`,
288
- );
289
- const SCRIPT = join(import.meta.dir, "detect-stray-writes.ts");
290
-
291
- beforeAll(() => {
292
- mkdirSync(FIXTURE_ROOT, { recursive: true });
293
- });
294
-
295
- afterAll(() => {
296
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
297
- });
298
-
299
- test("reports live-source reads per run in stray-writes.json", () => {
300
- const root = join(FIXTURE_ROOT, "cli-live-reads");
301
- const skillDir = join(root, "skill-dir");
302
- const skillSub = join(skillDir, "mr-review");
303
- mkdirSync(skillSub, { recursive: true });
304
- writeFileSync(
305
- join(skillSub, "SKILL.md"),
306
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
307
- );
308
-
309
- const cwd = join(root, "work");
310
- const iterationDir = join(
311
- cwd,
312
- "skills-workspace",
313
- "mr-review",
314
- "iteration-1",
315
- );
316
- const condDir = join(iterationDir, "eval-e1", "old_skill");
317
- mkdirSync(condDir, { recursive: true });
318
- writeFileSync(
319
- join(iterationDir, "conditions.json"),
320
- `${JSON.stringify({
321
- mode: "revision",
322
- conditions: [
323
- { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
324
- { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
325
- ],
326
- timestamp: new Date().toISOString(),
327
- harness: "claude-code",
328
- })}\n`,
329
- );
330
- writeFileSync(
331
- join(condDir, "run.json"),
332
- `${JSON.stringify({
333
- eval_id: "e1",
334
- condition: "old_skill",
335
- skill_path: join(skillSub, "SKILL.md"),
336
- prompt: "do the task",
337
- files: [],
338
- final_message: "done",
339
- tool_invocations: [
340
- {
341
- name: "Read",
342
- args: { file_path: join(skillSub, "SKILL.md") },
343
- ordinal: 0,
344
- },
345
- {
346
- name: "Write",
347
- args: { file_path: join(condDir, "outputs", "answer.md") },
348
- ordinal: 1,
349
- },
350
- ],
351
- })}\n`,
352
- );
353
-
354
- const res = Bun.spawnSync(
355
- [
356
- "bun",
357
- "run",
358
- SCRIPT,
359
- "--skill-dir",
360
- skillDir,
361
- "--skill",
362
- "mr-review",
363
- "--iteration",
364
- "1",
365
- ],
366
- { cwd, stdout: "pipe", stderr: "pipe" },
367
- );
368
- expect(res.exitCode).toBe(0);
369
-
370
- const report = JSON.parse(
371
- readFileSync(join(iterationDir, "stray-writes.json"), "utf8"),
372
- ) as {
373
- totals: {
374
- violations: number;
375
- warnings: number;
376
- live_source_reads: number;
377
- };
378
- runs: Array<{
379
- eval_id: string;
380
- condition: string;
381
- live_source_reads: Array<{ tool: string; path?: string }>;
382
- }>;
383
- };
384
- expect(report.totals.live_source_reads).toBe(1);
385
- expect(report.totals.violations).toBe(0);
386
- expect(report.runs).toHaveLength(1);
387
- expect(report.runs[0]).toMatchObject({
388
- eval_id: "e1",
389
- condition: "old_skill",
390
- });
391
- expect(report.runs[0].live_source_reads[0]).toMatchObject({
392
- tool: "Read",
393
- path: join(skillSub, "SKILL.md"),
394
- });
395
- });
396
- });
@@ -1,288 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
3
- import { join, relative, resolve } from "node:path";
4
- import { detectRunContext } from "./context";
5
- import { classifyBash, isUnder, pathArg, WRITE_TOOLS } from "./sandbox-policy";
6
- import type { ConditionsRecord, RunRecord, ToolInvocation } from "./types";
7
- import { validateAgainstSchema } from "./validate-schema";
8
-
9
- function die(msg: string): never {
10
- console.error(`error: ${msg}`);
11
- process.exit(1);
12
- }
13
-
14
- export type StrayFinding = {
15
- tool: string;
16
- path?: string;
17
- command?: string;
18
- ordinal: number;
19
- reason: string;
20
- };
21
-
22
- export type RunFindings = {
23
- violations: StrayFinding[];
24
- warnings: StrayFinding[];
25
- };
26
-
27
- /**
28
- * Classify a run's tool invocations against its allowed outputs dir.
29
- *
30
- * - `violations`: file-write tools (Write/Edit/MultiEdit/NotebookEdit) whose
31
- * target path resolves outside `outputsDir`. High confidence — a run that
32
- * edits the real repo is a tainted data point.
33
- * - `warnings`: Bash commands matching a mutating pattern that don't reference
34
- * `outputsDir`. Heuristic — review before trusting.
35
- *
36
- * Relative paths resolve against `repoRoot` (the subagent's working dir);
37
- * Claude Code's write tools use absolute paths, so this is a best-effort
38
- * fallback only.
39
- */
40
- export function detectStrayWrites(
41
- invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
42
- outputsDir: string,
43
- repoRoot: string,
44
- ): RunFindings {
45
- const violations: StrayFinding[] = [];
46
- const warnings: StrayFinding[] = [];
47
-
48
- for (const inv of invocations) {
49
- if (WRITE_TOOLS.has(inv.name)) {
50
- const p = pathArg(inv.args);
51
- if (p && !isUnder(p, outputsDir, repoRoot)) {
52
- violations.push({
53
- tool: inv.name,
54
- path: p,
55
- ordinal: inv.ordinal,
56
- reason: "writes outside the run's outputs dir",
57
- });
58
- }
59
- continue;
60
- }
61
-
62
- if (inv.name === "Bash") {
63
- const args = inv.args as { command?: unknown } | undefined;
64
- const command = typeof args?.command === "string" ? args.command : "";
65
- const reason = classifyBash(command, [outputsDir]);
66
- if (reason)
67
- warnings.push({ tool: "Bash", command, ordinal: inv.ordinal, reason });
68
- }
69
- }
70
-
71
- return { violations, warnings };
72
- }
73
-
74
- /** Read-only tools that carry a target path argument (see `pathArg`). */
75
- const READ_TOOLS = new Set(["Read", "Glob", "Grep"]);
76
-
77
- const LIVE_SOURCE_REASON =
78
- "reads the live skill source instead of its staged copy — the arm may be contaminated";
79
-
80
- function escapeRegExp(s: string): string {
81
- return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
82
- }
83
-
84
- /**
85
- * Flag tool invocations that read the **live** skill-under-test directory.
86
- *
87
- * Eval subagents are only ever meant to see the *staged* copy of the skill
88
- * (`.claude/skills/<slug>/`, or the inlined SKILL.md under `--no-stage`). A
89
- * read of the live source typically means the Skill tool couldn't resolve the
90
- * staged slug yet (mid-session registry refresh race) and the agent improvised
91
- * — fatal in revision mode, where the old_skill arm then reads new-skill
92
- * content. Reads are detected, not blocked: the guard stays read-permissive,
93
- * so this surfaces post-hoc as a validity warning.
94
- *
95
- * - Read-tool calls (Read/Glob/Grep) whose path arg resolves under the live
96
- * dir are flagged; relative paths resolve against `repoRoot`.
97
- * - Bash commands that reference the live dir (absolute, or repo-relative
98
- * text) are flagged. A staged copy under `.claude/skills/` can carry the
99
- * same `skills/<name>` relative text (e.g. via `--stage-name`), so that
100
- * prefix is excluded.
101
- */
102
- export function detectLiveSourceReads(
103
- invocations: Array<Pick<ToolInvocation, "name" | "args" | "ordinal">>,
104
- liveSkillDir: string,
105
- repoRoot: string,
106
- ): StrayFinding[] {
107
- const findings: StrayFinding[] = [];
108
- const liveDir = resolve(liveSkillDir);
109
- const rel = relative(repoRoot, liveDir);
110
- const relRe = rel.startsWith("..")
111
- ? null
112
- : new RegExp(
113
- // The lookbehind fires at the boundary char itself, so it checks for a
114
- // bare `.claude` — the `/` is consumed by the boundary group.
115
- `(?<!\\.claude)(^|[\\s'"=:(/])${escapeRegExp(rel)}(/|[\\s'")]|$)`,
116
- );
117
-
118
- for (const inv of invocations) {
119
- if (READ_TOOLS.has(inv.name)) {
120
- const p = pathArg(inv.args);
121
- if (p && isUnder(p, liveDir, repoRoot)) {
122
- findings.push({
123
- tool: inv.name,
124
- path: p,
125
- ordinal: inv.ordinal,
126
- reason: LIVE_SOURCE_REASON,
127
- });
128
- }
129
- continue;
130
- }
131
-
132
- if (inv.name === "Bash") {
133
- const args = inv.args as { command?: unknown } | undefined;
134
- const command = typeof args?.command === "string" ? args.command : "";
135
- if (command.includes(liveDir) || relRe?.test(command)) {
136
- findings.push({
137
- tool: "Bash",
138
- command,
139
- ordinal: inv.ordinal,
140
- reason: LIVE_SOURCE_REASON,
141
- });
142
- }
143
- }
144
- }
145
-
146
- return findings;
147
- }
148
-
149
- if (import.meta.main) {
150
- const argv = Bun.argv.slice(2);
151
- const flag = (name: string): string | undefined => {
152
- const i = argv.indexOf(`--${name}`);
153
- return i === -1 ? undefined : argv[i + 1];
154
- };
155
- const iteration = flag("iteration");
156
- if (!iteration) die("missing --iteration");
157
- const ctx = detectRunContext(argv);
158
-
159
- const iterationDir = join(
160
- ctx.workspaceRoot,
161
- ctx.skillName,
162
- `iteration-${iteration}`,
163
- );
164
- if (!existsSync(iterationDir)) die(`not found: ${iterationDir}`);
165
-
166
- const conditionsPath = join(iterationDir, "conditions.json");
167
- if (!existsSync(conditionsPath)) die(`missing: ${conditionsPath}`);
168
- const conditions: ConditionsRecord = JSON.parse(
169
- readFileSync(conditionsPath, "utf8"),
170
- );
171
- const conditionNames = conditions.conditions.map((c) => c.name);
172
-
173
- // dispatch.json carries the authoritative outputs_dir per task; fall back to
174
- // the conventional <condDir>/outputs when it's absent (hand-authored runs).
175
- const dispatchPath = join(iterationDir, "dispatch.json");
176
- const outputsByKey = new Map<string, string>();
177
- if (existsSync(dispatchPath)) {
178
- try {
179
- const dispatch = JSON.parse(readFileSync(dispatchPath, "utf8")) as {
180
- tasks?: Array<{
181
- eval_id: string;
182
- condition: string;
183
- outputs_dir?: string;
184
- }>;
185
- };
186
- for (const t of dispatch.tasks ?? []) {
187
- if (t.outputs_dir)
188
- outputsByKey.set(`${t.eval_id}:${t.condition}`, t.outputs_dir);
189
- }
190
- } catch {
191
- // fall through to convention
192
- }
193
- }
194
-
195
- const repoRoot = process.cwd();
196
- const evalDirs = readdirSync(iterationDir).filter((d) =>
197
- d.startsWith("eval-"),
198
- );
199
-
200
- type RunReport = {
201
- eval_id: string;
202
- condition: string;
203
- violations: StrayFinding[];
204
- warnings: StrayFinding[];
205
- live_source_reads: StrayFinding[];
206
- };
207
- const runs: RunReport[] = [];
208
- let totalViolations = 0;
209
- let totalWarnings = 0;
210
- let totalLiveReads = 0;
211
-
212
- for (const evalDir of evalDirs) {
213
- const evalId = evalDir.replace(/^eval-/, "");
214
- for (const cond of conditionNames) {
215
- const condDir = join(iterationDir, evalDir, cond);
216
- const runPath = join(condDir, "run.json");
217
- if (!existsSync(runPath)) continue;
218
- const run = validateAgainstSchema<RunRecord>(
219
- "run-record",
220
- JSON.parse(readFileSync(runPath, "utf8")),
221
- runPath,
222
- );
223
- const invocations = Array.isArray(run.tool_invocations)
224
- ? run.tool_invocations
225
- : [];
226
- const outputsDir =
227
- outputsByKey.get(`${evalId}:${cond}`) ?? join(condDir, "outputs");
228
- const findings = detectStrayWrites(invocations, outputsDir, repoRoot);
229
- const liveReads = detectLiveSourceReads(
230
- invocations,
231
- ctx.skillSubdir,
232
- repoRoot,
233
- );
234
- if (
235
- findings.violations.length ||
236
- findings.warnings.length ||
237
- liveReads.length
238
- ) {
239
- runs.push({
240
- eval_id: evalId,
241
- condition: cond,
242
- violations: findings.violations,
243
- warnings: findings.warnings,
244
- live_source_reads: liveReads,
245
- });
246
- }
247
- totalViolations += findings.violations.length;
248
- totalWarnings += findings.warnings.length;
249
- totalLiveReads += liveReads.length;
250
- }
251
- }
252
-
253
- const report = {
254
- generated: new Date().toISOString(),
255
- iteration: Number(iteration),
256
- totals: {
257
- violations: totalViolations,
258
- warnings: totalWarnings,
259
- live_source_reads: totalLiveReads,
260
- },
261
- runs,
262
- };
263
- const outPath = join(iterationDir, "stray-writes.json");
264
- validateAgainstSchema("stray-writes", report, outPath);
265
- writeFileSync(outPath, `${JSON.stringify(report, null, 2)}\n`);
266
- console.log(`Wrote ${outPath}`);
267
-
268
- for (const r of runs) {
269
- for (const v of r.violations)
270
- console.warn(
271
- `✗ ${r.eval_id}/${r.condition}: ${v.tool} wrote outside outputs dir → ${v.path} (ordinal ${v.ordinal})`,
272
- );
273
- for (const w of r.warnings)
274
- console.warn(
275
- `⚠ ${r.eval_id}/${r.condition}: Bash ${w.reason} (ordinal ${w.ordinal}): ${w.command}`,
276
- );
277
- for (const l of r.live_source_reads)
278
- console.warn(
279
- `⚠ ${r.eval_id}/${r.condition}: ${l.tool} read the live skill source (ordinal ${l.ordinal}): ${l.path ?? l.command}`,
280
- );
281
- }
282
- if (totalViolations === 0 && totalWarnings === 0 && totalLiveReads === 0)
283
- console.log("✓ No out-of-bounds writes or live-source reads detected.");
284
- else
285
- console.warn(
286
- `\n${totalViolations} violation(s), ${totalWarnings} warning(s), ${totalLiveReads} live-source read(s). Runs with violations edited files outside their sandbox; runs with live-source reads saw the live skill instead of their staged copy — treat those data points as tainted.`,
287
- );
288
- }