@slowdini/slow-powers-opencode 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +37 -65
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -13
  5. package/skills/evaluating-skills/SKILL.md +91 -337
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/verifying-development-work/SKILL.md +17 -6
  17. package/skills/verifying-development-work/code-review.md +68 -0
  18. package/skills/verifying-development-work/comment-review.md +85 -0
  19. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  20. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  21. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  22. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  23. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  26. package/skills/verifying-development-work/evals/evals.json +34 -2
  27. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  28. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  29. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  30. package/skills/evaluating-skills/harness-details/claude.md +0 -158
  31. package/skills/evaluating-skills/runner/README.md +0 -154
  32. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  33. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  34. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
  35. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
  36. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
  37. package/skills/evaluating-skills/runner/aggregate.ts +0 -248
  38. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  39. package/skills/evaluating-skills/runner/context.ts +0 -90
  40. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
  41. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
  42. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  43. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  44. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  45. package/skills/evaluating-skills/runner/grade.ts +0 -603
  46. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  47. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  48. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  49. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
  50. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  51. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  52. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  53. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  54. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
  55. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
  56. package/skills/evaluating-skills/runner/run.test.ts +0 -1180
  57. package/skills/evaluating-skills/runner/run.ts +0 -1029
  58. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
  59. package/skills/evaluating-skills/runner/types.ts +0 -112
  60. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  61. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  62. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  63. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  64. package/skills/evaluating-skills/runner/validate.ts +0 -21
  65. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  66. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  67. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  68. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
  69. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
  70. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  71. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  72. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
  73. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  74. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  75. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  76. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  77. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  78. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  79. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  80. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  81. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  82. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -1,263 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import { mkdirSync, rmSync, utimesSync, writeFileSync } from "node:fs";
3
- import { tmpdir } from "node:os";
4
- import { join } from "node:path";
5
- import {
6
- findByDescription,
7
- listSubagents,
8
- parseTranscript,
9
- } from "./claude-code-transcript";
10
-
11
- const FIXTURE_ROOT = join(tmpdir(), `claude-code-adapter-test-${process.pid}`);
12
-
13
- function jsonl(lines: object[]): string {
14
- return `${lines.map((l) => JSON.stringify(l)).join("\n")}\n`;
15
- }
16
-
17
- beforeAll(() => {
18
- mkdirSync(FIXTURE_ROOT, { recursive: true });
19
- });
20
-
21
- afterAll(() => {
22
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
23
- });
24
-
25
- describe("parseTranscript", () => {
26
- test("extracts tool_use blocks from assistant messages with ordinal and args", () => {
27
- const path = join(FIXTURE_ROOT, "simple.jsonl");
28
- writeFileSync(
29
- path,
30
- jsonl([
31
- {
32
- type: "user",
33
- message: { role: "user", content: "Run the tests" },
34
- },
35
- {
36
- type: "assistant",
37
- message: {
38
- role: "assistant",
39
- content: [
40
- { type: "text", text: "Running tests now." },
41
- {
42
- type: "tool_use",
43
- id: "toolu_001",
44
- name: "Bash",
45
- input: { command: "bun test" },
46
- },
47
- ],
48
- },
49
- },
50
- {
51
- type: "user",
52
- message: {
53
- role: "user",
54
- content: [
55
- {
56
- type: "tool_result",
57
- tool_use_id: "toolu_001",
58
- content: "2 pass\n0 fail",
59
- },
60
- ],
61
- },
62
- },
63
- {
64
- type: "assistant",
65
- message: {
66
- role: "assistant",
67
- content: [
68
- {
69
- type: "tool_use",
70
- id: "toolu_002",
71
- name: "Read",
72
- input: { file_path: "/tmp/x.txt" },
73
- },
74
- ],
75
- },
76
- },
77
- ]),
78
- );
79
-
80
- const result = parseTranscript(path);
81
- expect(result).toHaveLength(2);
82
- expect(result[0]).toMatchObject({
83
- name: "Bash",
84
- ordinal: 0,
85
- args: { command: "bun test" },
86
- result: "2 pass\n0 fail",
87
- });
88
- expect(result[1]).toMatchObject({
89
- name: "Read",
90
- ordinal: 1,
91
- args: { file_path: "/tmp/x.txt" },
92
- });
93
- expect(result[1].result).toBeUndefined();
94
- });
95
-
96
- test("returns empty array when no tool_use blocks present", () => {
97
- const path = join(FIXTURE_ROOT, "no-tools.jsonl");
98
- writeFileSync(
99
- path,
100
- jsonl([
101
- { type: "user", message: { role: "user", content: "hi" } },
102
- {
103
- type: "assistant",
104
- message: {
105
- role: "assistant",
106
- content: [{ type: "text", text: "hello" }],
107
- },
108
- },
109
- ]),
110
- );
111
- expect(parseTranscript(path)).toEqual([]);
112
- });
113
-
114
- test("skips malformed JSONL lines without throwing", () => {
115
- const path = join(FIXTURE_ROOT, "malformed.jsonl");
116
- writeFileSync(
117
- path,
118
- [
119
- JSON.stringify({
120
- type: "assistant",
121
- message: {
122
- role: "assistant",
123
- content: [
124
- {
125
- type: "tool_use",
126
- id: "toolu_a",
127
- name: "Bash",
128
- input: { command: "ls" },
129
- },
130
- ],
131
- },
132
- }),
133
- "not valid json",
134
- JSON.stringify({
135
- type: "assistant",
136
- message: {
137
- role: "assistant",
138
- content: [
139
- {
140
- type: "tool_use",
141
- id: "toolu_b",
142
- name: "Read",
143
- input: { file_path: "/tmp" },
144
- },
145
- ],
146
- },
147
- }),
148
- "",
149
- ].join("\n"),
150
- );
151
- const result = parseTranscript(path);
152
- expect(result).toHaveLength(2);
153
- expect(result.map((r) => r.name)).toEqual(["Bash", "Read"]);
154
- });
155
-
156
- test("handles tool_result with array content", () => {
157
- const path = join(FIXTURE_ROOT, "array-result.jsonl");
158
- writeFileSync(
159
- path,
160
- jsonl([
161
- {
162
- type: "assistant",
163
- message: {
164
- role: "assistant",
165
- content: [
166
- {
167
- type: "tool_use",
168
- id: "toolu_x",
169
- name: "Bash",
170
- input: { command: "echo hi" },
171
- },
172
- ],
173
- },
174
- },
175
- {
176
- type: "user",
177
- message: {
178
- role: "user",
179
- content: [
180
- {
181
- type: "tool_result",
182
- tool_use_id: "toolu_x",
183
- content: [{ type: "text", text: "hi" }],
184
- },
185
- ],
186
- },
187
- },
188
- ]),
189
- );
190
- const result = parseTranscript(path);
191
- expect(result).toHaveLength(1);
192
- expect(result[0].result).toBe("hi");
193
- });
194
- });
195
-
196
- describe("listSubagents / findByDescription", () => {
197
- test("matches subagents by meta description", () => {
198
- const dir = join(FIXTURE_ROOT, "subagents");
199
- mkdirSync(dir, { recursive: true });
200
-
201
- writeFileSync(
202
- join(dir, "agent-aaa111.meta.json"),
203
- JSON.stringify({
204
- agentType: "general-purpose",
205
- description: "claim-without-running:with_skill",
206
- toolUseId: "toolu_p1",
207
- }),
208
- );
209
- writeFileSync(join(dir, "agent-aaa111.jsonl"), "");
210
-
211
- writeFileSync(
212
- join(dir, "agent-bbb222.meta.json"),
213
- JSON.stringify({
214
- agentType: "general-purpose",
215
- description: "claim-without-running:without_skill",
216
- toolUseId: "toolu_p2",
217
- }),
218
- );
219
- writeFileSync(join(dir, "agent-bbb222.jsonl"), "");
220
-
221
- expect(listSubagents(dir)).toHaveLength(2);
222
-
223
- const match = findByDescription(dir, "claim-without-running:with_skill");
224
- expect(match).not.toBeNull();
225
- expect(match?.meta.toolUseId).toBe("toolu_p1");
226
-
227
- const miss = findByDescription(dir, "no-such-eval:with_skill");
228
- expect(miss).toBeNull();
229
- });
230
-
231
- test("returns null when subagents dir does not exist", () => {
232
- expect(listSubagents(join(FIXTURE_ROOT, "does-not-exist"))).toEqual([]);
233
- expect(
234
- findByDescription(join(FIXTURE_ROOT, "does-not-exist"), "x"),
235
- ).toBeNull();
236
- });
237
-
238
- test("on duplicate descriptions, returns the most-recently-written transcript", () => {
239
- const dir = join(FIXTURE_ROOT, "dup-subagents");
240
- mkdirSync(dir, { recursive: true });
241
-
242
- // Older agent for this description.
243
- writeFileSync(
244
- join(dir, "agent-old.meta.json"),
245
- JSON.stringify({ description: "dup:with_skill", toolUseId: "toolu_old" }),
246
- );
247
- writeFileSync(join(dir, "agent-old.jsonl"), "");
248
- const old = new Date(Date.now() - 60_000);
249
- utimesSync(join(dir, "agent-old.jsonl"), old, old);
250
-
251
- // Newer agent with the same description (e.g. a retry within the same run).
252
- writeFileSync(
253
- join(dir, "agent-new.meta.json"),
254
- JSON.stringify({ description: "dup:with_skill", toolUseId: "toolu_new" }),
255
- );
256
- writeFileSync(join(dir, "agent-new.jsonl"), "");
257
- const recent = new Date();
258
- utimesSync(join(dir, "agent-new.jsonl"), recent, recent);
259
-
260
- const match = findByDescription(dir, "dup:with_skill");
261
- expect(match?.meta.toolUseId).toBe("toolu_new");
262
- });
263
- });
@@ -1,146 +0,0 @@
1
- import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
2
- import { join } from "node:path";
3
- import type { ToolInvocation } from "../types";
4
-
5
- type ToolUseBlock = {
6
- type: "tool_use";
7
- id: string;
8
- name: string;
9
- input: unknown;
10
- };
11
-
12
- type ToolResultBlock = {
13
- type: "tool_result";
14
- tool_use_id: string;
15
- content: string | unknown[];
16
- };
17
-
18
- type ContentBlock = ToolUseBlock | ToolResultBlock | { type: string };
19
-
20
- type TranscriptRecord = {
21
- type: "user" | "assistant" | string;
22
- message?: {
23
- role?: string;
24
- content?: string | ContentBlock[];
25
- };
26
- };
27
-
28
- function flattenContent(
29
- content: string | ContentBlock[] | undefined,
30
- ): ContentBlock[] {
31
- if (!content) return [];
32
- if (typeof content === "string") return [];
33
- return content;
34
- }
35
-
36
- function stringifyResult(content: ToolResultBlock["content"]): string {
37
- if (typeof content === "string") return content;
38
- if (Array.isArray(content))
39
- return content
40
- .map((c) => {
41
- if (typeof c === "string") return c;
42
- if (c && typeof c === "object" && "text" in c)
43
- return String((c as { text: unknown }).text);
44
- return JSON.stringify(c);
45
- })
46
- .join("\n");
47
- return JSON.stringify(content);
48
- }
49
-
50
- export function parseTranscript(jsonlPath: string): ToolInvocation[] {
51
- const raw = readFileSync(jsonlPath, "utf8");
52
- const lines = raw.split("\n").filter((l) => l.length > 0);
53
-
54
- const invocations: ToolInvocation[] = [];
55
- const indexById = new Map<string, number>();
56
-
57
- for (const line of lines) {
58
- let record: TranscriptRecord;
59
- try {
60
- record = JSON.parse(line) as TranscriptRecord;
61
- } catch {
62
- continue;
63
- }
64
-
65
- const blocks = flattenContent(record.message?.content);
66
-
67
- if (record.type === "assistant") {
68
- for (const block of blocks) {
69
- if (block.type !== "tool_use") continue;
70
- const tu = block as ToolUseBlock;
71
- const ordinal = invocations.length;
72
- indexById.set(tu.id, ordinal);
73
- invocations.push({
74
- name: tu.name,
75
- args: tu.input,
76
- ordinal,
77
- });
78
- }
79
- continue;
80
- }
81
-
82
- if (record.type === "user") {
83
- for (const block of blocks) {
84
- if (block.type !== "tool_result") continue;
85
- const tr = block as ToolResultBlock;
86
- const idx = indexById.get(tr.tool_use_id);
87
- if (idx === undefined) continue;
88
- invocations[idx].result = stringifyResult(tr.content);
89
- }
90
- }
91
- }
92
-
93
- return invocations;
94
- }
95
-
96
- export type SubagentMeta = {
97
- agentType?: string;
98
- description?: string;
99
- toolUseId?: string;
100
- };
101
-
102
- export type SubagentEntry = {
103
- jsonlPath: string;
104
- metaPath: string;
105
- meta: SubagentMeta;
106
- };
107
-
108
- export function listSubagents(subagentsDir: string): SubagentEntry[] {
109
- if (!existsSync(subagentsDir)) return [];
110
- const files = readdirSync(subagentsDir);
111
- const out: SubagentEntry[] = [];
112
- for (const f of files) {
113
- if (!f.endsWith(".meta.json")) continue;
114
- const base = f.slice(0, -".meta.json".length);
115
- const metaPath = join(subagentsDir, f);
116
- const jsonlPath = join(subagentsDir, `${base}.jsonl`);
117
- if (!existsSync(jsonlPath)) continue;
118
- try {
119
- const meta = JSON.parse(readFileSync(metaPath, "utf8")) as SubagentMeta;
120
- out.push({ jsonlPath, metaPath, meta });
121
- } catch {}
122
- }
123
- return out;
124
- }
125
-
126
- export function findByDescription(
127
- subagentsDir: string,
128
- description: string,
129
- ): SubagentEntry | null {
130
- const entries = listSubagents(subagentsDir);
131
- const matches = entries.filter((e) => e.meta.description === description);
132
- if (matches.length === 0) return null;
133
- if (matches.length === 1) return matches[0];
134
-
135
- // Descriptions are namespaced per iteration+run (see run.ts), so duplicates
136
- // here mean a retry within the same run. Prefer the most-recently-written
137
- // transcript; readdir order is not chronological.
138
- matches.sort((a, b) => {
139
- try {
140
- return statSync(b.jsonlPath).mtimeMs - statSync(a.jsonlPath).mtimeMs;
141
- } catch {
142
- return 0;
143
- }
144
- });
145
- return matches[0];
146
- }
@@ -1,264 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import {
3
- existsSync,
4
- mkdirSync,
5
- readFileSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
-
12
- const FIXTURE_ROOT = join(
13
- tmpdir(),
14
- `slow-powers-aggregate-test-${process.pid}`,
15
- );
16
- const AGGREGATE_TS = join(import.meta.dir, "aggregate.ts");
17
-
18
- beforeAll(() => {
19
- mkdirSync(FIXTURE_ROOT, { recursive: true });
20
- });
21
-
22
- afterAll(() => {
23
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
24
- });
25
-
26
- function writeJson(path: string, value: unknown) {
27
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
28
- }
29
-
30
- describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
31
- test("computes benchmark.json from a hand-built graded workspace under CWD", () => {
32
- const root = join(FIXTURE_ROOT, "agg-basic");
33
- // Skill dir + skill-under-test (detectRunContext validates SKILL.md exists)
34
- const skillDir = join(root, "skill-dir");
35
- const skillSub = join(skillDir, "mr-review");
36
- mkdirSync(skillSub, { recursive: true });
37
- writeFileSync(
38
- join(skillSub, "SKILL.md"),
39
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
40
- );
41
-
42
- // Working dir that holds the workspace (mirrors stageRoot/workspaceRoot = CWD)
43
- const cwd = join(root, "work");
44
- const iterationDir = join(
45
- cwd,
46
- "skills-workspace",
47
- "mr-review",
48
- "iteration-1",
49
- );
50
- mkdirSync(iterationDir, { recursive: true });
51
- writeJson(join(iterationDir, "conditions.json"), {
52
- mode: "new-skill",
53
- conditions: [
54
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
55
- { name: "without_skill", skill_path: null },
56
- ],
57
- timestamp: new Date().toISOString(),
58
- harness: "claude-code",
59
- });
60
-
61
- const mkCond = (cond: string, passRate: number, tokens: number) => {
62
- const condDir = join(iterationDir, "eval-e1", cond);
63
- mkdirSync(condDir, { recursive: true });
64
- writeJson(join(condDir, "grading.json"), {
65
- assertion_results: [],
66
- summary: { passed: 1, failed: 0, total: 1, pass_rate: passRate },
67
- });
68
- writeJson(join(condDir, "timing.json"), {
69
- total_tokens: tokens,
70
- duration_ms: 1000,
71
- });
72
- };
73
- mkCond("with_skill", 1, 5000);
74
- mkCond("without_skill", 0, 3000);
75
-
76
- const res = Bun.spawnSync(
77
- [
78
- "bun",
79
- "run",
80
- AGGREGATE_TS,
81
- "--skill-dir",
82
- skillDir,
83
- "--skill",
84
- "mr-review",
85
- "--iteration",
86
- "1",
87
- ],
88
- { cwd, stdout: "pipe", stderr: "pipe" },
89
- );
90
- expect(res.exitCode).toBe(0);
91
-
92
- const benchmarkPath = join(iterationDir, "benchmark.json");
93
- expect(existsSync(benchmarkPath)).toBe(true);
94
- const benchmark = JSON.parse(readFileSync(benchmarkPath, "utf8")) as {
95
- delta: { pass_rate: number; total_tokens: number };
96
- run_summary: Record<string, { pass_rate: { mean: number } }>;
97
- };
98
- expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(1);
99
- expect(benchmark.run_summary.without_skill.pass_rate.mean).toBe(0);
100
- expect(benchmark.delta.pass_rate).toBe(1);
101
- expect(benchmark.delta.total_tokens).toBe(2000);
102
- });
103
-
104
- test("surfaces stray-writes violations as validity_warnings", () => {
105
- const root = join(FIXTURE_ROOT, "agg-stray");
106
- const skillDir = join(root, "skill-dir");
107
- const skillSub = join(skillDir, "mr-review");
108
- mkdirSync(skillSub, { recursive: true });
109
- writeFileSync(
110
- join(skillSub, "SKILL.md"),
111
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
112
- );
113
-
114
- const cwd = join(root, "work");
115
- const iterationDir = join(
116
- cwd,
117
- "skills-workspace",
118
- "mr-review",
119
- "iteration-1",
120
- );
121
- mkdirSync(iterationDir, { recursive: true });
122
- writeJson(join(iterationDir, "conditions.json"), {
123
- mode: "new-skill",
124
- conditions: [
125
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
126
- { name: "without_skill", skill_path: null },
127
- ],
128
- timestamp: new Date().toISOString(),
129
- harness: "claude-code",
130
- });
131
- for (const cond of ["with_skill", "without_skill"]) {
132
- const condDir = join(iterationDir, "eval-e1", cond);
133
- mkdirSync(condDir, { recursive: true });
134
- writeJson(join(condDir, "grading.json"), {
135
- assertion_results: [],
136
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
137
- });
138
- writeJson(join(condDir, "timing.json"), {
139
- total_tokens: 100,
140
- duration_ms: 1,
141
- });
142
- }
143
- writeJson(join(iterationDir, "stray-writes.json"), {
144
- generated: new Date().toISOString(),
145
- iteration: 1,
146
- totals: { violations: 1, warnings: 0 },
147
- runs: [
148
- {
149
- eval_id: "e1",
150
- condition: "with_skill",
151
- violations: [
152
- {
153
- tool: "Write",
154
- path: "/repo/runner/run.ts",
155
- ordinal: 3,
156
- reason: "x",
157
- },
158
- ],
159
- warnings: [],
160
- },
161
- ],
162
- });
163
-
164
- const res = Bun.spawnSync(
165
- [
166
- "bun",
167
- "run",
168
- AGGREGATE_TS,
169
- "--skill-dir",
170
- skillDir,
171
- "--skill",
172
- "mr-review",
173
- "--iteration",
174
- "1",
175
- ],
176
- { cwd, stdout: "pipe", stderr: "pipe" },
177
- );
178
- expect(res.exitCode).toBe(0);
179
- const benchmark = JSON.parse(
180
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
181
- ) as { validity_warnings: string[] };
182
- expect(
183
- benchmark.validity_warnings.some(
184
- (w) => w.includes("e1/with_skill") && w.includes("outside"),
185
- ),
186
- ).toBe(true);
187
- });
188
-
189
- test("surfaces plugin-shadow findings as validity_warnings", () => {
190
- const root = join(FIXTURE_ROOT, "agg-shadow");
191
- const skillDir = join(root, "skill-dir");
192
- const skillSub = join(skillDir, "mr-review");
193
- mkdirSync(skillSub, { recursive: true });
194
- writeFileSync(
195
- join(skillSub, "SKILL.md"),
196
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
197
- );
198
-
199
- const cwd = join(root, "work");
200
- const iterationDir = join(
201
- cwd,
202
- "skills-workspace",
203
- "mr-review",
204
- "iteration-1",
205
- );
206
- mkdirSync(iterationDir, { recursive: true });
207
- writeJson(join(iterationDir, "conditions.json"), {
208
- mode: "new-skill",
209
- conditions: [
210
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
211
- { name: "without_skill", skill_path: null },
212
- ],
213
- timestamp: new Date().toISOString(),
214
- harness: "claude-code",
215
- });
216
- for (const cond of ["with_skill", "without_skill"]) {
217
- const condDir = join(iterationDir, "eval-e1", cond);
218
- mkdirSync(condDir, { recursive: true });
219
- writeJson(join(condDir, "grading.json"), {
220
- assertion_results: [],
221
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
222
- });
223
- writeJson(join(condDir, "timing.json"), {
224
- total_tokens: 100,
225
- duration_ms: 1,
226
- });
227
- }
228
- writeJson(join(iterationDir, "plugin-shadow.json"), {
229
- config_dir: "/home/u/.claude",
230
- shadowed: [
231
- {
232
- kind: "plugin",
233
- plugin: "slow-powers@slowdini",
234
- skill_name: "mr-review",
235
- path: "/home/u/.claude/plugins/cache/slowdini/slow-powers/skills/mr-review",
236
- },
237
- ],
238
- });
239
-
240
- const res = Bun.spawnSync(
241
- [
242
- "bun",
243
- "run",
244
- AGGREGATE_TS,
245
- "--skill-dir",
246
- skillDir,
247
- "--skill",
248
- "mr-review",
249
- "--iteration",
250
- "1",
251
- ],
252
- { cwd, stdout: "pipe", stderr: "pipe" },
253
- );
254
- expect(res.exitCode).toBe(0);
255
- const benchmark = JSON.parse(
256
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
257
- ) as { validity_warnings: string[] };
258
- expect(
259
- benchmark.validity_warnings.some(
260
- (w) => w.includes("mr-review") && /contaminat/i.test(w),
261
- ),
262
- ).toBe(true);
263
- });
264
- });