@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,484 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import {
3
- existsSync,
4
- mkdirSync,
5
- readFileSync,
6
- rmSync,
7
- writeFileSync,
8
- } from "node:fs";
9
- import { tmpdir } from "node:os";
10
- import { join } from "node:path";
11
-
12
- const FIXTURE_ROOT = join(
13
- tmpdir(),
14
- `slow-powers-aggregate-test-${process.pid}`,
15
- );
16
- const AGGREGATE_TS = join(import.meta.dir, "aggregate.ts");
17
-
18
- beforeAll(() => {
19
- mkdirSync(FIXTURE_ROOT, { recursive: true });
20
- });
21
-
22
- afterAll(() => {
23
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
24
- });
25
-
26
- function writeJson(path: string, value: unknown) {
27
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
28
- }
29
-
30
- describe("aggregate.ts user-mode (--skill-dir, isolated CWD)", () => {
31
- test("computes benchmark.json from a hand-built graded workspace under CWD", () => {
32
- const root = join(FIXTURE_ROOT, "agg-basic");
33
- // Skill dir + skill-under-test (detectRunContext validates SKILL.md exists)
34
- const skillDir = join(root, "skill-dir");
35
- const skillSub = join(skillDir, "mr-review");
36
- mkdirSync(skillSub, { recursive: true });
37
- writeFileSync(
38
- join(skillSub, "SKILL.md"),
39
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
40
- );
41
-
42
- // Working dir that holds the workspace (mirrors stageRoot/workspaceRoot = CWD)
43
- const cwd = join(root, "work");
44
- const iterationDir = join(
45
- cwd,
46
- "skills-workspace",
47
- "mr-review",
48
- "iteration-1",
49
- );
50
- mkdirSync(iterationDir, { recursive: true });
51
- writeJson(join(iterationDir, "conditions.json"), {
52
- mode: "new-skill",
53
- conditions: [
54
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
55
- { name: "without_skill", skill_path: null },
56
- ],
57
- timestamp: new Date().toISOString(),
58
- harness: "claude-code",
59
- });
60
-
61
- const mkCond = (cond: string, passRate: number, tokens: number) => {
62
- const condDir = join(iterationDir, "eval-e1", cond);
63
- mkdirSync(condDir, { recursive: true });
64
- writeJson(join(condDir, "grading.json"), {
65
- assertion_results: [],
66
- summary: { passed: 1, failed: 0, total: 1, pass_rate: passRate },
67
- });
68
- writeJson(join(condDir, "timing.json"), {
69
- total_tokens: tokens,
70
- duration_ms: 1000,
71
- });
72
- };
73
- mkCond("with_skill", 1, 5000);
74
- mkCond("without_skill", 0, 3000);
75
-
76
- const res = Bun.spawnSync(
77
- [
78
- "bun",
79
- "run",
80
- AGGREGATE_TS,
81
- "--skill-dir",
82
- skillDir,
83
- "--skill",
84
- "mr-review",
85
- "--iteration",
86
- "1",
87
- ],
88
- { cwd, stdout: "pipe", stderr: "pipe" },
89
- );
90
- expect(res.exitCode).toBe(0);
91
-
92
- const benchmarkPath = join(iterationDir, "benchmark.json");
93
- expect(existsSync(benchmarkPath)).toBe(true);
94
- const benchmark = JSON.parse(readFileSync(benchmarkPath, "utf8")) as {
95
- delta: { pass_rate: number; total_tokens: number };
96
- run_summary: Record<string, { pass_rate: { mean: number } }>;
97
- };
98
- expect(benchmark.run_summary.with_skill.pass_rate.mean).toBe(1);
99
- expect(benchmark.run_summary.without_skill.pass_rate.mean).toBe(0);
100
- expect(benchmark.delta.pass_rate).toBe(1);
101
- expect(benchmark.delta.total_tokens).toBe(2000);
102
- });
103
-
104
- test("surfaces stray-writes violations as validity_warnings", () => {
105
- const root = join(FIXTURE_ROOT, "agg-stray");
106
- const skillDir = join(root, "skill-dir");
107
- const skillSub = join(skillDir, "mr-review");
108
- mkdirSync(skillSub, { recursive: true });
109
- writeFileSync(
110
- join(skillSub, "SKILL.md"),
111
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
112
- );
113
-
114
- const cwd = join(root, "work");
115
- const iterationDir = join(
116
- cwd,
117
- "skills-workspace",
118
- "mr-review",
119
- "iteration-1",
120
- );
121
- mkdirSync(iterationDir, { recursive: true });
122
- writeJson(join(iterationDir, "conditions.json"), {
123
- mode: "new-skill",
124
- conditions: [
125
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
126
- { name: "without_skill", skill_path: null },
127
- ],
128
- timestamp: new Date().toISOString(),
129
- harness: "claude-code",
130
- });
131
- for (const cond of ["with_skill", "without_skill"]) {
132
- const condDir = join(iterationDir, "eval-e1", cond);
133
- mkdirSync(condDir, { recursive: true });
134
- writeJson(join(condDir, "grading.json"), {
135
- assertion_results: [],
136
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
137
- });
138
- writeJson(join(condDir, "timing.json"), {
139
- total_tokens: 100,
140
- duration_ms: 1,
141
- });
142
- }
143
- writeJson(join(iterationDir, "stray-writes.json"), {
144
- generated: new Date().toISOString(),
145
- iteration: 1,
146
- totals: { violations: 1, warnings: 0 },
147
- runs: [
148
- {
149
- eval_id: "e1",
150
- condition: "with_skill",
151
- violations: [
152
- {
153
- tool: "Write",
154
- path: "/repo/runner/run.ts",
155
- ordinal: 3,
156
- reason: "x",
157
- },
158
- ],
159
- warnings: [],
160
- },
161
- ],
162
- });
163
-
164
- const res = Bun.spawnSync(
165
- [
166
- "bun",
167
- "run",
168
- AGGREGATE_TS,
169
- "--skill-dir",
170
- skillDir,
171
- "--skill",
172
- "mr-review",
173
- "--iteration",
174
- "1",
175
- ],
176
- { cwd, stdout: "pipe", stderr: "pipe" },
177
- );
178
- expect(res.exitCode).toBe(0);
179
- const benchmark = JSON.parse(
180
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
181
- ) as { validity_warnings: string[] };
182
- expect(
183
- benchmark.validity_warnings.some(
184
- (w) => w.includes("e1/with_skill") && w.includes("outside"),
185
- ),
186
- ).toBe(true);
187
- });
188
-
189
- test("surfaces live-source reads as validity_warnings", () => {
190
- const root = join(FIXTURE_ROOT, "agg-live-reads");
191
- const skillDir = join(root, "skill-dir");
192
- const skillSub = join(skillDir, "mr-review");
193
- mkdirSync(skillSub, { recursive: true });
194
- writeFileSync(
195
- join(skillSub, "SKILL.md"),
196
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
197
- );
198
-
199
- const cwd = join(root, "work");
200
- const iterationDir = join(
201
- cwd,
202
- "skills-workspace",
203
- "mr-review",
204
- "iteration-1",
205
- );
206
- mkdirSync(iterationDir, { recursive: true });
207
- writeJson(join(iterationDir, "conditions.json"), {
208
- mode: "revision",
209
- conditions: [
210
- { name: "old_skill", skill_path: join(skillSub, "SKILL.md") },
211
- { name: "new_skill", skill_path: join(skillSub, "SKILL.md") },
212
- ],
213
- timestamp: new Date().toISOString(),
214
- harness: "claude-code",
215
- });
216
- for (const cond of ["old_skill", "new_skill"]) {
217
- const condDir = join(iterationDir, "eval-e1", cond);
218
- mkdirSync(condDir, { recursive: true });
219
- writeJson(join(condDir, "grading.json"), {
220
- assertion_results: [],
221
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
222
- });
223
- writeJson(join(condDir, "timing.json"), {
224
- total_tokens: 100,
225
- duration_ms: 1,
226
- });
227
- }
228
- writeJson(join(iterationDir, "stray-writes.json"), {
229
- generated: new Date().toISOString(),
230
- iteration: 1,
231
- totals: { violations: 0, warnings: 0, live_source_reads: 1 },
232
- runs: [
233
- {
234
- eval_id: "e1",
235
- condition: "old_skill",
236
- violations: [],
237
- warnings: [],
238
- live_source_reads: [
239
- {
240
- tool: "Read",
241
- path: join(skillSub, "SKILL.md"),
242
- ordinal: 0,
243
- reason: "x",
244
- },
245
- ],
246
- },
247
- ],
248
- });
249
-
250
- const res = Bun.spawnSync(
251
- [
252
- "bun",
253
- "run",
254
- AGGREGATE_TS,
255
- "--skill-dir",
256
- skillDir,
257
- "--skill",
258
- "mr-review",
259
- "--iteration",
260
- "1",
261
- ],
262
- { cwd, stdout: "pipe", stderr: "pipe" },
263
- );
264
- expect(res.exitCode).toBe(0);
265
- const benchmark = JSON.parse(
266
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
267
- ) as { validity_warnings: string[] };
268
- expect(
269
- benchmark.validity_warnings.some(
270
- (w) => w.includes("e1/old_skill") && /live skill source/i.test(w),
271
- ),
272
- ).toBe(true);
273
- });
274
-
275
- test("warns when timing sources are mixed across the compared runs", () => {
276
- const root = join(FIXTURE_ROOT, "agg-mixed-timing");
277
- const skillDir = join(root, "skill-dir");
278
- const skillSub = join(skillDir, "mr-review");
279
- mkdirSync(skillSub, { recursive: true });
280
- writeFileSync(
281
- join(skillSub, "SKILL.md"),
282
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
283
- );
284
-
285
- const cwd = join(root, "work");
286
- const iterationDir = join(
287
- cwd,
288
- "skills-workspace",
289
- "mr-review",
290
- "iteration-1",
291
- );
292
- mkdirSync(iterationDir, { recursive: true });
293
- writeJson(join(iterationDir, "conditions.json"), {
294
- mode: "new-skill",
295
- conditions: [
296
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
297
- { name: "without_skill", skill_path: null },
298
- ],
299
- timestamp: new Date().toISOString(),
300
- harness: "claude-code",
301
- });
302
- // One arm has agent-captured completion-event timing (no source field, the
303
- // pre-provenance shape); the other was backfilled from the transcript.
304
- const mkCond = (cond: string, timing: unknown) => {
305
- const condDir = join(iterationDir, "eval-e1", cond);
306
- mkdirSync(condDir, { recursive: true });
307
- writeJson(join(condDir, "grading.json"), {
308
- assertion_results: [],
309
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
310
- });
311
- writeJson(join(condDir, "timing.json"), timing);
312
- };
313
- mkCond("with_skill", { total_tokens: 5000, duration_ms: 1000 });
314
- mkCond("without_skill", {
315
- total_tokens: 90000,
316
- duration_ms: 1200,
317
- source: "transcript",
318
- });
319
-
320
- const res = Bun.spawnSync(
321
- [
322
- "bun",
323
- "run",
324
- AGGREGATE_TS,
325
- "--skill-dir",
326
- skillDir,
327
- "--skill",
328
- "mr-review",
329
- "--iteration",
330
- "1",
331
- ],
332
- { cwd, stdout: "pipe", stderr: "pipe" },
333
- );
334
- expect(res.exitCode).toBe(0);
335
- const benchmark = JSON.parse(
336
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
337
- ) as { validity_warnings: string[] };
338
- expect(
339
- benchmark.validity_warnings.some(
340
- (w) => w.includes("timing source") && w.includes("transcript"),
341
- ),
342
- ).toBe(true);
343
- });
344
-
345
- test("does not warn when all timing comes from one source", () => {
346
- const root = join(FIXTURE_ROOT, "agg-same-timing");
347
- const skillDir = join(root, "skill-dir");
348
- const skillSub = join(skillDir, "mr-review");
349
- mkdirSync(skillSub, { recursive: true });
350
- writeFileSync(
351
- join(skillSub, "SKILL.md"),
352
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
353
- );
354
-
355
- const cwd = join(root, "work");
356
- const iterationDir = join(
357
- cwd,
358
- "skills-workspace",
359
- "mr-review",
360
- "iteration-1",
361
- );
362
- mkdirSync(iterationDir, { recursive: true });
363
- writeJson(join(iterationDir, "conditions.json"), {
364
- mode: "new-skill",
365
- conditions: [
366
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
367
- { name: "without_skill", skill_path: null },
368
- ],
369
- timestamp: new Date().toISOString(),
370
- harness: "claude-code",
371
- });
372
- for (const cond of ["with_skill", "without_skill"]) {
373
- const condDir = join(iterationDir, "eval-e1", cond);
374
- mkdirSync(condDir, { recursive: true });
375
- writeJson(join(condDir, "grading.json"), {
376
- assertion_results: [],
377
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
378
- });
379
- writeJson(join(condDir, "timing.json"), {
380
- total_tokens: 100,
381
- duration_ms: 1,
382
- source: "transcript",
383
- });
384
- }
385
-
386
- const res = Bun.spawnSync(
387
- [
388
- "bun",
389
- "run",
390
- AGGREGATE_TS,
391
- "--skill-dir",
392
- skillDir,
393
- "--skill",
394
- "mr-review",
395
- "--iteration",
396
- "1",
397
- ],
398
- { cwd, stdout: "pipe", stderr: "pipe" },
399
- );
400
- expect(res.exitCode).toBe(0);
401
- const benchmark = JSON.parse(
402
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
403
- ) as { validity_warnings: string[] };
404
- expect(
405
- benchmark.validity_warnings.some((w) => w.includes("timing source")),
406
- ).toBe(false);
407
- });
408
-
409
- test("surfaces plugin-shadow findings as validity_warnings", () => {
410
- const root = join(FIXTURE_ROOT, "agg-shadow");
411
- const skillDir = join(root, "skill-dir");
412
- const skillSub = join(skillDir, "mr-review");
413
- mkdirSync(skillSub, { recursive: true });
414
- writeFileSync(
415
- join(skillSub, "SKILL.md"),
416
- "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
417
- );
418
-
419
- const cwd = join(root, "work");
420
- const iterationDir = join(
421
- cwd,
422
- "skills-workspace",
423
- "mr-review",
424
- "iteration-1",
425
- );
426
- mkdirSync(iterationDir, { recursive: true });
427
- writeJson(join(iterationDir, "conditions.json"), {
428
- mode: "new-skill",
429
- conditions: [
430
- { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
431
- { name: "without_skill", skill_path: null },
432
- ],
433
- timestamp: new Date().toISOString(),
434
- harness: "claude-code",
435
- });
436
- for (const cond of ["with_skill", "without_skill"]) {
437
- const condDir = join(iterationDir, "eval-e1", cond);
438
- mkdirSync(condDir, { recursive: true });
439
- writeJson(join(condDir, "grading.json"), {
440
- assertion_results: [],
441
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
442
- });
443
- writeJson(join(condDir, "timing.json"), {
444
- total_tokens: 100,
445
- duration_ms: 1,
446
- });
447
- }
448
- writeJson(join(iterationDir, "plugin-shadow.json"), {
449
- config_dir: "/home/u/.claude",
450
- shadowed: [
451
- {
452
- kind: "plugin",
453
- plugin: "slow-powers@slowdini",
454
- skill_name: "mr-review",
455
- path: "/home/u/.claude/plugins/cache/slowdini/slow-powers/skills/mr-review",
456
- },
457
- ],
458
- });
459
-
460
- const res = Bun.spawnSync(
461
- [
462
- "bun",
463
- "run",
464
- AGGREGATE_TS,
465
- "--skill-dir",
466
- skillDir,
467
- "--skill",
468
- "mr-review",
469
- "--iteration",
470
- "1",
471
- ],
472
- { cwd, stdout: "pipe", stderr: "pipe" },
473
- );
474
- expect(res.exitCode).toBe(0);
475
- const benchmark = JSON.parse(
476
- readFileSync(join(iterationDir, "benchmark.json"), "utf8"),
477
- ) as { validity_warnings: string[] };
478
- expect(
479
- benchmark.validity_warnings.some(
480
- (w) => w.includes("mr-review") && /contaminat/i.test(w),
481
- ),
482
- ).toBe(true);
483
- });
484
- });