@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,347 @@
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import {
3
+ existsSync,
4
+ mkdirSync,
5
+ readFileSync,
6
+ rmSync,
7
+ writeFileSync,
8
+ } from "node:fs";
9
+ import { tmpdir } from "node:os";
10
+ import { join } from "node:path";
11
+ import { checkSkillInvokedFromTranscript } from "./grade";
12
+ import type { ToolInvocation } from "./types";
13
+
14
+ describe("checkSkillInvokedFromTranscript", () => {
15
+ test("returns true when transcript contains a Skill call with input.skill matching the slug", () => {
16
+ const slug =
17
+ "slow-powers-eval-1-with_skill__verification-before-completion";
18
+ const invocations: ToolInvocation[] = [
19
+ { name: "Bash", args: { command: "ls" }, ordinal: 0 },
20
+ { name: "Skill", args: { skill: slug }, ordinal: 1 },
21
+ { name: "Read", args: { file_path: "/tmp/x" }, ordinal: 2 },
22
+ ];
23
+ expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(true);
24
+ });
25
+
26
+ test("returns false when transcript has no Skill calls", () => {
27
+ const invocations: ToolInvocation[] = [
28
+ { name: "Bash", args: { command: "ls" }, ordinal: 0 },
29
+ { name: "Read", args: { file_path: "/tmp/x" }, ordinal: 1 },
30
+ ];
31
+ expect(
32
+ checkSkillInvokedFromTranscript(
33
+ invocations,
34
+ "slow-powers-eval-1-with_skill__foo",
35
+ ),
36
+ ).toBe(false);
37
+ });
38
+
39
+ test("returns false when Skill call references a different slug", () => {
40
+ const slug =
41
+ "slow-powers-eval-1-with_skill__verification-before-completion";
42
+ const invocations: ToolInvocation[] = [
43
+ {
44
+ name: "Skill",
45
+ args: { skill: "slow-powers:writing-skills" },
46
+ ordinal: 0,
47
+ },
48
+ {
49
+ name: "Skill",
50
+ args: { skill: "slow-powers-eval-2-old_skill__other" },
51
+ ordinal: 1,
52
+ },
53
+ ];
54
+ expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(false);
55
+ });
56
+
57
+ test("returns false on empty invocations array", () => {
58
+ expect(checkSkillInvokedFromTranscript([], "anything")).toBe(false);
59
+ });
60
+
61
+ test("tolerates Skill invocations whose args are missing or malformed", () => {
62
+ const slug = "slow-powers-eval-1-with_skill__foo";
63
+ const invocations: ToolInvocation[] = [
64
+ { name: "Skill", ordinal: 0 },
65
+ { name: "Skill", args: "not-an-object", ordinal: 1 },
66
+ { name: "Skill", args: { other: "field" }, ordinal: 2 },
67
+ ];
68
+ expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(false);
69
+ });
70
+ });
71
+
72
+ const GRADE_FIXTURE_ROOT = join(
73
+ tmpdir(),
74
+ `slow-powers-grade-test-${process.pid}`,
75
+ );
76
+ const GRADE_TS = join(import.meta.dir, "grade.ts");
77
+
78
+ beforeAll(() => {
79
+ mkdirSync(GRADE_FIXTURE_ROOT, { recursive: true });
80
+ });
81
+
82
+ afterAll(() => {
83
+ rmSync(GRADE_FIXTURE_ROOT, { recursive: true, force: true });
84
+ });
85
+
86
+ function writeJsonFile(path: string, value: unknown) {
87
+ writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
88
+ }
89
+
90
+ describe("emitJudgeTasks skill-invocation meta-check gating", () => {
91
+ test("omits the skill-invocation meta-check for evals marked skill_should_trigger: false", () => {
92
+ const root = join(GRADE_FIXTURE_ROOT, "negative-eval");
93
+ const skill = "mr-review";
94
+ const skillDir = join(root, "skill-dir");
95
+ const skillSub = join(skillDir, skill);
96
+ mkdirSync(join(skillSub, "evals"), { recursive: true });
97
+ writeFileSync(
98
+ join(skillSub, "SKILL.md"),
99
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
100
+ );
101
+ // Two evals: a positive one (skill should fire) and a negative one
102
+ // (skill should NOT fire — non-invocation is the desired behavior).
103
+ writeJsonFile(join(skillSub, "evals", "evals.json"), {
104
+ skill_name: skill,
105
+ evals: [
106
+ {
107
+ id: "pos-eval",
108
+ prompt: "Fix the failing build.",
109
+ expected_output: "Agent debugs systematically.",
110
+ assertions: [
111
+ { id: "a1", type: "llm_judge", rubric: "Did it debug?" },
112
+ ],
113
+ },
114
+ {
115
+ id: "neg-eval",
116
+ prompt: "Add a --verbose flag.",
117
+ expected_output: "Agent treats it as a feature, no debugging.",
118
+ skill_should_trigger: false,
119
+ assertions: [
120
+ { id: "a2", type: "llm_judge", rubric: "Did it avoid debugging?" },
121
+ ],
122
+ },
123
+ ],
124
+ });
125
+
126
+ const cwd = join(root, "work");
127
+ const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
128
+ mkdirSync(iterationDir, { recursive: true });
129
+ writeJsonFile(join(iterationDir, "conditions.json"), {
130
+ mode: "new-skill",
131
+ conditions: [
132
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
133
+ { name: "without_skill", skill_path: null },
134
+ ],
135
+ timestamp: new Date().toISOString(),
136
+ harness: "claude-code",
137
+ });
138
+
139
+ for (const evalId of ["pos-eval", "neg-eval"]) {
140
+ for (const cond of ["with_skill", "without_skill"]) {
141
+ const condDir = join(iterationDir, `eval-${evalId}`, cond);
142
+ mkdirSync(condDir, { recursive: true });
143
+ // Empty tool_invocations => meta routed to a judge task (not code-checked).
144
+ writeJsonFile(join(condDir, "run.json"), {
145
+ eval_id: evalId,
146
+ condition: cond,
147
+ skill_path: cond === "with_skill" ? join(skillSub, "SKILL.md") : null,
148
+ prompt: "p",
149
+ files: [],
150
+ final_message: "done",
151
+ tool_invocations: [],
152
+ total_tokens: 100,
153
+ duration_ms: 1000,
154
+ });
155
+ }
156
+ }
157
+
158
+ const res = Bun.spawnSync(
159
+ [
160
+ "bun",
161
+ "run",
162
+ GRADE_TS,
163
+ "--skill-dir",
164
+ skillDir,
165
+ "--skill",
166
+ skill,
167
+ "--iteration",
168
+ "1",
169
+ ],
170
+ { cwd, stdout: "pipe", stderr: "pipe" },
171
+ );
172
+ expect(res.exitCode).toBe(0);
173
+
174
+ const tasks = JSON.parse(
175
+ readFileSync(join(iterationDir, "judge-tasks.json"), "utf8"),
176
+ ) as { tasks: Array<{ eval_id: string; is_meta: boolean }> };
177
+ const metaTasks = tasks.tasks.filter((t) => t.is_meta);
178
+ // Exactly one meta-check, and only for the positive eval.
179
+ expect(metaTasks.map((t) => t.eval_id)).toEqual(["pos-eval"]);
180
+ });
181
+ });
182
+
183
+ describe("emitJudgeTasks run.json validation", () => {
184
+ test("fails fast with a schema error when a run.json is malformed", () => {
185
+ const root = join(GRADE_FIXTURE_ROOT, "bad-run-record");
186
+ const skill = "mr-review";
187
+ const skillDir = join(root, "skill-dir");
188
+ const skillSub = join(skillDir, skill);
189
+ mkdirSync(join(skillSub, "evals"), { recursive: true });
190
+ writeFileSync(
191
+ join(skillSub, "SKILL.md"),
192
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
193
+ );
194
+ writeJsonFile(join(skillSub, "evals", "evals.json"), {
195
+ skill_name: skill,
196
+ evals: [
197
+ {
198
+ id: "pos-eval",
199
+ prompt: "Fix the failing build.",
200
+ expected_output: "Agent debugs systematically.",
201
+ assertions: [
202
+ { id: "a1", type: "llm_judge", rubric: "Did it debug?" },
203
+ ],
204
+ },
205
+ ],
206
+ });
207
+
208
+ const cwd = join(root, "work");
209
+ const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
210
+ mkdirSync(iterationDir, { recursive: true });
211
+ writeJsonFile(join(iterationDir, "conditions.json"), {
212
+ mode: "new-skill",
213
+ conditions: [
214
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
215
+ { name: "without_skill", skill_path: null },
216
+ ],
217
+ timestamp: new Date().toISOString(),
218
+ harness: "claude-code",
219
+ });
220
+
221
+ for (const cond of ["with_skill", "without_skill"]) {
222
+ const condDir = join(iterationDir, "eval-pos-eval", cond);
223
+ mkdirSync(condDir, { recursive: true });
224
+ // Missing required `final_message` and `files` — must be rejected.
225
+ writeJsonFile(join(condDir, "run.json"), {
226
+ eval_id: "pos-eval",
227
+ condition: cond,
228
+ skill_path: null,
229
+ prompt: "p",
230
+ tool_invocations: [],
231
+ });
232
+ }
233
+
234
+ const res = Bun.spawnSync(
235
+ [
236
+ "bun",
237
+ "run",
238
+ GRADE_TS,
239
+ "--skill-dir",
240
+ skillDir,
241
+ "--skill",
242
+ skill,
243
+ "--iteration",
244
+ "1",
245
+ ],
246
+ { cwd, stdout: "pipe", stderr: "pipe" },
247
+ );
248
+ expect(res.exitCode).not.toBe(0);
249
+ expect(res.stderr.toString()).toContain("run-record schema");
250
+ });
251
+ });
252
+
253
+ describe("emitJudgeTasks file-pointer dispatch", () => {
254
+ test("writes each judge prompt to a file and drops the inline prompt from judge-tasks.json", () => {
255
+ const root = join(GRADE_FIXTURE_ROOT, "judge-prompt-file");
256
+ const skill = "mr-review";
257
+ const skillDir = join(root, "skill-dir");
258
+ const skillSub = join(skillDir, skill);
259
+ mkdirSync(join(skillSub, "evals"), { recursive: true });
260
+ writeFileSync(
261
+ join(skillSub, "SKILL.md"),
262
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
263
+ );
264
+ writeJsonFile(join(skillSub, "evals", "evals.json"), {
265
+ skill_name: skill,
266
+ evals: [
267
+ {
268
+ id: "pos-eval",
269
+ prompt: "Fix the failing build.",
270
+ expected_output: "Agent debugs systematically.",
271
+ assertions: [
272
+ { id: "a1", type: "llm_judge", rubric: "Did it debug?" },
273
+ ],
274
+ },
275
+ ],
276
+ });
277
+
278
+ const cwd = join(root, "work");
279
+ const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
280
+ mkdirSync(iterationDir, { recursive: true });
281
+ writeJsonFile(join(iterationDir, "conditions.json"), {
282
+ mode: "new-skill",
283
+ conditions: [
284
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
285
+ { name: "without_skill", skill_path: null },
286
+ ],
287
+ timestamp: new Date().toISOString(),
288
+ harness: "claude-code",
289
+ });
290
+
291
+ for (const cond of ["with_skill", "without_skill"]) {
292
+ const condDir = join(iterationDir, "eval-pos-eval", cond);
293
+ mkdirSync(condDir, { recursive: true });
294
+ writeJsonFile(join(condDir, "run.json"), {
295
+ eval_id: "pos-eval",
296
+ condition: cond,
297
+ skill_path: cond === "with_skill" ? join(skillSub, "SKILL.md") : null,
298
+ prompt: "p",
299
+ files: [],
300
+ final_message: "done",
301
+ tool_invocations: [],
302
+ total_tokens: 100,
303
+ duration_ms: 1000,
304
+ });
305
+ }
306
+
307
+ const res = Bun.spawnSync(
308
+ [
309
+ "bun",
310
+ "run",
311
+ GRADE_TS,
312
+ "--skill-dir",
313
+ skillDir,
314
+ "--skill",
315
+ skill,
316
+ "--iteration",
317
+ "1",
318
+ ],
319
+ { cwd, stdout: "pipe", stderr: "pipe" },
320
+ );
321
+ expect(res.exitCode).toBe(0);
322
+
323
+ const tasks = JSON.parse(
324
+ readFileSync(join(iterationDir, "judge-tasks.json"), "utf8"),
325
+ ) as {
326
+ tasks: Array<{
327
+ assertion_id: string;
328
+ response_path: string;
329
+ dispatch_prompt?: string;
330
+ dispatch_prompt_path: string;
331
+ }>;
332
+ };
333
+
334
+ expect(tasks.tasks.length).toBeGreaterThan(0);
335
+ for (const t of tasks.tasks) {
336
+ // Nothing inlined; the orchestrator reads the prompt from a file.
337
+ expect(t.dispatch_prompt).toBeUndefined();
338
+ expect(t.dispatch_prompt_path.endsWith(`${t.assertion_id}.txt`)).toBe(
339
+ true,
340
+ );
341
+ expect(existsSync(t.dispatch_prompt_path)).toBe(true);
342
+ const contents = readFileSync(t.dispatch_prompt_path, "utf8");
343
+ // The judge still learns where to write its verdict from the prompt text.
344
+ expect(contents).toContain(t.response_path);
345
+ }
346
+ });
347
+ });