@slowdini/slow-powers-opencode 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -65
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -13
- package/skills/evaluating-skills/SKILL.md +91 -337
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -158
- package/skills/evaluating-skills/runner/README.md +0 -154
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
- package/skills/evaluating-skills/runner/aggregate.ts +0 -248
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
- package/skills/evaluating-skills/runner/run.test.ts +0 -1180
- package/skills/evaluating-skills/runner/run.ts +0 -1029
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
- package/skills/evaluating-skills/runner/types.ts +0 -112
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
|
@@ -1,347 +0,0 @@
|
|
|
1
|
-
import { afterAll, beforeAll, describe, expect, test } from "bun:test";
|
|
2
|
-
import {
|
|
3
|
-
existsSync,
|
|
4
|
-
mkdirSync,
|
|
5
|
-
readFileSync,
|
|
6
|
-
rmSync,
|
|
7
|
-
writeFileSync,
|
|
8
|
-
} from "node:fs";
|
|
9
|
-
import { tmpdir } from "node:os";
|
|
10
|
-
import { join } from "node:path";
|
|
11
|
-
import { checkSkillInvokedFromTranscript } from "./grade";
|
|
12
|
-
import type { ToolInvocation } from "./types";
|
|
13
|
-
|
|
14
|
-
describe("checkSkillInvokedFromTranscript", () => {
|
|
15
|
-
test("returns true when transcript contains a Skill call with input.skill matching the slug", () => {
|
|
16
|
-
const slug =
|
|
17
|
-
"slow-powers-eval-1-with_skill__verification-before-completion";
|
|
18
|
-
const invocations: ToolInvocation[] = [
|
|
19
|
-
{ name: "Bash", args: { command: "ls" }, ordinal: 0 },
|
|
20
|
-
{ name: "Skill", args: { skill: slug }, ordinal: 1 },
|
|
21
|
-
{ name: "Read", args: { file_path: "/tmp/x" }, ordinal: 2 },
|
|
22
|
-
];
|
|
23
|
-
expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(true);
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
test("returns false when transcript has no Skill calls", () => {
|
|
27
|
-
const invocations: ToolInvocation[] = [
|
|
28
|
-
{ name: "Bash", args: { command: "ls" }, ordinal: 0 },
|
|
29
|
-
{ name: "Read", args: { file_path: "/tmp/x" }, ordinal: 1 },
|
|
30
|
-
];
|
|
31
|
-
expect(
|
|
32
|
-
checkSkillInvokedFromTranscript(
|
|
33
|
-
invocations,
|
|
34
|
-
"slow-powers-eval-1-with_skill__foo",
|
|
35
|
-
),
|
|
36
|
-
).toBe(false);
|
|
37
|
-
});
|
|
38
|
-
|
|
39
|
-
test("returns false when Skill call references a different slug", () => {
|
|
40
|
-
const slug =
|
|
41
|
-
"slow-powers-eval-1-with_skill__verification-before-completion";
|
|
42
|
-
const invocations: ToolInvocation[] = [
|
|
43
|
-
{
|
|
44
|
-
name: "Skill",
|
|
45
|
-
args: { skill: "slow-powers:writing-skills" },
|
|
46
|
-
ordinal: 0,
|
|
47
|
-
},
|
|
48
|
-
{
|
|
49
|
-
name: "Skill",
|
|
50
|
-
args: { skill: "slow-powers-eval-2-old_skill__other" },
|
|
51
|
-
ordinal: 1,
|
|
52
|
-
},
|
|
53
|
-
];
|
|
54
|
-
expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(false);
|
|
55
|
-
});
|
|
56
|
-
|
|
57
|
-
test("returns false on empty invocations array", () => {
|
|
58
|
-
expect(checkSkillInvokedFromTranscript([], "anything")).toBe(false);
|
|
59
|
-
});
|
|
60
|
-
|
|
61
|
-
test("tolerates Skill invocations whose args are missing or malformed", () => {
|
|
62
|
-
const slug = "slow-powers-eval-1-with_skill__foo";
|
|
63
|
-
const invocations: ToolInvocation[] = [
|
|
64
|
-
{ name: "Skill", ordinal: 0 },
|
|
65
|
-
{ name: "Skill", args: "not-an-object", ordinal: 1 },
|
|
66
|
-
{ name: "Skill", args: { other: "field" }, ordinal: 2 },
|
|
67
|
-
];
|
|
68
|
-
expect(checkSkillInvokedFromTranscript(invocations, slug)).toBe(false);
|
|
69
|
-
});
|
|
70
|
-
});
|
|
71
|
-
|
|
72
|
-
const GRADE_FIXTURE_ROOT = join(
|
|
73
|
-
tmpdir(),
|
|
74
|
-
`slow-powers-grade-test-${process.pid}`,
|
|
75
|
-
);
|
|
76
|
-
const GRADE_TS = join(import.meta.dir, "grade.ts");
|
|
77
|
-
|
|
78
|
-
beforeAll(() => {
|
|
79
|
-
mkdirSync(GRADE_FIXTURE_ROOT, { recursive: true });
|
|
80
|
-
});
|
|
81
|
-
|
|
82
|
-
afterAll(() => {
|
|
83
|
-
rmSync(GRADE_FIXTURE_ROOT, { recursive: true, force: true });
|
|
84
|
-
});
|
|
85
|
-
|
|
86
|
-
function writeJsonFile(path: string, value: unknown) {
|
|
87
|
-
writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
describe("emitJudgeTasks skill-invocation meta-check gating", () => {
|
|
91
|
-
test("omits the skill-invocation meta-check for evals marked skill_should_trigger: false", () => {
|
|
92
|
-
const root = join(GRADE_FIXTURE_ROOT, "negative-eval");
|
|
93
|
-
const skill = "mr-review";
|
|
94
|
-
const skillDir = join(root, "skill-dir");
|
|
95
|
-
const skillSub = join(skillDir, skill);
|
|
96
|
-
mkdirSync(join(skillSub, "evals"), { recursive: true });
|
|
97
|
-
writeFileSync(
|
|
98
|
-
join(skillSub, "SKILL.md"),
|
|
99
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
100
|
-
);
|
|
101
|
-
// Two evals: a positive one (skill should fire) and a negative one
|
|
102
|
-
// (skill should NOT fire — non-invocation is the desired behavior).
|
|
103
|
-
writeJsonFile(join(skillSub, "evals", "evals.json"), {
|
|
104
|
-
skill_name: skill,
|
|
105
|
-
evals: [
|
|
106
|
-
{
|
|
107
|
-
id: "pos-eval",
|
|
108
|
-
prompt: "Fix the failing build.",
|
|
109
|
-
expected_output: "Agent debugs systematically.",
|
|
110
|
-
assertions: [
|
|
111
|
-
{ id: "a1", type: "llm_judge", rubric: "Did it debug?" },
|
|
112
|
-
],
|
|
113
|
-
},
|
|
114
|
-
{
|
|
115
|
-
id: "neg-eval",
|
|
116
|
-
prompt: "Add a --verbose flag.",
|
|
117
|
-
expected_output: "Agent treats it as a feature, no debugging.",
|
|
118
|
-
skill_should_trigger: false,
|
|
119
|
-
assertions: [
|
|
120
|
-
{ id: "a2", type: "llm_judge", rubric: "Did it avoid debugging?" },
|
|
121
|
-
],
|
|
122
|
-
},
|
|
123
|
-
],
|
|
124
|
-
});
|
|
125
|
-
|
|
126
|
-
const cwd = join(root, "work");
|
|
127
|
-
const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
|
|
128
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
129
|
-
writeJsonFile(join(iterationDir, "conditions.json"), {
|
|
130
|
-
mode: "new-skill",
|
|
131
|
-
conditions: [
|
|
132
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
133
|
-
{ name: "without_skill", skill_path: null },
|
|
134
|
-
],
|
|
135
|
-
timestamp: new Date().toISOString(),
|
|
136
|
-
harness: "claude-code",
|
|
137
|
-
});
|
|
138
|
-
|
|
139
|
-
for (const evalId of ["pos-eval", "neg-eval"]) {
|
|
140
|
-
for (const cond of ["with_skill", "without_skill"]) {
|
|
141
|
-
const condDir = join(iterationDir, `eval-${evalId}`, cond);
|
|
142
|
-
mkdirSync(condDir, { recursive: true });
|
|
143
|
-
// Empty tool_invocations => meta routed to a judge task (not code-checked).
|
|
144
|
-
writeJsonFile(join(condDir, "run.json"), {
|
|
145
|
-
eval_id: evalId,
|
|
146
|
-
condition: cond,
|
|
147
|
-
skill_path: cond === "with_skill" ? join(skillSub, "SKILL.md") : null,
|
|
148
|
-
prompt: "p",
|
|
149
|
-
files: [],
|
|
150
|
-
final_message: "done",
|
|
151
|
-
tool_invocations: [],
|
|
152
|
-
total_tokens: 100,
|
|
153
|
-
duration_ms: 1000,
|
|
154
|
-
});
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const res = Bun.spawnSync(
|
|
159
|
-
[
|
|
160
|
-
"bun",
|
|
161
|
-
"run",
|
|
162
|
-
GRADE_TS,
|
|
163
|
-
"--skill-dir",
|
|
164
|
-
skillDir,
|
|
165
|
-
"--skill",
|
|
166
|
-
skill,
|
|
167
|
-
"--iteration",
|
|
168
|
-
"1",
|
|
169
|
-
],
|
|
170
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
171
|
-
);
|
|
172
|
-
expect(res.exitCode).toBe(0);
|
|
173
|
-
|
|
174
|
-
const tasks = JSON.parse(
|
|
175
|
-
readFileSync(join(iterationDir, "judge-tasks.json"), "utf8"),
|
|
176
|
-
) as { tasks: Array<{ eval_id: string; is_meta: boolean }> };
|
|
177
|
-
const metaTasks = tasks.tasks.filter((t) => t.is_meta);
|
|
178
|
-
// Exactly one meta-check, and only for the positive eval.
|
|
179
|
-
expect(metaTasks.map((t) => t.eval_id)).toEqual(["pos-eval"]);
|
|
180
|
-
});
|
|
181
|
-
});
|
|
182
|
-
|
|
183
|
-
describe("emitJudgeTasks run.json validation", () => {
|
|
184
|
-
test("fails fast with a schema error when a run.json is malformed", () => {
|
|
185
|
-
const root = join(GRADE_FIXTURE_ROOT, "bad-run-record");
|
|
186
|
-
const skill = "mr-review";
|
|
187
|
-
const skillDir = join(root, "skill-dir");
|
|
188
|
-
const skillSub = join(skillDir, skill);
|
|
189
|
-
mkdirSync(join(skillSub, "evals"), { recursive: true });
|
|
190
|
-
writeFileSync(
|
|
191
|
-
join(skillSub, "SKILL.md"),
|
|
192
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
193
|
-
);
|
|
194
|
-
writeJsonFile(join(skillSub, "evals", "evals.json"), {
|
|
195
|
-
skill_name: skill,
|
|
196
|
-
evals: [
|
|
197
|
-
{
|
|
198
|
-
id: "pos-eval",
|
|
199
|
-
prompt: "Fix the failing build.",
|
|
200
|
-
expected_output: "Agent debugs systematically.",
|
|
201
|
-
assertions: [
|
|
202
|
-
{ id: "a1", type: "llm_judge", rubric: "Did it debug?" },
|
|
203
|
-
],
|
|
204
|
-
},
|
|
205
|
-
],
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
const cwd = join(root, "work");
|
|
209
|
-
const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
|
|
210
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
211
|
-
writeJsonFile(join(iterationDir, "conditions.json"), {
|
|
212
|
-
mode: "new-skill",
|
|
213
|
-
conditions: [
|
|
214
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
215
|
-
{ name: "without_skill", skill_path: null },
|
|
216
|
-
],
|
|
217
|
-
timestamp: new Date().toISOString(),
|
|
218
|
-
harness: "claude-code",
|
|
219
|
-
});
|
|
220
|
-
|
|
221
|
-
for (const cond of ["with_skill", "without_skill"]) {
|
|
222
|
-
const condDir = join(iterationDir, "eval-pos-eval", cond);
|
|
223
|
-
mkdirSync(condDir, { recursive: true });
|
|
224
|
-
// Missing required `final_message` and `files` — must be rejected.
|
|
225
|
-
writeJsonFile(join(condDir, "run.json"), {
|
|
226
|
-
eval_id: "pos-eval",
|
|
227
|
-
condition: cond,
|
|
228
|
-
skill_path: null,
|
|
229
|
-
prompt: "p",
|
|
230
|
-
tool_invocations: [],
|
|
231
|
-
});
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
const res = Bun.spawnSync(
|
|
235
|
-
[
|
|
236
|
-
"bun",
|
|
237
|
-
"run",
|
|
238
|
-
GRADE_TS,
|
|
239
|
-
"--skill-dir",
|
|
240
|
-
skillDir,
|
|
241
|
-
"--skill",
|
|
242
|
-
skill,
|
|
243
|
-
"--iteration",
|
|
244
|
-
"1",
|
|
245
|
-
],
|
|
246
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
247
|
-
);
|
|
248
|
-
expect(res.exitCode).not.toBe(0);
|
|
249
|
-
expect(res.stderr.toString()).toContain("run-record schema");
|
|
250
|
-
});
|
|
251
|
-
});
|
|
252
|
-
|
|
253
|
-
describe("emitJudgeTasks file-pointer dispatch", () => {
|
|
254
|
-
test("writes each judge prompt to a file and drops the inline prompt from judge-tasks.json", () => {
|
|
255
|
-
const root = join(GRADE_FIXTURE_ROOT, "judge-prompt-file");
|
|
256
|
-
const skill = "mr-review";
|
|
257
|
-
const skillDir = join(root, "skill-dir");
|
|
258
|
-
const skillSub = join(skillDir, skill);
|
|
259
|
-
mkdirSync(join(skillSub, "evals"), { recursive: true });
|
|
260
|
-
writeFileSync(
|
|
261
|
-
join(skillSub, "SKILL.md"),
|
|
262
|
-
"---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
|
|
263
|
-
);
|
|
264
|
-
writeJsonFile(join(skillSub, "evals", "evals.json"), {
|
|
265
|
-
skill_name: skill,
|
|
266
|
-
evals: [
|
|
267
|
-
{
|
|
268
|
-
id: "pos-eval",
|
|
269
|
-
prompt: "Fix the failing build.",
|
|
270
|
-
expected_output: "Agent debugs systematically.",
|
|
271
|
-
assertions: [
|
|
272
|
-
{ id: "a1", type: "llm_judge", rubric: "Did it debug?" },
|
|
273
|
-
],
|
|
274
|
-
},
|
|
275
|
-
],
|
|
276
|
-
});
|
|
277
|
-
|
|
278
|
-
const cwd = join(root, "work");
|
|
279
|
-
const iterationDir = join(cwd, "skills-workspace", skill, "iteration-1");
|
|
280
|
-
mkdirSync(iterationDir, { recursive: true });
|
|
281
|
-
writeJsonFile(join(iterationDir, "conditions.json"), {
|
|
282
|
-
mode: "new-skill",
|
|
283
|
-
conditions: [
|
|
284
|
-
{ name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
|
|
285
|
-
{ name: "without_skill", skill_path: null },
|
|
286
|
-
],
|
|
287
|
-
timestamp: new Date().toISOString(),
|
|
288
|
-
harness: "claude-code",
|
|
289
|
-
});
|
|
290
|
-
|
|
291
|
-
for (const cond of ["with_skill", "without_skill"]) {
|
|
292
|
-
const condDir = join(iterationDir, "eval-pos-eval", cond);
|
|
293
|
-
mkdirSync(condDir, { recursive: true });
|
|
294
|
-
writeJsonFile(join(condDir, "run.json"), {
|
|
295
|
-
eval_id: "pos-eval",
|
|
296
|
-
condition: cond,
|
|
297
|
-
skill_path: cond === "with_skill" ? join(skillSub, "SKILL.md") : null,
|
|
298
|
-
prompt: "p",
|
|
299
|
-
files: [],
|
|
300
|
-
final_message: "done",
|
|
301
|
-
tool_invocations: [],
|
|
302
|
-
total_tokens: 100,
|
|
303
|
-
duration_ms: 1000,
|
|
304
|
-
});
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
const res = Bun.spawnSync(
|
|
308
|
-
[
|
|
309
|
-
"bun",
|
|
310
|
-
"run",
|
|
311
|
-
GRADE_TS,
|
|
312
|
-
"--skill-dir",
|
|
313
|
-
skillDir,
|
|
314
|
-
"--skill",
|
|
315
|
-
skill,
|
|
316
|
-
"--iteration",
|
|
317
|
-
"1",
|
|
318
|
-
],
|
|
319
|
-
{ cwd, stdout: "pipe", stderr: "pipe" },
|
|
320
|
-
);
|
|
321
|
-
expect(res.exitCode).toBe(0);
|
|
322
|
-
|
|
323
|
-
const tasks = JSON.parse(
|
|
324
|
-
readFileSync(join(iterationDir, "judge-tasks.json"), "utf8"),
|
|
325
|
-
) as {
|
|
326
|
-
tasks: Array<{
|
|
327
|
-
assertion_id: string;
|
|
328
|
-
response_path: string;
|
|
329
|
-
dispatch_prompt?: string;
|
|
330
|
-
dispatch_prompt_path: string;
|
|
331
|
-
}>;
|
|
332
|
-
};
|
|
333
|
-
|
|
334
|
-
expect(tasks.tasks.length).toBeGreaterThan(0);
|
|
335
|
-
for (const t of tasks.tasks) {
|
|
336
|
-
// Nothing inlined; the orchestrator reads the prompt from a file.
|
|
337
|
-
expect(t.dispatch_prompt).toBeUndefined();
|
|
338
|
-
expect(t.dispatch_prompt_path.endsWith(`${t.assertion_id}.txt`)).toBe(
|
|
339
|
-
true,
|
|
340
|
-
);
|
|
341
|
-
expect(existsSync(t.dispatch_prompt_path)).toBe(true);
|
|
342
|
-
const contents = readFileSync(t.dispatch_prompt_path, "utf8");
|
|
343
|
-
// The judge still learns where to write its verdict from the prompt text.
|
|
344
|
-
expect(contents).toContain(t.response_path);
|
|
345
|
-
}
|
|
346
|
-
});
|
|
347
|
-
});
|