@forwardimpact/libeval 0.1.32 → 0.1.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +167 -0
- package/package.json +5 -3
- package/src/benchmark/apm-installer.js +39 -0
- package/src/benchmark/judge.js +146 -0
- package/src/benchmark/report.js +161 -0
- package/src/benchmark/result.js +108 -0
- package/src/benchmark/runner.js +396 -0
- package/src/benchmark/scorer.js +138 -0
- package/src/benchmark/task-family.js +259 -0
- package/src/benchmark/workdir.js +248 -0
- package/src/commands/benchmark-report.js +39 -0
- package/src/commands/benchmark-run.js +53 -0
- package/src/commands/benchmark-score.js +68 -0
- package/src/index.js +2 -0
- package/src/judge.js +211 -0
- package/src/orchestration-toolkit.js +25 -0
package/src/judge.js
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Judge — one agent session that inspects a completed agent's work and emits
|
|
3
|
+
* a verdict via the orchestration `Conclude` tool. Parallel concept to
|
|
4
|
+
* `Supervisor` and `Facilitator`, but post-hoc and solo: no peer agents,
|
|
5
|
+
* no message bus, no relay loop. The judge reads the task, optionally
|
|
6
|
+
* inspects the working directory and trace via read-only tools, and calls
|
|
7
|
+
* Conclude exactly once.
|
|
8
|
+
*
|
|
9
|
+
* Trace lines are tagged `source: "judge"` so consumers can distinguish
|
|
10
|
+
* judge sessions from supervisor or facilitator sessions in a unified
|
|
11
|
+
* NDJSON envelope.
|
|
12
|
+
*
|
|
13
|
+
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { resolve } from "node:path";
|
|
17
|
+
import { Writable } from "node:stream";
|
|
18
|
+
|
|
19
|
+
import { createAgentRunner } from "./agent-runner.js";
|
|
20
|
+
import { composeProfilePrompt } from "./profile-prompt.js";
|
|
21
|
+
import { SequenceCounter } from "./sequence-counter.js";
|
|
22
|
+
import {
|
|
23
|
+
createJudgeToolServer,
|
|
24
|
+
createOrchestrationContext,
|
|
25
|
+
} from "./orchestration-toolkit.js";
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* System-prompt trailer appended to the judge's main thread. Always applied,
|
|
29
|
+
* even when a `judgeProfile` is supplied — the profile layers on top of the
|
|
30
|
+
* trailer, the same way `SUPERVISOR_SYSTEM_PROMPT` and
|
|
31
|
+
* `FACILITATOR_SYSTEM_PROMPT` work for their respective roles.
|
|
32
|
+
*/
|
|
33
|
+
export const JUDGE_SYSTEM_PROMPT =
|
|
34
|
+
"You are a post-hoc judge for an agent task benchmark. " +
|
|
35
|
+
"The agent has already completed its work and an objective scoring step has already run; your role is to confirm or override the verdict by inspecting the agent's working directory and trace. " +
|
|
36
|
+
"You have read-only inspection tools — Read, Glob, Grep, Bash — to investigate; do not modify the working directory. " +
|
|
37
|
+
"Conclude ends the session with a verdict ('success' or 'failure') and a one-paragraph summary; verdict='success' iff the agent's work meets the criteria stated in the task. " +
|
|
38
|
+
"Call Conclude as your final action — do not deliberate across multiple turns.";
|
|
39
|
+
|
|
40
|
+
const DEFAULT_JUDGE_ALLOWED_TOOLS = ["Read", "Glob", "Grep", "Bash"];
|
|
41
|
+
|
|
42
|
+
const devNull = new Writable({
|
|
43
|
+
write(_chunk, _enc, cb) {
|
|
44
|
+
cb();
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
/** Run a single post-hoc judge session and emit a verdict via Conclude. */
|
|
49
|
+
export class Judge {
|
|
50
|
+
/**
|
|
51
|
+
* @param {object} deps
|
|
52
|
+
* @param {import("./agent-runner.js").AgentRunner} deps.runner - The judge's AgentRunner.
|
|
53
|
+
* @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to.
|
|
54
|
+
* @param {object} deps.ctx - Orchestration context (the Conclude handler writes to it).
|
|
55
|
+
* @param {import("./redaction.js").Redactor} deps.redactor
|
|
56
|
+
* @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
|
|
57
|
+
*/
|
|
58
|
+
constructor({ runner, output, ctx, redactor, taskAmend }) {
|
|
59
|
+
if (!runner) throw new Error("runner is required");
|
|
60
|
+
if (!output) throw new Error("output is required");
|
|
61
|
+
if (!ctx) throw new Error("ctx is required");
|
|
62
|
+
if (!redactor) throw new Error("redactor is required");
|
|
63
|
+
this.runner = runner;
|
|
64
|
+
this.output = output;
|
|
65
|
+
this.ctx = ctx;
|
|
66
|
+
this.redactor = redactor;
|
|
67
|
+
this.taskAmend = taskAmend ?? null;
|
|
68
|
+
this.counter = new SequenceCounter();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Run the judge session.
|
|
73
|
+
* @param {string} task - The judge prompt (with placeholders already substituted).
|
|
74
|
+
* @returns {Promise<{success: boolean, verdict: string|null, summary: string|null, turns: number}>}
|
|
75
|
+
*/
|
|
76
|
+
async run(task) {
|
|
77
|
+
const fullTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
|
|
78
|
+
const result = await this.runner.run(fullTask);
|
|
79
|
+
|
|
80
|
+
if (this.ctx.concluded) {
|
|
81
|
+
const success = this.ctx.verdict === "success";
|
|
82
|
+
const outcome = {
|
|
83
|
+
success,
|
|
84
|
+
verdict: this.ctx.verdict,
|
|
85
|
+
summary: this.ctx.summary ?? null,
|
|
86
|
+
turns: 1,
|
|
87
|
+
};
|
|
88
|
+
this.emitSummary(outcome);
|
|
89
|
+
return outcome;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// The judge ended without calling Conclude. Surface that explicitly so
|
|
93
|
+
// callers can distinguish "judge said fail" from "judge never voted."
|
|
94
|
+
const outcome = {
|
|
95
|
+
success: false,
|
|
96
|
+
verdict: null,
|
|
97
|
+
summary: null,
|
|
98
|
+
turns: result.success ? 1 : 0,
|
|
99
|
+
};
|
|
100
|
+
this.emitSummary(outcome);
|
|
101
|
+
return outcome;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Tag a single NDJSON line with `source: "judge"` and emit it to the
|
|
106
|
+
* judge's output stream. Wired into the underlying AgentRunner via the
|
|
107
|
+
* `onLine` callback so the judge's stream is the single source of truth
|
|
108
|
+
* for the session's trace.
|
|
109
|
+
* @param {string} line
|
|
110
|
+
*/
|
|
111
|
+
emitLine(line) {
|
|
112
|
+
const event = JSON.parse(line);
|
|
113
|
+
const tagged = { source: "judge", seq: this.counter.next(), event };
|
|
114
|
+
this.output.write(JSON.stringify(this.redactor.redactValue(tagged)) + "\n");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Emit a final orchestrator summary line, wrapped in the universal envelope.
|
|
119
|
+
* @param {{success: boolean, verdict?: string|null, summary?: string|null, turns: number}} result
|
|
120
|
+
*/
|
|
121
|
+
emitSummary(result) {
|
|
122
|
+
this.output.write(
|
|
123
|
+
JSON.stringify(
|
|
124
|
+
this.redactor.redactValue({
|
|
125
|
+
source: "orchestrator",
|
|
126
|
+
seq: this.counter.next(),
|
|
127
|
+
event: {
|
|
128
|
+
type: "summary",
|
|
129
|
+
success: result.success,
|
|
130
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
131
|
+
turns: result.turns,
|
|
132
|
+
...(result.summary && { summary: result.summary }),
|
|
133
|
+
},
|
|
134
|
+
}),
|
|
135
|
+
) + "\n",
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Factory function — wires the AgentRunner with the judge orchestration server
|
|
142
|
+
* and the JUDGE_SYSTEM_PROMPT trailer. A `judgeProfile` (when supplied) layers
|
|
143
|
+
* on top of the trailer via `composeProfilePrompt`, matching the
|
|
144
|
+
* supervisor/facilitator pattern.
|
|
145
|
+
*
|
|
146
|
+
* @param {object} deps
|
|
147
|
+
* @param {string} deps.cwd - Judge working directory. Defaults to the directory whose `.claude/agents` holds `judgeProfile`.
|
|
148
|
+
* @param {function} deps.query - SDK query function (injected for testing).
|
|
149
|
+
* @param {import("stream").Writable} deps.output - Trace output stream.
|
|
150
|
+
* @param {import("./redaction.js").Redactor} deps.redactor
|
|
151
|
+
* @param {string} [deps.model]
|
|
152
|
+
* @param {number} [deps.maxTurns] - Default 5 (the judge is expected to act in turn 1; 5 leaves headroom for tool inspection).
|
|
153
|
+
* @param {string[]} [deps.allowedTools] - Default `["Read","Glob","Grep","Bash"]` — read-only inspection.
|
|
154
|
+
* @param {string} [deps.judgeProfile] - Profile name; resolved into the system prompt via `composeProfilePrompt`.
|
|
155
|
+
* @param {string} [deps.profilesDir] - Defaults to `<cwd>/.claude/agents`.
|
|
156
|
+
* @param {string} [deps.taskAmend]
|
|
157
|
+
* @returns {Judge}
|
|
158
|
+
*/
|
|
159
|
+
export function createJudge({
|
|
160
|
+
cwd,
|
|
161
|
+
query,
|
|
162
|
+
output,
|
|
163
|
+
redactor,
|
|
164
|
+
model,
|
|
165
|
+
maxTurns,
|
|
166
|
+
allowedTools,
|
|
167
|
+
judgeProfile,
|
|
168
|
+
profilesDir,
|
|
169
|
+
taskAmend,
|
|
170
|
+
}) {
|
|
171
|
+
if (!cwd) throw new Error("cwd is required");
|
|
172
|
+
if (!query) throw new Error("query is required");
|
|
173
|
+
if (!output) throw new Error("output is required");
|
|
174
|
+
if (!redactor) throw new Error("redactor is required");
|
|
175
|
+
|
|
176
|
+
const resolvedProfilesDir = profilesDir ?? resolve(cwd, ".claude/agents");
|
|
177
|
+
const systemPrompt = judgeProfile
|
|
178
|
+
? composeProfilePrompt(judgeProfile, {
|
|
179
|
+
profilesDir: resolvedProfilesDir,
|
|
180
|
+
trailer: JUDGE_SYSTEM_PROMPT,
|
|
181
|
+
})
|
|
182
|
+
: {
|
|
183
|
+
type: "preset",
|
|
184
|
+
preset: "claude_code",
|
|
185
|
+
append: JUDGE_SYSTEM_PROMPT,
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
const ctx = createOrchestrationContext();
|
|
189
|
+
ctx.participants = [{ name: "judge", role: "judge" }];
|
|
190
|
+
const judgeServer = createJudgeToolServer(ctx);
|
|
191
|
+
|
|
192
|
+
let judge;
|
|
193
|
+
const onLine = (line) => judge.emitLine(line);
|
|
194
|
+
|
|
195
|
+
const runner = createAgentRunner({
|
|
196
|
+
cwd,
|
|
197
|
+
query,
|
|
198
|
+
output: devNull,
|
|
199
|
+
model,
|
|
200
|
+
maxTurns: maxTurns ?? 5,
|
|
201
|
+
allowedTools: allowedTools ?? DEFAULT_JUDGE_ALLOWED_TOOLS,
|
|
202
|
+
onLine,
|
|
203
|
+
settingSources: ["project"],
|
|
204
|
+
systemPrompt,
|
|
205
|
+
mcpServers: { orchestration: judgeServer },
|
|
206
|
+
redactor,
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
judge = new Judge({ runner, output, ctx, redactor, taskAmend });
|
|
210
|
+
return judge;
|
|
211
|
+
}
|
|
@@ -279,6 +279,31 @@ export function createSupervisedAgentToolServer(ctx) {
|
|
|
279
279
|
});
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
+
/**
|
|
283
|
+
* Judge tools: Conclude only.
|
|
284
|
+
*
|
|
285
|
+
* The judge runs a single post-hoc session with no peer participants —
|
|
286
|
+
* Ask/Answer/Announce/Redirect/RollCall are all moot. The judge inspects
|
|
287
|
+
* the agent's working directory and trace via the host's read-only tools
|
|
288
|
+
* and emits its verdict via Conclude.
|
|
289
|
+
*
|
|
290
|
+
* @param {object} ctx - Orchestration context
|
|
291
|
+
* @returns {object} MCP server config (type: "sdk")
|
|
292
|
+
*/
|
|
293
|
+
export function createJudgeToolServer(ctx) {
|
|
294
|
+
return createSdkMcpServer({
|
|
295
|
+
name: "orchestration",
|
|
296
|
+
tools: [
|
|
297
|
+
tool(
|
|
298
|
+
"Conclude",
|
|
299
|
+
"End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
|
|
300
|
+
{ verdict: z.enum(["success", "failure"]), summary: z.string() },
|
|
301
|
+
createConcludeHandler(ctx),
|
|
302
|
+
),
|
|
303
|
+
],
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
|
|
282
307
|
/**
|
|
283
308
|
* Facilitator tools: Ask + Announce + Conclude + RollCall.
|
|
284
309
|
*
|