@forwardimpact/libeval 0.1.32 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/judge.js ADDED
@@ -0,0 +1,211 @@
1
+ /**
2
+ * Judge — one agent session that inspects a completed agent's work and emits
3
+ * a verdict via the orchestration `Conclude` tool. Parallel concept to
4
+ * `Supervisor` and `Facilitator`, but post-hoc and solo: no peer agents,
5
+ * no message bus, no relay loop. The judge reads the task, optionally
6
+ * inspects the working directory and trace via read-only tools, and calls
7
+ * Conclude exactly once.
8
+ *
9
+ * Trace lines are tagged `source: "judge"` so consumers can distinguish
10
+ * judge sessions from supervisor or facilitator sessions in a unified
11
+ * NDJSON envelope.
12
+ *
13
+ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
14
+ */
15
+
16
+ import { resolve } from "node:path";
17
+ import { Writable } from "node:stream";
18
+
19
+ import { createAgentRunner } from "./agent-runner.js";
20
+ import { composeProfilePrompt } from "./profile-prompt.js";
21
+ import { SequenceCounter } from "./sequence-counter.js";
22
+ import {
23
+ createJudgeToolServer,
24
+ createOrchestrationContext,
25
+ } from "./orchestration-toolkit.js";
26
+
27
+ /**
28
+ * System-prompt trailer appended to the judge's main thread. Always applied,
29
+ * even when a `judgeProfile` is supplied — the profile layers on top of the
30
+ * trailer, the same way `SUPERVISOR_SYSTEM_PROMPT` and
31
+ * `FACILITATOR_SYSTEM_PROMPT` work for their respective roles.
32
+ */
33
+ export const JUDGE_SYSTEM_PROMPT =
34
+ "You are a post-hoc judge for an agent task benchmark. " +
35
+ "The agent has already completed its work and an objective scoring step has already run; your role is to confirm or override the verdict by inspecting the agent's working directory and trace. " +
36
+ "You have read-only inspection tools — Read, Glob, Grep, Bash — to investigate; do not modify the working directory. " +
37
+ "Conclude ends the session with a verdict ('success' or 'failure') and a one-paragraph summary; verdict='success' iff the agent's work meets the criteria stated in the task. " +
38
+ "Call Conclude as your final action — do not deliberate across multiple turns.";
39
+
40
+ const DEFAULT_JUDGE_ALLOWED_TOOLS = ["Read", "Glob", "Grep", "Bash"];
41
+
42
+ const devNull = new Writable({
43
+ write(_chunk, _enc, cb) {
44
+ cb();
45
+ },
46
+ });
47
+
48
+ /** Run a single post-hoc judge session and emit a verdict via Conclude. */
49
+ export class Judge {
50
+ /**
51
+ * @param {object} deps
52
+ * @param {import("./agent-runner.js").AgentRunner} deps.runner - The judge's AgentRunner.
53
+ * @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to.
54
+ * @param {object} deps.ctx - Orchestration context (the Conclude handler writes to it).
55
+ * @param {import("./redaction.js").Redactor} deps.redactor
56
+ * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
57
+ */
58
+ constructor({ runner, output, ctx, redactor, taskAmend }) {
59
+ if (!runner) throw new Error("runner is required");
60
+ if (!output) throw new Error("output is required");
61
+ if (!ctx) throw new Error("ctx is required");
62
+ if (!redactor) throw new Error("redactor is required");
63
+ this.runner = runner;
64
+ this.output = output;
65
+ this.ctx = ctx;
66
+ this.redactor = redactor;
67
+ this.taskAmend = taskAmend ?? null;
68
+ this.counter = new SequenceCounter();
69
+ }
70
+
71
+ /**
72
+ * Run the judge session.
73
+ * @param {string} task - The judge prompt (with placeholders already substituted).
74
+ * @returns {Promise<{success: boolean, verdict: string|null, summary: string|null, turns: number}>}
75
+ */
76
+ async run(task) {
77
+ const fullTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
78
+ const result = await this.runner.run(fullTask);
79
+
80
+ if (this.ctx.concluded) {
81
+ const success = this.ctx.verdict === "success";
82
+ const outcome = {
83
+ success,
84
+ verdict: this.ctx.verdict,
85
+ summary: this.ctx.summary ?? null,
86
+ turns: 1,
87
+ };
88
+ this.emitSummary(outcome);
89
+ return outcome;
90
+ }
91
+
92
+ // The judge ended without calling Conclude. Surface that explicitly so
93
+ // callers can distinguish "judge said fail" from "judge never voted."
94
+ const outcome = {
95
+ success: false,
96
+ verdict: null,
97
+ summary: null,
98
+ turns: result.success ? 1 : 0,
99
+ };
100
+ this.emitSummary(outcome);
101
+ return outcome;
102
+ }
103
+
104
+ /**
105
+ * Tag a single NDJSON line with `source: "judge"` and emit it to the
106
+ * judge's output stream. Wired into the underlying AgentRunner via the
107
+ * `onLine` callback so the judge's stream is the single source of truth
108
+ * for the session's trace.
109
+ * @param {string} line
110
+ */
111
+ emitLine(line) {
112
+ const event = JSON.parse(line);
113
+ const tagged = { source: "judge", seq: this.counter.next(), event };
114
+ this.output.write(JSON.stringify(this.redactor.redactValue(tagged)) + "\n");
115
+ }
116
+
117
+ /**
118
+ * Emit a final orchestrator summary line, wrapped in the universal envelope.
119
+ * @param {{success: boolean, verdict?: string|null, summary?: string|null, turns: number}} result
120
+ */
121
+ emitSummary(result) {
122
+ this.output.write(
123
+ JSON.stringify(
124
+ this.redactor.redactValue({
125
+ source: "orchestrator",
126
+ seq: this.counter.next(),
127
+ event: {
128
+ type: "summary",
129
+ success: result.success,
130
+ ...(result.verdict && { verdict: result.verdict }),
131
+ turns: result.turns,
132
+ ...(result.summary && { summary: result.summary }),
133
+ },
134
+ }),
135
+ ) + "\n",
136
+ );
137
+ }
138
+ }
139
+
140
+ /**
141
+ * Factory function — wires the AgentRunner with the judge orchestration server
142
+ * and the JUDGE_SYSTEM_PROMPT trailer. A `judgeProfile` (when supplied) layers
143
+ * on top of the trailer via `composeProfilePrompt`, matching the
144
+ * supervisor/facilitator pattern.
145
+ *
146
+ * @param {object} deps
147
+ * @param {string} deps.cwd - Judge working directory. Defaults to the directory whose `.claude/agents` holds `judgeProfile`.
148
+ * @param {function} deps.query - SDK query function (injected for testing).
149
+ * @param {import("stream").Writable} deps.output - Trace output stream.
150
+ * @param {import("./redaction.js").Redactor} deps.redactor
151
+ * @param {string} [deps.model]
152
+ * @param {number} [deps.maxTurns] - Default 5 (the judge is expected to act in turn 1; 5 leaves headroom for tool inspection).
153
+ * @param {string[]} [deps.allowedTools] - Default `["Read","Glob","Grep","Bash"]` — read-only inspection.
154
+ * @param {string} [deps.judgeProfile] - Profile name; resolved into the system prompt via `composeProfilePrompt`.
155
+ * @param {string} [deps.profilesDir] - Defaults to `<cwd>/.claude/agents`.
156
+ * @param {string} [deps.taskAmend]
157
+ * @returns {Judge}
158
+ */
159
+ export function createJudge({
160
+ cwd,
161
+ query,
162
+ output,
163
+ redactor,
164
+ model,
165
+ maxTurns,
166
+ allowedTools,
167
+ judgeProfile,
168
+ profilesDir,
169
+ taskAmend,
170
+ }) {
171
+ if (!cwd) throw new Error("cwd is required");
172
+ if (!query) throw new Error("query is required");
173
+ if (!output) throw new Error("output is required");
174
+ if (!redactor) throw new Error("redactor is required");
175
+
176
+ const resolvedProfilesDir = profilesDir ?? resolve(cwd, ".claude/agents");
177
+ const systemPrompt = judgeProfile
178
+ ? composeProfilePrompt(judgeProfile, {
179
+ profilesDir: resolvedProfilesDir,
180
+ trailer: JUDGE_SYSTEM_PROMPT,
181
+ })
182
+ : {
183
+ type: "preset",
184
+ preset: "claude_code",
185
+ append: JUDGE_SYSTEM_PROMPT,
186
+ };
187
+
188
+ const ctx = createOrchestrationContext();
189
+ ctx.participants = [{ name: "judge", role: "judge" }];
190
+ const judgeServer = createJudgeToolServer(ctx);
191
+
192
+ let judge;
193
+ const onLine = (line) => judge.emitLine(line);
194
+
195
+ const runner = createAgentRunner({
196
+ cwd,
197
+ query,
198
+ output: devNull,
199
+ model,
200
+ maxTurns: maxTurns ?? 5,
201
+ allowedTools: allowedTools ?? DEFAULT_JUDGE_ALLOWED_TOOLS,
202
+ onLine,
203
+ settingSources: ["project"],
204
+ systemPrompt,
205
+ mcpServers: { orchestration: judgeServer },
206
+ redactor,
207
+ });
208
+
209
+ judge = new Judge({ runner, output, ctx, redactor, taskAmend });
210
+ return judge;
211
+ }
@@ -279,6 +279,31 @@ export function createSupervisedAgentToolServer(ctx) {
279
279
  });
280
280
  }
281
281
 
282
+ /**
283
+ * Judge tools: Conclude only.
284
+ *
285
+ * The judge runs a single post-hoc session with no peer participants —
286
+ * Ask/Answer/Announce/Redirect/RollCall are all moot. The judge inspects
287
+ * the agent's working directory and trace via the host's read-only tools
288
+ * and emits its verdict via Conclude.
289
+ *
290
+ * @param {object} ctx - Orchestration context
291
+ * @returns {object} MCP server config (type: "sdk")
292
+ */
293
+ export function createJudgeToolServer(ctx) {
294
+ return createSdkMcpServer({
295
+ name: "orchestration",
296
+ tools: [
297
+ tool(
298
+ "Conclude",
299
+ "End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
300
+ { verdict: z.enum(["success", "failure"]), summary: z.string() },
301
+ createConcludeHandler(ctx),
302
+ ),
303
+ ],
304
+ });
305
+ }
306
+
282
307
  /**
283
308
  * Facilitator tools: Ask + Announce + Conclude + RollCall.
284
309
  *