@forwardimpact/libeval 0.1.31 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/bin/fit-benchmark.js +167 -0
- package/package.json +5 -3
- package/src/agent-runner.js +7 -1
- package/src/benchmark/apm-installer.js +39 -0
- package/src/benchmark/judge.js +146 -0
- package/src/benchmark/report.js +161 -0
- package/src/benchmark/result.js +108 -0
- package/src/benchmark/runner.js +396 -0
- package/src/benchmark/scorer.js +138 -0
- package/src/benchmark/task-family.js +259 -0
- package/src/benchmark/workdir.js +248 -0
- package/src/commands/benchmark-report.js +39 -0
- package/src/commands/benchmark-run.js +53 -0
- package/src/commands/benchmark-score.js +68 -0
- package/src/commands/facilitate.js +7 -0
- package/src/commands/run.js +9 -3
- package/src/commands/supervise.js +7 -0
- package/src/facilitator.js +35 -21
- package/src/index.js +9 -0
- package/src/judge.js +211 -0
- package/src/orchestration-toolkit.js +25 -0
- package/src/redaction.js +163 -0
- package/src/supervisor.js +29 -17
package/src/commands/run.js
CHANGED
|
@@ -3,6 +3,7 @@ import { Writable } from "node:stream";
|
|
|
3
3
|
import { resolve } from "node:path";
|
|
4
4
|
import { createAgentRunner } from "../agent-runner.js";
|
|
5
5
|
import { composeProfilePrompt } from "../profile-prompt.js";
|
|
6
|
+
import { createRedactor } from "../redaction.js";
|
|
6
7
|
import { createTeeWriter } from "../tee-writer.js";
|
|
7
8
|
import { SequenceCounter } from "../sequence-counter.js";
|
|
8
9
|
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
@@ -61,6 +62,11 @@ export async function runRunCommand(values, _args) {
|
|
|
61
62
|
mcpServer,
|
|
62
63
|
} = parseRunOptions(values);
|
|
63
64
|
|
|
65
|
+
// Build the redactor as the first observable side-effect after option
|
|
66
|
+
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
67
|
+
// process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
68
|
+
const redactor = createRedactor();
|
|
69
|
+
|
|
64
70
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
65
71
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
66
72
|
const fileStream = outputPath ? createWriteStream(outputPath) : null;
|
|
@@ -76,9 +82,8 @@ export async function runRunCommand(values, _args) {
|
|
|
76
82
|
});
|
|
77
83
|
const onLine = (line) => {
|
|
78
84
|
const event = JSON.parse(line);
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
);
|
|
85
|
+
const tagged = { source: "agent", seq: counter.next(), event };
|
|
86
|
+
output.write(JSON.stringify(redactor.redactValue(tagged)) + "\n");
|
|
82
87
|
};
|
|
83
88
|
|
|
84
89
|
let mcpServers = null;
|
|
@@ -117,6 +122,7 @@ export async function runRunCommand(values, _args) {
|
|
|
117
122
|
systemPrompt,
|
|
118
123
|
taskAmend,
|
|
119
124
|
mcpServers,
|
|
125
|
+
redactor,
|
|
120
126
|
});
|
|
121
127
|
|
|
122
128
|
const result = await runner.run(taskContent);
|
|
@@ -2,6 +2,7 @@ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
|
|
|
2
2
|
import { resolve, join } from "node:path";
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { createSupervisor } from "../supervisor.js";
|
|
5
|
+
import { createRedactor } from "../redaction.js";
|
|
5
6
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
7
|
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
7
8
|
|
|
@@ -60,6 +61,11 @@ function parseSuperviseOptions(values) {
|
|
|
60
61
|
export async function runSuperviseCommand(values, _args) {
|
|
61
62
|
const opts = parseSuperviseOptions(values);
|
|
62
63
|
|
|
64
|
+
// Build the redactor as the first observable side-effect after option
|
|
65
|
+
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
66
|
+
// process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
67
|
+
const redactor = createRedactor();
|
|
68
|
+
|
|
63
69
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
64
70
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
65
71
|
const fileStream = opts.outputPath
|
|
@@ -104,6 +110,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
104
110
|
agentProfile: opts.agentProfile,
|
|
105
111
|
taskAmend: opts.taskAmend,
|
|
106
112
|
agentMcpServers,
|
|
113
|
+
redactor,
|
|
107
114
|
});
|
|
108
115
|
|
|
109
116
|
const result = await supervisor.run(opts.taskContent);
|
package/src/facilitator.js
CHANGED
|
@@ -59,7 +59,10 @@ export class Facilitator {
|
|
|
59
59
|
ctx,
|
|
60
60
|
eventQueue,
|
|
61
61
|
taskAmend,
|
|
62
|
+
redactor,
|
|
62
63
|
}) {
|
|
64
|
+
if (!redactor) throw new Error("redactor is required");
|
|
65
|
+
this.redactor = redactor;
|
|
63
66
|
this.facilitatorRunner = facilitatorRunner;
|
|
64
67
|
this.agents = agents;
|
|
65
68
|
this.messageBus = messageBus;
|
|
@@ -327,11 +330,13 @@ export class Facilitator {
|
|
|
327
330
|
emitLine(source, line) {
|
|
328
331
|
const event = JSON.parse(line);
|
|
329
332
|
this.output.write(
|
|
330
|
-
JSON.stringify(
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
333
|
+
JSON.stringify(
|
|
334
|
+
this.redactor.redactValue({
|
|
335
|
+
source,
|
|
336
|
+
seq: this.counter.next(),
|
|
337
|
+
event,
|
|
338
|
+
}),
|
|
339
|
+
) + "\n",
|
|
335
340
|
);
|
|
336
341
|
}
|
|
337
342
|
|
|
@@ -340,11 +345,13 @@ export class Facilitator {
|
|
|
340
345
|
*/
|
|
341
346
|
emitOrchestratorEvent(event) {
|
|
342
347
|
this.output.write(
|
|
343
|
-
JSON.stringify(
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
+
JSON.stringify(
|
|
349
|
+
this.redactor.redactValue({
|
|
350
|
+
source: "orchestrator",
|
|
351
|
+
seq: this.counter.next(),
|
|
352
|
+
event,
|
|
353
|
+
}),
|
|
354
|
+
) + "\n",
|
|
348
355
|
);
|
|
349
356
|
}
|
|
350
357
|
|
|
@@ -353,17 +360,19 @@ export class Facilitator {
|
|
|
353
360
|
*/
|
|
354
361
|
emitSummary(result) {
|
|
355
362
|
this.output.write(
|
|
356
|
-
JSON.stringify(
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
363
|
+
JSON.stringify(
|
|
364
|
+
this.redactor.redactValue({
|
|
365
|
+
source: "orchestrator",
|
|
366
|
+
seq: this.counter.next(),
|
|
367
|
+
event: {
|
|
368
|
+
type: "summary",
|
|
369
|
+
success: result.success,
|
|
370
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
371
|
+
turns: result.turns,
|
|
372
|
+
...(result.summary && { summary: result.summary }),
|
|
373
|
+
},
|
|
374
|
+
}),
|
|
375
|
+
) + "\n",
|
|
367
376
|
);
|
|
368
377
|
}
|
|
369
378
|
}
|
|
@@ -398,7 +407,9 @@ export function createFacilitator({
|
|
|
398
407
|
facilitatorProfile,
|
|
399
408
|
profilesDir,
|
|
400
409
|
taskAmend,
|
|
410
|
+
redactor,
|
|
401
411
|
}) {
|
|
412
|
+
if (!redactor) throw new Error("redactor is required");
|
|
402
413
|
const resolvedProfilesDir =
|
|
403
414
|
profilesDir ?? resolve(facilitatorCwd, ".claude/agents");
|
|
404
415
|
const systemPromptFor = (profile, trailer) => {
|
|
@@ -446,6 +457,7 @@ export function createFacilitator({
|
|
|
446
457
|
mcpServers: { orchestration: agentServer },
|
|
447
458
|
settingSources: ["project"],
|
|
448
459
|
systemPrompt: systemPromptFor(config.agentProfile, agentTrailer),
|
|
460
|
+
redactor,
|
|
449
461
|
});
|
|
450
462
|
|
|
451
463
|
return { name: config.name, role: config.role, runner };
|
|
@@ -464,6 +476,7 @@ export function createFacilitator({
|
|
|
464
476
|
facilitatorProfile,
|
|
465
477
|
FACILITATOR_SYSTEM_PROMPT,
|
|
466
478
|
),
|
|
479
|
+
redactor,
|
|
467
480
|
});
|
|
468
481
|
|
|
469
482
|
facilitator = new Facilitator({
|
|
@@ -475,6 +488,7 @@ export function createFacilitator({
|
|
|
475
488
|
ctx,
|
|
476
489
|
eventQueue,
|
|
477
490
|
taskAmend,
|
|
491
|
+
redactor,
|
|
478
492
|
});
|
|
479
493
|
return facilitator;
|
|
480
494
|
}
|
package/src/index.js
CHANGED
|
@@ -23,6 +23,7 @@ export {
|
|
|
23
23
|
createSupervisedAgentToolServer,
|
|
24
24
|
createFacilitatorToolServer,
|
|
25
25
|
createFacilitatedAgentToolServer,
|
|
26
|
+
createJudgeToolServer,
|
|
26
27
|
} from "./orchestration-toolkit.js";
|
|
27
28
|
export { MessageBus, createMessageBus } from "./message-bus.js";
|
|
28
29
|
export {
|
|
@@ -31,3 +32,11 @@ export {
|
|
|
31
32
|
FACILITATOR_SYSTEM_PROMPT,
|
|
32
33
|
FACILITATED_AGENT_SYSTEM_PROMPT,
|
|
33
34
|
} from "./facilitator.js";
|
|
35
|
+
export { Judge, createJudge, JUDGE_SYSTEM_PROMPT } from "./judge.js";
|
|
36
|
+
export {
|
|
37
|
+
Redactor,
|
|
38
|
+
createRedactor,
|
|
39
|
+
createNoopRedactor,
|
|
40
|
+
DEFAULT_ENV_ALLOWLIST,
|
|
41
|
+
DEFAULT_PATTERNS,
|
|
42
|
+
} from "./redaction.js";
|
package/src/judge.js
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Judge — one agent session that inspects a completed agent's work and emits
|
|
3
|
+
* a verdict via the orchestration `Conclude` tool. Parallel concept to
|
|
4
|
+
* `Supervisor` and `Facilitator`, but post-hoc and solo: no peer agents,
|
|
5
|
+
* no message bus, no relay loop. The judge reads the task, optionally
|
|
6
|
+
* inspects the working directory and trace via read-only tools, and calls
|
|
7
|
+
* Conclude exactly once.
|
|
8
|
+
*
|
|
9
|
+
* Trace lines are tagged `source: "judge"` so consumers can distinguish
|
|
10
|
+
* judge sessions from supervisor or facilitator sessions in a unified
|
|
11
|
+
* NDJSON envelope.
|
|
12
|
+
*
|
|
13
|
+
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { resolve } from "node:path";
|
|
17
|
+
import { Writable } from "node:stream";
|
|
18
|
+
|
|
19
|
+
import { createAgentRunner } from "./agent-runner.js";
|
|
20
|
+
import { composeProfilePrompt } from "./profile-prompt.js";
|
|
21
|
+
import { SequenceCounter } from "./sequence-counter.js";
|
|
22
|
+
import {
|
|
23
|
+
createJudgeToolServer,
|
|
24
|
+
createOrchestrationContext,
|
|
25
|
+
} from "./orchestration-toolkit.js";
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* System-prompt trailer appended to the judge's main thread. Always applied,
|
|
29
|
+
* even when a `judgeProfile` is supplied — the profile layers on top of the
|
|
30
|
+
* trailer, the same way `SUPERVISOR_SYSTEM_PROMPT` and
|
|
31
|
+
* `FACILITATOR_SYSTEM_PROMPT` work for their respective roles.
|
|
32
|
+
*/
|
|
33
|
+
export const JUDGE_SYSTEM_PROMPT =
|
|
34
|
+
"You are a post-hoc judge for an agent task benchmark. " +
|
|
35
|
+
"The agent has already completed its work and an objective scoring step has already run; your role is to confirm or override the verdict by inspecting the agent's working directory and trace. " +
|
|
36
|
+
"You have read-only inspection tools — Read, Glob, Grep, Bash — to investigate; do not modify the working directory. " +
|
|
37
|
+
"Conclude ends the session with a verdict ('success' or 'failure') and a one-paragraph summary; verdict='success' iff the agent's work meets the criteria stated in the task. " +
|
|
38
|
+
"Call Conclude as your final action — do not deliberate across multiple turns.";
|
|
39
|
+
|
|
40
|
+
const DEFAULT_JUDGE_ALLOWED_TOOLS = ["Read", "Glob", "Grep", "Bash"];
|
|
41
|
+
|
|
42
|
+
const devNull = new Writable({
|
|
43
|
+
write(_chunk, _enc, cb) {
|
|
44
|
+
cb();
|
|
45
|
+
},
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
/** Run a single post-hoc judge session and emit a verdict via Conclude. */
|
|
49
|
+
export class Judge {
|
|
50
|
+
/**
|
|
51
|
+
* @param {object} deps
|
|
52
|
+
* @param {import("./agent-runner.js").AgentRunner} deps.runner - The judge's AgentRunner.
|
|
53
|
+
* @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to.
|
|
54
|
+
* @param {object} deps.ctx - Orchestration context (the Conclude handler writes to it).
|
|
55
|
+
* @param {import("./redaction.js").Redactor} deps.redactor
|
|
56
|
+
* @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
|
|
57
|
+
*/
|
|
58
|
+
constructor({ runner, output, ctx, redactor, taskAmend }) {
|
|
59
|
+
if (!runner) throw new Error("runner is required");
|
|
60
|
+
if (!output) throw new Error("output is required");
|
|
61
|
+
if (!ctx) throw new Error("ctx is required");
|
|
62
|
+
if (!redactor) throw new Error("redactor is required");
|
|
63
|
+
this.runner = runner;
|
|
64
|
+
this.output = output;
|
|
65
|
+
this.ctx = ctx;
|
|
66
|
+
this.redactor = redactor;
|
|
67
|
+
this.taskAmend = taskAmend ?? null;
|
|
68
|
+
this.counter = new SequenceCounter();
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Run the judge session.
|
|
73
|
+
* @param {string} task - The judge prompt (with placeholders already substituted).
|
|
74
|
+
* @returns {Promise<{success: boolean, verdict: string|null, summary: string|null, turns: number}>}
|
|
75
|
+
*/
|
|
76
|
+
async run(task) {
|
|
77
|
+
const fullTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
|
|
78
|
+
const result = await this.runner.run(fullTask);
|
|
79
|
+
|
|
80
|
+
if (this.ctx.concluded) {
|
|
81
|
+
const success = this.ctx.verdict === "success";
|
|
82
|
+
const outcome = {
|
|
83
|
+
success,
|
|
84
|
+
verdict: this.ctx.verdict,
|
|
85
|
+
summary: this.ctx.summary ?? null,
|
|
86
|
+
turns: 1,
|
|
87
|
+
};
|
|
88
|
+
this.emitSummary(outcome);
|
|
89
|
+
return outcome;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// The judge ended without calling Conclude. Surface that explicitly so
|
|
93
|
+
// callers can distinguish "judge said fail" from "judge never voted."
|
|
94
|
+
const outcome = {
|
|
95
|
+
success: false,
|
|
96
|
+
verdict: null,
|
|
97
|
+
summary: null,
|
|
98
|
+
turns: result.success ? 1 : 0,
|
|
99
|
+
};
|
|
100
|
+
this.emitSummary(outcome);
|
|
101
|
+
return outcome;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Tag a single NDJSON line with `source: "judge"` and emit it to the
|
|
106
|
+
* judge's output stream. Wired into the underlying AgentRunner via the
|
|
107
|
+
* `onLine` callback so the judge's stream is the single source of truth
|
|
108
|
+
* for the session's trace.
|
|
109
|
+
* @param {string} line
|
|
110
|
+
*/
|
|
111
|
+
emitLine(line) {
|
|
112
|
+
const event = JSON.parse(line);
|
|
113
|
+
const tagged = { source: "judge", seq: this.counter.next(), event };
|
|
114
|
+
this.output.write(JSON.stringify(this.redactor.redactValue(tagged)) + "\n");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Emit a final orchestrator summary line, wrapped in the universal envelope.
|
|
119
|
+
* @param {{success: boolean, verdict?: string|null, summary?: string|null, turns: number}} result
|
|
120
|
+
*/
|
|
121
|
+
emitSummary(result) {
|
|
122
|
+
this.output.write(
|
|
123
|
+
JSON.stringify(
|
|
124
|
+
this.redactor.redactValue({
|
|
125
|
+
source: "orchestrator",
|
|
126
|
+
seq: this.counter.next(),
|
|
127
|
+
event: {
|
|
128
|
+
type: "summary",
|
|
129
|
+
success: result.success,
|
|
130
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
131
|
+
turns: result.turns,
|
|
132
|
+
...(result.summary && { summary: result.summary }),
|
|
133
|
+
},
|
|
134
|
+
}),
|
|
135
|
+
) + "\n",
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Factory function — wires the AgentRunner with the judge orchestration server
|
|
142
|
+
* and the JUDGE_SYSTEM_PROMPT trailer. A `judgeProfile` (when supplied) layers
|
|
143
|
+
* on top of the trailer via `composeProfilePrompt`, matching the
|
|
144
|
+
* supervisor/facilitator pattern.
|
|
145
|
+
*
|
|
146
|
+
* @param {object} deps
|
|
147
|
+
* @param {string} deps.cwd - Judge working directory. Defaults to the directory whose `.claude/agents` holds `judgeProfile`.
|
|
148
|
+
* @param {function} deps.query - SDK query function (injected for testing).
|
|
149
|
+
* @param {import("stream").Writable} deps.output - Trace output stream.
|
|
150
|
+
* @param {import("./redaction.js").Redactor} deps.redactor
|
|
151
|
+
* @param {string} [deps.model]
|
|
152
|
+
* @param {number} [deps.maxTurns] - Default 5 (the judge is expected to act in turn 1; 5 leaves headroom for tool inspection).
|
|
153
|
+
* @param {string[]} [deps.allowedTools] - Default `["Read","Glob","Grep","Bash"]` — read-only inspection.
|
|
154
|
+
* @param {string} [deps.judgeProfile] - Profile name; resolved into the system prompt via `composeProfilePrompt`.
|
|
155
|
+
* @param {string} [deps.profilesDir] - Defaults to `<cwd>/.claude/agents`.
|
|
156
|
+
* @param {string} [deps.taskAmend]
|
|
157
|
+
* @returns {Judge}
|
|
158
|
+
*/
|
|
159
|
+
export function createJudge({
|
|
160
|
+
cwd,
|
|
161
|
+
query,
|
|
162
|
+
output,
|
|
163
|
+
redactor,
|
|
164
|
+
model,
|
|
165
|
+
maxTurns,
|
|
166
|
+
allowedTools,
|
|
167
|
+
judgeProfile,
|
|
168
|
+
profilesDir,
|
|
169
|
+
taskAmend,
|
|
170
|
+
}) {
|
|
171
|
+
if (!cwd) throw new Error("cwd is required");
|
|
172
|
+
if (!query) throw new Error("query is required");
|
|
173
|
+
if (!output) throw new Error("output is required");
|
|
174
|
+
if (!redactor) throw new Error("redactor is required");
|
|
175
|
+
|
|
176
|
+
const resolvedProfilesDir = profilesDir ?? resolve(cwd, ".claude/agents");
|
|
177
|
+
const systemPrompt = judgeProfile
|
|
178
|
+
? composeProfilePrompt(judgeProfile, {
|
|
179
|
+
profilesDir: resolvedProfilesDir,
|
|
180
|
+
trailer: JUDGE_SYSTEM_PROMPT,
|
|
181
|
+
})
|
|
182
|
+
: {
|
|
183
|
+
type: "preset",
|
|
184
|
+
preset: "claude_code",
|
|
185
|
+
append: JUDGE_SYSTEM_PROMPT,
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
const ctx = createOrchestrationContext();
|
|
189
|
+
ctx.participants = [{ name: "judge", role: "judge" }];
|
|
190
|
+
const judgeServer = createJudgeToolServer(ctx);
|
|
191
|
+
|
|
192
|
+
let judge;
|
|
193
|
+
const onLine = (line) => judge.emitLine(line);
|
|
194
|
+
|
|
195
|
+
const runner = createAgentRunner({
|
|
196
|
+
cwd,
|
|
197
|
+
query,
|
|
198
|
+
output: devNull,
|
|
199
|
+
model,
|
|
200
|
+
maxTurns: maxTurns ?? 5,
|
|
201
|
+
allowedTools: allowedTools ?? DEFAULT_JUDGE_ALLOWED_TOOLS,
|
|
202
|
+
onLine,
|
|
203
|
+
settingSources: ["project"],
|
|
204
|
+
systemPrompt,
|
|
205
|
+
mcpServers: { orchestration: judgeServer },
|
|
206
|
+
redactor,
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
judge = new Judge({ runner, output, ctx, redactor, taskAmend });
|
|
210
|
+
return judge;
|
|
211
|
+
}
|
|
@@ -279,6 +279,31 @@ export function createSupervisedAgentToolServer(ctx) {
|
|
|
279
279
|
});
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
+
/**
|
|
283
|
+
* Judge tools: Conclude only.
|
|
284
|
+
*
|
|
285
|
+
* The judge runs a single post-hoc session with no peer participants —
|
|
286
|
+
* Ask/Answer/Announce/Redirect/RollCall are all moot. The judge inspects
|
|
287
|
+
* the agent's working directory and trace via the host's read-only tools
|
|
288
|
+
* and emits its verdict via Conclude.
|
|
289
|
+
*
|
|
290
|
+
* @param {object} ctx - Orchestration context
|
|
291
|
+
* @returns {object} MCP server config (type: "sdk")
|
|
292
|
+
*/
|
|
293
|
+
export function createJudgeToolServer(ctx) {
|
|
294
|
+
return createSdkMcpServer({
|
|
295
|
+
name: "orchestration",
|
|
296
|
+
tools: [
|
|
297
|
+
tool(
|
|
298
|
+
"Conclude",
|
|
299
|
+
"End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
|
|
300
|
+
{ verdict: z.enum(["success", "failure"]), summary: z.string() },
|
|
301
|
+
createConcludeHandler(ctx),
|
|
302
|
+
),
|
|
303
|
+
],
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
|
|
282
307
|
/**
|
|
283
308
|
* Facilitator tools: Ask + Announce + Conclude + RollCall.
|
|
284
309
|
*
|
package/src/redaction.js
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Redactor — replaces secrets in JSON-serialisable values before they reach
|
|
3
|
+
* the trace artifact. Composes two layers: an env-var value allowlist and a
|
|
4
|
+
* set of credential-shape regexes. Both run on every primitive string.
|
|
5
|
+
*
|
|
6
|
+
* Stateless after construction: `env` is captured once so in-process
|
|
7
|
+
* `process.env` writes (e.g. agent-runner.js LIBEVAL_SKILL, commands/run.js
|
|
8
|
+
* LIBEVAL_AGENT_PROFILE) cannot smuggle a value past the redactor.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
export const DEFAULT_ENV_ALLOWLIST = Object.freeze([
|
|
12
|
+
"ANTHROPIC_API_KEY",
|
|
13
|
+
"GH_TOKEN",
|
|
14
|
+
"GITHUB_TOKEN",
|
|
15
|
+
]);
|
|
16
|
+
|
|
17
|
+
// Anchored prefixes per
|
|
18
|
+
// https://github.blog/security/application-security/behind-githubs-new-authentication-token-formats/
|
|
19
|
+
// Anthropic prefix is heuristic — the env-allowlist layer is the primary
|
|
20
|
+
// defence for Anthropic keys.
|
|
21
|
+
export const DEFAULT_PATTERNS = Object.freeze([
|
|
22
|
+
{ kind: "anthropic", regex: /sk-ant-[A-Za-z0-9_-]{80,}/g },
|
|
23
|
+
{ kind: "gh-pat", regex: /\bghp_[A-Za-z0-9]{36}\b/g },
|
|
24
|
+
{ kind: "gh-installation", regex: /\bghs_[A-Za-z0-9]{36}\b/g },
|
|
25
|
+
{ kind: "gh-oauth", regex: /\bgho_[A-Za-z0-9]{36}\b/g },
|
|
26
|
+
{ kind: "gh-fine-grained", regex: /\bgithub_pat_[A-Za-z0-9_]{82}\b/g },
|
|
27
|
+
]);
|
|
28
|
+
|
|
29
|
+
const ENV_PLACEHOLDER = (name) => `[REDACTED:env:${name}]`;
|
|
30
|
+
const PATTERN_PLACEHOLDER = (kind) => `[REDACTED:pattern:${kind}]`;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Build a frozen { name → value } snapshot of the requested env vars.
|
|
34
|
+
* Empty strings are skipped — a leaked empty env var would otherwise
|
|
35
|
+
* cause every empty string in the trace to be replaced.
|
|
36
|
+
*/
|
|
37
|
+
function snapshotEnv(env, allowlist) {
|
|
38
|
+
const snap = {};
|
|
39
|
+
for (const name of allowlist) {
|
|
40
|
+
const v = env[name];
|
|
41
|
+
if (typeof v === "string" && v.length > 0) snap[name] = v;
|
|
42
|
+
}
|
|
43
|
+
return Object.freeze(snap);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Recursively walk and redact a JSON-serialisable value in place-free style. */
|
|
47
|
+
function walk(value, redactString) {
|
|
48
|
+
if (typeof value === "string") return redactString(value);
|
|
49
|
+
if (Array.isArray(value)) return value.map((v) => walk(v, redactString));
|
|
50
|
+
if (value && typeof value === "object") {
|
|
51
|
+
const out = {};
|
|
52
|
+
for (const k of Object.keys(value)) out[k] = walk(value[k], redactString);
|
|
53
|
+
return out;
|
|
54
|
+
}
|
|
55
|
+
return value;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Stateless secret redactor — composes env-allowlist and pattern layers. */
|
|
59
|
+
export class Redactor {
|
|
60
|
+
/**
|
|
61
|
+
* @param {object} deps
|
|
62
|
+
* @param {Readonly<Record<string, string>>} deps.envSnapshot - Frozen { name → secret } map captured at construction time.
|
|
63
|
+
* @param {ReadonlyArray<{kind: string, regex: RegExp}>} deps.patterns - Credential-shape regexes; each match becomes `[REDACTED:pattern:KIND]`.
|
|
64
|
+
* @param {boolean} deps.enabled - When false, `redactValue` returns its input by reference.
|
|
65
|
+
*/
|
|
66
|
+
constructor({ envSnapshot, patterns, enabled }) {
|
|
67
|
+
this.envSnapshot = envSnapshot;
|
|
68
|
+
this.patterns = patterns;
|
|
69
|
+
this.enabled = enabled;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Redact any JSON-serialisable value by deep-walking and replacing secrets
|
|
74
|
+
* in every primitive string. Identity on the input when disabled.
|
|
75
|
+
* @param {unknown} value
|
|
76
|
+
* @returns {unknown}
|
|
77
|
+
*/
|
|
78
|
+
redactValue(value) {
|
|
79
|
+
if (!this.enabled) return value;
|
|
80
|
+
return walk(value, (s) => this.#redactString(s));
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Apply the env-allowlist and pattern layers to a single string.
|
|
85
|
+
* @param {string} s
|
|
86
|
+
* @returns {string}
|
|
87
|
+
*/
|
|
88
|
+
#redactString(s) {
|
|
89
|
+
let out = s;
|
|
90
|
+
for (const [name, secret] of Object.entries(this.envSnapshot)) {
|
|
91
|
+
if (out.includes(secret)) {
|
|
92
|
+
out = out.split(secret).join(ENV_PLACEHOLDER(name));
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
for (const { kind, regex } of this.patterns) {
|
|
96
|
+
out = out.replace(regex, PATTERN_PLACEHOLDER(kind));
|
|
97
|
+
}
|
|
98
|
+
return out;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Build a redactor. Reads `LIBEVAL_REDACTION_DISABLED` and
|
|
104
|
+
* `LIBEVAL_REDACTION_ENV_VARS` from the supplied env (defaults to
|
|
105
|
+
* `process.env`). Fires a one-shot stderr warning when constructed
|
|
106
|
+
* disabled — bypass via `createNoopRedactor()` for silent fixtures.
|
|
107
|
+
* @param {object} [opts]
|
|
108
|
+
* @param {Record<string, string|undefined>} [opts.env] - Environment to snapshot. Defaults to `process.env`.
|
|
109
|
+
* @param {string[]} [opts.allowlist] - Override the env-var name list. Defaults to `DEFAULT_ENV_ALLOWLIST` or the parsed `LIBEVAL_REDACTION_ENV_VARS` value.
|
|
110
|
+
* @param {ReadonlyArray<{kind: string, regex: RegExp}>} [opts.patterns] - Credential-shape regexes. Defaults to `DEFAULT_PATTERNS`.
|
|
111
|
+
* @param {boolean} [opts.enabled] - Force enabled/disabled; bypasses `LIBEVAL_REDACTION_DISABLED`.
|
|
112
|
+
* @returns {Redactor}
|
|
113
|
+
*/
|
|
114
|
+
export function createRedactor({
|
|
115
|
+
env = process.env,
|
|
116
|
+
allowlist,
|
|
117
|
+
patterns = DEFAULT_PATTERNS,
|
|
118
|
+
enabled,
|
|
119
|
+
} = {}) {
|
|
120
|
+
const envDisabled = env.LIBEVAL_REDACTION_DISABLED === "1";
|
|
121
|
+
const resolvedEnabled = enabled ?? !envDisabled;
|
|
122
|
+
const resolvedAllowlist = allowlist ?? resolveAllowlistFromEnv(env);
|
|
123
|
+
const envSnapshot = resolvedEnabled
|
|
124
|
+
? snapshotEnv(env, resolvedAllowlist)
|
|
125
|
+
: Object.freeze({});
|
|
126
|
+
if (!resolvedEnabled) {
|
|
127
|
+
process.stderr.write(
|
|
128
|
+
"libeval: trace redaction DISABLED via LIBEVAL_REDACTION_DISABLED — secrets may appear in trace artifact\n",
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
return new Redactor({ envSnapshot, patterns, enabled: resolvedEnabled });
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Parse `LIBEVAL_REDACTION_ENV_VARS` into a trimmed, non-empty name list.
|
|
136
|
+
* Falls back to `DEFAULT_ENV_ALLOWLIST` when unset or empty.
|
|
137
|
+
* @param {Record<string, string|undefined>} env
|
|
138
|
+
* @returns {string[]}
|
|
139
|
+
*/
|
|
140
|
+
function resolveAllowlistFromEnv(env) {
|
|
141
|
+
const override = env.LIBEVAL_REDACTION_ENV_VARS;
|
|
142
|
+
if (typeof override !== "string" || override.length === 0) {
|
|
143
|
+
return DEFAULT_ENV_ALLOWLIST;
|
|
144
|
+
}
|
|
145
|
+
return override
|
|
146
|
+
.split(",")
|
|
147
|
+
.map((s) => s.trim())
|
|
148
|
+
.filter(Boolean);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Build a disabled redactor whose `redactValue` is the identity function.
|
|
153
|
+
* Test-fixture form — bypasses `createRedactor` so no stderr warning
|
|
154
|
+
* fires regardless of env state.
|
|
155
|
+
* @returns {Redactor}
|
|
156
|
+
*/
|
|
157
|
+
export function createNoopRedactor() {
|
|
158
|
+
return new Redactor({
|
|
159
|
+
envSnapshot: Object.freeze({}),
|
|
160
|
+
patterns: [],
|
|
161
|
+
enabled: false,
|
|
162
|
+
});
|
|
163
|
+
}
|