@forwardimpact/libeval 0.1.31 → 0.1.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ /**
2
+ * BenchmarkRunner — sole orchestrator for a task-family benchmark run.
3
+ *
4
+ * Phases per (task, runIndex):
5
+ * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
+ * 2. AgentRunner (bare; design Decision 14) → produce trace + submission
7
+ * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
8
+ * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
+ * 5. WorkdirManager.teardown → process-group cleanup
10
+ *
11
+ * Results stream as an async iterable AND are appended to
12
+ * `<output>/results.jsonl` for durability. The two paths are different
13
+ * consumers of the same record — the iterator drives CLI stdout mirroring,
14
+ * the JSONL append is the system of record.
15
+ */
16
+
17
+ import { createReadStream, createWriteStream } from "node:fs";
18
+ import { access, constants, mkdir, readFile } from "node:fs/promises";
19
+ import { createInterface } from "node:readline";
20
+ import { join, resolve as resolvePath } from "node:path";
21
+
22
+ import { createAgentRunner } from "../agent-runner.js";
23
+ import { composeProfilePrompt } from "../profile-prompt.js";
24
+ import { createRedactor } from "../redaction.js";
25
+ import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
26
+ import { createTraceCollector } from "../trace-collector.js";
27
+ import { installApm } from "./apm-installer.js";
28
+ import { runJudge } from "./judge.js";
29
+ import { validateResultRecord } from "./result.js";
30
+ import { runScoring } from "./scorer.js";
31
+ import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
32
+ import { createWorkdirManager } from "./workdir.js";
33
+
34
+ const BASE_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
35
+
36
+ /** Sole orchestrator for a task-family benchmark run. */
37
+ export class BenchmarkRunner {
38
+ /**
39
+ * @param {object} opts
40
+ * @param {import("./task-family.js").TaskFamily | string} opts.family
41
+ * @param {number} opts.runs - Runs per task (≥ 1).
42
+ * @param {string} opts.output - Run-output directory.
43
+ * @param {string} opts.model
44
+ * @param {{agent?: string, judge?: string}} [opts.profiles]
45
+ * @param {Function} opts.query - SDK query (injected for testability).
46
+ * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
47
+ * @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
48
+ * @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
49
+ * session. Must return `{costUsd, turns, submission, agentError?}` and
50
+ * write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
51
+ * `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
52
+ * testing only — not part of the public API.
53
+ * @param {Function} [opts.runScoring] - Test seam: replaces `runScoring`.
54
+ * Same contract as `runScoring(task, ctx)`. Internal testing only.
55
+ * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
56
+ * contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
57
+ * only.
58
+ */
59
+ constructor({
60
+ family,
61
+ runs,
62
+ output,
63
+ model,
64
+ profiles,
65
+ query,
66
+ maxTurns,
67
+ termGraceMs,
68
+ // Test seams — default to the real implementations.
69
+ runAgent,
70
+ runScoring: runScoringHook,
71
+ runJudge: runJudgeHook,
72
+ }) {
73
+ if (!family) throw new Error("family is required");
74
+ if (!Number.isInteger(runs) || runs < 1)
75
+ throw new Error("runs must be an integer ≥ 1");
76
+ if (!output) throw new Error("output is required");
77
+ if (!model) throw new Error("model is required");
78
+ if (!query) throw new Error("query is required");
79
+ this.familyInput = family;
80
+ this.runs = runs;
81
+ this.output = output;
82
+ this.model = model;
83
+ this.profiles = {
84
+ agent: profiles?.agent ?? null,
85
+ judge: profiles?.judge ?? null,
86
+ };
87
+ this.query = query;
88
+ this.maxTurns = maxTurns;
89
+ this.termGraceMs = termGraceMs;
90
+ this._runAgentHook = runAgent ?? null;
91
+ this._runScoringHook = runScoringHook ?? runScoring;
92
+ this._runJudgeHook = runJudgeHook ?? runJudge;
93
+ }
94
+
95
+ /**
96
+ * Yield one ResultRecord per (task, runIndex).
97
+ * @returns {AsyncGenerator<object>}
98
+ */
99
+ async *run() {
100
+ const family =
101
+ typeof this.familyInput === "string"
102
+ ? await loadTaskFamily(this.familyInput)
103
+ : this.familyInput;
104
+
105
+ await mkdir(this.output, { recursive: true });
106
+ const { stagingDir, skillSetHash } = await installApm(family, this.output);
107
+
108
+ const tasks = family.tasks();
109
+ for (const task of tasks) {
110
+ await assertPreflightExecutable(task);
111
+ }
112
+ if (this.profiles.judge) {
113
+ await assertJudgeProfileStaged(family, stagingDir, this.profiles.judge);
114
+ }
115
+
116
+ const wm = createWorkdirManager({
117
+ stagingDir,
118
+ runOutputDir: this.output,
119
+ termGraceMs: this.termGraceMs,
120
+ });
121
+
122
+ const resultsPath = join(this.output, "results.jsonl");
123
+ const resultsStream = createWriteStream(resultsPath, { flags: "a" });
124
+ try {
125
+ for (const task of tasks) {
126
+ for (let runIndex = 0; runIndex < this.runs; runIndex++) {
127
+ const record = await this.#runOne(
128
+ family,
129
+ wm,
130
+ task,
131
+ runIndex,
132
+ skillSetHash,
133
+ );
134
+ await writeRecord(resultsStream, record);
135
+ yield record;
136
+ }
137
+ }
138
+ } finally {
139
+ await new Promise((r) => resultsStream.end(r));
140
+ }
141
+ }
142
+
143
+ async #runOne(family, wm, task, runIndex, skillSetHash) {
144
+ const t0 = Date.now();
145
+ const workdir = await wm.start(task, runIndex);
146
+ try {
147
+ if (workdir.preflightError) {
148
+ const record = this.#buildPreflightFailureRecord({
149
+ task,
150
+ runIndex,
151
+ workdir,
152
+ skillSetHash,
153
+ familyRevision: family.familyRevision,
154
+ durationMs: Date.now() - t0,
155
+ });
156
+ return this.#validateOrFallback(
157
+ record,
158
+ resultsRecordKey(task, runIndex),
159
+ );
160
+ }
161
+ const { costUsd, turns, submission, agentError } =
162
+ await this.#runAgentSafe(task, workdir);
163
+ const scoring = await this._runScoringHook(task, {
164
+ cwd: workdir.cwd,
165
+ port: workdir.port,
166
+ runDir: workdir.runDir,
167
+ });
168
+ const judgeVerdict = await this._runJudgeHook(task, workdir, scoring, {
169
+ query: this.query,
170
+ model: this.model,
171
+ judgeProfile: this.profiles.judge ?? undefined,
172
+ });
173
+ const record = {
174
+ taskId: task.id,
175
+ runIndex,
176
+ verdict:
177
+ scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
178
+ ? "pass"
179
+ : "fail",
180
+ scoring,
181
+ submission,
182
+ judgeVerdict,
183
+ costUsd,
184
+ turns,
185
+ agentTracePath: workdir.agentTracePath,
186
+ judgeTracePath: workdir.judgeTracePath,
187
+ profiles: {
188
+ agent: this.profiles.agent,
189
+ supervisor: null,
190
+ judge: this.profiles.judge,
191
+ },
192
+ model: this.model,
193
+ skillSetHash,
194
+ familyRevision: family.familyRevision,
195
+ durationMs: Date.now() - t0,
196
+ ...(agentError && { agentError }),
197
+ };
198
+ return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
199
+ } finally {
200
+ await wm.teardown(workdir).catch(() => {});
201
+ }
202
+ }
203
+
204
+ /**
205
+ * Dispatch to either the injected hook or the default `#runAgent`. Either
206
+ * path can throw; catch here so a thrown error becomes an `agentError` on
207
+ * the record (spec criterion 1: records on agent failure) rather than
208
+ * aborting the whole iterator.
209
+ */
210
+ async #runAgentSafe(task, workdir) {
211
+ try {
212
+ if (this._runAgentHook) {
213
+ const r = await this._runAgentHook(task, workdir, this);
214
+ return { agentError: null, ...r };
215
+ }
216
+ return await this.#runAgent(task, workdir);
217
+ } catch (e) {
218
+ return {
219
+ costUsd: 0,
220
+ turns: 0,
221
+ submission: "",
222
+ agentError: { message: e.message ?? String(e), aborted: false },
223
+ };
224
+ }
225
+ }
226
+
227
+ /**
228
+ * Run the agent-under-test as a bare AgentRunner (design Decision 14).
229
+ * Recover cost/turns/submission from the trace by replaying it into a
230
+ * fresh TraceCollector — the bare runner writes a single NDJSON stream
231
+ * with one terminal `result` event.
232
+ *
233
+ * Inspects both thrown errors AND the resolved `{success, aborted, error}`
234
+ * shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
235
+ * the SDK iterator catches its own errors and resolves with `success:
236
+ * false`, so a try/catch alone would silently treat a failed session as
237
+ * a successful one (plan Step 8.5.c).
238
+ */
239
+ async #runAgent(task, workdir) {
240
+ const agentTraceStream = createWriteStream(workdir.agentTracePath);
241
+ const systemPrompt = this.profiles.agent
242
+ ? composeProfilePrompt(this.profiles.agent, {
243
+ profilesDir: resolvePath(workdir.cwd, ".claude/agents"),
244
+ trailer: AGENT_SYSTEM_PROMPT,
245
+ })
246
+ : undefined;
247
+ const runner = createAgentRunner({
248
+ cwd: workdir.cwd,
249
+ query: this.query,
250
+ output: agentTraceStream,
251
+ model: this.model,
252
+ maxTurns: this.maxTurns ?? 50,
253
+ allowedTools: BASE_TOOLS,
254
+ settingSources: ["project"],
255
+ systemPrompt,
256
+ redactor: createRedactor(),
257
+ });
258
+ const instructions = await readFile(task.paths.instructions, "utf8");
259
+ let agentError = null;
260
+ try {
261
+ const result = await runner.run(instructions);
262
+ if (!result.success) {
263
+ agentError = {
264
+ message:
265
+ result.error?.message ??
266
+ (result.aborted ? "aborted" : "agent did not succeed"),
267
+ aborted: result.aborted ?? false,
268
+ };
269
+ }
270
+ } catch (e) {
271
+ agentError = { message: e.message ?? String(e), aborted: false };
272
+ } finally {
273
+ await new Promise((r) => agentTraceStream.end(r));
274
+ }
275
+ const summary = await readAgentSummary(workdir.agentTracePath);
276
+ return { ...summary, agentError };
277
+ }
278
+
279
+ #buildPreflightFailureRecord({
280
+ task,
281
+ runIndex,
282
+ workdir,
283
+ skillSetHash,
284
+ familyRevision,
285
+ durationMs,
286
+ }) {
287
+ return {
288
+ taskId: task.id,
289
+ runIndex,
290
+ verdict: "fail",
291
+ costUsd: 0,
292
+ turns: 0,
293
+ preflightError: workdir.preflightError,
294
+ profiles: {
295
+ agent: this.profiles.agent,
296
+ supervisor: null,
297
+ judge: this.profiles.judge,
298
+ },
299
+ model: this.model,
300
+ skillSetHash,
301
+ familyRevision,
302
+ durationMs,
303
+ agentTracePath: workdir.agentTracePath,
304
+ judgeTracePath: workdir.judgeTracePath,
305
+ };
306
+ }
307
+
308
+ #validateOrFallback(record, key) {
309
+ try {
310
+ validateResultRecord(record);
311
+ return record;
312
+ } catch (e) {
313
+ // The runner constructed the record — a schema failure is a real bug,
314
+ // not bad family input. Emit a noisy fallback so the iterator stays
315
+ // consumable and the agent budget isn't silently dropped.
316
+ return {
317
+ taskId: record.taskId ?? key.taskId,
318
+ runIndex: record.runIndex ?? key.runIndex,
319
+ verdict: "fail",
320
+ schemaError: e.message ?? String(e),
321
+ };
322
+ }
323
+ }
324
+ }
325
+
326
+ function resultsRecordKey(task, runIndex) {
327
+ return { taskId: task.id, runIndex };
328
+ }
329
+
330
+ async function writeRecord(stream, record) {
331
+ const line = JSON.stringify(record) + "\n";
332
+ await new Promise((res, rej) => {
333
+ stream.write(line, (err) => (err ? rej(err) : res()));
334
+ });
335
+ }
336
+
337
+ /**
338
+ * Pre-flight install gate. Throws synchronously if any task's preflight
339
+ * script is missing or not executable — design § Pre-flight contract:
340
+ * "The harness fails the family at install if any task's preflight script
341
+ * is missing or non-executable, before any agent session starts."
342
+ */
343
+ async function assertPreflightExecutable(task) {
344
+ const path = join(task.paths.workdir, "scripts", "preflight.sh");
345
+ try {
346
+ await access(path, constants.X_OK);
347
+ } catch (e) {
348
+ throw new Error(
349
+ `task ${task.id}: preflight script not executable at ${path} (${e.code ?? e.message})`,
350
+ );
351
+ }
352
+ }
353
+
354
+ /**
355
+ * Replay the bare AgentRunner trace into a fresh TraceCollector to recover
356
+ * cost, turn count, and the final assistant text block (the submission).
357
+ */
358
+ async function readAgentSummary(tracePath) {
359
+ const collector = createTraceCollector();
360
+ const stream = createReadStream(tracePath);
361
+ const rl = createInterface({ input: stream, crlfDelay: Infinity });
362
+ for await (const line of rl) collector.addLine(line);
363
+ const json = collector.toJSON();
364
+ const summary = json.summary ?? {};
365
+ return {
366
+ costUsd:
367
+ typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
368
+ turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
369
+ submission: lastAssistantText(json),
370
+ };
371
+ }
372
+
373
+ function lastAssistantText(json) {
374
+ const turns = json.turns ?? [];
375
+ for (let i = turns.length - 1; i >= 0; i--) {
376
+ const turn = turns[i];
377
+ if (turn.role !== "assistant") continue;
378
+ const content = turn.content ?? [];
379
+ for (let j = content.length - 1; j >= 0; j--) {
380
+ if (content[j].type === "text" && content[j].text) return content[j].text;
381
+ }
382
+ }
383
+ return "";
384
+ }
385
+
386
+ /**
387
+ * Factory function — wires real dependencies.
388
+ * @param {ConstructorParameters<typeof BenchmarkRunner>[0]} opts
389
+ * @returns {BenchmarkRunner}
390
+ */
391
+ export function createBenchmarkRunner(opts) {
392
+ return new BenchmarkRunner(opts);
393
+ }
394
+
395
+ // Internal exports used by tests.
396
+ export const __BASE_TOOLS = BASE_TOOLS;
@@ -0,0 +1,138 @@
1
+ /**
2
+ * Scorer — runs `<task.paths.scoring>/run.sh` from the template path against
3
+ * the post-run agent CWD. The exit code is authoritative for the verdict;
4
+ * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
5
+ *
6
+ * Scoring scripts are never copied into the agent CWD — they live only in the
7
+ * task template (design Decision 3).
8
+ */
9
+
10
+ import { spawn } from "node:child_process";
11
+ import {
12
+ closeSync,
13
+ createWriteStream,
14
+ openSync,
15
+ readFileSync,
16
+ unlinkSync,
17
+ } from "node:fs";
18
+ import { join } from "node:path";
19
+
20
+ /**
21
+ * @typedef {object} ScoringResult
22
+ * @property {"pass" | "fail"} verdict
23
+ * @property {Array<object>} details
24
+ * @property {number} exitCode
25
+ */
26
+
27
+ /**
28
+ * Run the task's scoring script.
29
+ * @param {import("./task-family.js").Task} task
30
+ * @param {{cwd: string, port: number, runDir: string}} ctx
31
+ * @returns {Promise<ScoringResult>}
32
+ */
33
+ export function runScoring(task, ctx) {
34
+ return new Promise((res, rej) => {
35
+ const script = join(task.paths.scoring, "run.sh");
36
+ const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
37
+
38
+ // Bun's child_process pipe setup for fd >= 3 is racy under load (it
39
+ // creates a unix socket pair and the connect() can return ENOENT). Use
40
+ // a temp file as the fd-3 backing store instead — the script still
41
+ // writes via `$RESULTS_FD`, but we hand it a real file descriptor.
42
+ const fd3Path = join(ctx.runDir, "scoring.fd3.ndjson");
43
+ let fd3File;
44
+ try {
45
+ fd3File = openSync(fd3Path, "w+");
46
+ } catch (e) {
47
+ rej(e);
48
+ return;
49
+ }
50
+
51
+ const child = spawn(script, [], {
52
+ env: {
53
+ ...process.env,
54
+ WORKDIR: ctx.cwd,
55
+ PORT: String(ctx.port),
56
+ RESULTS_FD: "3",
57
+ },
58
+ stdio: ["inherit", "pipe", "pipe", fd3File],
59
+ });
60
+ if (child.pid === undefined) {
61
+ try {
62
+ closeSync(fd3File);
63
+ } catch {
64
+ // already closed
65
+ }
66
+ rej(new Error(`failed to spawn scoring script: ${script}`));
67
+ return;
68
+ }
69
+
70
+ child.stderr.pipe(stderrLog);
71
+ // Drain stdout (do not require consumers to read it).
72
+ child.stdout.on("data", () => {});
73
+
74
+ child.on("error", (e) => {
75
+ tryClose(fd3File);
76
+ rej(e);
77
+ });
78
+ child.on("close", (code) => {
79
+ stderrLog.end();
80
+ tryClose(fd3File);
81
+ const raw = readAndUnlink(fd3Path);
82
+ const details = [];
83
+ parseFd3Buffer(raw, details);
84
+ const exitCode = typeof code === "number" ? code : -1;
85
+ res({
86
+ verdict: exitCode === 0 ? "pass" : "fail",
87
+ details,
88
+ exitCode,
89
+ });
90
+ });
91
+ });
92
+ }
93
+
94
+ function pushRow(line, details) {
95
+ const trimmed = line.trim();
96
+ if (!trimmed) return;
97
+ try {
98
+ details.push(JSON.parse(trimmed));
99
+ } catch {
100
+ details.push({ raw: trimmed, parseError: true });
101
+ }
102
+ }
103
+
104
+ function tryClose(fd) {
105
+ try {
106
+ closeSync(fd);
107
+ } catch {
108
+ // already closed
109
+ }
110
+ }
111
+
112
+ function readAndUnlink(path) {
113
+ let raw = "";
114
+ try {
115
+ raw = readFileSync(path, "utf8");
116
+ } catch {
117
+ // empty
118
+ }
119
+ try {
120
+ unlinkSync(path);
121
+ } catch {
122
+ // best-effort cleanup
123
+ }
124
+ return raw;
125
+ }
126
+
127
+ /**
128
+ * Parse the fd-3 buffer (read from the temp-file backing) into one NDJSON
129
+ * row per detail entry.
130
+ */
131
+ function parseFd3Buffer(buf, details) {
132
+ if (!buf) return;
133
+ const parts = buf.split("\n");
134
+ for (let i = 0; i < parts.length - 1; i++) pushRow(parts[i], details);
135
+ if (parts[parts.length - 1].trim()) {
136
+ pushRow(parts[parts.length - 1], details);
137
+ }
138
+ }