@forwardimpact/libeval 0.1.50 → 0.1.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +36 -30
- package/bin/fit-trace.js +83 -57
- package/package.json +1 -1
- package/src/agent-runner.js +20 -12
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/invariants.js +128 -0
- package/src/benchmark/judge.js +18 -19
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +40 -26
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +90 -46
- package/src/benchmark/task-family.js +78 -65
- package/src/benchmark/workdir.js +100 -93
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +24 -15
- package/src/commands/benchmark-run.js +16 -9
- package/src/commands/by-discussion.js +33 -23
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +31 -13
- package/src/commands/facilitate.js +21 -14
- package/src/commands/output.js +15 -13
- package/src/commands/run.js +28 -14
- package/src/commands/supervise.js +29 -19
- package/src/commands/task-input.js +10 -5
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +181 -99
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +53 -2
- package/src/events/github.js +27 -5
- package/src/facilitator.js +4 -0
- package/src/inbox-poller.js +84 -0
- package/src/judge.js +4 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +14 -4
- package/src/orchestration-toolkit.js +14 -0
- package/src/profile-prompt.js +22 -9
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/supervisor.js +4 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
- package/src/benchmark/scorer.js +0 -138
- package/src/commands/benchmark-score.js +0 -68
package/src/benchmark/report.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* records by `taskId`, and compute pass@k via the OpenAI HumanEval
|
|
4
4
|
* unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
|
|
5
5
|
*
|
|
6
|
-
* When `includeRuns` is true, each task carries per-run detail (
|
|
6
|
+
* When `includeRuns` is true, each task carries per-run detail (invariant
|
|
7
7
|
* checks, judge commentary, cost, duration) and the text renderer produces
|
|
8
8
|
* a full markdown report instead of just the pass@k table.
|
|
9
9
|
*
|
|
@@ -12,9 +12,7 @@
|
|
|
12
12
|
* whole report.
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
|
-
import { createReadStream } from "node:fs";
|
|
16
15
|
import { join } from "node:path";
|
|
17
|
-
import { createInterface } from "node:readline";
|
|
18
16
|
|
|
19
17
|
import { validateResultRecord } from "./result.js";
|
|
20
18
|
|
|
@@ -22,7 +20,7 @@ import { validateResultRecord } from "./result.js";
|
|
|
22
20
|
* @typedef {object} RunDetail
|
|
23
21
|
* @property {number} runIndex
|
|
24
22
|
* @property {"pass"|"fail"} verdict
|
|
25
|
-
* @property {{verdict: string, details: unknown[], exitCode: number}} [
|
|
23
|
+
* @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
|
|
26
24
|
* @property {{verdict: string, summary: string}} [judgeVerdict]
|
|
27
25
|
* @property {number} costUsd
|
|
28
26
|
* @property {number} turns
|
|
@@ -41,11 +39,17 @@ import { validateResultRecord } from "./result.js";
|
|
|
41
39
|
*/
|
|
42
40
|
|
|
43
41
|
/**
|
|
44
|
-
* @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
|
|
42
|
+
* @param {{inputDir: string, kValues: number[], includeRuns?: boolean, runtime: import("@forwardimpact/libutil/runtime").Runtime}} opts
|
|
45
43
|
* @returns {Promise<{tasks: TaskReport[], totals: object}>}
|
|
46
44
|
*/
|
|
47
|
-
export async function aggregate({
|
|
48
|
-
|
|
45
|
+
export async function aggregate({
|
|
46
|
+
inputDir,
|
|
47
|
+
kValues,
|
|
48
|
+
includeRuns = false,
|
|
49
|
+
runtime,
|
|
50
|
+
}) {
|
|
51
|
+
if (!runtime) throw new Error("runtime is required");
|
|
52
|
+
const records = await loadRecords(inputDir, runtime);
|
|
49
53
|
const grouped = groupByTask(records.records);
|
|
50
54
|
const tasks = [];
|
|
51
55
|
let totalRuns = 0;
|
|
@@ -112,7 +116,7 @@ function buildRunDetail(r, acc) {
|
|
|
112
116
|
return {
|
|
113
117
|
runIndex: r.runIndex,
|
|
114
118
|
verdict: r.verdict,
|
|
115
|
-
...(r.
|
|
119
|
+
...(r.invariants && { invariants: r.invariants }),
|
|
116
120
|
...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
|
|
117
121
|
costUsd: r.costUsd ?? 0,
|
|
118
122
|
turns: r.turns ?? 0,
|
|
@@ -262,7 +266,7 @@ function renderTaskDetail(task) {
|
|
|
262
266
|
|
|
263
267
|
lines.push("", renderRunsTable(runs));
|
|
264
268
|
|
|
265
|
-
const checks =
|
|
269
|
+
const checks = renderInvariantChecks(runs, singleRun);
|
|
266
270
|
if (checks) lines.push("", checks);
|
|
267
271
|
|
|
268
272
|
const commentary = renderJudgeCommentary(runs, singleRun);
|
|
@@ -278,7 +282,7 @@ function renderRunsTable(runs) {
|
|
|
278
282
|
const header = [
|
|
279
283
|
"Run",
|
|
280
284
|
"Verdict",
|
|
281
|
-
"
|
|
285
|
+
"Invariants",
|
|
282
286
|
"Judge",
|
|
283
287
|
"Cost",
|
|
284
288
|
"Turns",
|
|
@@ -286,10 +290,10 @@ function renderRunsTable(runs) {
|
|
|
286
290
|
];
|
|
287
291
|
const rows = [header, header.map(() => "---")];
|
|
288
292
|
for (const r of runs) {
|
|
289
|
-
const
|
|
293
|
+
const invariantsCell = r.preflightError
|
|
290
294
|
? "preflight error"
|
|
291
|
-
: r.
|
|
292
|
-
? statusIcon(r.
|
|
295
|
+
: r.invariants
|
|
296
|
+
? statusIcon(r.invariants.verdict === "pass")
|
|
293
297
|
: "—";
|
|
294
298
|
const judgeCell = r.preflightError
|
|
295
299
|
? "—"
|
|
@@ -299,7 +303,7 @@ function renderRunsTable(runs) {
|
|
|
299
303
|
rows.push([
|
|
300
304
|
String(r.runIndex),
|
|
301
305
|
statusIcon(r.verdict === "pass"),
|
|
302
|
-
|
|
306
|
+
invariantsCell,
|
|
303
307
|
judgeCell,
|
|
304
308
|
formatCost(r.costUsd),
|
|
305
309
|
String(r.turns),
|
|
@@ -309,15 +313,15 @@ function renderRunsTable(runs) {
|
|
|
309
313
|
return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
|
|
310
314
|
}
|
|
311
315
|
|
|
312
|
-
function
|
|
313
|
-
const rows =
|
|
316
|
+
function renderInvariantChecks(runs, singleRun) {
|
|
317
|
+
const rows = collectInvariantRows(runs);
|
|
314
318
|
if (!rows.length) return null;
|
|
315
319
|
|
|
316
320
|
const header = singleRun
|
|
317
321
|
? ["Check", "Result", "Message"]
|
|
318
322
|
: ["Run", "Check", "Result", "Message"];
|
|
319
323
|
const lines = [
|
|
320
|
-
"####
|
|
324
|
+
"#### Invariant Checks",
|
|
321
325
|
"",
|
|
322
326
|
`| ${header.join(" | ")} |`,
|
|
323
327
|
`| ${header.map(() => "---").join(" | ")} |`,
|
|
@@ -331,11 +335,11 @@ function renderScoringChecks(runs, singleRun) {
|
|
|
331
335
|
return lines.join("\n");
|
|
332
336
|
}
|
|
333
337
|
|
|
334
|
-
function
|
|
338
|
+
function collectInvariantRows(runs) {
|
|
335
339
|
const rows = [];
|
|
336
340
|
for (const r of runs) {
|
|
337
|
-
if (!r.
|
|
338
|
-
for (const d of r.
|
|
341
|
+
if (!r.invariants?.details?.length) continue;
|
|
342
|
+
for (const d of r.invariants.details) {
|
|
339
343
|
rows.push({
|
|
340
344
|
run: r.runIndex,
|
|
341
345
|
check: escapeCell(String(d.test ?? "(unnamed)")),
|
|
@@ -429,20 +433,30 @@ function median(arr) {
|
|
|
429
433
|
// Record loading
|
|
430
434
|
// ---------------------------------------------------------------------------
|
|
431
435
|
|
|
432
|
-
async function loadRecords(inputDir) {
|
|
436
|
+
async function loadRecords(inputDir, runtime) {
|
|
433
437
|
const path = join(inputDir, "results.jsonl");
|
|
434
|
-
|
|
435
|
-
|
|
438
|
+
let content;
|
|
439
|
+
try {
|
|
440
|
+
content = await runtime.fs.readFile(path, "utf8");
|
|
441
|
+
} catch (e) {
|
|
442
|
+
// Re-throw with the stack collapsed to the message line so the CLI's
|
|
443
|
+
// error rendering stays free of node-internal async `readFile` frames
|
|
444
|
+
// (matching the pre-1370 stream-error shape the golden captured).
|
|
445
|
+
const err = new Error(e.message);
|
|
446
|
+
if (e.code) err.code = e.code;
|
|
447
|
+
err.stack = `Error: ${e.message}`;
|
|
448
|
+
throw err;
|
|
449
|
+
}
|
|
436
450
|
const records = [];
|
|
437
451
|
let skipped = 0;
|
|
438
|
-
for
|
|
452
|
+
for (const line of content.split("\n")) {
|
|
439
453
|
const trimmed = line.trim();
|
|
440
454
|
if (!trimmed) continue;
|
|
441
455
|
let record;
|
|
442
456
|
try {
|
|
443
457
|
record = JSON.parse(trimmed);
|
|
444
458
|
} catch (e) {
|
|
445
|
-
|
|
459
|
+
runtime.proc.stderr.write(
|
|
446
460
|
`benchmark report: skipped malformed JSON line — ${e.message}\n`,
|
|
447
461
|
);
|
|
448
462
|
skipped++;
|
|
@@ -451,7 +465,7 @@ async function loadRecords(inputDir) {
|
|
|
451
465
|
try {
|
|
452
466
|
validateResultRecord(record);
|
|
453
467
|
} catch (e) {
|
|
454
|
-
|
|
468
|
+
runtime.proc.stderr.write(
|
|
455
469
|
`benchmark report: skipped record failing schema — ${describeError(e)}\n`,
|
|
456
470
|
);
|
|
457
471
|
skipped++;
|
package/src/benchmark/result.js
CHANGED
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Two schemas live here:
|
|
5
5
|
* - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
|
|
6
|
-
* benchmark run. Has a happy branch (
|
|
7
|
-
* pre-flight-failure branch (
|
|
8
|
-
* -
|
|
9
|
-
* ad-hoc grading without a full lifecycle.
|
|
6
|
+
* benchmark run. Has a happy branch (invariants + judge present) and a
|
|
7
|
+
* pre-flight-failure branch (invariants/judgeVerdict/submission absent).
|
|
8
|
+
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
|
|
9
|
+
* (P7): ad-hoc grading without a full lifecycle.
|
|
10
10
|
*
|
|
11
11
|
* Validation is throw-on-mismatch so the runner can wrap every JSONL append
|
|
12
12
|
* in a guard and reject schema drift at write time.
|
|
@@ -16,7 +16,7 @@ import { z } from "zod";
|
|
|
16
16
|
|
|
17
17
|
const VERDICT_ENUM = z.enum(["pass", "fail"]);
|
|
18
18
|
|
|
19
|
-
const
|
|
19
|
+
const INVARIANTS_SHAPE = z.object({
|
|
20
20
|
verdict: VERDICT_ENUM,
|
|
21
21
|
details: z.array(z.unknown()),
|
|
22
22
|
exitCode: z.number().int(),
|
|
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
|
|
|
63
63
|
|
|
64
64
|
const HAPPY_RECORD = z.object({
|
|
65
65
|
...COMMON_FIELDS,
|
|
66
|
-
|
|
66
|
+
invariants: INVARIANTS_SHAPE,
|
|
67
67
|
submission: z.string(),
|
|
68
68
|
judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
|
|
69
69
|
agentTracePath: z.string(),
|
|
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
83
83
|
agentTracePath: z.string(),
|
|
84
84
|
supervisorTracePath: z.string(),
|
|
85
85
|
judgeTracePath: z.string(),
|
|
86
|
-
|
|
86
|
+
invariants: z.undefined().optional(),
|
|
87
87
|
submission: z.undefined().optional(),
|
|
88
88
|
judgeVerdict: z.undefined().optional(),
|
|
89
89
|
agentError: z.undefined().optional(),
|
|
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
91
91
|
|
|
92
92
|
export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
|
|
93
93
|
|
|
94
|
-
export const
|
|
94
|
+
export const INVARIANTS_RECORD_SCHEMA = z.object({
|
|
95
95
|
taskId: z.string().min(1),
|
|
96
|
-
|
|
96
|
+
invariants: INVARIANTS_SHAPE,
|
|
97
97
|
exitCode: z.number().int(),
|
|
98
98
|
});
|
|
99
99
|
|
|
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
|
|
|
109
109
|
* Throw on schema mismatch.
|
|
110
110
|
* @param {object} record
|
|
111
111
|
*/
|
|
112
|
-
export function
|
|
113
|
-
|
|
112
|
+
export function validateInvariantsRecord(record) {
|
|
113
|
+
INVARIANTS_RECORD_SCHEMA.parse(record);
|
|
114
114
|
}
|
package/src/benchmark/runner.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Phases per (task, runIndex):
|
|
5
5
|
* 1. WorkdirManager.start → seed CWD + run pre-flight probe
|
|
6
6
|
* 2. Supervisor session (agent + supervisor) → produce traces + submission
|
|
7
|
-
* 3.
|
|
7
|
+
* 3. Invariants.runInvariants → exit-code-driven verdict via fd-3 NDJSON
|
|
8
8
|
* 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
|
|
9
9
|
* 5. WorkdirManager.teardown → process-group cleanup
|
|
10
10
|
*
|
|
@@ -14,8 +14,6 @@
|
|
|
14
14
|
* the JSONL append is the system of record.
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
import { createReadStream, createWriteStream } from "node:fs";
|
|
18
|
-
import { mkdir, readFile, unlink } from "node:fs/promises";
|
|
19
17
|
import { createInterface } from "node:readline";
|
|
20
18
|
import { join, resolve as resolvePath } from "node:path";
|
|
21
19
|
|
|
@@ -25,7 +23,7 @@ import { installApm as defaultInstallApm } from "./apm-installer.js";
|
|
|
25
23
|
import { installNpm as defaultInstallNpm } from "./npm-installer.js";
|
|
26
24
|
import { runJudge } from "./judge.js";
|
|
27
25
|
import { validateResultRecord } from "./result.js";
|
|
28
|
-
import {
|
|
26
|
+
import { runInvariants } from "./invariants.js";
|
|
29
27
|
import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
|
|
30
28
|
import { createWorkdirManager } from "./workdir.js";
|
|
31
29
|
|
|
@@ -60,17 +58,21 @@ export class BenchmarkRunner {
|
|
|
60
58
|
* write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
|
|
61
59
|
* `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
|
|
62
60
|
* testing only — not part of the public API.
|
|
63
|
-
* @param {
|
|
64
|
-
*
|
|
61
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime -
|
|
62
|
+
* Injected ambient collaborators (`fs`, `subprocess`, `clock`, `proc`),
|
|
63
|
+
* threaded into the installers, workdir manager, invariants, and judge.
|
|
64
|
+
* @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
|
|
65
|
+
* Same contract as `runInvariants(task, ctx, runtime)`. Internal testing only.
|
|
65
66
|
* @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
|
|
66
|
-
* contract as `runJudge(task, workdir,
|
|
67
|
-
* only.
|
|
67
|
+
* contract as `runJudge(task, workdir, invariants, deps)` (deps carries
|
|
68
|
+
* `runtime`). Internal testing only.
|
|
68
69
|
* @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
|
|
69
|
-
* Same contract as `installApm(family, outputDir)`. Lets tests
|
|
70
|
-
* fake
|
|
71
|
-
* shells out to a real `apm` binary. Internal testing only.
|
|
70
|
+
* Same contract as `installApm(family, outputDir, runtime)`. Lets tests
|
|
71
|
+
* inject a fake subprocess (or skip the install entirely) so the suite
|
|
72
|
+
* never shells out to a real `apm` binary. Internal testing only.
|
|
72
73
|
* @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
|
|
73
|
-
* Same contract as `installNpm(family, stagingDir)`. Internal
|
|
74
|
+
* Same contract as `installNpm(family, stagingDir, runtime)`. Internal
|
|
75
|
+
* testing only.
|
|
74
76
|
*/
|
|
75
77
|
constructor({
|
|
76
78
|
family,
|
|
@@ -84,19 +86,16 @@ export class BenchmarkRunner {
|
|
|
84
86
|
allowedTools,
|
|
85
87
|
maxTurns,
|
|
86
88
|
termGraceMs,
|
|
89
|
+
runtime,
|
|
87
90
|
// Test seams — default to the real implementations.
|
|
88
91
|
runAgent,
|
|
89
|
-
|
|
92
|
+
runInvariants: runInvariantsHook,
|
|
90
93
|
runJudge: runJudgeHook,
|
|
91
94
|
installApm: installApmHook,
|
|
92
95
|
installNpm: installNpmHook,
|
|
93
96
|
}) {
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
throw new Error("runs must be an integer ≥ 1");
|
|
97
|
-
if (!output) throw new Error("output is required");
|
|
98
|
-
if (!agentModel) throw new Error("agentModel is required");
|
|
99
|
-
if (!query) throw new Error("query is required");
|
|
97
|
+
validateRunnerArgs({ family, runs, output, agentModel, query, runtime });
|
|
98
|
+
this.runtime = runtime;
|
|
100
99
|
this.familyInput = family;
|
|
101
100
|
this.runs = runs;
|
|
102
101
|
this.output = output;
|
|
@@ -112,7 +111,7 @@ export class BenchmarkRunner {
|
|
|
112
111
|
this.maxTurns = maxTurns;
|
|
113
112
|
this.termGraceMs = termGraceMs;
|
|
114
113
|
this._runAgentHook = runAgent ?? null;
|
|
115
|
-
this.
|
|
114
|
+
this._runInvariantsHook = runInvariantsHook ?? runInvariants;
|
|
116
115
|
this._runJudgeHook = runJudgeHook ?? runJudge;
|
|
117
116
|
this._installApmHook = installApmHook ?? defaultInstallApm;
|
|
118
117
|
this._installNpmHook = installNpmHook ?? defaultInstallNpm;
|
|
@@ -123,15 +122,16 @@ export class BenchmarkRunner {
|
|
|
123
122
|
* @returns {AsyncGenerator<object>}
|
|
124
123
|
*/
|
|
125
124
|
async *run() {
|
|
125
|
+
const runtime = this.runtime;
|
|
126
126
|
const family =
|
|
127
127
|
typeof this.familyInput === "string"
|
|
128
|
-
? await loadTaskFamily(this.familyInput)
|
|
128
|
+
? await loadTaskFamily(this.familyInput, runtime)
|
|
129
129
|
: this.familyInput;
|
|
130
130
|
|
|
131
|
-
await mkdir(this.output, { recursive: true });
|
|
131
|
+
await runtime.fs.mkdir(this.output, { recursive: true });
|
|
132
132
|
const { stagingDir, skillSetHash, judgeProfilesDir } =
|
|
133
|
-
await this._installApmHook(family, this.output);
|
|
134
|
-
await this._installNpmHook(family, stagingDir);
|
|
133
|
+
await this._installApmHook(family, this.output, runtime);
|
|
134
|
+
await this._installNpmHook(family, stagingDir, runtime);
|
|
135
135
|
|
|
136
136
|
const tasks = family.tasks();
|
|
137
137
|
if (this.profiles.judge) {
|
|
@@ -139,6 +139,7 @@ export class BenchmarkRunner {
|
|
|
139
139
|
family,
|
|
140
140
|
judgeProfilesDir,
|
|
141
141
|
this.profiles.judge,
|
|
142
|
+
runtime,
|
|
142
143
|
);
|
|
143
144
|
}
|
|
144
145
|
|
|
@@ -147,10 +148,13 @@ export class BenchmarkRunner {
|
|
|
147
148
|
runOutputDir: this.output,
|
|
148
149
|
termGraceMs: this.termGraceMs,
|
|
149
150
|
familyRootPath: family.rootPath,
|
|
151
|
+
runtime,
|
|
150
152
|
});
|
|
151
153
|
|
|
152
154
|
const resultsPath = join(this.output, "results.jsonl");
|
|
153
|
-
const resultsStream = createWriteStream(resultsPath, {
|
|
155
|
+
const resultsStream = runtime.fs.createWriteStream(resultsPath, {
|
|
156
|
+
flags: "a",
|
|
157
|
+
});
|
|
154
158
|
try {
|
|
155
159
|
for (const task of tasks) {
|
|
156
160
|
for (let runIndex = 0; runIndex < this.runs; runIndex++) {
|
|
@@ -172,7 +176,7 @@ export class BenchmarkRunner {
|
|
|
172
176
|
}
|
|
173
177
|
|
|
174
178
|
async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
|
|
175
|
-
const t0 =
|
|
179
|
+
const t0 = this.runtime.clock.now();
|
|
176
180
|
const workdir = await wm.start(task, runIndex);
|
|
177
181
|
try {
|
|
178
182
|
if (workdir.preflightError) {
|
|
@@ -182,7 +186,7 @@ export class BenchmarkRunner {
|
|
|
182
186
|
workdir,
|
|
183
187
|
skillSetHash,
|
|
184
188
|
familyRevision: family.familyRevision,
|
|
185
|
-
durationMs:
|
|
189
|
+
durationMs: this.runtime.clock.now() - t0,
|
|
186
190
|
});
|
|
187
191
|
return this.#validateOrFallback(
|
|
188
192
|
record,
|
|
@@ -191,11 +195,15 @@ export class BenchmarkRunner {
|
|
|
191
195
|
}
|
|
192
196
|
const { costUsd, turns, submission, agentError } =
|
|
193
197
|
await this.#runAgentSafe(task, workdir);
|
|
194
|
-
const
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
198
|
+
const invariants = await this._runInvariantsHook(
|
|
199
|
+
task,
|
|
200
|
+
{
|
|
201
|
+
cwd: workdir.cwd,
|
|
202
|
+
port: workdir.port,
|
|
203
|
+
runDir: workdir.runDir,
|
|
204
|
+
},
|
|
205
|
+
this.runtime,
|
|
206
|
+
);
|
|
199
207
|
let judgeVerdict = null;
|
|
200
208
|
if (task.paths.judge) {
|
|
201
209
|
const judgeContext = await this.#buildJudgeContext(
|
|
@@ -206,18 +214,19 @@ export class BenchmarkRunner {
|
|
|
206
214
|
judgeVerdict = await this._runJudgeHook(
|
|
207
215
|
task,
|
|
208
216
|
workdir,
|
|
209
|
-
|
|
217
|
+
invariants,
|
|
210
218
|
{
|
|
211
219
|
query: this.query,
|
|
212
220
|
model: this.judgeModel,
|
|
213
221
|
judgeProfile: this.profiles.judge ?? undefined,
|
|
214
222
|
profilesDir: judgeProfilesDir,
|
|
223
|
+
runtime: this.runtime,
|
|
215
224
|
},
|
|
216
225
|
judgeContext,
|
|
217
226
|
);
|
|
218
227
|
}
|
|
219
228
|
const verdict =
|
|
220
|
-
|
|
229
|
+
invariants.verdict === "pass" &&
|
|
221
230
|
(judgeVerdict === null || judgeVerdict.verdict === "pass")
|
|
222
231
|
? "pass"
|
|
223
232
|
: "fail";
|
|
@@ -225,7 +234,7 @@ export class BenchmarkRunner {
|
|
|
225
234
|
taskId: task.id,
|
|
226
235
|
runIndex,
|
|
227
236
|
verdict,
|
|
228
|
-
|
|
237
|
+
invariants,
|
|
229
238
|
submission,
|
|
230
239
|
...(judgeVerdict && { judgeVerdict }),
|
|
231
240
|
costUsd,
|
|
@@ -245,7 +254,7 @@ export class BenchmarkRunner {
|
|
|
245
254
|
},
|
|
246
255
|
skillSetHash,
|
|
247
256
|
familyRevision: family.familyRevision,
|
|
248
|
-
durationMs:
|
|
257
|
+
durationMs: this.runtime.clock.now() - t0,
|
|
249
258
|
...(agentError && { agentError }),
|
|
250
259
|
};
|
|
251
260
|
return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
|
|
@@ -283,10 +292,11 @@ export class BenchmarkRunner {
|
|
|
283
292
|
* agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
|
|
284
293
|
*/
|
|
285
294
|
async #runAgent(task, workdir) {
|
|
295
|
+
const fs = this.runtime.fs;
|
|
286
296
|
const combinedPath = join(workdir.runDir, ".combined.ndjson");
|
|
287
|
-
const combinedStream = createWriteStream(combinedPath);
|
|
297
|
+
const combinedStream = fs.createWriteStream(combinedPath);
|
|
288
298
|
const supervisorInstructions = task.paths.supervisor
|
|
289
|
-
? await readFile(task.paths.supervisor, "utf8").catch(() => null)
|
|
299
|
+
? await fs.readFile(task.paths.supervisor, "utf8").catch(() => null)
|
|
290
300
|
: null;
|
|
291
301
|
const supervisor = createSupervisor({
|
|
292
302
|
supervisorCwd: workdir.cwd,
|
|
@@ -301,9 +311,11 @@ export class BenchmarkRunner {
|
|
|
301
311
|
...(supervisorInstructions && { taskAmend: supervisorInstructions }),
|
|
302
312
|
redactor: createRedactor({
|
|
303
313
|
allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
|
|
314
|
+
runtime: this.runtime,
|
|
304
315
|
}),
|
|
316
|
+
runtime: this.runtime,
|
|
305
317
|
});
|
|
306
|
-
const instructions = await readFile(task.paths.instructions, "utf8");
|
|
318
|
+
const instructions = await fs.readFile(task.paths.instructions, "utf8");
|
|
307
319
|
let agentError = null;
|
|
308
320
|
try {
|
|
309
321
|
const result = await supervisor.run(instructions);
|
|
@@ -316,16 +328,21 @@ export class BenchmarkRunner {
|
|
|
316
328
|
await new Promise((r) => combinedStream.end(r));
|
|
317
329
|
}
|
|
318
330
|
const summary = await splitAndSummarize(
|
|
331
|
+
this.runtime,
|
|
319
332
|
combinedPath,
|
|
320
333
|
workdir.agentTracePath,
|
|
321
334
|
workdir.supervisorTracePath,
|
|
322
335
|
);
|
|
323
|
-
await unlink(combinedPath).catch(() => {});
|
|
336
|
+
await fs.unlink(combinedPath).catch(() => {});
|
|
324
337
|
return { ...summary, agentError };
|
|
325
338
|
}
|
|
326
339
|
|
|
327
340
|
async #buildJudgeContext(task, workdir, skillSetHash) {
|
|
328
|
-
const
|
|
341
|
+
const fs = this.runtime.fs;
|
|
342
|
+
const agentInstructions = await fs.readFile(
|
|
343
|
+
task.paths.instructions,
|
|
344
|
+
"utf8",
|
|
345
|
+
);
|
|
329
346
|
let agentProfile = "";
|
|
330
347
|
if (this.profiles.agent) {
|
|
331
348
|
const profilePath = resolvePath(
|
|
@@ -333,7 +350,7 @@ export class BenchmarkRunner {
|
|
|
333
350
|
".claude/agents",
|
|
334
351
|
`${this.profiles.agent}.md`,
|
|
335
352
|
);
|
|
336
|
-
agentProfile = await readFile(profilePath, "utf8").catch(() => "");
|
|
353
|
+
agentProfile = await fs.readFile(profilePath, "utf8").catch(() => "");
|
|
337
354
|
}
|
|
338
355
|
return { agentInstructions, agentProfile, skillSetHash };
|
|
339
356
|
}
|
|
@@ -390,6 +407,27 @@ export class BenchmarkRunner {
|
|
|
390
407
|
}
|
|
391
408
|
}
|
|
392
409
|
|
|
410
|
+
/**
|
|
411
|
+
* Validate the required BenchmarkRunner constructor arguments. Extracted from
|
|
412
|
+
* the constructor to keep its cognitive complexity under the lint ceiling.
|
|
413
|
+
*/
|
|
414
|
+
function validateRunnerArgs({
|
|
415
|
+
family,
|
|
416
|
+
runs,
|
|
417
|
+
output,
|
|
418
|
+
agentModel,
|
|
419
|
+
query,
|
|
420
|
+
runtime,
|
|
421
|
+
}) {
|
|
422
|
+
if (!family) throw new Error("family is required");
|
|
423
|
+
if (!Number.isInteger(runs) || runs < 1)
|
|
424
|
+
throw new Error("runs must be an integer ≥ 1");
|
|
425
|
+
if (!output) throw new Error("output is required");
|
|
426
|
+
if (!agentModel) throw new Error("agentModel is required");
|
|
427
|
+
if (!query) throw new Error("query is required");
|
|
428
|
+
if (!runtime) throw new Error("runtime is required");
|
|
429
|
+
}
|
|
430
|
+
|
|
393
431
|
function resultsRecordKey(task, runIndex) {
|
|
394
432
|
return { taskId: task.id, runIndex };
|
|
395
433
|
}
|
|
@@ -408,11 +446,17 @@ async function writeRecord(stream, record) {
|
|
|
408
446
|
* `supervisorPath`.
|
|
409
447
|
*/
|
|
410
448
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
|
|
411
|
-
async function splitAndSummarize(
|
|
412
|
-
|
|
413
|
-
|
|
449
|
+
async function splitAndSummarize(
|
|
450
|
+
runtime,
|
|
451
|
+
combinedPath,
|
|
452
|
+
agentPath,
|
|
453
|
+
supervisorPath,
|
|
454
|
+
) {
|
|
455
|
+
const fs = runtime.fs;
|
|
456
|
+
const agentStream = fs.createWriteStream(agentPath);
|
|
457
|
+
const supStream = fs.createWriteStream(supervisorPath);
|
|
414
458
|
const rl = createInterface({
|
|
415
|
-
input: createReadStream(combinedPath),
|
|
459
|
+
input: fs.createReadStream(combinedPath),
|
|
416
460
|
crlfDelay: Infinity,
|
|
417
461
|
});
|
|
418
462
|
let agentCost = 0;
|