@forwardimpact/libeval 0.1.50 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +36 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/apm-installer.js +48 -44
  8. package/src/benchmark/env-loader.js +35 -23
  9. package/src/benchmark/invariants.js +128 -0
  10. package/src/benchmark/judge.js +18 -19
  11. package/src/benchmark/npm-installer.js +33 -33
  12. package/src/benchmark/report.js +40 -26
  13. package/src/benchmark/result.js +11 -11
  14. package/src/benchmark/runner.js +90 -46
  15. package/src/benchmark/task-family.js +78 -65
  16. package/src/benchmark/workdir.js +100 -93
  17. package/src/commands/assert.js +30 -22
  18. package/src/commands/benchmark-invariants.js +74 -0
  19. package/src/commands/benchmark-report.js +24 -15
  20. package/src/commands/benchmark-run.js +16 -9
  21. package/src/commands/by-discussion.js +33 -23
  22. package/src/commands/callback.js +20 -11
  23. package/src/commands/discuss.js +31 -13
  24. package/src/commands/facilitate.js +21 -14
  25. package/src/commands/output.js +15 -13
  26. package/src/commands/run.js +28 -14
  27. package/src/commands/supervise.js +29 -19
  28. package/src/commands/task-input.js +10 -5
  29. package/src/commands/tee.js +24 -9
  30. package/src/commands/trace.js +181 -99
  31. package/src/discuss-tools.js +48 -2
  32. package/src/discusser.js +53 -2
  33. package/src/events/github.js +27 -5
  34. package/src/facilitator.js +4 -0
  35. package/src/inbox-poller.js +84 -0
  36. package/src/judge.js +4 -1
  37. package/src/message-bus.js +6 -0
  38. package/src/orchestration-loop.js +14 -4
  39. package/src/orchestration-toolkit.js +14 -0
  40. package/src/profile-prompt.js +22 -9
  41. package/src/redaction.js +31 -9
  42. package/src/reply-emitter.js +47 -0
  43. package/src/supervisor.js +4 -0
  44. package/src/tee-writer.js +4 -2
  45. package/src/trace-collector.js +9 -2
  46. package/src/trace-github.js +47 -27
  47. package/src/benchmark/scorer.js +0 -138
  48. package/src/commands/benchmark-score.js +0 -68
@@ -3,7 +3,7 @@
3
3
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
4
4
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
5
5
  *
6
- * When `includeRuns` is true, each task carries per-run detail (scoring
6
+ * When `includeRuns` is true, each task carries per-run detail (invariant
7
7
  * checks, judge commentary, cost, duration) and the text renderer produces
8
8
  * a full markdown report instead of just the pass@k table.
9
9
  *
@@ -12,9 +12,7 @@
12
12
  * whole report.
13
13
  */
14
14
 
15
- import { createReadStream } from "node:fs";
16
15
  import { join } from "node:path";
17
- import { createInterface } from "node:readline";
18
16
 
19
17
  import { validateResultRecord } from "./result.js";
20
18
 
@@ -22,7 +20,7 @@ import { validateResultRecord } from "./result.js";
22
20
  * @typedef {object} RunDetail
23
21
  * @property {number} runIndex
24
22
  * @property {"pass"|"fail"} verdict
25
- * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
23
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
26
24
  * @property {{verdict: string, summary: string}} [judgeVerdict]
27
25
  * @property {number} costUsd
28
26
  * @property {number} turns
@@ -41,11 +39,17 @@ import { validateResultRecord } from "./result.js";
41
39
  */
42
40
 
43
41
  /**
44
- * @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
42
+ * @param {{inputDir: string, kValues: number[], includeRuns?: boolean, runtime: import("@forwardimpact/libutil/runtime").Runtime}} opts
45
43
  * @returns {Promise<{tasks: TaskReport[], totals: object}>}
46
44
  */
47
- export async function aggregate({ inputDir, kValues, includeRuns = false }) {
48
- const records = await loadRecords(inputDir);
45
+ export async function aggregate({
46
+ inputDir,
47
+ kValues,
48
+ includeRuns = false,
49
+ runtime,
50
+ }) {
51
+ if (!runtime) throw new Error("runtime is required");
52
+ const records = await loadRecords(inputDir, runtime);
49
53
  const grouped = groupByTask(records.records);
50
54
  const tasks = [];
51
55
  let totalRuns = 0;
@@ -112,7 +116,7 @@ function buildRunDetail(r, acc) {
112
116
  return {
113
117
  runIndex: r.runIndex,
114
118
  verdict: r.verdict,
115
- ...(r.scoring && { scoring: r.scoring }),
119
+ ...(r.invariants && { invariants: r.invariants }),
116
120
  ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
117
121
  costUsd: r.costUsd ?? 0,
118
122
  turns: r.turns ?? 0,
@@ -262,7 +266,7 @@ function renderTaskDetail(task) {
262
266
 
263
267
  lines.push("", renderRunsTable(runs));
264
268
 
265
- const checks = renderScoringChecks(runs, singleRun);
269
+ const checks = renderInvariantChecks(runs, singleRun);
266
270
  if (checks) lines.push("", checks);
267
271
 
268
272
  const commentary = renderJudgeCommentary(runs, singleRun);
@@ -278,7 +282,7 @@ function renderRunsTable(runs) {
278
282
  const header = [
279
283
  "Run",
280
284
  "Verdict",
281
- "Scoring",
285
+ "Invariants",
282
286
  "Judge",
283
287
  "Cost",
284
288
  "Turns",
@@ -286,10 +290,10 @@ function renderRunsTable(runs) {
286
290
  ];
287
291
  const rows = [header, header.map(() => "---")];
288
292
  for (const r of runs) {
289
- const scoringCell = r.preflightError
293
+ const invariantsCell = r.preflightError
290
294
  ? "preflight error"
291
- : r.scoring
292
- ? statusIcon(r.scoring.verdict === "pass")
295
+ : r.invariants
296
+ ? statusIcon(r.invariants.verdict === "pass")
293
297
  : "—";
294
298
  const judgeCell = r.preflightError
295
299
  ? "—"
@@ -299,7 +303,7 @@ function renderRunsTable(runs) {
299
303
  rows.push([
300
304
  String(r.runIndex),
301
305
  statusIcon(r.verdict === "pass"),
302
- scoringCell,
306
+ invariantsCell,
303
307
  judgeCell,
304
308
  formatCost(r.costUsd),
305
309
  String(r.turns),
@@ -309,15 +313,15 @@ function renderRunsTable(runs) {
309
313
  return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
310
314
  }
311
315
 
312
- function renderScoringChecks(runs, singleRun) {
313
- const rows = collectScoringRows(runs);
316
+ function renderInvariantChecks(runs, singleRun) {
317
+ const rows = collectInvariantRows(runs);
314
318
  if (!rows.length) return null;
315
319
 
316
320
  const header = singleRun
317
321
  ? ["Check", "Result", "Message"]
318
322
  : ["Run", "Check", "Result", "Message"];
319
323
  const lines = [
320
- "#### Scoring Checks",
324
+ "#### Invariant Checks",
321
325
  "",
322
326
  `| ${header.join(" | ")} |`,
323
327
  `| ${header.map(() => "---").join(" | ")} |`,
@@ -331,11 +335,11 @@ function renderScoringChecks(runs, singleRun) {
331
335
  return lines.join("\n");
332
336
  }
333
337
 
334
- function collectScoringRows(runs) {
338
+ function collectInvariantRows(runs) {
335
339
  const rows = [];
336
340
  for (const r of runs) {
337
- if (!r.scoring?.details?.length) continue;
338
- for (const d of r.scoring.details) {
341
+ if (!r.invariants?.details?.length) continue;
342
+ for (const d of r.invariants.details) {
339
343
  rows.push({
340
344
  run: r.runIndex,
341
345
  check: escapeCell(String(d.test ?? "(unnamed)")),
@@ -429,20 +433,30 @@ function median(arr) {
429
433
  // Record loading
430
434
  // ---------------------------------------------------------------------------
431
435
 
432
- async function loadRecords(inputDir) {
436
+ async function loadRecords(inputDir, runtime) {
433
437
  const path = join(inputDir, "results.jsonl");
434
- const stream = createReadStream(path);
435
- const rl = createInterface({ input: stream, crlfDelay: Infinity });
438
+ let content;
439
+ try {
440
+ content = await runtime.fs.readFile(path, "utf8");
441
+ } catch (e) {
442
+ // Re-throw with the stack collapsed to the message line so the CLI's
443
+ // error rendering stays free of node-internal async `readFile` frames
444
+ // (matching the pre-1370 stream-error shape the golden captured).
445
+ const err = new Error(e.message);
446
+ if (e.code) err.code = e.code;
447
+ err.stack = `Error: ${e.message}`;
448
+ throw err;
449
+ }
436
450
  const records = [];
437
451
  let skipped = 0;
438
- for await (const line of rl) {
452
+ for (const line of content.split("\n")) {
439
453
  const trimmed = line.trim();
440
454
  if (!trimmed) continue;
441
455
  let record;
442
456
  try {
443
457
  record = JSON.parse(trimmed);
444
458
  } catch (e) {
445
- process.stderr.write(
459
+ runtime.proc.stderr.write(
446
460
  `benchmark report: skipped malformed JSON line — ${e.message}\n`,
447
461
  );
448
462
  skipped++;
@@ -451,7 +465,7 @@ async function loadRecords(inputDir) {
451
465
  try {
452
466
  validateResultRecord(record);
453
467
  } catch (e) {
454
- process.stderr.write(
468
+ runtime.proc.stderr.write(
455
469
  `benchmark report: skipped record failing schema — ${describeError(e)}\n`,
456
470
  );
457
471
  skipped++;
@@ -3,10 +3,10 @@
3
3
  *
4
4
  * Two schemas live here:
5
5
  * - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
6
- * benchmark run. Has a happy branch (scoring + judge present) and a
7
- * pre-flight-failure branch (scoring/judgeVerdict/submission absent).
8
- * - SCORING_RECORD_SCHEMA — narrower output of `benchmark-score` (P7):
9
- * ad-hoc grading without a full lifecycle.
6
+ * benchmark run. Has a happy branch (invariants + judge present) and a
7
+ * pre-flight-failure branch (invariants/judgeVerdict/submission absent).
8
+ * - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
9
+ * (P7): ad-hoc grading without a full lifecycle.
10
10
  *
11
11
  * Validation is throw-on-mismatch so the runner can wrap every JSONL append
12
12
  * in a guard and reject schema drift at write time.
@@ -16,7 +16,7 @@ import { z } from "zod";
16
16
 
17
17
  const VERDICT_ENUM = z.enum(["pass", "fail"]);
18
18
 
19
- const SCORING_SHAPE = z.object({
19
+ const INVARIANTS_SHAPE = z.object({
20
20
  verdict: VERDICT_ENUM,
21
21
  details: z.array(z.unknown()),
22
22
  exitCode: z.number().int(),
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
63
63
 
64
64
  const HAPPY_RECORD = z.object({
65
65
  ...COMMON_FIELDS,
66
- scoring: SCORING_SHAPE,
66
+ invariants: INVARIANTS_SHAPE,
67
67
  submission: z.string(),
68
68
  judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
69
69
  agentTracePath: z.string(),
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
83
83
  agentTracePath: z.string(),
84
84
  supervisorTracePath: z.string(),
85
85
  judgeTracePath: z.string(),
86
- scoring: z.undefined().optional(),
86
+ invariants: z.undefined().optional(),
87
87
  submission: z.undefined().optional(),
88
88
  judgeVerdict: z.undefined().optional(),
89
89
  agentError: z.undefined().optional(),
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
91
91
 
92
92
  export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
93
93
 
94
- export const SCORING_RECORD_SCHEMA = z.object({
94
+ export const INVARIANTS_RECORD_SCHEMA = z.object({
95
95
  taskId: z.string().min(1),
96
- scoring: SCORING_SHAPE,
96
+ invariants: INVARIANTS_SHAPE,
97
97
  exitCode: z.number().int(),
98
98
  });
99
99
 
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
109
109
  * Throw on schema mismatch.
110
110
  * @param {object} record
111
111
  */
112
- export function validateScoringRecord(record) {
113
- SCORING_RECORD_SCHEMA.parse(record);
112
+ export function validateInvariantsRecord(record) {
113
+ INVARIANTS_RECORD_SCHEMA.parse(record);
114
114
  }
@@ -4,7 +4,7 @@
4
4
  * Phases per (task, runIndex):
5
5
  * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
6
  * 2. Supervisor session (agent + supervisor) → produce traces + submission
7
- * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
7
+ * 3. Invariants.runInvariants → exit-code-driven verdict via fd-3 NDJSON
8
8
  * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
9
  * 5. WorkdirManager.teardown → process-group cleanup
10
10
  *
@@ -14,8 +14,6 @@
14
14
  * the JSONL append is the system of record.
15
15
  */
16
16
 
17
- import { createReadStream, createWriteStream } from "node:fs";
18
- import { mkdir, readFile, unlink } from "node:fs/promises";
19
17
  import { createInterface } from "node:readline";
20
18
  import { join, resolve as resolvePath } from "node:path";
21
19
 
@@ -25,7 +23,7 @@ import { installApm as defaultInstallApm } from "./apm-installer.js";
25
23
  import { installNpm as defaultInstallNpm } from "./npm-installer.js";
26
24
  import { runJudge } from "./judge.js";
27
25
  import { validateResultRecord } from "./result.js";
28
- import { runScoring } from "./scorer.js";
26
+ import { runInvariants } from "./invariants.js";
29
27
  import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
30
28
  import { createWorkdirManager } from "./workdir.js";
31
29
 
@@ -60,17 +58,21 @@ export class BenchmarkRunner {
60
58
  * write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
61
59
  * `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
62
60
  * testing only — not part of the public API.
63
- * @param {Function} [opts.runScoring] - Test seam: replaces `runScoring`.
64
- * Same contract as `runScoring(task, ctx)`. Internal testing only.
61
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime -
62
+ * Injected ambient collaborators (`fs`, `subprocess`, `clock`, `proc`),
63
+ * threaded into the installers, workdir manager, invariants, and judge.
64
+ * @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
65
+ * Same contract as `runInvariants(task, ctx, runtime)`. Internal testing only.
65
66
  * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
66
- * contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
67
- * only.
67
+ * contract as `runJudge(task, workdir, invariants, deps)` (deps carries
68
+ * `runtime`). Internal testing only.
68
69
  * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
69
- * Same contract as `installApm(family, outputDir)`. Lets tests inject a
70
- * fake `apm` spawn (or skip the install entirely) so the suite never
71
- * shells out to a real `apm` binary. Internal testing only.
70
+ * Same contract as `installApm(family, outputDir, runtime)`. Lets tests
71
+ * inject a fake subprocess (or skip the install entirely) so the suite
72
+ * never shells out to a real `apm` binary. Internal testing only.
72
73
  * @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
73
- * Same contract as `installNpm(family, stagingDir)`. Internal testing only.
74
+ * Same contract as `installNpm(family, stagingDir, runtime)`. Internal
75
+ * testing only.
74
76
  */
75
77
  constructor({
76
78
  family,
@@ -84,19 +86,16 @@ export class BenchmarkRunner {
84
86
  allowedTools,
85
87
  maxTurns,
86
88
  termGraceMs,
89
+ runtime,
87
90
  // Test seams — default to the real implementations.
88
91
  runAgent,
89
- runScoring: runScoringHook,
92
+ runInvariants: runInvariantsHook,
90
93
  runJudge: runJudgeHook,
91
94
  installApm: installApmHook,
92
95
  installNpm: installNpmHook,
93
96
  }) {
94
- if (!family) throw new Error("family is required");
95
- if (!Number.isInteger(runs) || runs < 1)
96
- throw new Error("runs must be an integer ≥ 1");
97
- if (!output) throw new Error("output is required");
98
- if (!agentModel) throw new Error("agentModel is required");
99
- if (!query) throw new Error("query is required");
97
+ validateRunnerArgs({ family, runs, output, agentModel, query, runtime });
98
+ this.runtime = runtime;
100
99
  this.familyInput = family;
101
100
  this.runs = runs;
102
101
  this.output = output;
@@ -112,7 +111,7 @@ export class BenchmarkRunner {
112
111
  this.maxTurns = maxTurns;
113
112
  this.termGraceMs = termGraceMs;
114
113
  this._runAgentHook = runAgent ?? null;
115
- this._runScoringHook = runScoringHook ?? runScoring;
114
+ this._runInvariantsHook = runInvariantsHook ?? runInvariants;
116
115
  this._runJudgeHook = runJudgeHook ?? runJudge;
117
116
  this._installApmHook = installApmHook ?? defaultInstallApm;
118
117
  this._installNpmHook = installNpmHook ?? defaultInstallNpm;
@@ -123,15 +122,16 @@ export class BenchmarkRunner {
123
122
  * @returns {AsyncGenerator<object>}
124
123
  */
125
124
  async *run() {
125
+ const runtime = this.runtime;
126
126
  const family =
127
127
  typeof this.familyInput === "string"
128
- ? await loadTaskFamily(this.familyInput)
128
+ ? await loadTaskFamily(this.familyInput, runtime)
129
129
  : this.familyInput;
130
130
 
131
- await mkdir(this.output, { recursive: true });
131
+ await runtime.fs.mkdir(this.output, { recursive: true });
132
132
  const { stagingDir, skillSetHash, judgeProfilesDir } =
133
- await this._installApmHook(family, this.output);
134
- await this._installNpmHook(family, stagingDir);
133
+ await this._installApmHook(family, this.output, runtime);
134
+ await this._installNpmHook(family, stagingDir, runtime);
135
135
 
136
136
  const tasks = family.tasks();
137
137
  if (this.profiles.judge) {
@@ -139,6 +139,7 @@ export class BenchmarkRunner {
139
139
  family,
140
140
  judgeProfilesDir,
141
141
  this.profiles.judge,
142
+ runtime,
142
143
  );
143
144
  }
144
145
 
@@ -147,10 +148,13 @@ export class BenchmarkRunner {
147
148
  runOutputDir: this.output,
148
149
  termGraceMs: this.termGraceMs,
149
150
  familyRootPath: family.rootPath,
151
+ runtime,
150
152
  });
151
153
 
152
154
  const resultsPath = join(this.output, "results.jsonl");
153
- const resultsStream = createWriteStream(resultsPath, { flags: "a" });
155
+ const resultsStream = runtime.fs.createWriteStream(resultsPath, {
156
+ flags: "a",
157
+ });
154
158
  try {
155
159
  for (const task of tasks) {
156
160
  for (let runIndex = 0; runIndex < this.runs; runIndex++) {
@@ -172,7 +176,7 @@ export class BenchmarkRunner {
172
176
  }
173
177
 
174
178
  async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
175
- const t0 = Date.now();
179
+ const t0 = this.runtime.clock.now();
176
180
  const workdir = await wm.start(task, runIndex);
177
181
  try {
178
182
  if (workdir.preflightError) {
@@ -182,7 +186,7 @@ export class BenchmarkRunner {
182
186
  workdir,
183
187
  skillSetHash,
184
188
  familyRevision: family.familyRevision,
185
- durationMs: Date.now() - t0,
189
+ durationMs: this.runtime.clock.now() - t0,
186
190
  });
187
191
  return this.#validateOrFallback(
188
192
  record,
@@ -191,11 +195,15 @@ export class BenchmarkRunner {
191
195
  }
192
196
  const { costUsd, turns, submission, agentError } =
193
197
  await this.#runAgentSafe(task, workdir);
194
- const scoring = await this._runScoringHook(task, {
195
- cwd: workdir.cwd,
196
- port: workdir.port,
197
- runDir: workdir.runDir,
198
- });
198
+ const invariants = await this._runInvariantsHook(
199
+ task,
200
+ {
201
+ cwd: workdir.cwd,
202
+ port: workdir.port,
203
+ runDir: workdir.runDir,
204
+ },
205
+ this.runtime,
206
+ );
199
207
  let judgeVerdict = null;
200
208
  if (task.paths.judge) {
201
209
  const judgeContext = await this.#buildJudgeContext(
@@ -206,18 +214,19 @@ export class BenchmarkRunner {
206
214
  judgeVerdict = await this._runJudgeHook(
207
215
  task,
208
216
  workdir,
209
- scoring,
217
+ invariants,
210
218
  {
211
219
  query: this.query,
212
220
  model: this.judgeModel,
213
221
  judgeProfile: this.profiles.judge ?? undefined,
214
222
  profilesDir: judgeProfilesDir,
223
+ runtime: this.runtime,
215
224
  },
216
225
  judgeContext,
217
226
  );
218
227
  }
219
228
  const verdict =
220
- scoring.verdict === "pass" &&
229
+ invariants.verdict === "pass" &&
221
230
  (judgeVerdict === null || judgeVerdict.verdict === "pass")
222
231
  ? "pass"
223
232
  : "fail";
@@ -225,7 +234,7 @@ export class BenchmarkRunner {
225
234
  taskId: task.id,
226
235
  runIndex,
227
236
  verdict,
228
- scoring,
237
+ invariants,
229
238
  submission,
230
239
  ...(judgeVerdict && { judgeVerdict }),
231
240
  costUsd,
@@ -245,7 +254,7 @@ export class BenchmarkRunner {
245
254
  },
246
255
  skillSetHash,
247
256
  familyRevision: family.familyRevision,
248
- durationMs: Date.now() - t0,
257
+ durationMs: this.runtime.clock.now() - t0,
249
258
  ...(agentError && { agentError }),
250
259
  };
251
260
  return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
@@ -283,10 +292,11 @@ export class BenchmarkRunner {
283
292
  * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
284
293
  */
285
294
  async #runAgent(task, workdir) {
295
+ const fs = this.runtime.fs;
286
296
  const combinedPath = join(workdir.runDir, ".combined.ndjson");
287
- const combinedStream = createWriteStream(combinedPath);
297
+ const combinedStream = fs.createWriteStream(combinedPath);
288
298
  const supervisorInstructions = task.paths.supervisor
289
- ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
299
+ ? await fs.readFile(task.paths.supervisor, "utf8").catch(() => null)
290
300
  : null;
291
301
  const supervisor = createSupervisor({
292
302
  supervisorCwd: workdir.cwd,
@@ -301,9 +311,11 @@ export class BenchmarkRunner {
301
311
  ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
302
312
  redactor: createRedactor({
303
313
  allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
314
+ runtime: this.runtime,
304
315
  }),
316
+ runtime: this.runtime,
305
317
  });
306
- const instructions = await readFile(task.paths.instructions, "utf8");
318
+ const instructions = await fs.readFile(task.paths.instructions, "utf8");
307
319
  let agentError = null;
308
320
  try {
309
321
  const result = await supervisor.run(instructions);
@@ -316,16 +328,21 @@ export class BenchmarkRunner {
316
328
  await new Promise((r) => combinedStream.end(r));
317
329
  }
318
330
  const summary = await splitAndSummarize(
331
+ this.runtime,
319
332
  combinedPath,
320
333
  workdir.agentTracePath,
321
334
  workdir.supervisorTracePath,
322
335
  );
323
- await unlink(combinedPath).catch(() => {});
336
+ await fs.unlink(combinedPath).catch(() => {});
324
337
  return { ...summary, agentError };
325
338
  }
326
339
 
327
340
  async #buildJudgeContext(task, workdir, skillSetHash) {
328
- const agentInstructions = await readFile(task.paths.instructions, "utf8");
341
+ const fs = this.runtime.fs;
342
+ const agentInstructions = await fs.readFile(
343
+ task.paths.instructions,
344
+ "utf8",
345
+ );
329
346
  let agentProfile = "";
330
347
  if (this.profiles.agent) {
331
348
  const profilePath = resolvePath(
@@ -333,7 +350,7 @@ export class BenchmarkRunner {
333
350
  ".claude/agents",
334
351
  `${this.profiles.agent}.md`,
335
352
  );
336
- agentProfile = await readFile(profilePath, "utf8").catch(() => "");
353
+ agentProfile = await fs.readFile(profilePath, "utf8").catch(() => "");
337
354
  }
338
355
  return { agentInstructions, agentProfile, skillSetHash };
339
356
  }
@@ -390,6 +407,27 @@ export class BenchmarkRunner {
390
407
  }
391
408
  }
392
409
 
410
+ /**
411
+ * Validate the required BenchmarkRunner constructor arguments. Extracted from
412
+ * the constructor to keep its cognitive complexity under the lint ceiling.
413
+ */
414
+ function validateRunnerArgs({
415
+ family,
416
+ runs,
417
+ output,
418
+ agentModel,
419
+ query,
420
+ runtime,
421
+ }) {
422
+ if (!family) throw new Error("family is required");
423
+ if (!Number.isInteger(runs) || runs < 1)
424
+ throw new Error("runs must be an integer ≥ 1");
425
+ if (!output) throw new Error("output is required");
426
+ if (!agentModel) throw new Error("agentModel is required");
427
+ if (!query) throw new Error("query is required");
428
+ if (!runtime) throw new Error("runtime is required");
429
+ }
430
+
393
431
  function resultsRecordKey(task, runIndex) {
394
432
  return { taskId: task.id, runIndex };
395
433
  }
@@ -408,11 +446,17 @@ async function writeRecord(stream, record) {
408
446
  * `supervisorPath`.
409
447
  */
410
448
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
411
- async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
412
- const agentStream = createWriteStream(agentPath);
413
- const supStream = createWriteStream(supervisorPath);
449
+ async function splitAndSummarize(
450
+ runtime,
451
+ combinedPath,
452
+ agentPath,
453
+ supervisorPath,
454
+ ) {
455
+ const fs = runtime.fs;
456
+ const agentStream = fs.createWriteStream(agentPath);
457
+ const supStream = fs.createWriteStream(supervisorPath);
414
458
  const rl = createInterface({
415
- input: createReadStream(combinedPath),
459
+ input: fs.createReadStream(combinedPath),
416
460
  crlfDelay: Infinity,
417
461
  });
418
462
  let agentCost = 0;