@forwardimpact/libeval 0.1.51 → 0.1.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,23 +3,22 @@
3
3
  * is present, then copies the resulting `node_modules/` into the staging
4
4
  * directory so WorkdirManager can seed each per-task CWD.
5
5
  *
6
- * Symmetric to ApmInstaller: constructor injection of `spawn` for testability,
7
- * factory function, and a free-function shorthand.
6
+ * Symmetric to ApmInstaller: the subprocess and filesystem flow through the
7
+ * injected `runtime` bag (`runtime.subprocess.spawn` + `runtime.fs`).
8
8
  */
9
9
 
10
- import { spawn as nodeSpawn } from "node:child_process";
11
- import { access, cp } from "node:fs/promises";
12
10
  import { join } from "node:path";
13
11
 
14
12
  /** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
15
13
  export class NpmInstaller {
16
14
  /**
17
- * @param {object} [deps]
18
- * @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
19
- * `node:child_process` spawn). Tests inject a fake to avoid shelling out.
15
+ * @param {object} deps
16
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
17
+ * Ambient collaborators; uses `subprocess.spawn` and `fs`.
20
18
  */
21
- constructor({ spawn } = {}) {
22
- this.spawn = spawn ?? nodeSpawn;
19
+ constructor({ runtime }) {
20
+ if (!runtime) throw new Error("runtime is required");
21
+ this.runtime = runtime;
23
22
  }
24
23
 
25
24
  /**
@@ -28,8 +27,10 @@ export class NpmInstaller {
28
27
  * @returns {Promise<void>}
29
28
  */
30
29
  async install(family, stagingDir) {
30
+ const fs = this.runtime.fs;
31
31
  const pkgJson = join(family.rootPath, "package.json");
32
- const hasPkg = await access(pkgJson)
32
+ const hasPkg = await fs
33
+ .access(pkgJson)
33
34
  .then(() => true)
34
35
  .catch(() => false);
35
36
  if (!hasPkg) return;
@@ -38,37 +39,35 @@ export class NpmInstaller {
38
39
 
39
40
  const sourceModules = join(family.rootPath, "node_modules");
40
41
  try {
41
- await access(sourceModules);
42
+ await fs.access(sourceModules);
42
43
  } catch {
43
44
  throw new Error(
44
45
  `bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
45
46
  );
46
47
  }
47
48
 
48
- await cp(sourceModules, join(stagingDir, "node_modules"), {
49
+ await fs.cp(sourceModules, join(stagingDir, "node_modules"), {
49
50
  recursive: true,
50
51
  });
51
52
  }
52
53
 
53
- #runBunInstall(cwd) {
54
- return new Promise((res, rej) => {
55
- const child = this.spawn("bun", ["install"], {
56
- cwd,
57
- stdio: ["ignore", "pipe", "pipe"],
58
- });
59
- let stderr = "";
60
- child.stdout.on("data", () => {});
61
- child.stderr.on("data", (d) => {
62
- stderr += d.toString();
63
- });
64
- child.on("error", (e) => {
65
- rej(new Error(`failed to spawn bun: ${e.message}`));
66
- });
67
- child.on("close", (code) => {
68
- if (code === 0) res();
69
- else rej(new Error(`bun install exited ${code}: ${stderr}`));
70
- });
54
+ async #runBunInstall(cwd) {
55
+ const child = this.runtime.subprocess.spawn("bun", ["install"], {
56
+ cwd,
57
+ stdio: ["ignore", "pipe", "pipe"],
71
58
  });
59
+ let stderr = "";
60
+ const drainStdout = (async () => {
61
+ for await (const _chunk of child.stdout) {
62
+ // discard
63
+ }
64
+ })();
65
+ for await (const chunk of child.stderr) stderr += chunk.toString();
66
+ await drainStdout;
67
+ const code = await child.exitCode;
68
+ if (code !== 0) {
69
+ throw new Error(`bun install exited ${code}: ${stderr}`);
70
+ }
72
71
  }
73
72
  }
74
73
 
@@ -78,10 +77,11 @@ export function createNpmInstaller(deps) {
78
77
  }
79
78
 
80
79
  /**
81
- * Free-function shorthand for callers that don't need to inject a spawn seam.
80
+ * Free-function shorthand for callers that thread a runtime bag.
82
81
  * @param {import("./task-family.js").TaskFamily} family
83
82
  * @param {string} stagingDir
83
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
84
84
  */
85
- export function installNpm(family, stagingDir) {
86
- return new NpmInstaller().install(family, stagingDir);
85
+ export function installNpm(family, stagingDir, runtime) {
86
+ return new NpmInstaller({ runtime }).install(family, stagingDir);
87
87
  }
@@ -12,9 +12,7 @@
12
12
  * whole report.
13
13
  */
14
14
 
15
- import { createReadStream } from "node:fs";
16
15
  import { join } from "node:path";
17
- import { createInterface } from "node:readline";
18
16
 
19
17
  import { validateResultRecord } from "./result.js";
20
18
 
@@ -41,11 +39,17 @@ import { validateResultRecord } from "./result.js";
41
39
  */
42
40
 
43
41
  /**
44
- * @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
42
+ * @param {{inputDir: string, kValues: number[], includeRuns?: boolean, runtime: import("@forwardimpact/libutil/runtime").Runtime}} opts
45
43
  * @returns {Promise<{tasks: TaskReport[], totals: object}>}
46
44
  */
47
- export async function aggregate({ inputDir, kValues, includeRuns = false }) {
48
- const records = await loadRecords(inputDir);
45
+ export async function aggregate({
46
+ inputDir,
47
+ kValues,
48
+ includeRuns = false,
49
+ runtime,
50
+ }) {
51
+ if (!runtime) throw new Error("runtime is required");
52
+ const records = await loadRecords(inputDir, runtime);
49
53
  const grouped = groupByTask(records.records);
50
54
  const tasks = [];
51
55
  let totalRuns = 0;
@@ -429,20 +433,30 @@ function median(arr) {
429
433
  // Record loading
430
434
  // ---------------------------------------------------------------------------
431
435
 
432
- async function loadRecords(inputDir) {
436
+ async function loadRecords(inputDir, runtime) {
433
437
  const path = join(inputDir, "results.jsonl");
434
- const stream = createReadStream(path);
435
- const rl = createInterface({ input: stream, crlfDelay: Infinity });
438
+ let content;
439
+ try {
440
+ content = await runtime.fs.readFile(path, "utf8");
441
+ } catch (e) {
442
+ // Re-throw with the stack collapsed to the message line so the CLI's
443
+ // error rendering stays free of node-internal async `readFile` frames
444
+ // (matching the pre-1370 stream-error shape the golden captured).
445
+ const err = new Error(e.message);
446
+ if (e.code) err.code = e.code;
447
+ err.stack = `Error: ${e.message}`;
448
+ throw err;
449
+ }
436
450
  const records = [];
437
451
  let skipped = 0;
438
- for await (const line of rl) {
452
+ for (const line of content.split("\n")) {
439
453
  const trimmed = line.trim();
440
454
  if (!trimmed) continue;
441
455
  let record;
442
456
  try {
443
457
  record = JSON.parse(trimmed);
444
458
  } catch (e) {
445
- process.stderr.write(
459
+ runtime.proc.stderr.write(
446
460
  `benchmark report: skipped malformed JSON line — ${e.message}\n`,
447
461
  );
448
462
  skipped++;
@@ -451,7 +465,7 @@ async function loadRecords(inputDir) {
451
465
  try {
452
466
  validateResultRecord(record);
453
467
  } catch (e) {
454
- process.stderr.write(
468
+ runtime.proc.stderr.write(
455
469
  `benchmark report: skipped record failing schema — ${describeError(e)}\n`,
456
470
  );
457
471
  skipped++;
@@ -5,8 +5,8 @@
5
5
  * - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
6
6
  * benchmark run. Has a happy branch (invariants + judge present) and a
7
7
  * pre-flight-failure branch (invariants/judgeVerdict/submission absent).
8
- * - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
9
- * (P7): ad-hoc grading without a full lifecycle.
8
+ * - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`:
9
+ * ad-hoc grading without a full lifecycle.
10
10
  *
11
11
  * Validation is throw-on-mismatch so the runner can wrap every JSONL append
12
12
  * in a guard and reject schema drift at write time.
@@ -14,8 +14,6 @@
14
14
  * the JSONL append is the system of record.
15
15
  */
16
16
 
17
- import { createReadStream, createWriteStream } from "node:fs";
18
- import { mkdir, readFile, unlink } from "node:fs/promises";
19
17
  import { createInterface } from "node:readline";
20
18
  import { join, resolve as resolvePath } from "node:path";
21
19
 
@@ -60,17 +58,21 @@ export class BenchmarkRunner {
60
58
  * write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
61
59
  * `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
62
60
  * testing only — not part of the public API.
61
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime -
62
+ * Injected ambient collaborators (`fs`, `subprocess`, `clock`, `proc`),
63
+ * threaded into the installers, workdir manager, invariants, and judge.
63
64
  * @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
64
- * Same contract as `runInvariants(task, ctx)`. Internal testing only.
65
+ * Same contract as `runInvariants(task, ctx, runtime)`. Internal testing only.
65
66
  * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
66
- * contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
67
- * only.
67
+ * contract as `runJudge(task, workdir, invariants, deps)` (deps carries
68
+ * `runtime`). Internal testing only.
68
69
  * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
69
- * Same contract as `installApm(family, outputDir)`. Lets tests inject a
70
- * fake `apm` spawn (or skip the install entirely) so the suite never
71
- * shells out to a real `apm` binary. Internal testing only.
70
+ * Same contract as `installApm(family, outputDir, runtime)`. Lets tests
71
+ * inject a fake subprocess (or skip the install entirely) so the suite
72
+ * never shells out to a real `apm` binary. Internal testing only.
72
73
  * @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
73
- * Same contract as `installNpm(family, stagingDir)`. Internal testing only.
74
+ * Same contract as `installNpm(family, stagingDir, runtime)`. Internal
75
+ * testing only.
74
76
  */
75
77
  constructor({
76
78
  family,
@@ -84,6 +86,7 @@ export class BenchmarkRunner {
84
86
  allowedTools,
85
87
  maxTurns,
86
88
  termGraceMs,
89
+ runtime,
87
90
  // Test seams — default to the real implementations.
88
91
  runAgent,
89
92
  runInvariants: runInvariantsHook,
@@ -91,12 +94,8 @@ export class BenchmarkRunner {
91
94
  installApm: installApmHook,
92
95
  installNpm: installNpmHook,
93
96
  }) {
94
- if (!family) throw new Error("family is required");
95
- if (!Number.isInteger(runs) || runs < 1)
96
- throw new Error("runs must be an integer ≥ 1");
97
- if (!output) throw new Error("output is required");
98
- if (!agentModel) throw new Error("agentModel is required");
99
- if (!query) throw new Error("query is required");
97
+ validateRunnerArgs({ family, runs, output, agentModel, query, runtime });
98
+ this.runtime = runtime;
100
99
  this.familyInput = family;
101
100
  this.runs = runs;
102
101
  this.output = output;
@@ -123,15 +122,16 @@ export class BenchmarkRunner {
123
122
  * @returns {AsyncGenerator<object>}
124
123
  */
125
124
  async *run() {
125
+ const runtime = this.runtime;
126
126
  const family =
127
127
  typeof this.familyInput === "string"
128
- ? await loadTaskFamily(this.familyInput)
128
+ ? await loadTaskFamily(this.familyInput, runtime)
129
129
  : this.familyInput;
130
130
 
131
- await mkdir(this.output, { recursive: true });
131
+ await runtime.fs.mkdir(this.output, { recursive: true });
132
132
  const { stagingDir, skillSetHash, judgeProfilesDir } =
133
- await this._installApmHook(family, this.output);
134
- await this._installNpmHook(family, stagingDir);
133
+ await this._installApmHook(family, this.output, runtime);
134
+ await this._installNpmHook(family, stagingDir, runtime);
135
135
 
136
136
  const tasks = family.tasks();
137
137
  if (this.profiles.judge) {
@@ -139,6 +139,7 @@ export class BenchmarkRunner {
139
139
  family,
140
140
  judgeProfilesDir,
141
141
  this.profiles.judge,
142
+ runtime,
142
143
  );
143
144
  }
144
145
 
@@ -147,10 +148,13 @@ export class BenchmarkRunner {
147
148
  runOutputDir: this.output,
148
149
  termGraceMs: this.termGraceMs,
149
150
  familyRootPath: family.rootPath,
151
+ runtime,
150
152
  });
151
153
 
152
154
  const resultsPath = join(this.output, "results.jsonl");
153
- const resultsStream = createWriteStream(resultsPath, { flags: "a" });
155
+ const resultsStream = runtime.fs.createWriteStream(resultsPath, {
156
+ flags: "a",
157
+ });
154
158
  try {
155
159
  for (const task of tasks) {
156
160
  for (let runIndex = 0; runIndex < this.runs; runIndex++) {
@@ -172,7 +176,7 @@ export class BenchmarkRunner {
172
176
  }
173
177
 
174
178
  async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
175
- const t0 = Date.now();
179
+ const t0 = this.runtime.clock.now();
176
180
  const workdir = await wm.start(task, runIndex);
177
181
  try {
178
182
  if (workdir.preflightError) {
@@ -182,7 +186,7 @@ export class BenchmarkRunner {
182
186
  workdir,
183
187
  skillSetHash,
184
188
  familyRevision: family.familyRevision,
185
- durationMs: Date.now() - t0,
189
+ durationMs: this.runtime.clock.now() - t0,
186
190
  });
187
191
  return this.#validateOrFallback(
188
192
  record,
@@ -191,11 +195,15 @@ export class BenchmarkRunner {
191
195
  }
192
196
  const { costUsd, turns, submission, agentError } =
193
197
  await this.#runAgentSafe(task, workdir);
194
- const invariants = await this._runInvariantsHook(task, {
195
- cwd: workdir.cwd,
196
- port: workdir.port,
197
- runDir: workdir.runDir,
198
- });
198
+ const invariants = await this._runInvariantsHook(
199
+ task,
200
+ {
201
+ cwd: workdir.cwd,
202
+ port: workdir.port,
203
+ runDir: workdir.runDir,
204
+ },
205
+ this.runtime,
206
+ );
199
207
  let judgeVerdict = null;
200
208
  if (task.paths.judge) {
201
209
  const judgeContext = await this.#buildJudgeContext(
@@ -212,6 +220,7 @@ export class BenchmarkRunner {
212
220
  model: this.judgeModel,
213
221
  judgeProfile: this.profiles.judge ?? undefined,
214
222
  profilesDir: judgeProfilesDir,
223
+ runtime: this.runtime,
215
224
  },
216
225
  judgeContext,
217
226
  );
@@ -245,7 +254,7 @@ export class BenchmarkRunner {
245
254
  },
246
255
  skillSetHash,
247
256
  familyRevision: family.familyRevision,
248
- durationMs: Date.now() - t0,
257
+ durationMs: this.runtime.clock.now() - t0,
249
258
  ...(agentError && { agentError }),
250
259
  };
251
260
  return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
@@ -283,10 +292,11 @@ export class BenchmarkRunner {
283
292
  * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
284
293
  */
285
294
  async #runAgent(task, workdir) {
295
+ const fs = this.runtime.fs;
286
296
  const combinedPath = join(workdir.runDir, ".combined.ndjson");
287
- const combinedStream = createWriteStream(combinedPath);
297
+ const combinedStream = fs.createWriteStream(combinedPath);
288
298
  const supervisorInstructions = task.paths.supervisor
289
- ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
299
+ ? await fs.readFile(task.paths.supervisor, "utf8").catch(() => null)
290
300
  : null;
291
301
  const supervisor = createSupervisor({
292
302
  supervisorCwd: workdir.cwd,
@@ -301,9 +311,11 @@ export class BenchmarkRunner {
301
311
  ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
302
312
  redactor: createRedactor({
303
313
  allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
314
+ runtime: this.runtime,
304
315
  }),
316
+ runtime: this.runtime,
305
317
  });
306
- const instructions = await readFile(task.paths.instructions, "utf8");
318
+ const instructions = await fs.readFile(task.paths.instructions, "utf8");
307
319
  let agentError = null;
308
320
  try {
309
321
  const result = await supervisor.run(instructions);
@@ -316,16 +328,21 @@ export class BenchmarkRunner {
316
328
  await new Promise((r) => combinedStream.end(r));
317
329
  }
318
330
  const summary = await splitAndSummarize(
331
+ this.runtime,
319
332
  combinedPath,
320
333
  workdir.agentTracePath,
321
334
  workdir.supervisorTracePath,
322
335
  );
323
- await unlink(combinedPath).catch(() => {});
336
+ await fs.unlink(combinedPath).catch(() => {});
324
337
  return { ...summary, agentError };
325
338
  }
326
339
 
327
340
  async #buildJudgeContext(task, workdir, skillSetHash) {
328
- const agentInstructions = await readFile(task.paths.instructions, "utf8");
341
+ const fs = this.runtime.fs;
342
+ const agentInstructions = await fs.readFile(
343
+ task.paths.instructions,
344
+ "utf8",
345
+ );
329
346
  let agentProfile = "";
330
347
  if (this.profiles.agent) {
331
348
  const profilePath = resolvePath(
@@ -333,7 +350,7 @@ export class BenchmarkRunner {
333
350
  ".claude/agents",
334
351
  `${this.profiles.agent}.md`,
335
352
  );
336
- agentProfile = await readFile(profilePath, "utf8").catch(() => "");
353
+ agentProfile = await fs.readFile(profilePath, "utf8").catch(() => "");
337
354
  }
338
355
  return { agentInstructions, agentProfile, skillSetHash };
339
356
  }
@@ -390,6 +407,27 @@ export class BenchmarkRunner {
390
407
  }
391
408
  }
392
409
 
410
+ /**
411
+ * Validate the required BenchmarkRunner constructor arguments. Extracted from
412
+ * the constructor to keep its cognitive complexity under the lint ceiling.
413
+ */
414
+ function validateRunnerArgs({
415
+ family,
416
+ runs,
417
+ output,
418
+ agentModel,
419
+ query,
420
+ runtime,
421
+ }) {
422
+ if (!family) throw new Error("family is required");
423
+ if (!Number.isInteger(runs) || runs < 1)
424
+ throw new Error("runs must be an integer ≥ 1");
425
+ if (!output) throw new Error("output is required");
426
+ if (!agentModel) throw new Error("agentModel is required");
427
+ if (!query) throw new Error("query is required");
428
+ if (!runtime) throw new Error("runtime is required");
429
+ }
430
+
393
431
  function resultsRecordKey(task, runIndex) {
394
432
  return { taskId: task.id, runIndex };
395
433
  }
@@ -408,11 +446,17 @@ async function writeRecord(stream, record) {
408
446
  * `supervisorPath`.
409
447
  */
410
448
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
411
- async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
412
- const agentStream = createWriteStream(agentPath);
413
- const supStream = createWriteStream(supervisorPath);
449
+ async function splitAndSummarize(
450
+ runtime,
451
+ combinedPath,
452
+ agentPath,
453
+ supervisorPath,
454
+ ) {
455
+ const fs = runtime.fs;
456
+ const agentStream = fs.createWriteStream(agentPath);
457
+ const supStream = fs.createWriteStream(supervisorPath);
414
458
  const rl = createInterface({
415
- input: createReadStream(combinedPath),
459
+ input: fs.createReadStream(combinedPath),
416
460
  crlfDelay: Infinity,
417
461
  });
418
462
  let agentCost = 0;