@forwardimpact/libeval 0.1.51 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,8 +14,6 @@
14
14
  * the JSONL append is the system of record.
15
15
  */
16
16
 
17
- import { createReadStream, createWriteStream } from "node:fs";
18
- import { mkdir, readFile, unlink } from "node:fs/promises";
19
17
  import { createInterface } from "node:readline";
20
18
  import { join, resolve as resolvePath } from "node:path";
21
19
 
@@ -60,17 +58,21 @@ export class BenchmarkRunner {
60
58
  * write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
61
59
  * `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
62
60
  * testing only — not part of the public API.
61
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime -
62
+ * Injected ambient collaborators (`fs`, `subprocess`, `clock`, `proc`),
63
+ * threaded into the installers, workdir manager, invariants, and judge.
63
64
  * @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
64
- * Same contract as `runInvariants(task, ctx)`. Internal testing only.
65
+ * Same contract as `runInvariants(task, ctx, runtime)`. Internal testing only.
65
66
  * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
66
- * contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
67
- * only.
67
+ * contract as `runJudge(task, workdir, invariants, deps)` (deps carries
68
+ * `runtime`). Internal testing only.
68
69
  * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
69
- * Same contract as `installApm(family, outputDir)`. Lets tests inject a
70
- * fake `apm` spawn (or skip the install entirely) so the suite never
71
- * shells out to a real `apm` binary. Internal testing only.
70
+ * Same contract as `installApm(family, outputDir, runtime)`. Lets tests
71
+ * inject a fake subprocess (or skip the install entirely) so the suite
72
+ * never shells out to a real `apm` binary. Internal testing only.
72
73
  * @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
73
- * Same contract as `installNpm(family, stagingDir)`. Internal testing only.
74
+ * Same contract as `installNpm(family, stagingDir, runtime)`. Internal
75
+ * testing only.
74
76
  */
75
77
  constructor({
76
78
  family,
@@ -84,6 +86,7 @@ export class BenchmarkRunner {
84
86
  allowedTools,
85
87
  maxTurns,
86
88
  termGraceMs,
89
+ runtime,
87
90
  // Test seams — default to the real implementations.
88
91
  runAgent,
89
92
  runInvariants: runInvariantsHook,
@@ -91,12 +94,8 @@ export class BenchmarkRunner {
91
94
  installApm: installApmHook,
92
95
  installNpm: installNpmHook,
93
96
  }) {
94
- if (!family) throw new Error("family is required");
95
- if (!Number.isInteger(runs) || runs < 1)
96
- throw new Error("runs must be an integer ≥ 1");
97
- if (!output) throw new Error("output is required");
98
- if (!agentModel) throw new Error("agentModel is required");
99
- if (!query) throw new Error("query is required");
97
+ validateRunnerArgs({ family, runs, output, agentModel, query, runtime });
98
+ this.runtime = runtime;
100
99
  this.familyInput = family;
101
100
  this.runs = runs;
102
101
  this.output = output;
@@ -123,15 +122,16 @@ export class BenchmarkRunner {
123
122
  * @returns {AsyncGenerator<object>}
124
123
  */
125
124
  async *run() {
125
+ const runtime = this.runtime;
126
126
  const family =
127
127
  typeof this.familyInput === "string"
128
- ? await loadTaskFamily(this.familyInput)
128
+ ? await loadTaskFamily(this.familyInput, runtime)
129
129
  : this.familyInput;
130
130
 
131
- await mkdir(this.output, { recursive: true });
131
+ await runtime.fs.mkdir(this.output, { recursive: true });
132
132
  const { stagingDir, skillSetHash, judgeProfilesDir } =
133
- await this._installApmHook(family, this.output);
134
- await this._installNpmHook(family, stagingDir);
133
+ await this._installApmHook(family, this.output, runtime);
134
+ await this._installNpmHook(family, stagingDir, runtime);
135
135
 
136
136
  const tasks = family.tasks();
137
137
  if (this.profiles.judge) {
@@ -139,6 +139,7 @@ export class BenchmarkRunner {
139
139
  family,
140
140
  judgeProfilesDir,
141
141
  this.profiles.judge,
142
+ runtime,
142
143
  );
143
144
  }
144
145
 
@@ -147,10 +148,13 @@ export class BenchmarkRunner {
147
148
  runOutputDir: this.output,
148
149
  termGraceMs: this.termGraceMs,
149
150
  familyRootPath: family.rootPath,
151
+ runtime,
150
152
  });
151
153
 
152
154
  const resultsPath = join(this.output, "results.jsonl");
153
- const resultsStream = createWriteStream(resultsPath, { flags: "a" });
155
+ const resultsStream = runtime.fs.createWriteStream(resultsPath, {
156
+ flags: "a",
157
+ });
154
158
  try {
155
159
  for (const task of tasks) {
156
160
  for (let runIndex = 0; runIndex < this.runs; runIndex++) {
@@ -172,7 +176,7 @@ export class BenchmarkRunner {
172
176
  }
173
177
 
174
178
  async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
175
- const t0 = Date.now();
179
+ const t0 = this.runtime.clock.now();
176
180
  const workdir = await wm.start(task, runIndex);
177
181
  try {
178
182
  if (workdir.preflightError) {
@@ -182,7 +186,7 @@ export class BenchmarkRunner {
182
186
  workdir,
183
187
  skillSetHash,
184
188
  familyRevision: family.familyRevision,
185
- durationMs: Date.now() - t0,
189
+ durationMs: this.runtime.clock.now() - t0,
186
190
  });
187
191
  return this.#validateOrFallback(
188
192
  record,
@@ -191,11 +195,15 @@ export class BenchmarkRunner {
191
195
  }
192
196
  const { costUsd, turns, submission, agentError } =
193
197
  await this.#runAgentSafe(task, workdir);
194
- const invariants = await this._runInvariantsHook(task, {
195
- cwd: workdir.cwd,
196
- port: workdir.port,
197
- runDir: workdir.runDir,
198
- });
198
+ const invariants = await this._runInvariantsHook(
199
+ task,
200
+ {
201
+ cwd: workdir.cwd,
202
+ port: workdir.port,
203
+ runDir: workdir.runDir,
204
+ },
205
+ this.runtime,
206
+ );
199
207
  let judgeVerdict = null;
200
208
  if (task.paths.judge) {
201
209
  const judgeContext = await this.#buildJudgeContext(
@@ -212,6 +220,7 @@ export class BenchmarkRunner {
212
220
  model: this.judgeModel,
213
221
  judgeProfile: this.profiles.judge ?? undefined,
214
222
  profilesDir: judgeProfilesDir,
223
+ runtime: this.runtime,
215
224
  },
216
225
  judgeContext,
217
226
  );
@@ -245,7 +254,7 @@ export class BenchmarkRunner {
245
254
  },
246
255
  skillSetHash,
247
256
  familyRevision: family.familyRevision,
248
- durationMs: Date.now() - t0,
257
+ durationMs: this.runtime.clock.now() - t0,
249
258
  ...(agentError && { agentError }),
250
259
  };
251
260
  return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
@@ -283,10 +292,11 @@ export class BenchmarkRunner {
283
292
  * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
284
293
  */
285
294
  async #runAgent(task, workdir) {
295
+ const fs = this.runtime.fs;
286
296
  const combinedPath = join(workdir.runDir, ".combined.ndjson");
287
- const combinedStream = createWriteStream(combinedPath);
297
+ const combinedStream = fs.createWriteStream(combinedPath);
288
298
  const supervisorInstructions = task.paths.supervisor
289
- ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
299
+ ? await fs.readFile(task.paths.supervisor, "utf8").catch(() => null)
290
300
  : null;
291
301
  const supervisor = createSupervisor({
292
302
  supervisorCwd: workdir.cwd,
@@ -301,9 +311,11 @@ export class BenchmarkRunner {
301
311
  ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
302
312
  redactor: createRedactor({
303
313
  allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
314
+ runtime: this.runtime,
304
315
  }),
316
+ runtime: this.runtime,
305
317
  });
306
- const instructions = await readFile(task.paths.instructions, "utf8");
318
+ const instructions = await fs.readFile(task.paths.instructions, "utf8");
307
319
  let agentError = null;
308
320
  try {
309
321
  const result = await supervisor.run(instructions);
@@ -316,16 +328,21 @@ export class BenchmarkRunner {
316
328
  await new Promise((r) => combinedStream.end(r));
317
329
  }
318
330
  const summary = await splitAndSummarize(
331
+ this.runtime,
319
332
  combinedPath,
320
333
  workdir.agentTracePath,
321
334
  workdir.supervisorTracePath,
322
335
  );
323
- await unlink(combinedPath).catch(() => {});
336
+ await fs.unlink(combinedPath).catch(() => {});
324
337
  return { ...summary, agentError };
325
338
  }
326
339
 
327
340
  async #buildJudgeContext(task, workdir, skillSetHash) {
328
- const agentInstructions = await readFile(task.paths.instructions, "utf8");
341
+ const fs = this.runtime.fs;
342
+ const agentInstructions = await fs.readFile(
343
+ task.paths.instructions,
344
+ "utf8",
345
+ );
329
346
  let agentProfile = "";
330
347
  if (this.profiles.agent) {
331
348
  const profilePath = resolvePath(
@@ -333,7 +350,7 @@ export class BenchmarkRunner {
333
350
  ".claude/agents",
334
351
  `${this.profiles.agent}.md`,
335
352
  );
336
- agentProfile = await readFile(profilePath, "utf8").catch(() => "");
353
+ agentProfile = await fs.readFile(profilePath, "utf8").catch(() => "");
337
354
  }
338
355
  return { agentInstructions, agentProfile, skillSetHash };
339
356
  }
@@ -390,6 +407,27 @@ export class BenchmarkRunner {
390
407
  }
391
408
  }
392
409
 
410
+ /**
411
+ * Validate the required BenchmarkRunner constructor arguments. Extracted from
412
+ * the constructor to keep its cognitive complexity under the lint ceiling.
413
+ */
414
+ function validateRunnerArgs({
415
+ family,
416
+ runs,
417
+ output,
418
+ agentModel,
419
+ query,
420
+ runtime,
421
+ }) {
422
+ if (!family) throw new Error("family is required");
423
+ if (!Number.isInteger(runs) || runs < 1)
424
+ throw new Error("runs must be an integer ≥ 1");
425
+ if (!output) throw new Error("output is required");
426
+ if (!agentModel) throw new Error("agentModel is required");
427
+ if (!query) throw new Error("query is required");
428
+ if (!runtime) throw new Error("runtime is required");
429
+ }
430
+
393
431
  function resultsRecordKey(task, runIndex) {
394
432
  return { taskId: task.id, runIndex };
395
433
  }
@@ -408,11 +446,17 @@ async function writeRecord(stream, record) {
408
446
  * `supervisorPath`.
409
447
  */
410
448
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
411
- async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
412
- const agentStream = createWriteStream(agentPath);
413
- const supStream = createWriteStream(supervisorPath);
449
+ async function splitAndSummarize(
450
+ runtime,
451
+ combinedPath,
452
+ agentPath,
453
+ supervisorPath,
454
+ ) {
455
+ const fs = runtime.fs;
456
+ const agentStream = fs.createWriteStream(agentPath);
457
+ const supStream = fs.createWriteStream(supervisorPath);
414
458
  const rl = createInterface({
415
- input: createReadStream(combinedPath),
459
+ input: fs.createReadStream(combinedPath),
416
460
  crlfDelay: Infinity,
417
461
  });
418
462
  let agentCost = 0;
@@ -17,45 +17,55 @@
17
17
  * a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
18
18
  * Local paths use the canonical-tree algorithm from design § Family revision
19
19
  * algorithm so the result is stable across operating systems.
20
+ *
21
+ * Filesystem and subprocess access route through the injected `runtime` bag
22
+ * (`runtime.fs` async, `runtime.subprocess.run` one-shot, `tmpdir` derived
23
+ * from `runtime.proc.env`).
20
24
  */
21
25
 
22
- import { spawn } from "node:child_process";
23
26
  import { createHash } from "node:crypto";
24
- import {
25
- access,
26
- constants,
27
- lstat,
28
- mkdtemp,
29
- readdir,
30
- readFile,
31
- realpath,
32
- } from "node:fs/promises";
33
- import { tmpdir } from "node:os";
34
27
  import { join, posix, relative, resolve, sep } from "node:path";
35
28
 
36
29
  const GIT_URL_RE = /^(git@|https?:\/\/|ssh:\/\/|git:\/\/)/;
37
30
  const SKIP_DIRS = new Set([".git", "node_modules"]);
31
+ // POSIX `X_OK` (execute permission); node's fs honours the numeric mode, so we
32
+ // avoid importing `node:fs`'s `constants` (which would light the fs smell).
33
+ const X_OK = 1;
34
+
35
+ /**
36
+ * Derive the system temp dir from the env (node's `os.tmpdir()` is itself an
37
+ * env-respecting wrapper). The runtime bag has no `os` slot by design.
38
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
39
+ * @returns {string}
40
+ */
41
+ function tmpdir(runtime) {
42
+ return runtime.proc.env.TMPDIR ?? "/tmp";
43
+ }
38
44
 
39
45
  /**
40
46
  * Load a task family from a local path or git URL.
41
47
  * @param {string} rootPathOrGitUrl
48
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
42
49
  * @returns {Promise<TaskFamily>}
43
50
  */
44
- export async function loadTaskFamily(rootPathOrGitUrl) {
51
+ export async function loadTaskFamily(rootPathOrGitUrl, runtime) {
52
+ if (!runtime) throw new Error("runtime is required");
45
53
  const isGit = GIT_URL_RE.test(rootPathOrGitUrl);
46
54
  let rootPath;
47
55
  let familyRevision;
48
56
  if (isGit) {
49
- const dir = await mkdtemp(join(tmpdir(), "fit-benchmark-family-"));
50
- await gitClone(rootPathOrGitUrl, dir);
57
+ const dir = await runtime.fs.mkdtemp(
58
+ join(tmpdir(runtime), "fit-benchmark-family-"),
59
+ );
60
+ await gitClone(runtime, rootPathOrGitUrl, dir);
51
61
  rootPath = dir;
52
- familyRevision = "git:" + (await gitHead(dir));
62
+ familyRevision = "git:" + (await gitHead(runtime, dir));
53
63
  } else {
54
64
  rootPath = resolve(rootPathOrGitUrl);
55
- familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
65
+ familyRevision = "sha256:" + (await canonicalTreeHash(runtime, rootPath));
56
66
  }
57
67
 
58
- const tasks = await discoverTasks(rootPath);
68
+ const tasks = await discoverTasks(runtime, rootPath);
59
69
 
60
70
  return {
61
71
  rootPath,
@@ -73,27 +83,30 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
73
83
  * @param {TaskFamily} _family
74
84
  * @param {string} judgeProfilesDir
75
85
  * @param {string} judgeProfile
86
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
76
87
  * @returns {Promise<void>}
77
88
  */
78
89
  export async function assertJudgeProfileStaged(
79
90
  _family,
80
91
  judgeProfilesDir,
81
92
  judgeProfile,
93
+ runtime,
82
94
  ) {
83
95
  const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
84
96
  try {
85
- await access(candidate);
97
+ await runtime.fs.access(candidate);
86
98
  } catch {
87
99
  throw new Error(`judge profile not staged: ${candidate}`);
88
100
  }
89
101
  }
90
102
 
91
- async function discoverTasks(rootPath) {
103
+ async function discoverTasks(runtime, rootPath) {
104
+ const fs = runtime.fs;
92
105
  const tasksRoot = join(rootPath, "tasks");
93
106
  const tasks = [];
94
107
  let entries;
95
108
  try {
96
- entries = await readdir(tasksRoot, { withFileTypes: true });
109
+ entries = await fs.readdir(tasksRoot, { withFileTypes: true });
97
110
  } catch (e) {
98
111
  if (e.code === "ENOENT") return tasks;
99
112
  throw e;
@@ -110,11 +123,15 @@ async function discoverTasks(rootPath) {
110
123
  paths: {
111
124
  taskDir,
112
125
  instructions: join(taskDir, "agent.task.md"),
113
- supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
114
- judge: (await fileExists(judgePath)) ? judgePath : null,
126
+ supervisor: (await fileExists(fs, supervisorPath))
127
+ ? supervisorPath
128
+ : null,
129
+ judge: (await fileExists(fs, judgePath)) ? judgePath : null,
115
130
  hooks: join(taskDir, "hooks"),
116
- preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
117
- invariants: (await fileExecutable(invariantsPath))
131
+ preflight: (await fileExecutable(fs, preflightPath))
132
+ ? preflightPath
133
+ : null,
134
+ invariants: (await fileExecutable(fs, invariantsPath))
118
135
  ? invariantsPath
119
136
  : null,
120
137
  specs: join(taskDir, "specs"),
@@ -126,18 +143,18 @@ async function discoverTasks(rootPath) {
126
143
  return tasks;
127
144
  }
128
145
 
129
- async function fileExists(path) {
146
+ async function fileExists(fs, path) {
130
147
  try {
131
- await access(path);
148
+ await fs.access(path);
132
149
  return true;
133
150
  } catch {
134
151
  return false;
135
152
  }
136
153
  }
137
154
 
138
- async function fileExecutable(path) {
155
+ async function fileExecutable(fs, path) {
139
156
  try {
140
- await access(path, constants.X_OK);
157
+ await fs.access(path, X_OK);
141
158
  return true;
142
159
  } catch {
143
160
  return false;
@@ -151,16 +168,18 @@ async function fileExecutable(path) {
151
168
  * sort by NFC-normalised POSIX-style root-relative path
152
169
  * row = <rel-path>\0<hex-sha256>\n
153
170
  * sha256(concat(rows))
171
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
154
172
  * @param {string} rootPath
155
173
  * @returns {Promise<string>} hex digest
156
174
  */
157
- async function canonicalTreeHash(rootPath) {
158
- const real = await realpath(rootPath);
175
+ async function canonicalTreeHash(runtime, rootPath) {
176
+ const fs = runtime.fs;
177
+ const real = await fs.realpath(rootPath);
159
178
  const rows = [];
160
- for await (const filePath of walkFiles(real)) {
179
+ for await (const filePath of walkFiles(fs, real)) {
161
180
  const rel = toPosix(relative(real, filePath)).normalize("NFC");
162
- const target = await realpath(filePath);
163
- const bytes = await readFile(target);
181
+ const target = await fs.realpath(filePath);
182
+ const bytes = await fs.readFile(target);
164
183
  const hex = createHash("sha256").update(bytes).digest("hex");
165
184
  rows.push({ rel, hex });
166
185
  }
@@ -170,15 +189,15 @@ async function canonicalTreeHash(rootPath) {
170
189
  return acc.digest("hex");
171
190
  }
172
191
 
173
- async function* walkFiles(dir) {
174
- const entries = await readdir(dir, { withFileTypes: true });
192
+ async function* walkFiles(fs, dir) {
193
+ const entries = await fs.readdir(dir, { withFileTypes: true });
175
194
  for (const entry of entries) {
176
195
  const full = join(dir, entry.name);
177
196
  if (entry.isDirectory()) {
178
197
  if (SKIP_DIRS.has(entry.name)) continue;
179
- yield* walkFiles(full);
198
+ yield* walkFiles(fs, full);
180
199
  } else if (entry.isSymbolicLink()) {
181
- const resolvedFile = await resolveSymlinkToFile(full);
200
+ const resolvedFile = await resolveSymlinkToFile(fs, full);
182
201
  if (resolvedFile) yield full;
183
202
  } else if (entry.isFile()) {
184
203
  yield full;
@@ -190,12 +209,12 @@ async function* walkFiles(dir) {
190
209
  * Return the resolved path if `linkPath` is a symlink to a regular file.
191
210
  * Returns null for dangling symlinks or links to non-file targets.
192
211
  */
193
- async function resolveSymlinkToFile(linkPath) {
194
- const st = await lstat(linkPath);
212
+ async function resolveSymlinkToFile(fs, linkPath) {
213
+ const st = await fs.lstat(linkPath);
195
214
  if (!st.isSymbolicLink()) return null;
196
215
  try {
197
- const resolved = await realpath(linkPath);
198
- const tstat = await lstat(resolved);
216
+ const resolved = await fs.realpath(linkPath);
217
+ const tstat = await fs.lstat(resolved);
199
218
  return tstat.isFile() ? resolved : null;
200
219
  } catch {
201
220
  return null;
@@ -207,32 +226,24 @@ function toPosix(p) {
207
226
  return p.split(sep).join(posix.sep);
208
227
  }
209
228
 
210
- async function gitClone(url, dir) {
211
- await run("git", ["clone", "--depth", "1", url, dir]);
229
+ async function gitClone(runtime, url, dir) {
230
+ await git(runtime, ["clone", "--depth", "1", url, dir]);
212
231
  }
213
232
 
214
- async function gitHead(dir) {
215
- const out = await run("git", ["-C", dir, "rev-parse", "HEAD"]);
233
+ async function gitHead(runtime, dir) {
234
+ const out = await git(runtime, ["-C", dir, "rev-parse", "HEAD"]);
216
235
  return out.trim();
217
236
  }
218
237
 
219
- function run(cmd, args) {
220
- return new Promise((res, rej) => {
221
- const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
222
- let stdout = "";
223
- let stderr = "";
224
- child.stdout.on("data", (d) => {
225
- stdout += d.toString();
226
- });
227
- child.stderr.on("data", (d) => {
228
- stderr += d.toString();
229
- });
230
- child.on("error", rej);
231
- child.on("close", (code) => {
232
- if (code === 0) res(stdout);
233
- else rej(new Error(`${cmd} ${args.join(" ")} exited ${code}: ${stderr}`));
234
- });
235
- });
238
+ async function git(runtime, args) {
239
+ const { stdout, stderr, exitCode } = await runtime.subprocess.run(
240
+ "git",
241
+ args,
242
+ );
243
+ if (exitCode !== 0) {
244
+ throw new Error(`git ${args.join(" ")} exited ${exitCode}: ${stderr}`);
245
+ }
246
+ return stdout;
236
247
  }
237
248
 
238
249
  /**