@forwardimpact/libeval 0.1.51 → 0.1.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +8 -14
- package/bin/fit-eval.js +8 -28
- package/bin/fit-selfedit.js +6 -4
- package/bin/fit-trace.js +7 -14
- package/package.json +1 -1
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/invariants.js +51 -63
- package/src/benchmark/judge.js +13 -11
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +25 -11
- package/src/benchmark/result.js +2 -2
- package/src/benchmark/runner.js +82 -38
- package/src/benchmark/task-family.js +74 -63
- package/src/benchmark/workdir.js +91 -99
- package/src/commands/benchmark-invariants.js +3 -3
- package/src/commands/benchmark-report.js +1 -0
- package/src/commands/benchmark-run.js +1 -1
- package/src/commands/by-discussion.js +10 -11
- package/src/commands/discuss.js +3 -2
- package/src/commands/facilitate.js +3 -2
- package/src/commands/output.js +4 -1
- package/src/commands/run.js +6 -2
- package/src/commands/supervise.js +3 -2
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +7 -2
- package/src/discusser.js +7 -5
- package/src/events/github.js +7 -1
- package/src/facilitator.js +6 -5
- package/src/inbox-poller.js +5 -8
- package/src/judge.js +12 -13
- package/src/profile-prompt.js +124 -26
- package/src/redaction.js +3 -16
- package/src/supervisor.js +7 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
|
@@ -3,23 +3,22 @@
|
|
|
3
3
|
* is present, then copies the resulting `node_modules/` into the staging
|
|
4
4
|
* directory so WorkdirManager can seed each per-task CWD.
|
|
5
5
|
*
|
|
6
|
-
* Symmetric to ApmInstaller:
|
|
7
|
-
*
|
|
6
|
+
* Symmetric to ApmInstaller: the subprocess and filesystem flow through the
|
|
7
|
+
* injected `runtime` bag (`runtime.subprocess.spawn` + `runtime.fs`).
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { spawn as nodeSpawn } from "node:child_process";
|
|
11
|
-
import { access, cp } from "node:fs/promises";
|
|
12
10
|
import { join } from "node:path";
|
|
13
11
|
|
|
14
12
|
/** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
|
|
15
13
|
export class NpmInstaller {
|
|
16
14
|
/**
|
|
17
|
-
* @param {object}
|
|
18
|
-
* @param {
|
|
19
|
-
*
|
|
15
|
+
* @param {object} deps
|
|
16
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
17
|
+
* Ambient collaborators; uses `subprocess.spawn` and `fs`.
|
|
20
18
|
*/
|
|
21
|
-
constructor({
|
|
22
|
-
|
|
19
|
+
constructor({ runtime }) {
|
|
20
|
+
if (!runtime) throw new Error("runtime is required");
|
|
21
|
+
this.runtime = runtime;
|
|
23
22
|
}
|
|
24
23
|
|
|
25
24
|
/**
|
|
@@ -28,8 +27,10 @@ export class NpmInstaller {
|
|
|
28
27
|
* @returns {Promise<void>}
|
|
29
28
|
*/
|
|
30
29
|
async install(family, stagingDir) {
|
|
30
|
+
const fs = this.runtime.fs;
|
|
31
31
|
const pkgJson = join(family.rootPath, "package.json");
|
|
32
|
-
const hasPkg = await
|
|
32
|
+
const hasPkg = await fs
|
|
33
|
+
.access(pkgJson)
|
|
33
34
|
.then(() => true)
|
|
34
35
|
.catch(() => false);
|
|
35
36
|
if (!hasPkg) return;
|
|
@@ -38,37 +39,35 @@ export class NpmInstaller {
|
|
|
38
39
|
|
|
39
40
|
const sourceModules = join(family.rootPath, "node_modules");
|
|
40
41
|
try {
|
|
41
|
-
await access(sourceModules);
|
|
42
|
+
await fs.access(sourceModules);
|
|
42
43
|
} catch {
|
|
43
44
|
throw new Error(
|
|
44
45
|
`bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
|
|
45
46
|
);
|
|
46
47
|
}
|
|
47
48
|
|
|
48
|
-
await cp(sourceModules, join(stagingDir, "node_modules"), {
|
|
49
|
+
await fs.cp(sourceModules, join(stagingDir, "node_modules"), {
|
|
49
50
|
recursive: true,
|
|
50
51
|
});
|
|
51
52
|
}
|
|
52
53
|
|
|
53
|
-
#runBunInstall(cwd) {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
58
|
-
});
|
|
59
|
-
let stderr = "";
|
|
60
|
-
child.stdout.on("data", () => {});
|
|
61
|
-
child.stderr.on("data", (d) => {
|
|
62
|
-
stderr += d.toString();
|
|
63
|
-
});
|
|
64
|
-
child.on("error", (e) => {
|
|
65
|
-
rej(new Error(`failed to spawn bun: ${e.message}`));
|
|
66
|
-
});
|
|
67
|
-
child.on("close", (code) => {
|
|
68
|
-
if (code === 0) res();
|
|
69
|
-
else rej(new Error(`bun install exited ${code}: ${stderr}`));
|
|
70
|
-
});
|
|
54
|
+
async #runBunInstall(cwd) {
|
|
55
|
+
const child = this.runtime.subprocess.spawn("bun", ["install"], {
|
|
56
|
+
cwd,
|
|
57
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
71
58
|
});
|
|
59
|
+
let stderr = "";
|
|
60
|
+
const drainStdout = (async () => {
|
|
61
|
+
for await (const _chunk of child.stdout) {
|
|
62
|
+
// discard
|
|
63
|
+
}
|
|
64
|
+
})();
|
|
65
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
66
|
+
await drainStdout;
|
|
67
|
+
const code = await child.exitCode;
|
|
68
|
+
if (code !== 0) {
|
|
69
|
+
throw new Error(`bun install exited ${code}: ${stderr}`);
|
|
70
|
+
}
|
|
72
71
|
}
|
|
73
72
|
}
|
|
74
73
|
|
|
@@ -78,10 +77,11 @@ export function createNpmInstaller(deps) {
|
|
|
78
77
|
}
|
|
79
78
|
|
|
80
79
|
/**
|
|
81
|
-
* Free-function shorthand for callers that
|
|
80
|
+
* Free-function shorthand for callers that thread a runtime bag.
|
|
82
81
|
* @param {import("./task-family.js").TaskFamily} family
|
|
83
82
|
* @param {string} stagingDir
|
|
83
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
84
84
|
*/
|
|
85
|
-
export function installNpm(family, stagingDir) {
|
|
86
|
-
return new NpmInstaller().install(family, stagingDir);
|
|
85
|
+
export function installNpm(family, stagingDir, runtime) {
|
|
86
|
+
return new NpmInstaller({ runtime }).install(family, stagingDir);
|
|
87
87
|
}
|
package/src/benchmark/report.js
CHANGED
|
@@ -12,9 +12,7 @@
|
|
|
12
12
|
* whole report.
|
|
13
13
|
*/
|
|
14
14
|
|
|
15
|
-
import { createReadStream } from "node:fs";
|
|
16
15
|
import { join } from "node:path";
|
|
17
|
-
import { createInterface } from "node:readline";
|
|
18
16
|
|
|
19
17
|
import { validateResultRecord } from "./result.js";
|
|
20
18
|
|
|
@@ -41,11 +39,17 @@ import { validateResultRecord } from "./result.js";
|
|
|
41
39
|
*/
|
|
42
40
|
|
|
43
41
|
/**
|
|
44
|
-
* @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
|
|
42
|
+
* @param {{inputDir: string, kValues: number[], includeRuns?: boolean, runtime: import("@forwardimpact/libutil/runtime").Runtime}} opts
|
|
45
43
|
* @returns {Promise<{tasks: TaskReport[], totals: object}>}
|
|
46
44
|
*/
|
|
47
|
-
export async function aggregate({
|
|
48
|
-
|
|
45
|
+
export async function aggregate({
|
|
46
|
+
inputDir,
|
|
47
|
+
kValues,
|
|
48
|
+
includeRuns = false,
|
|
49
|
+
runtime,
|
|
50
|
+
}) {
|
|
51
|
+
if (!runtime) throw new Error("runtime is required");
|
|
52
|
+
const records = await loadRecords(inputDir, runtime);
|
|
49
53
|
const grouped = groupByTask(records.records);
|
|
50
54
|
const tasks = [];
|
|
51
55
|
let totalRuns = 0;
|
|
@@ -429,20 +433,30 @@ function median(arr) {
|
|
|
429
433
|
// Record loading
|
|
430
434
|
// ---------------------------------------------------------------------------
|
|
431
435
|
|
|
432
|
-
async function loadRecords(inputDir) {
|
|
436
|
+
async function loadRecords(inputDir, runtime) {
|
|
433
437
|
const path = join(inputDir, "results.jsonl");
|
|
434
|
-
|
|
435
|
-
|
|
438
|
+
let content;
|
|
439
|
+
try {
|
|
440
|
+
content = await runtime.fs.readFile(path, "utf8");
|
|
441
|
+
} catch (e) {
|
|
442
|
+
// Re-throw with the stack collapsed to the message line so the CLI's
|
|
443
|
+
// error rendering stays free of node-internal async `readFile` frames
|
|
444
|
+
// (matching the pre-1370 stream-error shape the golden captured).
|
|
445
|
+
const err = new Error(e.message);
|
|
446
|
+
if (e.code) err.code = e.code;
|
|
447
|
+
err.stack = `Error: ${e.message}`;
|
|
448
|
+
throw err;
|
|
449
|
+
}
|
|
436
450
|
const records = [];
|
|
437
451
|
let skipped = 0;
|
|
438
|
-
for
|
|
452
|
+
for (const line of content.split("\n")) {
|
|
439
453
|
const trimmed = line.trim();
|
|
440
454
|
if (!trimmed) continue;
|
|
441
455
|
let record;
|
|
442
456
|
try {
|
|
443
457
|
record = JSON.parse(trimmed);
|
|
444
458
|
} catch (e) {
|
|
445
|
-
|
|
459
|
+
runtime.proc.stderr.write(
|
|
446
460
|
`benchmark report: skipped malformed JSON line — ${e.message}\n`,
|
|
447
461
|
);
|
|
448
462
|
skipped++;
|
|
@@ -451,7 +465,7 @@ async function loadRecords(inputDir) {
|
|
|
451
465
|
try {
|
|
452
466
|
validateResultRecord(record);
|
|
453
467
|
} catch (e) {
|
|
454
|
-
|
|
468
|
+
runtime.proc.stderr.write(
|
|
455
469
|
`benchmark report: skipped record failing schema — ${describeError(e)}\n`,
|
|
456
470
|
);
|
|
457
471
|
skipped++;
|
package/src/benchmark/result.js
CHANGED
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
* - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
|
|
6
6
|
* benchmark run. Has a happy branch (invariants + judge present) and a
|
|
7
7
|
* pre-flight-failure branch (invariants/judgeVerdict/submission absent).
|
|
8
|
-
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants
|
|
9
|
-
*
|
|
8
|
+
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`:
|
|
9
|
+
* ad-hoc grading without a full lifecycle.
|
|
10
10
|
*
|
|
11
11
|
* Validation is throw-on-mismatch so the runner can wrap every JSONL append
|
|
12
12
|
* in a guard and reject schema drift at write time.
|
package/src/benchmark/runner.js
CHANGED
|
@@ -14,8 +14,6 @@
|
|
|
14
14
|
* the JSONL append is the system of record.
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
import { createReadStream, createWriteStream } from "node:fs";
|
|
18
|
-
import { mkdir, readFile, unlink } from "node:fs/promises";
|
|
19
17
|
import { createInterface } from "node:readline";
|
|
20
18
|
import { join, resolve as resolvePath } from "node:path";
|
|
21
19
|
|
|
@@ -60,17 +58,21 @@ export class BenchmarkRunner {
|
|
|
60
58
|
* write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
|
|
61
59
|
* `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
|
|
62
60
|
* testing only — not part of the public API.
|
|
61
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime -
|
|
62
|
+
* Injected ambient collaborators (`fs`, `subprocess`, `clock`, `proc`),
|
|
63
|
+
* threaded into the installers, workdir manager, invariants, and judge.
|
|
63
64
|
* @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
|
|
64
|
-
* Same contract as `runInvariants(task, ctx)`. Internal testing only.
|
|
65
|
+
* Same contract as `runInvariants(task, ctx, runtime)`. Internal testing only.
|
|
65
66
|
* @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
|
|
66
|
-
* contract as `runJudge(task, workdir, invariants, deps)
|
|
67
|
-
* only.
|
|
67
|
+
* contract as `runJudge(task, workdir, invariants, deps)` (deps carries
|
|
68
|
+
* `runtime`). Internal testing only.
|
|
68
69
|
* @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
|
|
69
|
-
* Same contract as `installApm(family, outputDir)`. Lets tests
|
|
70
|
-
* fake
|
|
71
|
-
* shells out to a real `apm` binary. Internal testing only.
|
|
70
|
+
* Same contract as `installApm(family, outputDir, runtime)`. Lets tests
|
|
71
|
+
* inject a fake subprocess (or skip the install entirely) so the suite
|
|
72
|
+
* never shells out to a real `apm` binary. Internal testing only.
|
|
72
73
|
* @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
|
|
73
|
-
* Same contract as `installNpm(family, stagingDir)`. Internal
|
|
74
|
+
* Same contract as `installNpm(family, stagingDir, runtime)`. Internal
|
|
75
|
+
* testing only.
|
|
74
76
|
*/
|
|
75
77
|
constructor({
|
|
76
78
|
family,
|
|
@@ -84,6 +86,7 @@ export class BenchmarkRunner {
|
|
|
84
86
|
allowedTools,
|
|
85
87
|
maxTurns,
|
|
86
88
|
termGraceMs,
|
|
89
|
+
runtime,
|
|
87
90
|
// Test seams — default to the real implementations.
|
|
88
91
|
runAgent,
|
|
89
92
|
runInvariants: runInvariantsHook,
|
|
@@ -91,12 +94,8 @@ export class BenchmarkRunner {
|
|
|
91
94
|
installApm: installApmHook,
|
|
92
95
|
installNpm: installNpmHook,
|
|
93
96
|
}) {
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
throw new Error("runs must be an integer ≥ 1");
|
|
97
|
-
if (!output) throw new Error("output is required");
|
|
98
|
-
if (!agentModel) throw new Error("agentModel is required");
|
|
99
|
-
if (!query) throw new Error("query is required");
|
|
97
|
+
validateRunnerArgs({ family, runs, output, agentModel, query, runtime });
|
|
98
|
+
this.runtime = runtime;
|
|
100
99
|
this.familyInput = family;
|
|
101
100
|
this.runs = runs;
|
|
102
101
|
this.output = output;
|
|
@@ -123,15 +122,16 @@ export class BenchmarkRunner {
|
|
|
123
122
|
* @returns {AsyncGenerator<object>}
|
|
124
123
|
*/
|
|
125
124
|
async *run() {
|
|
125
|
+
const runtime = this.runtime;
|
|
126
126
|
const family =
|
|
127
127
|
typeof this.familyInput === "string"
|
|
128
|
-
? await loadTaskFamily(this.familyInput)
|
|
128
|
+
? await loadTaskFamily(this.familyInput, runtime)
|
|
129
129
|
: this.familyInput;
|
|
130
130
|
|
|
131
|
-
await mkdir(this.output, { recursive: true });
|
|
131
|
+
await runtime.fs.mkdir(this.output, { recursive: true });
|
|
132
132
|
const { stagingDir, skillSetHash, judgeProfilesDir } =
|
|
133
|
-
await this._installApmHook(family, this.output);
|
|
134
|
-
await this._installNpmHook(family, stagingDir);
|
|
133
|
+
await this._installApmHook(family, this.output, runtime);
|
|
134
|
+
await this._installNpmHook(family, stagingDir, runtime);
|
|
135
135
|
|
|
136
136
|
const tasks = family.tasks();
|
|
137
137
|
if (this.profiles.judge) {
|
|
@@ -139,6 +139,7 @@ export class BenchmarkRunner {
|
|
|
139
139
|
family,
|
|
140
140
|
judgeProfilesDir,
|
|
141
141
|
this.profiles.judge,
|
|
142
|
+
runtime,
|
|
142
143
|
);
|
|
143
144
|
}
|
|
144
145
|
|
|
@@ -147,10 +148,13 @@ export class BenchmarkRunner {
|
|
|
147
148
|
runOutputDir: this.output,
|
|
148
149
|
termGraceMs: this.termGraceMs,
|
|
149
150
|
familyRootPath: family.rootPath,
|
|
151
|
+
runtime,
|
|
150
152
|
});
|
|
151
153
|
|
|
152
154
|
const resultsPath = join(this.output, "results.jsonl");
|
|
153
|
-
const resultsStream = createWriteStream(resultsPath, {
|
|
155
|
+
const resultsStream = runtime.fs.createWriteStream(resultsPath, {
|
|
156
|
+
flags: "a",
|
|
157
|
+
});
|
|
154
158
|
try {
|
|
155
159
|
for (const task of tasks) {
|
|
156
160
|
for (let runIndex = 0; runIndex < this.runs; runIndex++) {
|
|
@@ -172,7 +176,7 @@ export class BenchmarkRunner {
|
|
|
172
176
|
}
|
|
173
177
|
|
|
174
178
|
async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
|
|
175
|
-
const t0 =
|
|
179
|
+
const t0 = this.runtime.clock.now();
|
|
176
180
|
const workdir = await wm.start(task, runIndex);
|
|
177
181
|
try {
|
|
178
182
|
if (workdir.preflightError) {
|
|
@@ -182,7 +186,7 @@ export class BenchmarkRunner {
|
|
|
182
186
|
workdir,
|
|
183
187
|
skillSetHash,
|
|
184
188
|
familyRevision: family.familyRevision,
|
|
185
|
-
durationMs:
|
|
189
|
+
durationMs: this.runtime.clock.now() - t0,
|
|
186
190
|
});
|
|
187
191
|
return this.#validateOrFallback(
|
|
188
192
|
record,
|
|
@@ -191,11 +195,15 @@ export class BenchmarkRunner {
|
|
|
191
195
|
}
|
|
192
196
|
const { costUsd, turns, submission, agentError } =
|
|
193
197
|
await this.#runAgentSafe(task, workdir);
|
|
194
|
-
const invariants = await this._runInvariantsHook(
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
198
|
+
const invariants = await this._runInvariantsHook(
|
|
199
|
+
task,
|
|
200
|
+
{
|
|
201
|
+
cwd: workdir.cwd,
|
|
202
|
+
port: workdir.port,
|
|
203
|
+
runDir: workdir.runDir,
|
|
204
|
+
},
|
|
205
|
+
this.runtime,
|
|
206
|
+
);
|
|
199
207
|
let judgeVerdict = null;
|
|
200
208
|
if (task.paths.judge) {
|
|
201
209
|
const judgeContext = await this.#buildJudgeContext(
|
|
@@ -212,6 +220,7 @@ export class BenchmarkRunner {
|
|
|
212
220
|
model: this.judgeModel,
|
|
213
221
|
judgeProfile: this.profiles.judge ?? undefined,
|
|
214
222
|
profilesDir: judgeProfilesDir,
|
|
223
|
+
runtime: this.runtime,
|
|
215
224
|
},
|
|
216
225
|
judgeContext,
|
|
217
226
|
);
|
|
@@ -245,7 +254,7 @@ export class BenchmarkRunner {
|
|
|
245
254
|
},
|
|
246
255
|
skillSetHash,
|
|
247
256
|
familyRevision: family.familyRevision,
|
|
248
|
-
durationMs:
|
|
257
|
+
durationMs: this.runtime.clock.now() - t0,
|
|
249
258
|
...(agentError && { agentError }),
|
|
250
259
|
};
|
|
251
260
|
return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
|
|
@@ -283,10 +292,11 @@ export class BenchmarkRunner {
|
|
|
283
292
|
* agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
|
|
284
293
|
*/
|
|
285
294
|
async #runAgent(task, workdir) {
|
|
295
|
+
const fs = this.runtime.fs;
|
|
286
296
|
const combinedPath = join(workdir.runDir, ".combined.ndjson");
|
|
287
|
-
const combinedStream = createWriteStream(combinedPath);
|
|
297
|
+
const combinedStream = fs.createWriteStream(combinedPath);
|
|
288
298
|
const supervisorInstructions = task.paths.supervisor
|
|
289
|
-
? await readFile(task.paths.supervisor, "utf8").catch(() => null)
|
|
299
|
+
? await fs.readFile(task.paths.supervisor, "utf8").catch(() => null)
|
|
290
300
|
: null;
|
|
291
301
|
const supervisor = createSupervisor({
|
|
292
302
|
supervisorCwd: workdir.cwd,
|
|
@@ -301,9 +311,11 @@ export class BenchmarkRunner {
|
|
|
301
311
|
...(supervisorInstructions && { taskAmend: supervisorInstructions }),
|
|
302
312
|
redactor: createRedactor({
|
|
303
313
|
allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
|
|
314
|
+
runtime: this.runtime,
|
|
304
315
|
}),
|
|
316
|
+
runtime: this.runtime,
|
|
305
317
|
});
|
|
306
|
-
const instructions = await readFile(task.paths.instructions, "utf8");
|
|
318
|
+
const instructions = await fs.readFile(task.paths.instructions, "utf8");
|
|
307
319
|
let agentError = null;
|
|
308
320
|
try {
|
|
309
321
|
const result = await supervisor.run(instructions);
|
|
@@ -316,16 +328,21 @@ export class BenchmarkRunner {
|
|
|
316
328
|
await new Promise((r) => combinedStream.end(r));
|
|
317
329
|
}
|
|
318
330
|
const summary = await splitAndSummarize(
|
|
331
|
+
this.runtime,
|
|
319
332
|
combinedPath,
|
|
320
333
|
workdir.agentTracePath,
|
|
321
334
|
workdir.supervisorTracePath,
|
|
322
335
|
);
|
|
323
|
-
await unlink(combinedPath).catch(() => {});
|
|
336
|
+
await fs.unlink(combinedPath).catch(() => {});
|
|
324
337
|
return { ...summary, agentError };
|
|
325
338
|
}
|
|
326
339
|
|
|
327
340
|
async #buildJudgeContext(task, workdir, skillSetHash) {
|
|
328
|
-
const
|
|
341
|
+
const fs = this.runtime.fs;
|
|
342
|
+
const agentInstructions = await fs.readFile(
|
|
343
|
+
task.paths.instructions,
|
|
344
|
+
"utf8",
|
|
345
|
+
);
|
|
329
346
|
let agentProfile = "";
|
|
330
347
|
if (this.profiles.agent) {
|
|
331
348
|
const profilePath = resolvePath(
|
|
@@ -333,7 +350,7 @@ export class BenchmarkRunner {
|
|
|
333
350
|
".claude/agents",
|
|
334
351
|
`${this.profiles.agent}.md`,
|
|
335
352
|
);
|
|
336
|
-
agentProfile = await readFile(profilePath, "utf8").catch(() => "");
|
|
353
|
+
agentProfile = await fs.readFile(profilePath, "utf8").catch(() => "");
|
|
337
354
|
}
|
|
338
355
|
return { agentInstructions, agentProfile, skillSetHash };
|
|
339
356
|
}
|
|
@@ -390,6 +407,27 @@ export class BenchmarkRunner {
|
|
|
390
407
|
}
|
|
391
408
|
}
|
|
392
409
|
|
|
410
|
+
/**
|
|
411
|
+
* Validate the required BenchmarkRunner constructor arguments. Extracted from
|
|
412
|
+
* the constructor to keep its cognitive complexity under the lint ceiling.
|
|
413
|
+
*/
|
|
414
|
+
function validateRunnerArgs({
|
|
415
|
+
family,
|
|
416
|
+
runs,
|
|
417
|
+
output,
|
|
418
|
+
agentModel,
|
|
419
|
+
query,
|
|
420
|
+
runtime,
|
|
421
|
+
}) {
|
|
422
|
+
if (!family) throw new Error("family is required");
|
|
423
|
+
if (!Number.isInteger(runs) || runs < 1)
|
|
424
|
+
throw new Error("runs must be an integer ≥ 1");
|
|
425
|
+
if (!output) throw new Error("output is required");
|
|
426
|
+
if (!agentModel) throw new Error("agentModel is required");
|
|
427
|
+
if (!query) throw new Error("query is required");
|
|
428
|
+
if (!runtime) throw new Error("runtime is required");
|
|
429
|
+
}
|
|
430
|
+
|
|
393
431
|
function resultsRecordKey(task, runIndex) {
|
|
394
432
|
return { taskId: task.id, runIndex };
|
|
395
433
|
}
|
|
@@ -408,11 +446,17 @@ async function writeRecord(stream, record) {
|
|
|
408
446
|
* `supervisorPath`.
|
|
409
447
|
*/
|
|
410
448
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
|
|
411
|
-
async function splitAndSummarize(
|
|
412
|
-
|
|
413
|
-
|
|
449
|
+
async function splitAndSummarize(
|
|
450
|
+
runtime,
|
|
451
|
+
combinedPath,
|
|
452
|
+
agentPath,
|
|
453
|
+
supervisorPath,
|
|
454
|
+
) {
|
|
455
|
+
const fs = runtime.fs;
|
|
456
|
+
const agentStream = fs.createWriteStream(agentPath);
|
|
457
|
+
const supStream = fs.createWriteStream(supervisorPath);
|
|
414
458
|
const rl = createInterface({
|
|
415
|
-
input: createReadStream(combinedPath),
|
|
459
|
+
input: fs.createReadStream(combinedPath),
|
|
416
460
|
crlfDelay: Infinity,
|
|
417
461
|
});
|
|
418
462
|
let agentCost = 0;
|