@forwardimpact/libeval 0.1.36 → 0.1.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +27 -7
- package/bin/fit-eval.js +24 -3
- package/bin/fit-trace.js +42 -0
- package/package.json +2 -1
- package/src/benchmark/apm-installer.js +56 -10
- package/src/benchmark/judge.js +4 -3
- package/src/benchmark/report.js +43 -17
- package/src/benchmark/result.js +7 -1
- package/src/benchmark/runner.js +120 -75
- package/src/benchmark/scorer.js +2 -5
- package/src/benchmark/task-family.js +14 -47
- package/src/benchmark/workdir.js +7 -6
- package/src/commands/assert.js +145 -0
- package/src/commands/benchmark-report.js +1 -2
- package/src/commands/benchmark-run.js +5 -4
- package/src/commands/facilitate.js +4 -2
- package/src/commands/run.js +3 -3
- package/src/commands/supervise.js +5 -2
- package/src/facilitator.js +7 -3
- package/src/supervisor.js +42 -12
package/src/benchmark/runner.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Phases per (task, runIndex):
|
|
5
5
|
* 1. WorkdirManager.start → seed CWD + run pre-flight probe
|
|
6
|
-
* 2.
|
|
6
|
+
* 2. Supervisor relay (agent + supervisor) → produce traces + submission
|
|
7
7
|
* 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
|
|
8
8
|
* 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
|
|
9
9
|
* 5. WorkdirManager.teardown → process-group cleanup
|
|
@@ -15,15 +15,12 @@
|
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
import { createReadStream, createWriteStream } from "node:fs";
|
|
18
|
-
import { access, constants, mkdir, readFile } from "node:fs/promises";
|
|
18
|
+
import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
|
|
19
19
|
import { createInterface } from "node:readline";
|
|
20
20
|
import { join, resolve as resolvePath } from "node:path";
|
|
21
21
|
|
|
22
|
-
import { createAgentRunner } from "../agent-runner.js";
|
|
23
|
-
import { composeProfilePrompt } from "../profile-prompt.js";
|
|
24
22
|
import { createRedactor } from "../redaction.js";
|
|
25
|
-
import {
|
|
26
|
-
import { createTraceCollector } from "../trace-collector.js";
|
|
23
|
+
import { createSupervisor } from "../supervisor.js";
|
|
27
24
|
import { installApm } from "./apm-installer.js";
|
|
28
25
|
import { runJudge } from "./judge.js";
|
|
29
26
|
import { validateResultRecord } from "./result.js";
|
|
@@ -40,7 +37,9 @@ export class BenchmarkRunner {
|
|
|
40
37
|
* @param {import("./task-family.js").TaskFamily | string} opts.family
|
|
41
38
|
* @param {number} opts.runs - Runs per task (≥ 1).
|
|
42
39
|
* @param {string} opts.output - Run-output directory.
|
|
43
|
-
* @param {string} opts.
|
|
40
|
+
* @param {string} opts.agentModel
|
|
41
|
+
* @param {string} opts.supervisorModel
|
|
42
|
+
* @param {string} opts.judgeModel
|
|
44
43
|
* @param {{agent?: string, judge?: string}} [opts.profiles]
|
|
45
44
|
* @param {Function} opts.query - SDK query (injected for testability).
|
|
46
45
|
* @param {number} [opts.maxTurns] - Agent-under-test turn budget.
|
|
@@ -60,7 +59,9 @@ export class BenchmarkRunner {
|
|
|
60
59
|
family,
|
|
61
60
|
runs,
|
|
62
61
|
output,
|
|
63
|
-
|
|
62
|
+
agentModel,
|
|
63
|
+
supervisorModel,
|
|
64
|
+
judgeModel,
|
|
64
65
|
profiles,
|
|
65
66
|
query,
|
|
66
67
|
maxTurns,
|
|
@@ -74,12 +75,16 @@ export class BenchmarkRunner {
|
|
|
74
75
|
if (!Number.isInteger(runs) || runs < 1)
|
|
75
76
|
throw new Error("runs must be an integer ≥ 1");
|
|
76
77
|
if (!output) throw new Error("output is required");
|
|
77
|
-
if (!
|
|
78
|
+
if (!agentModel) throw new Error("agentModel is required");
|
|
79
|
+
if (!supervisorModel) throw new Error("supervisorModel is required");
|
|
80
|
+
if (!judgeModel) throw new Error("judgeModel is required");
|
|
78
81
|
if (!query) throw new Error("query is required");
|
|
79
82
|
this.familyInput = family;
|
|
80
83
|
this.runs = runs;
|
|
81
84
|
this.output = output;
|
|
82
|
-
this.
|
|
85
|
+
this.agentModel = agentModel;
|
|
86
|
+
this.supervisorModel = supervisorModel;
|
|
87
|
+
this.judgeModel = judgeModel;
|
|
83
88
|
this.profiles = {
|
|
84
89
|
agent: profiles?.agent ?? null,
|
|
85
90
|
judge: profiles?.judge ?? null,
|
|
@@ -103,14 +108,21 @@ export class BenchmarkRunner {
|
|
|
103
108
|
: this.familyInput;
|
|
104
109
|
|
|
105
110
|
await mkdir(this.output, { recursive: true });
|
|
106
|
-
const { stagingDir, skillSetHash } = await installApm(
|
|
111
|
+
const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
|
|
112
|
+
family,
|
|
113
|
+
this.output,
|
|
114
|
+
);
|
|
107
115
|
|
|
108
116
|
const tasks = family.tasks();
|
|
109
117
|
for (const task of tasks) {
|
|
110
118
|
await assertPreflightExecutable(task);
|
|
111
119
|
}
|
|
112
120
|
if (this.profiles.judge) {
|
|
113
|
-
await assertJudgeProfileStaged(
|
|
121
|
+
await assertJudgeProfileStaged(
|
|
122
|
+
family,
|
|
123
|
+
judgeProfilesDir,
|
|
124
|
+
this.profiles.judge,
|
|
125
|
+
);
|
|
114
126
|
}
|
|
115
127
|
|
|
116
128
|
const wm = createWorkdirManager({
|
|
@@ -130,6 +142,7 @@ export class BenchmarkRunner {
|
|
|
130
142
|
task,
|
|
131
143
|
runIndex,
|
|
132
144
|
skillSetHash,
|
|
145
|
+
judgeProfilesDir,
|
|
133
146
|
);
|
|
134
147
|
await writeRecord(resultsStream, record);
|
|
135
148
|
yield record;
|
|
@@ -140,7 +153,7 @@ export class BenchmarkRunner {
|
|
|
140
153
|
}
|
|
141
154
|
}
|
|
142
155
|
|
|
143
|
-
async #runOne(family, wm, task, runIndex, skillSetHash) {
|
|
156
|
+
async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
|
|
144
157
|
const t0 = Date.now();
|
|
145
158
|
const workdir = await wm.start(task, runIndex);
|
|
146
159
|
try {
|
|
@@ -176,8 +189,9 @@ export class BenchmarkRunner {
|
|
|
176
189
|
scoring,
|
|
177
190
|
{
|
|
178
191
|
query: this.query,
|
|
179
|
-
model: this.
|
|
192
|
+
model: this.judgeModel,
|
|
180
193
|
judgeProfile: this.profiles.judge ?? undefined,
|
|
194
|
+
profilesDir: judgeProfilesDir,
|
|
181
195
|
},
|
|
182
196
|
judgeContext,
|
|
183
197
|
);
|
|
@@ -194,13 +208,18 @@ export class BenchmarkRunner {
|
|
|
194
208
|
costUsd,
|
|
195
209
|
turns,
|
|
196
210
|
agentTracePath: workdir.agentTracePath,
|
|
211
|
+
supervisorTracePath: workdir.supervisorTracePath,
|
|
197
212
|
judgeTracePath: workdir.judgeTracePath,
|
|
198
213
|
profiles: {
|
|
199
214
|
agent: this.profiles.agent,
|
|
200
215
|
supervisor: null,
|
|
201
216
|
judge: this.profiles.judge,
|
|
202
217
|
},
|
|
203
|
-
model:
|
|
218
|
+
model: {
|
|
219
|
+
agent: this.agentModel,
|
|
220
|
+
supervisor: this.supervisorModel,
|
|
221
|
+
judge: this.judgeModel,
|
|
222
|
+
},
|
|
204
223
|
skillSetHash,
|
|
205
224
|
familyRevision: family.familyRevision,
|
|
206
225
|
durationMs: Date.now() - t0,
|
|
@@ -236,54 +255,43 @@ export class BenchmarkRunner {
|
|
|
236
255
|
}
|
|
237
256
|
|
|
238
257
|
/**
|
|
239
|
-
* Run the agent-under-test
|
|
240
|
-
*
|
|
241
|
-
*
|
|
242
|
-
* with one terminal `result` event.
|
|
243
|
-
*
|
|
244
|
-
* Inspects both thrown errors AND the resolved `{success, aborted, error}`
|
|
245
|
-
* shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
|
|
246
|
-
* the SDK iterator catches its own errors and resolves with `success:
|
|
247
|
-
* false`, so a try/catch alone would silently treat a failed session as
|
|
248
|
-
* a successful one (plan Step 8.5.c).
|
|
258
|
+
* Run the agent-under-test via a Supervisor relay. The supervisor writes
|
|
259
|
+
* a combined tagged NDJSON trace; after the session we split it into
|
|
260
|
+
* agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
|
|
249
261
|
*/
|
|
250
262
|
async #runAgent(task, workdir) {
|
|
251
|
-
const
|
|
252
|
-
const
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
})
|
|
257
|
-
: undefined;
|
|
258
|
-
const runner = createAgentRunner({
|
|
259
|
-
cwd: workdir.cwd,
|
|
263
|
+
const combinedPath = join(workdir.runDir, ".combined.ndjson");
|
|
264
|
+
const combinedStream = createWriteStream(combinedPath);
|
|
265
|
+
const supervisor = createSupervisor({
|
|
266
|
+
supervisorCwd: workdir.cwd,
|
|
267
|
+
agentCwd: workdir.cwd,
|
|
260
268
|
query: this.query,
|
|
261
|
-
output:
|
|
262
|
-
|
|
269
|
+
output: combinedStream,
|
|
270
|
+
agentModel: this.agentModel,
|
|
271
|
+
supervisorModel: this.supervisorModel,
|
|
263
272
|
maxTurns: this.maxTurns ?? 50,
|
|
264
273
|
allowedTools: BASE_TOOLS,
|
|
265
|
-
|
|
266
|
-
systemPrompt,
|
|
274
|
+
...(this.profiles.agent && { agentProfile: this.profiles.agent }),
|
|
267
275
|
redactor: createRedactor(),
|
|
268
276
|
});
|
|
269
277
|
const instructions = await readFile(task.paths.instructions, "utf8");
|
|
270
278
|
let agentError = null;
|
|
271
279
|
try {
|
|
272
|
-
const result = await
|
|
273
|
-
if (!result.success) {
|
|
274
|
-
agentError = {
|
|
275
|
-
message:
|
|
276
|
-
result.error?.message ??
|
|
277
|
-
(result.aborted ? "aborted" : "agent did not succeed"),
|
|
278
|
-
aborted: result.aborted ?? false,
|
|
279
|
-
};
|
|
280
|
+
const result = await supervisor.run(instructions);
|
|
281
|
+
if (!result.success && !result.concluded) {
|
|
282
|
+
agentError = { message: "supervisor did not succeed", aborted: false };
|
|
280
283
|
}
|
|
281
284
|
} catch (e) {
|
|
282
285
|
agentError = { message: e.message ?? String(e), aborted: false };
|
|
283
286
|
} finally {
|
|
284
|
-
await new Promise((r) =>
|
|
287
|
+
await new Promise((r) => combinedStream.end(r));
|
|
285
288
|
}
|
|
286
|
-
const summary = await
|
|
289
|
+
const summary = await splitAndSummarize(
|
|
290
|
+
combinedPath,
|
|
291
|
+
workdir.agentTracePath,
|
|
292
|
+
workdir.supervisorTracePath,
|
|
293
|
+
);
|
|
294
|
+
await unlink(combinedPath).catch(() => {});
|
|
287
295
|
return { ...summary, agentError };
|
|
288
296
|
}
|
|
289
297
|
|
|
@@ -321,11 +329,16 @@ export class BenchmarkRunner {
|
|
|
321
329
|
supervisor: null,
|
|
322
330
|
judge: this.profiles.judge,
|
|
323
331
|
},
|
|
324
|
-
model:
|
|
332
|
+
model: {
|
|
333
|
+
agent: this.agentModel,
|
|
334
|
+
supervisor: this.supervisorModel,
|
|
335
|
+
judge: this.judgeModel,
|
|
336
|
+
},
|
|
325
337
|
skillSetHash,
|
|
326
338
|
familyRevision,
|
|
327
339
|
durationMs,
|
|
328
340
|
agentTracePath: workdir.agentTracePath,
|
|
341
|
+
supervisorTracePath: workdir.supervisorTracePath,
|
|
329
342
|
judgeTracePath: workdir.judgeTracePath,
|
|
330
343
|
};
|
|
331
344
|
}
|
|
@@ -366,7 +379,7 @@ async function writeRecord(stream, record) {
|
|
|
366
379
|
* is missing or non-executable, before any agent session starts."
|
|
367
380
|
*/
|
|
368
381
|
async function assertPreflightExecutable(task) {
|
|
369
|
-
const path = join(task.paths.
|
|
382
|
+
const path = join(task.paths.hooks, "preflight.sh");
|
|
370
383
|
try {
|
|
371
384
|
await access(path, constants.X_OK);
|
|
372
385
|
} catch (e) {
|
|
@@ -377,35 +390,67 @@ async function assertPreflightExecutable(task) {
|
|
|
377
390
|
}
|
|
378
391
|
|
|
379
392
|
/**
|
|
380
|
-
*
|
|
381
|
-
* cost, turn count, and
|
|
393
|
+
* Split the combined supervisor trace into agent and supervisor files, and
|
|
394
|
+
* extract cost, turn count, and submission in a single pass. Agent-source
|
|
395
|
+
* events go to `agentPath`; supervisor and orchestrator events go to
|
|
396
|
+
* `supervisorPath`.
|
|
382
397
|
*/
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
const
|
|
386
|
-
const
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
398
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
|
|
399
|
+
async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
|
|
400
|
+
const agentStream = createWriteStream(agentPath);
|
|
401
|
+
const supStream = createWriteStream(supervisorPath);
|
|
402
|
+
const rl = createInterface({
|
|
403
|
+
input: createReadStream(combinedPath),
|
|
404
|
+
crlfDelay: Infinity,
|
|
405
|
+
});
|
|
406
|
+
let agentCost = 0;
|
|
407
|
+
let supervisorCost = 0;
|
|
408
|
+
let turns = 0;
|
|
409
|
+
let submission = "";
|
|
410
|
+
for await (const line of rl) {
|
|
411
|
+
if (!line.trim()) continue;
|
|
412
|
+
let event;
|
|
413
|
+
try {
|
|
414
|
+
event = JSON.parse(line);
|
|
415
|
+
} catch {
|
|
416
|
+
continue;
|
|
417
|
+
}
|
|
418
|
+
const target = event.source === "agent" ? agentStream : supStream;
|
|
419
|
+
target.write(line + "\n");
|
|
420
|
+
const inner = event.event;
|
|
421
|
+
if (!inner) continue;
|
|
422
|
+
if (event.source === "agent") {
|
|
423
|
+
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
424
|
+
agentCost = inner.total_cost_usd;
|
|
425
|
+
}
|
|
426
|
+
if (inner.type === "assistant") {
|
|
427
|
+
const text = extractText(inner);
|
|
428
|
+
if (text) submission = text;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
if (event.source === "supervisor") {
|
|
432
|
+
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
433
|
+
supervisorCost = inner.total_cost_usd;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
if (event.source === "orchestrator" && inner.type === "summary") {
|
|
437
|
+
turns = inner.turns ?? 0;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
await Promise.all([
|
|
441
|
+
new Promise((r) => agentStream.end(r)),
|
|
442
|
+
new Promise((r) => supStream.end(r)),
|
|
443
|
+
]);
|
|
444
|
+
return { costUsd: agentCost + supervisorCost, turns, submission };
|
|
396
445
|
}
|
|
397
446
|
|
|
398
|
-
function
|
|
399
|
-
const
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
if (
|
|
403
|
-
const content = turn.content ?? [];
|
|
404
|
-
for (let j = content.length - 1; j >= 0; j--) {
|
|
405
|
-
if (content[j].type === "text" && content[j].text) return content[j].text;
|
|
406
|
-
}
|
|
447
|
+
function extractText(inner) {
|
|
448
|
+
const content = inner.message?.content ?? inner.content;
|
|
449
|
+
if (!Array.isArray(content)) return null;
|
|
450
|
+
for (let i = content.length - 1; i >= 0; i--) {
|
|
451
|
+
if (content[i].type === "text" && content[i].text) return content[i].text;
|
|
407
452
|
}
|
|
408
|
-
return
|
|
453
|
+
return null;
|
|
409
454
|
}
|
|
410
455
|
|
|
411
456
|
/**
|
package/src/benchmark/scorer.js
CHANGED
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Scorer — runs `<task.paths.
|
|
2
|
+
* Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
|
|
3
3
|
* the post-run agent CWD. The exit code is authoritative for the verdict;
|
|
4
4
|
* structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
|
-
*
|
|
6
|
-
* Scoring scripts are never copied into the agent CWD — they live only in the
|
|
7
|
-
* task template (design Decision 3).
|
|
8
5
|
*/
|
|
9
6
|
|
|
10
7
|
import { spawn } from "node:child_process";
|
|
@@ -32,7 +29,7 @@ import { join } from "node:path";
|
|
|
32
29
|
*/
|
|
33
30
|
export function runScoring(task, ctx) {
|
|
34
31
|
return new Promise((res, rej) => {
|
|
35
|
-
const script = join(task.paths.
|
|
32
|
+
const script = join(task.paths.hooks, "score.sh");
|
|
36
33
|
const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
|
|
37
34
|
|
|
38
35
|
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
@@ -4,13 +4,14 @@
|
|
|
4
4
|
* apm.lock.yaml
|
|
5
5
|
* .claude/ # pre-staged skills + agents (P1)
|
|
6
6
|
* tasks/<task_name>/
|
|
7
|
-
*
|
|
7
|
+
* agent.task.md
|
|
8
8
|
* supervisor.task.md # preserved for v2; not read in v1
|
|
9
9
|
* judge.task.md
|
|
10
|
+
* hooks/ # harness-only; never copied to agent CWD
|
|
11
|
+
* preflight.sh
|
|
12
|
+
* score.sh
|
|
10
13
|
* specs/ # copied into agent CWD
|
|
11
|
-
* workdir/ # copied into agent CWD
|
|
12
|
-
* scripts/preflight.sh
|
|
13
|
-
* scoring/ # template-only; never copied
|
|
14
|
+
* workdir/ # copied into agent CWD
|
|
14
15
|
*
|
|
15
16
|
* Local paths or git URLs are both accepted; git URLs are shallow-cloned into
|
|
16
17
|
* a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
|
|
@@ -53,13 +54,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
53
54
|
familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
|
|
54
55
|
}
|
|
55
56
|
|
|
56
|
-
const apmLockBytes = await readApmLockBytes(rootPath);
|
|
57
57
|
const tasks = await discoverTasks(rootPath);
|
|
58
58
|
|
|
59
59
|
return {
|
|
60
60
|
rootPath,
|
|
61
61
|
familyRevision,
|
|
62
|
-
apmLockBytes,
|
|
63
62
|
tasks() {
|
|
64
63
|
return tasks;
|
|
65
64
|
},
|
|
@@ -67,58 +66,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
67
66
|
}
|
|
68
67
|
|
|
69
68
|
/**
|
|
70
|
-
* Assert that `<
|
|
71
|
-
*
|
|
69
|
+
* Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
|
|
70
|
+
* `BenchmarkRunner.run()` so a missing judge profile fails the family
|
|
72
71
|
* install before any agent session starts.
|
|
73
72
|
* @param {TaskFamily} _family
|
|
74
|
-
* @param {string}
|
|
73
|
+
* @param {string} judgeProfilesDir
|
|
75
74
|
* @param {string} judgeProfile
|
|
76
75
|
* @returns {Promise<void>}
|
|
77
76
|
*/
|
|
78
77
|
export async function assertJudgeProfileStaged(
|
|
79
78
|
_family,
|
|
80
|
-
|
|
79
|
+
judgeProfilesDir,
|
|
81
80
|
judgeProfile,
|
|
82
81
|
) {
|
|
83
|
-
const candidate = join(
|
|
82
|
+
const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
|
|
84
83
|
try {
|
|
85
84
|
await access(candidate);
|
|
86
85
|
} catch {
|
|
87
|
-
throw new Error(
|
|
88
|
-
`judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
|
|
89
|
-
);
|
|
86
|
+
throw new Error(`judge profile not staged: ${candidate}`);
|
|
90
87
|
}
|
|
91
88
|
}
|
|
92
89
|
|
|
93
|
-
async function readApmLockBytes(rootPath) {
|
|
94
|
-
const lockPath = join(rootPath, "apm.lock.yaml");
|
|
95
|
-
try {
|
|
96
|
-
const raw = await readFile(lockPath);
|
|
97
|
-
return normalizeLf(raw);
|
|
98
|
-
} catch (e) {
|
|
99
|
-
if (e.code === "ENOENT") {
|
|
100
|
-
throw new Error(
|
|
101
|
-
`task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
|
|
102
|
-
);
|
|
103
|
-
}
|
|
104
|
-
throw e;
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/**
|
|
109
|
-
* Replace CRLF with LF so cross-OS authored lockfiles hash identically.
|
|
110
|
-
* @param {Buffer} buf
|
|
111
|
-
* @returns {Buffer}
|
|
112
|
-
*/
|
|
113
|
-
function normalizeLf(buf) {
|
|
114
|
-
const out = [];
|
|
115
|
-
for (let i = 0; i < buf.length; i++) {
|
|
116
|
-
if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
|
|
117
|
-
out.push(buf[i]);
|
|
118
|
-
}
|
|
119
|
-
return Buffer.from(out);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
90
|
async function discoverTasks(rootPath) {
|
|
123
91
|
const tasksRoot = join(rootPath, "tasks");
|
|
124
92
|
const tasks = [];
|
|
@@ -135,12 +103,12 @@ async function discoverTasks(rootPath) {
|
|
|
135
103
|
tasks.push({
|
|
136
104
|
id: entry.name,
|
|
137
105
|
paths: {
|
|
138
|
-
instructions: join(taskDir, "
|
|
106
|
+
instructions: join(taskDir, "agent.task.md"),
|
|
139
107
|
supervisor: join(taskDir, "supervisor.task.md"),
|
|
140
108
|
judge: join(taskDir, "judge.task.md"),
|
|
109
|
+
hooks: join(taskDir, "hooks"),
|
|
141
110
|
specs: join(taskDir, "specs"),
|
|
142
111
|
workdir: join(taskDir, "workdir"),
|
|
143
|
-
scoring: join(taskDir, "scoring"),
|
|
144
112
|
},
|
|
145
113
|
});
|
|
146
114
|
}
|
|
@@ -242,13 +210,12 @@ function run(cmd, args) {
|
|
|
242
210
|
/**
|
|
243
211
|
* @typedef {object} Task
|
|
244
212
|
* @property {string} id - Task name (directory name under tasks/)
|
|
245
|
-
* @property {{instructions: string, supervisor: string, judge: string,
|
|
213
|
+
* @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
|
|
246
214
|
*/
|
|
247
215
|
|
|
248
216
|
/**
|
|
249
217
|
* @typedef {object} TaskFamily
|
|
250
218
|
* @property {string} rootPath
|
|
251
219
|
* @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
|
|
252
|
-
* @property {Buffer} apmLockBytes - LF-normalised
|
|
253
220
|
* @property {() => Task[]} tasks
|
|
254
221
|
*/
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -11,9 +11,8 @@ import { spawn } from "node:child_process";
|
|
|
11
11
|
import { cp, mkdir } from "node:fs/promises";
|
|
12
12
|
import { createServer } from "node:net";
|
|
13
13
|
import { connect } from "node:net";
|
|
14
|
-
import { join
|
|
14
|
+
import { join } from "node:path";
|
|
15
15
|
|
|
16
|
-
const PREFLIGHT_REL = join("workdir", "scripts");
|
|
17
16
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
18
17
|
|
|
19
18
|
/**
|
|
@@ -24,6 +23,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
|
24
23
|
* @property {number} pgid - Process-group id captured from the preflight child.
|
|
25
24
|
* @property {*} scaffold - Reserved per design § Components; v1 sets null.
|
|
26
25
|
* @property {string} agentTracePath
|
|
26
|
+
* @property {string} supervisorTracePath
|
|
27
27
|
* @property {string} judgeTracePath
|
|
28
28
|
* @property {{phase: string, message: string, exitCode: number}} [preflightError]
|
|
29
29
|
*/
|
|
@@ -55,9 +55,8 @@ export class WorkdirManager {
|
|
|
55
55
|
const cwd = join(runDir, "cwd");
|
|
56
56
|
await mkdir(cwd, { recursive: true });
|
|
57
57
|
|
|
58
|
-
await cp(task.paths.workdir, cwd, {
|
|
59
|
-
|
|
60
|
-
filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
|
|
58
|
+
await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
|
|
59
|
+
if (e.code !== "ENOENT") throw e;
|
|
61
60
|
});
|
|
62
61
|
await cp(task.paths.specs, join(cwd, "specs"), {
|
|
63
62
|
recursive: true,
|
|
@@ -70,9 +69,10 @@ export class WorkdirManager {
|
|
|
70
69
|
|
|
71
70
|
const port = await allocatePort();
|
|
72
71
|
const agentTracePath = join(runDir, "agent.ndjson");
|
|
72
|
+
const supervisorTracePath = join(runDir, "supervisor.ndjson");
|
|
73
73
|
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
74
74
|
|
|
75
|
-
const preflightScript = join(task.paths.
|
|
75
|
+
const preflightScript = join(task.paths.hooks, "preflight.sh");
|
|
76
76
|
const preflight = await runPreflight(preflightScript, cwd, port);
|
|
77
77
|
|
|
78
78
|
return {
|
|
@@ -82,6 +82,7 @@ export class WorkdirManager {
|
|
|
82
82
|
pgid: preflight.pgid,
|
|
83
83
|
scaffold: null,
|
|
84
84
|
agentTracePath,
|
|
85
|
+
supervisorTracePath,
|
|
85
86
|
judgeTracePath,
|
|
86
87
|
...(preflight.error && { preflightError: preflight.error }),
|
|
87
88
|
};
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
2
|
+
import { basename } from "node:path";
|
|
3
|
+
import jmespath from "jmespath";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Evaluate an assertion and return the structured result.
|
|
7
|
+
* @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
|
|
8
|
+
* @param {string[]} args - [testName, file]
|
|
9
|
+
* @returns {{ test: string, pass: boolean, message?: string }}
|
|
10
|
+
*/
|
|
11
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
|
|
12
|
+
export function evaluateAssertion(values, args) {
|
|
13
|
+
const testName = args[0];
|
|
14
|
+
if (!testName) throw new Error("assert: missing test name");
|
|
15
|
+
|
|
16
|
+
const file = args[1];
|
|
17
|
+
const modes = [
|
|
18
|
+
values.grep,
|
|
19
|
+
values.query,
|
|
20
|
+
values.exists,
|
|
21
|
+
values["cites-job"],
|
|
22
|
+
].filter((v) => v !== undefined && v !== false);
|
|
23
|
+
if (modes.length === 0) {
|
|
24
|
+
throw new Error(
|
|
25
|
+
"assert: specify one of --grep, --query, --exists, or --cites-job",
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
if (modes.length > 1) {
|
|
29
|
+
throw new Error(
|
|
30
|
+
"assert: specify only one of --grep, --query, --exists, or --cites-job",
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
let result;
|
|
35
|
+
if (values.exists) {
|
|
36
|
+
if (!file) throw new Error("assert: missing file argument");
|
|
37
|
+
result = assertExists(file);
|
|
38
|
+
} else if (values.grep) {
|
|
39
|
+
if (!file) throw new Error("assert: missing file argument for --grep");
|
|
40
|
+
result = assertGrep(values.grep, file);
|
|
41
|
+
} else if (values["cites-job"]) {
|
|
42
|
+
if (!file) throw new Error("assert: missing file argument for --cites-job");
|
|
43
|
+
result = assertCitesJob(values["cites-job"], file);
|
|
44
|
+
} else {
|
|
45
|
+
if (!file) throw new Error("assert: missing file argument for --query");
|
|
46
|
+
result = assertQuery(values.query, file);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (values.not) {
|
|
50
|
+
result.pass = !result.pass;
|
|
51
|
+
if (result.pass) {
|
|
52
|
+
delete result.message;
|
|
53
|
+
} else {
|
|
54
|
+
result.message =
|
|
55
|
+
result.message ?? `inverted assertion failed for ${basename(file)}`;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (!result.pass && values.message) {
|
|
60
|
+
result.message = values.message;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const output = { test: testName, pass: result.pass };
|
|
64
|
+
if (result.message) output.message = result.message;
|
|
65
|
+
return output;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Run an assertion, write JSON to stdout, and set process.exitCode on failure.
|
|
70
|
+
* @param {object} values
|
|
71
|
+
* @param {string[]} args
|
|
72
|
+
*/
|
|
73
|
+
export async function runAssertCommand(values, args) {
|
|
74
|
+
const result = evaluateAssertion(values, args);
|
|
75
|
+
process.stdout.write(JSON.stringify(result) + "\n");
|
|
76
|
+
if (!result.pass) process.exitCode = 1;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function assertExists(file) {
|
|
80
|
+
if (existsSync(file)) return { pass: true };
|
|
81
|
+
return { pass: false, message: `${file} not found` };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function assertGrep(pattern, file) {
|
|
85
|
+
const content = readFileSync(file, "utf8");
|
|
86
|
+
const re = new RegExp(pattern, "im");
|
|
87
|
+
if (re.test(content)) return { pass: true };
|
|
88
|
+
return {
|
|
89
|
+
pass: false,
|
|
90
|
+
message: `pattern "${pattern}" not found in ${basename(file)}`,
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function assertQuery(expression, file) {
|
|
95
|
+
const content = readFileSync(file, "utf8");
|
|
96
|
+
const data = parseJsonOrNdjson(content);
|
|
97
|
+
const result = jmespath.search(data, expression);
|
|
98
|
+
const truthy =
|
|
99
|
+
result !== null &&
|
|
100
|
+
result !== undefined &&
|
|
101
|
+
result !== false &&
|
|
102
|
+
(Array.isArray(result) ? result.length > 0 : true);
|
|
103
|
+
if (truthy) return { pass: true };
|
|
104
|
+
return {
|
|
105
|
+
pass: false,
|
|
106
|
+
message: `query returned ${JSON.stringify(result)}`,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
|
|
111
|
+
|
|
112
|
+
function assertCitesJob(jobFile, file) {
|
|
113
|
+
const jobContent = readFileSync(jobFile, "utf8");
|
|
114
|
+
const match = JOB_TAG_RE.exec(jobContent);
|
|
115
|
+
if (!match) {
|
|
116
|
+
return {
|
|
117
|
+
pass: false,
|
|
118
|
+
message: `no <job> tag found in ${basename(jobFile)}`,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
const citation = `${match[1]}: ${match[2]}`;
|
|
122
|
+
const content = readFileSync(file, "utf8");
|
|
123
|
+
if (content.includes(citation)) return { pass: true };
|
|
124
|
+
return { pass: false, message: `missing "${citation}"` };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function parseJsonOrNdjson(content) {
|
|
128
|
+
try {
|
|
129
|
+
return JSON.parse(content);
|
|
130
|
+
} catch {
|
|
131
|
+
// Fall through to NDJSON
|
|
132
|
+
}
|
|
133
|
+
const lines = [];
|
|
134
|
+
for (const raw of content.split("\n")) {
|
|
135
|
+
const trimmed = raw.trim();
|
|
136
|
+
if (!trimmed) continue;
|
|
137
|
+
try {
|
|
138
|
+
lines.push(JSON.parse(trimmed));
|
|
139
|
+
} catch {
|
|
140
|
+
// skip unparseable lines
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (lines.length === 0) throw new Error("assert: no valid JSON in file");
|
|
144
|
+
return lines;
|
|
145
|
+
}
|
|
@@ -13,8 +13,7 @@ import { aggregate, renderTextReport } from "../benchmark/report.js";
|
|
|
13
13
|
* @param {string[]} _args
|
|
14
14
|
*/
|
|
15
15
|
export async function runBenchmarkReportCommand(values, _args) {
|
|
16
|
-
const inputDir = values.input;
|
|
17
|
-
if (!inputDir) throw new Error("--input is required");
|
|
16
|
+
const inputDir = values.input ?? "benchmark-runs";
|
|
18
17
|
const kRaw = values.k ?? "1,3,5";
|
|
19
18
|
const kValues = kRaw.split(",").map((t) => {
|
|
20
19
|
const n = Number.parseInt(t.trim(), 10);
|