@forwardimpact/libeval 0.1.35 → 0.1.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +27 -7
- package/bin/fit-eval.js +24 -3
- package/bin/fit-trace.js +42 -0
- package/package.json +2 -1
- package/src/benchmark/apm-installer.js +56 -10
- package/src/benchmark/judge.js +35 -8
- package/src/benchmark/report.js +364 -17
- package/src/benchmark/result.js +7 -1
- package/src/benchmark/runner.js +149 -79
- package/src/benchmark/scorer.js +2 -5
- package/src/benchmark/task-family.js +14 -47
- package/src/benchmark/workdir.js +7 -6
- package/src/commands/assert.js +145 -0
- package/src/commands/benchmark-report.js +6 -3
- package/src/commands/benchmark-run.js +5 -4
- package/src/commands/facilitate.js +4 -2
- package/src/commands/run.js +3 -3
- package/src/commands/supervise.js +5 -2
- package/src/facilitator.js +7 -3
- package/src/supervisor.js +47 -14
package/src/benchmark/runner.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Phases per (task, runIndex):
|
|
5
5
|
* 1. WorkdirManager.start → seed CWD + run pre-flight probe
|
|
6
|
-
* 2.
|
|
6
|
+
* 2. Supervisor relay (agent + supervisor) → produce traces + submission
|
|
7
7
|
* 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
|
|
8
8
|
* 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
|
|
9
9
|
* 5. WorkdirManager.teardown → process-group cleanup
|
|
@@ -15,15 +15,12 @@
|
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
import { createReadStream, createWriteStream } from "node:fs";
|
|
18
|
-
import { access, constants, mkdir, readFile } from "node:fs/promises";
|
|
18
|
+
import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
|
|
19
19
|
import { createInterface } from "node:readline";
|
|
20
20
|
import { join, resolve as resolvePath } from "node:path";
|
|
21
21
|
|
|
22
|
-
import { createAgentRunner } from "../agent-runner.js";
|
|
23
|
-
import { composeProfilePrompt } from "../profile-prompt.js";
|
|
24
22
|
import { createRedactor } from "../redaction.js";
|
|
25
|
-
import {
|
|
26
|
-
import { createTraceCollector } from "../trace-collector.js";
|
|
23
|
+
import { createSupervisor } from "../supervisor.js";
|
|
27
24
|
import { installApm } from "./apm-installer.js";
|
|
28
25
|
import { runJudge } from "./judge.js";
|
|
29
26
|
import { validateResultRecord } from "./result.js";
|
|
@@ -40,7 +37,9 @@ export class BenchmarkRunner {
|
|
|
40
37
|
* @param {import("./task-family.js").TaskFamily | string} opts.family
|
|
41
38
|
* @param {number} opts.runs - Runs per task (≥ 1).
|
|
42
39
|
* @param {string} opts.output - Run-output directory.
|
|
43
|
-
* @param {string} opts.
|
|
40
|
+
* @param {string} opts.agentModel
|
|
41
|
+
* @param {string} opts.supervisorModel
|
|
42
|
+
* @param {string} opts.judgeModel
|
|
44
43
|
* @param {{agent?: string, judge?: string}} [opts.profiles]
|
|
45
44
|
* @param {Function} opts.query - SDK query (injected for testability).
|
|
46
45
|
* @param {number} [opts.maxTurns] - Agent-under-test turn budget.
|
|
@@ -60,7 +59,9 @@ export class BenchmarkRunner {
|
|
|
60
59
|
family,
|
|
61
60
|
runs,
|
|
62
61
|
output,
|
|
63
|
-
|
|
62
|
+
agentModel,
|
|
63
|
+
supervisorModel,
|
|
64
|
+
judgeModel,
|
|
64
65
|
profiles,
|
|
65
66
|
query,
|
|
66
67
|
maxTurns,
|
|
@@ -74,12 +75,16 @@ export class BenchmarkRunner {
|
|
|
74
75
|
if (!Number.isInteger(runs) || runs < 1)
|
|
75
76
|
throw new Error("runs must be an integer ≥ 1");
|
|
76
77
|
if (!output) throw new Error("output is required");
|
|
77
|
-
if (!
|
|
78
|
+
if (!agentModel) throw new Error("agentModel is required");
|
|
79
|
+
if (!supervisorModel) throw new Error("supervisorModel is required");
|
|
80
|
+
if (!judgeModel) throw new Error("judgeModel is required");
|
|
78
81
|
if (!query) throw new Error("query is required");
|
|
79
82
|
this.familyInput = family;
|
|
80
83
|
this.runs = runs;
|
|
81
84
|
this.output = output;
|
|
82
|
-
this.
|
|
85
|
+
this.agentModel = agentModel;
|
|
86
|
+
this.supervisorModel = supervisorModel;
|
|
87
|
+
this.judgeModel = judgeModel;
|
|
83
88
|
this.profiles = {
|
|
84
89
|
agent: profiles?.agent ?? null,
|
|
85
90
|
judge: profiles?.judge ?? null,
|
|
@@ -103,14 +108,21 @@ export class BenchmarkRunner {
|
|
|
103
108
|
: this.familyInput;
|
|
104
109
|
|
|
105
110
|
await mkdir(this.output, { recursive: true });
|
|
106
|
-
const { stagingDir, skillSetHash } = await installApm(
|
|
111
|
+
const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
|
|
112
|
+
family,
|
|
113
|
+
this.output,
|
|
114
|
+
);
|
|
107
115
|
|
|
108
116
|
const tasks = family.tasks();
|
|
109
117
|
for (const task of tasks) {
|
|
110
118
|
await assertPreflightExecutable(task);
|
|
111
119
|
}
|
|
112
120
|
if (this.profiles.judge) {
|
|
113
|
-
await assertJudgeProfileStaged(
|
|
121
|
+
await assertJudgeProfileStaged(
|
|
122
|
+
family,
|
|
123
|
+
judgeProfilesDir,
|
|
124
|
+
this.profiles.judge,
|
|
125
|
+
);
|
|
114
126
|
}
|
|
115
127
|
|
|
116
128
|
const wm = createWorkdirManager({
|
|
@@ -130,6 +142,7 @@ export class BenchmarkRunner {
|
|
|
130
142
|
task,
|
|
131
143
|
runIndex,
|
|
132
144
|
skillSetHash,
|
|
145
|
+
judgeProfilesDir,
|
|
133
146
|
);
|
|
134
147
|
await writeRecord(resultsStream, record);
|
|
135
148
|
yield record;
|
|
@@ -140,7 +153,7 @@ export class BenchmarkRunner {
|
|
|
140
153
|
}
|
|
141
154
|
}
|
|
142
155
|
|
|
143
|
-
async #runOne(family, wm, task, runIndex, skillSetHash) {
|
|
156
|
+
async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
|
|
144
157
|
const t0 = Date.now();
|
|
145
158
|
const workdir = await wm.start(task, runIndex);
|
|
146
159
|
try {
|
|
@@ -165,11 +178,23 @@ export class BenchmarkRunner {
|
|
|
165
178
|
port: workdir.port,
|
|
166
179
|
runDir: workdir.runDir,
|
|
167
180
|
});
|
|
168
|
-
const
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
181
|
+
const judgeContext = await this.#buildJudgeContext(
|
|
182
|
+
task,
|
|
183
|
+
workdir,
|
|
184
|
+
skillSetHash,
|
|
185
|
+
);
|
|
186
|
+
const judgeVerdict = await this._runJudgeHook(
|
|
187
|
+
task,
|
|
188
|
+
workdir,
|
|
189
|
+
scoring,
|
|
190
|
+
{
|
|
191
|
+
query: this.query,
|
|
192
|
+
model: this.judgeModel,
|
|
193
|
+
judgeProfile: this.profiles.judge ?? undefined,
|
|
194
|
+
profilesDir: judgeProfilesDir,
|
|
195
|
+
},
|
|
196
|
+
judgeContext,
|
|
197
|
+
);
|
|
173
198
|
const record = {
|
|
174
199
|
taskId: task.id,
|
|
175
200
|
runIndex,
|
|
@@ -183,13 +208,18 @@ export class BenchmarkRunner {
|
|
|
183
208
|
costUsd,
|
|
184
209
|
turns,
|
|
185
210
|
agentTracePath: workdir.agentTracePath,
|
|
211
|
+
supervisorTracePath: workdir.supervisorTracePath,
|
|
186
212
|
judgeTracePath: workdir.judgeTracePath,
|
|
187
213
|
profiles: {
|
|
188
214
|
agent: this.profiles.agent,
|
|
189
215
|
supervisor: null,
|
|
190
216
|
judge: this.profiles.judge,
|
|
191
217
|
},
|
|
192
|
-
model:
|
|
218
|
+
model: {
|
|
219
|
+
agent: this.agentModel,
|
|
220
|
+
supervisor: this.supervisorModel,
|
|
221
|
+
judge: this.judgeModel,
|
|
222
|
+
},
|
|
193
223
|
skillSetHash,
|
|
194
224
|
familyRevision: family.familyRevision,
|
|
195
225
|
durationMs: Date.now() - t0,
|
|
@@ -225,57 +255,60 @@ export class BenchmarkRunner {
|
|
|
225
255
|
}
|
|
226
256
|
|
|
227
257
|
/**
|
|
228
|
-
* Run the agent-under-test
|
|
229
|
-
*
|
|
230
|
-
*
|
|
231
|
-
* with one terminal `result` event.
|
|
232
|
-
*
|
|
233
|
-
* Inspects both thrown errors AND the resolved `{success, aborted, error}`
|
|
234
|
-
* shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
|
|
235
|
-
* the SDK iterator catches its own errors and resolves with `success:
|
|
236
|
-
* false`, so a try/catch alone would silently treat a failed session as
|
|
237
|
-
* a successful one (plan Step 8.5.c).
|
|
258
|
+
* Run the agent-under-test via a Supervisor relay. The supervisor writes
|
|
259
|
+
* a combined tagged NDJSON trace; after the session we split it into
|
|
260
|
+
* agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
|
|
238
261
|
*/
|
|
239
262
|
async #runAgent(task, workdir) {
|
|
240
|
-
const
|
|
241
|
-
const
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
})
|
|
246
|
-
: undefined;
|
|
247
|
-
const runner = createAgentRunner({
|
|
248
|
-
cwd: workdir.cwd,
|
|
263
|
+
const combinedPath = join(workdir.runDir, ".combined.ndjson");
|
|
264
|
+
const combinedStream = createWriteStream(combinedPath);
|
|
265
|
+
const supervisor = createSupervisor({
|
|
266
|
+
supervisorCwd: workdir.cwd,
|
|
267
|
+
agentCwd: workdir.cwd,
|
|
249
268
|
query: this.query,
|
|
250
|
-
output:
|
|
251
|
-
|
|
269
|
+
output: combinedStream,
|
|
270
|
+
agentModel: this.agentModel,
|
|
271
|
+
supervisorModel: this.supervisorModel,
|
|
252
272
|
maxTurns: this.maxTurns ?? 50,
|
|
253
273
|
allowedTools: BASE_TOOLS,
|
|
254
|
-
|
|
255
|
-
systemPrompt,
|
|
274
|
+
...(this.profiles.agent && { agentProfile: this.profiles.agent }),
|
|
256
275
|
redactor: createRedactor(),
|
|
257
276
|
});
|
|
258
277
|
const instructions = await readFile(task.paths.instructions, "utf8");
|
|
259
278
|
let agentError = null;
|
|
260
279
|
try {
|
|
261
|
-
const result = await
|
|
262
|
-
if (!result.success) {
|
|
263
|
-
agentError = {
|
|
264
|
-
message:
|
|
265
|
-
result.error?.message ??
|
|
266
|
-
(result.aborted ? "aborted" : "agent did not succeed"),
|
|
267
|
-
aborted: result.aborted ?? false,
|
|
268
|
-
};
|
|
280
|
+
const result = await supervisor.run(instructions);
|
|
281
|
+
if (!result.success && !result.concluded) {
|
|
282
|
+
agentError = { message: "supervisor did not succeed", aborted: false };
|
|
269
283
|
}
|
|
270
284
|
} catch (e) {
|
|
271
285
|
agentError = { message: e.message ?? String(e), aborted: false };
|
|
272
286
|
} finally {
|
|
273
|
-
await new Promise((r) =>
|
|
287
|
+
await new Promise((r) => combinedStream.end(r));
|
|
274
288
|
}
|
|
275
|
-
const summary = await
|
|
289
|
+
const summary = await splitAndSummarize(
|
|
290
|
+
combinedPath,
|
|
291
|
+
workdir.agentTracePath,
|
|
292
|
+
workdir.supervisorTracePath,
|
|
293
|
+
);
|
|
294
|
+
await unlink(combinedPath).catch(() => {});
|
|
276
295
|
return { ...summary, agentError };
|
|
277
296
|
}
|
|
278
297
|
|
|
298
|
+
async #buildJudgeContext(task, workdir, skillSetHash) {
|
|
299
|
+
const agentInstructions = await readFile(task.paths.instructions, "utf8");
|
|
300
|
+
let agentProfile = "";
|
|
301
|
+
if (this.profiles.agent) {
|
|
302
|
+
const profilePath = resolvePath(
|
|
303
|
+
workdir.cwd,
|
|
304
|
+
".claude/agents",
|
|
305
|
+
`${this.profiles.agent}.md`,
|
|
306
|
+
);
|
|
307
|
+
agentProfile = await readFile(profilePath, "utf8").catch(() => "");
|
|
308
|
+
}
|
|
309
|
+
return { agentInstructions, agentProfile, skillSetHash };
|
|
310
|
+
}
|
|
311
|
+
|
|
279
312
|
#buildPreflightFailureRecord({
|
|
280
313
|
task,
|
|
281
314
|
runIndex,
|
|
@@ -296,11 +329,16 @@ export class BenchmarkRunner {
|
|
|
296
329
|
supervisor: null,
|
|
297
330
|
judge: this.profiles.judge,
|
|
298
331
|
},
|
|
299
|
-
model:
|
|
332
|
+
model: {
|
|
333
|
+
agent: this.agentModel,
|
|
334
|
+
supervisor: this.supervisorModel,
|
|
335
|
+
judge: this.judgeModel,
|
|
336
|
+
},
|
|
300
337
|
skillSetHash,
|
|
301
338
|
familyRevision,
|
|
302
339
|
durationMs,
|
|
303
340
|
agentTracePath: workdir.agentTracePath,
|
|
341
|
+
supervisorTracePath: workdir.supervisorTracePath,
|
|
304
342
|
judgeTracePath: workdir.judgeTracePath,
|
|
305
343
|
};
|
|
306
344
|
}
|
|
@@ -341,7 +379,7 @@ async function writeRecord(stream, record) {
|
|
|
341
379
|
* is missing or non-executable, before any agent session starts."
|
|
342
380
|
*/
|
|
343
381
|
async function assertPreflightExecutable(task) {
|
|
344
|
-
const path = join(task.paths.
|
|
382
|
+
const path = join(task.paths.hooks, "preflight.sh");
|
|
345
383
|
try {
|
|
346
384
|
await access(path, constants.X_OK);
|
|
347
385
|
} catch (e) {
|
|
@@ -352,35 +390,67 @@ async function assertPreflightExecutable(task) {
|
|
|
352
390
|
}
|
|
353
391
|
|
|
354
392
|
/**
|
|
355
|
-
*
|
|
356
|
-
* cost, turn count, and
|
|
393
|
+
* Split the combined supervisor trace into agent and supervisor files, and
|
|
394
|
+
* extract cost, turn count, and submission in a single pass. Agent-source
|
|
395
|
+
* events go to `agentPath`; supervisor and orchestrator events go to
|
|
396
|
+
* `supervisorPath`.
|
|
357
397
|
*/
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
const
|
|
361
|
-
const
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
398
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
|
|
399
|
+
async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
|
|
400
|
+
const agentStream = createWriteStream(agentPath);
|
|
401
|
+
const supStream = createWriteStream(supervisorPath);
|
|
402
|
+
const rl = createInterface({
|
|
403
|
+
input: createReadStream(combinedPath),
|
|
404
|
+
crlfDelay: Infinity,
|
|
405
|
+
});
|
|
406
|
+
let agentCost = 0;
|
|
407
|
+
let supervisorCost = 0;
|
|
408
|
+
let turns = 0;
|
|
409
|
+
let submission = "";
|
|
410
|
+
for await (const line of rl) {
|
|
411
|
+
if (!line.trim()) continue;
|
|
412
|
+
let event;
|
|
413
|
+
try {
|
|
414
|
+
event = JSON.parse(line);
|
|
415
|
+
} catch {
|
|
416
|
+
continue;
|
|
417
|
+
}
|
|
418
|
+
const target = event.source === "agent" ? agentStream : supStream;
|
|
419
|
+
target.write(line + "\n");
|
|
420
|
+
const inner = event.event;
|
|
421
|
+
if (!inner) continue;
|
|
422
|
+
if (event.source === "agent") {
|
|
423
|
+
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
424
|
+
agentCost = inner.total_cost_usd;
|
|
425
|
+
}
|
|
426
|
+
if (inner.type === "assistant") {
|
|
427
|
+
const text = extractText(inner);
|
|
428
|
+
if (text) submission = text;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
if (event.source === "supervisor") {
|
|
432
|
+
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
433
|
+
supervisorCost = inner.total_cost_usd;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
if (event.source === "orchestrator" && inner.type === "summary") {
|
|
437
|
+
turns = inner.turns ?? 0;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
await Promise.all([
|
|
441
|
+
new Promise((r) => agentStream.end(r)),
|
|
442
|
+
new Promise((r) => supStream.end(r)),
|
|
443
|
+
]);
|
|
444
|
+
return { costUsd: agentCost + supervisorCost, turns, submission };
|
|
371
445
|
}
|
|
372
446
|
|
|
373
|
-
function
|
|
374
|
-
const
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
if (
|
|
378
|
-
const content = turn.content ?? [];
|
|
379
|
-
for (let j = content.length - 1; j >= 0; j--) {
|
|
380
|
-
if (content[j].type === "text" && content[j].text) return content[j].text;
|
|
381
|
-
}
|
|
447
|
+
function extractText(inner) {
|
|
448
|
+
const content = inner.message?.content ?? inner.content;
|
|
449
|
+
if (!Array.isArray(content)) return null;
|
|
450
|
+
for (let i = content.length - 1; i >= 0; i--) {
|
|
451
|
+
if (content[i].type === "text" && content[i].text) return content[i].text;
|
|
382
452
|
}
|
|
383
|
-
return
|
|
453
|
+
return null;
|
|
384
454
|
}
|
|
385
455
|
|
|
386
456
|
/**
|
package/src/benchmark/scorer.js
CHANGED
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Scorer — runs `<task.paths.
|
|
2
|
+
* Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
|
|
3
3
|
* the post-run agent CWD. The exit code is authoritative for the verdict;
|
|
4
4
|
* structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
|
-
*
|
|
6
|
-
* Scoring scripts are never copied into the agent CWD — they live only in the
|
|
7
|
-
* task template (design Decision 3).
|
|
8
5
|
*/
|
|
9
6
|
|
|
10
7
|
import { spawn } from "node:child_process";
|
|
@@ -32,7 +29,7 @@ import { join } from "node:path";
|
|
|
32
29
|
*/
|
|
33
30
|
export function runScoring(task, ctx) {
|
|
34
31
|
return new Promise((res, rej) => {
|
|
35
|
-
const script = join(task.paths.
|
|
32
|
+
const script = join(task.paths.hooks, "score.sh");
|
|
36
33
|
const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
|
|
37
34
|
|
|
38
35
|
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
@@ -4,13 +4,14 @@
|
|
|
4
4
|
* apm.lock.yaml
|
|
5
5
|
* .claude/ # pre-staged skills + agents (P1)
|
|
6
6
|
* tasks/<task_name>/
|
|
7
|
-
*
|
|
7
|
+
* agent.task.md
|
|
8
8
|
* supervisor.task.md # preserved for v2; not read in v1
|
|
9
9
|
* judge.task.md
|
|
10
|
+
* hooks/ # harness-only; never copied to agent CWD
|
|
11
|
+
* preflight.sh
|
|
12
|
+
* score.sh
|
|
10
13
|
* specs/ # copied into agent CWD
|
|
11
|
-
* workdir/ # copied into agent CWD
|
|
12
|
-
* scripts/preflight.sh
|
|
13
|
-
* scoring/ # template-only; never copied
|
|
14
|
+
* workdir/ # copied into agent CWD
|
|
14
15
|
*
|
|
15
16
|
* Local paths or git URLs are both accepted; git URLs are shallow-cloned into
|
|
16
17
|
* a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
|
|
@@ -53,13 +54,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
53
54
|
familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
|
|
54
55
|
}
|
|
55
56
|
|
|
56
|
-
const apmLockBytes = await readApmLockBytes(rootPath);
|
|
57
57
|
const tasks = await discoverTasks(rootPath);
|
|
58
58
|
|
|
59
59
|
return {
|
|
60
60
|
rootPath,
|
|
61
61
|
familyRevision,
|
|
62
|
-
apmLockBytes,
|
|
63
62
|
tasks() {
|
|
64
63
|
return tasks;
|
|
65
64
|
},
|
|
@@ -67,58 +66,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
67
66
|
}
|
|
68
67
|
|
|
69
68
|
/**
|
|
70
|
-
* Assert that `<
|
|
71
|
-
*
|
|
69
|
+
* Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
|
|
70
|
+
* `BenchmarkRunner.run()` so a missing judge profile fails the family
|
|
72
71
|
* install before any agent session starts.
|
|
73
72
|
* @param {TaskFamily} _family
|
|
74
|
-
* @param {string}
|
|
73
|
+
* @param {string} judgeProfilesDir
|
|
75
74
|
* @param {string} judgeProfile
|
|
76
75
|
* @returns {Promise<void>}
|
|
77
76
|
*/
|
|
78
77
|
export async function assertJudgeProfileStaged(
|
|
79
78
|
_family,
|
|
80
|
-
|
|
79
|
+
judgeProfilesDir,
|
|
81
80
|
judgeProfile,
|
|
82
81
|
) {
|
|
83
|
-
const candidate = join(
|
|
82
|
+
const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
|
|
84
83
|
try {
|
|
85
84
|
await access(candidate);
|
|
86
85
|
} catch {
|
|
87
|
-
throw new Error(
|
|
88
|
-
`judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
|
|
89
|
-
);
|
|
86
|
+
throw new Error(`judge profile not staged: ${candidate}`);
|
|
90
87
|
}
|
|
91
88
|
}
|
|
92
89
|
|
|
93
|
-
async function readApmLockBytes(rootPath) {
|
|
94
|
-
const lockPath = join(rootPath, "apm.lock.yaml");
|
|
95
|
-
try {
|
|
96
|
-
const raw = await readFile(lockPath);
|
|
97
|
-
return normalizeLf(raw);
|
|
98
|
-
} catch (e) {
|
|
99
|
-
if (e.code === "ENOENT") {
|
|
100
|
-
throw new Error(
|
|
101
|
-
`task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
|
|
102
|
-
);
|
|
103
|
-
}
|
|
104
|
-
throw e;
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/**
|
|
109
|
-
* Replace CRLF with LF so cross-OS authored lockfiles hash identically.
|
|
110
|
-
* @param {Buffer} buf
|
|
111
|
-
* @returns {Buffer}
|
|
112
|
-
*/
|
|
113
|
-
function normalizeLf(buf) {
|
|
114
|
-
const out = [];
|
|
115
|
-
for (let i = 0; i < buf.length; i++) {
|
|
116
|
-
if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
|
|
117
|
-
out.push(buf[i]);
|
|
118
|
-
}
|
|
119
|
-
return Buffer.from(out);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
90
|
async function discoverTasks(rootPath) {
|
|
123
91
|
const tasksRoot = join(rootPath, "tasks");
|
|
124
92
|
const tasks = [];
|
|
@@ -135,12 +103,12 @@ async function discoverTasks(rootPath) {
|
|
|
135
103
|
tasks.push({
|
|
136
104
|
id: entry.name,
|
|
137
105
|
paths: {
|
|
138
|
-
instructions: join(taskDir, "
|
|
106
|
+
instructions: join(taskDir, "agent.task.md"),
|
|
139
107
|
supervisor: join(taskDir, "supervisor.task.md"),
|
|
140
108
|
judge: join(taskDir, "judge.task.md"),
|
|
109
|
+
hooks: join(taskDir, "hooks"),
|
|
141
110
|
specs: join(taskDir, "specs"),
|
|
142
111
|
workdir: join(taskDir, "workdir"),
|
|
143
|
-
scoring: join(taskDir, "scoring"),
|
|
144
112
|
},
|
|
145
113
|
});
|
|
146
114
|
}
|
|
@@ -242,13 +210,12 @@ function run(cmd, args) {
|
|
|
242
210
|
/**
|
|
243
211
|
* @typedef {object} Task
|
|
244
212
|
* @property {string} id - Task name (directory name under tasks/)
|
|
245
|
-
* @property {{instructions: string, supervisor: string, judge: string,
|
|
213
|
+
* @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
|
|
246
214
|
*/
|
|
247
215
|
|
|
248
216
|
/**
|
|
249
217
|
* @typedef {object} TaskFamily
|
|
250
218
|
* @property {string} rootPath
|
|
251
219
|
* @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
|
|
252
|
-
* @property {Buffer} apmLockBytes - LF-normalised
|
|
253
220
|
* @property {() => Task[]} tasks
|
|
254
221
|
*/
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -11,9 +11,8 @@ import { spawn } from "node:child_process";
|
|
|
11
11
|
import { cp, mkdir } from "node:fs/promises";
|
|
12
12
|
import { createServer } from "node:net";
|
|
13
13
|
import { connect } from "node:net";
|
|
14
|
-
import { join
|
|
14
|
+
import { join } from "node:path";
|
|
15
15
|
|
|
16
|
-
const PREFLIGHT_REL = join("workdir", "scripts");
|
|
17
16
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
18
17
|
|
|
19
18
|
/**
|
|
@@ -24,6 +23,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
|
24
23
|
* @property {number} pgid - Process-group id captured from the preflight child.
|
|
25
24
|
* @property {*} scaffold - Reserved per design § Components; v1 sets null.
|
|
26
25
|
* @property {string} agentTracePath
|
|
26
|
+
* @property {string} supervisorTracePath
|
|
27
27
|
* @property {string} judgeTracePath
|
|
28
28
|
* @property {{phase: string, message: string, exitCode: number}} [preflightError]
|
|
29
29
|
*/
|
|
@@ -55,9 +55,8 @@ export class WorkdirManager {
|
|
|
55
55
|
const cwd = join(runDir, "cwd");
|
|
56
56
|
await mkdir(cwd, { recursive: true });
|
|
57
57
|
|
|
58
|
-
await cp(task.paths.workdir, cwd, {
|
|
59
|
-
|
|
60
|
-
filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
|
|
58
|
+
await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
|
|
59
|
+
if (e.code !== "ENOENT") throw e;
|
|
61
60
|
});
|
|
62
61
|
await cp(task.paths.specs, join(cwd, "specs"), {
|
|
63
62
|
recursive: true,
|
|
@@ -70,9 +69,10 @@ export class WorkdirManager {
|
|
|
70
69
|
|
|
71
70
|
const port = await allocatePort();
|
|
72
71
|
const agentTracePath = join(runDir, "agent.ndjson");
|
|
72
|
+
const supervisorTracePath = join(runDir, "supervisor.ndjson");
|
|
73
73
|
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
74
74
|
|
|
75
|
-
const preflightScript = join(task.paths.
|
|
75
|
+
const preflightScript = join(task.paths.hooks, "preflight.sh");
|
|
76
76
|
const preflight = await runPreflight(preflightScript, cwd, port);
|
|
77
77
|
|
|
78
78
|
return {
|
|
@@ -82,6 +82,7 @@ export class WorkdirManager {
|
|
|
82
82
|
pgid: preflight.pgid,
|
|
83
83
|
scaffold: null,
|
|
84
84
|
agentTracePath,
|
|
85
|
+
supervisorTracePath,
|
|
85
86
|
judgeTracePath,
|
|
86
87
|
...(preflight.error && { preflightError: preflight.error }),
|
|
87
88
|
};
|