@forwardimpact/libeval 0.1.36 → 0.1.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +32 -7
- package/bin/fit-eval.js +24 -3
- package/bin/fit-trace.js +42 -0
- package/package.json +2 -1
- package/src/benchmark/apm-installer.js +78 -16
- package/src/benchmark/env-loader.js +146 -0
- package/src/benchmark/judge.js +4 -3
- package/src/benchmark/report.js +43 -17
- package/src/benchmark/result.js +9 -3
- package/src/benchmark/runner.js +164 -117
- package/src/benchmark/scorer.js +5 -5
- package/src/benchmark/task-family.js +43 -50
- package/src/benchmark/workdir.js +21 -8
- package/src/commands/assert.js +145 -0
- package/src/commands/benchmark-report.js +1 -2
- package/src/commands/benchmark-run.js +11 -4
- package/src/commands/facilitate.js +4 -2
- package/src/commands/run.js +3 -3
- package/src/commands/supervise.js +5 -2
- package/src/facilitator.js +7 -3
- package/src/supervisor.js +42 -12
package/src/benchmark/runner.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Phases per (task, runIndex):
|
|
5
5
|
* 1. WorkdirManager.start → seed CWD + run pre-flight probe
|
|
6
|
-
* 2.
|
|
6
|
+
* 2. Supervisor relay (agent + supervisor) → produce traces + submission
|
|
7
7
|
* 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
|
|
8
8
|
* 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
|
|
9
9
|
* 5. WorkdirManager.teardown → process-group cleanup
|
|
@@ -15,15 +15,12 @@
|
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
import { createReadStream, createWriteStream } from "node:fs";
|
|
18
|
-
import {
|
|
18
|
+
import { mkdir, readFile, unlink } from "node:fs/promises";
|
|
19
19
|
import { createInterface } from "node:readline";
|
|
20
20
|
import { join, resolve as resolvePath } from "node:path";
|
|
21
21
|
|
|
22
|
-
import {
|
|
23
|
-
import {
|
|
24
|
-
import { createRedactor } from "../redaction.js";
|
|
25
|
-
import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
|
|
26
|
-
import { createTraceCollector } from "../trace-collector.js";
|
|
22
|
+
import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
|
|
23
|
+
import { createSupervisor } from "../supervisor.js";
|
|
27
24
|
import { installApm } from "./apm-installer.js";
|
|
28
25
|
import { runJudge } from "./judge.js";
|
|
29
26
|
import { validateResultRecord } from "./result.js";
|
|
@@ -31,7 +28,16 @@ import { runScoring } from "./scorer.js";
|
|
|
31
28
|
import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
|
|
32
29
|
import { createWorkdirManager } from "./workdir.js";
|
|
33
30
|
|
|
34
|
-
const BASE_TOOLS = [
|
|
31
|
+
const BASE_TOOLS = [
|
|
32
|
+
"Bash",
|
|
33
|
+
"Read",
|
|
34
|
+
"Glob",
|
|
35
|
+
"Grep",
|
|
36
|
+
"Write",
|
|
37
|
+
"Edit",
|
|
38
|
+
"Agent",
|
|
39
|
+
"TodoWrite",
|
|
40
|
+
];
|
|
35
41
|
|
|
36
42
|
/** Sole orchestrator for a task-family benchmark run. */
|
|
37
43
|
export class BenchmarkRunner {
|
|
@@ -40,9 +46,12 @@ export class BenchmarkRunner {
|
|
|
40
46
|
* @param {import("./task-family.js").TaskFamily | string} opts.family
|
|
41
47
|
* @param {number} opts.runs - Runs per task (≥ 1).
|
|
42
48
|
* @param {string} opts.output - Run-output directory.
|
|
43
|
-
* @param {string} opts.
|
|
49
|
+
* @param {string} opts.agentModel
|
|
50
|
+
* @param {string} opts.supervisorModel
|
|
51
|
+
* @param {string} opts.judgeModel
|
|
44
52
|
* @param {{agent?: string, judge?: string}} [opts.profiles]
|
|
45
53
|
* @param {Function} opts.query - SDK query (injected for testability).
|
|
54
|
+
* @param {string[]} [opts.allowedTools] - Agent tool allowlist (default: BASE_TOOLS).
|
|
46
55
|
* @param {number} [opts.maxTurns] - Agent-under-test turn budget.
|
|
47
56
|
* @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
|
|
48
57
|
* @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
|
|
@@ -60,9 +69,12 @@ export class BenchmarkRunner {
|
|
|
60
69
|
family,
|
|
61
70
|
runs,
|
|
62
71
|
output,
|
|
63
|
-
|
|
72
|
+
agentModel,
|
|
73
|
+
supervisorModel,
|
|
74
|
+
judgeModel,
|
|
64
75
|
profiles,
|
|
65
76
|
query,
|
|
77
|
+
allowedTools,
|
|
66
78
|
maxTurns,
|
|
67
79
|
termGraceMs,
|
|
68
80
|
// Test seams — default to the real implementations.
|
|
@@ -74,12 +86,15 @@ export class BenchmarkRunner {
|
|
|
74
86
|
if (!Number.isInteger(runs) || runs < 1)
|
|
75
87
|
throw new Error("runs must be an integer ≥ 1");
|
|
76
88
|
if (!output) throw new Error("output is required");
|
|
77
|
-
if (!
|
|
89
|
+
if (!agentModel) throw new Error("agentModel is required");
|
|
78
90
|
if (!query) throw new Error("query is required");
|
|
79
91
|
this.familyInput = family;
|
|
80
92
|
this.runs = runs;
|
|
81
93
|
this.output = output;
|
|
82
|
-
this.
|
|
94
|
+
this.agentModel = agentModel;
|
|
95
|
+
this.supervisorModel = supervisorModel;
|
|
96
|
+
this.judgeModel = judgeModel;
|
|
97
|
+
this.allowedTools = allowedTools ?? BASE_TOOLS;
|
|
83
98
|
this.profiles = {
|
|
84
99
|
agent: profiles?.agent ?? null,
|
|
85
100
|
judge: profiles?.judge ?? null,
|
|
@@ -103,20 +118,25 @@ export class BenchmarkRunner {
|
|
|
103
118
|
: this.familyInput;
|
|
104
119
|
|
|
105
120
|
await mkdir(this.output, { recursive: true });
|
|
106
|
-
const { stagingDir, skillSetHash } = await installApm(
|
|
121
|
+
const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
|
|
122
|
+
family,
|
|
123
|
+
this.output,
|
|
124
|
+
);
|
|
107
125
|
|
|
108
126
|
const tasks = family.tasks();
|
|
109
|
-
for (const task of tasks) {
|
|
110
|
-
await assertPreflightExecutable(task);
|
|
111
|
-
}
|
|
112
127
|
if (this.profiles.judge) {
|
|
113
|
-
await assertJudgeProfileStaged(
|
|
128
|
+
await assertJudgeProfileStaged(
|
|
129
|
+
family,
|
|
130
|
+
judgeProfilesDir,
|
|
131
|
+
this.profiles.judge,
|
|
132
|
+
);
|
|
114
133
|
}
|
|
115
134
|
|
|
116
135
|
const wm = createWorkdirManager({
|
|
117
136
|
stagingDir,
|
|
118
137
|
runOutputDir: this.output,
|
|
119
138
|
termGraceMs: this.termGraceMs,
|
|
139
|
+
familyRootPath: family.rootPath,
|
|
120
140
|
});
|
|
121
141
|
|
|
122
142
|
const resultsPath = join(this.output, "results.jsonl");
|
|
@@ -130,6 +150,7 @@ export class BenchmarkRunner {
|
|
|
130
150
|
task,
|
|
131
151
|
runIndex,
|
|
132
152
|
skillSetHash,
|
|
153
|
+
judgeProfilesDir,
|
|
133
154
|
);
|
|
134
155
|
await writeRecord(resultsStream, record);
|
|
135
156
|
yield record;
|
|
@@ -140,7 +161,7 @@ export class BenchmarkRunner {
|
|
|
140
161
|
}
|
|
141
162
|
}
|
|
142
163
|
|
|
143
|
-
async #runOne(family, wm, task, runIndex, skillSetHash) {
|
|
164
|
+
async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
|
|
144
165
|
const t0 = Date.now();
|
|
145
166
|
const workdir = await wm.start(task, runIndex);
|
|
146
167
|
try {
|
|
@@ -165,42 +186,53 @@ export class BenchmarkRunner {
|
|
|
165
186
|
port: workdir.port,
|
|
166
187
|
runDir: workdir.runDir,
|
|
167
188
|
});
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
189
|
+
let judgeVerdict = null;
|
|
190
|
+
if (task.paths.judge) {
|
|
191
|
+
const judgeContext = await this.#buildJudgeContext(
|
|
192
|
+
task,
|
|
193
|
+
workdir,
|
|
194
|
+
skillSetHash,
|
|
195
|
+
);
|
|
196
|
+
judgeVerdict = await this._runJudgeHook(
|
|
197
|
+
task,
|
|
198
|
+
workdir,
|
|
199
|
+
scoring,
|
|
200
|
+
{
|
|
201
|
+
query: this.query,
|
|
202
|
+
model: this.judgeModel,
|
|
203
|
+
judgeProfile: this.profiles.judge ?? undefined,
|
|
204
|
+
profilesDir: judgeProfilesDir,
|
|
205
|
+
},
|
|
206
|
+
judgeContext,
|
|
207
|
+
);
|
|
208
|
+
}
|
|
209
|
+
const verdict =
|
|
210
|
+
scoring.verdict === "pass" &&
|
|
211
|
+
(judgeVerdict === null || judgeVerdict.verdict === "pass")
|
|
212
|
+
? "pass"
|
|
213
|
+
: "fail";
|
|
184
214
|
const record = {
|
|
185
215
|
taskId: task.id,
|
|
186
216
|
runIndex,
|
|
187
|
-
verdict
|
|
188
|
-
scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
|
|
189
|
-
? "pass"
|
|
190
|
-
: "fail",
|
|
217
|
+
verdict,
|
|
191
218
|
scoring,
|
|
192
219
|
submission,
|
|
193
|
-
judgeVerdict,
|
|
220
|
+
...(judgeVerdict && { judgeVerdict }),
|
|
194
221
|
costUsd,
|
|
195
222
|
turns,
|
|
196
223
|
agentTracePath: workdir.agentTracePath,
|
|
224
|
+
supervisorTracePath: workdir.supervisorTracePath,
|
|
197
225
|
judgeTracePath: workdir.judgeTracePath,
|
|
198
226
|
profiles: {
|
|
199
227
|
agent: this.profiles.agent,
|
|
200
228
|
supervisor: null,
|
|
201
229
|
judge: this.profiles.judge,
|
|
202
230
|
},
|
|
203
|
-
model:
|
|
231
|
+
model: {
|
|
232
|
+
agent: this.agentModel,
|
|
233
|
+
supervisor: this.supervisorModel,
|
|
234
|
+
judge: this.judgeModel,
|
|
235
|
+
},
|
|
204
236
|
skillSetHash,
|
|
205
237
|
familyRevision: family.familyRevision,
|
|
206
238
|
durationMs: Date.now() - t0,
|
|
@@ -236,54 +268,49 @@ export class BenchmarkRunner {
|
|
|
236
268
|
}
|
|
237
269
|
|
|
238
270
|
/**
|
|
239
|
-
* Run the agent-under-test
|
|
240
|
-
*
|
|
241
|
-
*
|
|
242
|
-
* with one terminal `result` event.
|
|
243
|
-
*
|
|
244
|
-
* Inspects both thrown errors AND the resolved `{success, aborted, error}`
|
|
245
|
-
* shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
|
|
246
|
-
* the SDK iterator catches its own errors and resolves with `success:
|
|
247
|
-
* false`, so a try/catch alone would silently treat a failed session as
|
|
248
|
-
* a successful one (plan Step 8.5.c).
|
|
271
|
+
* Run the agent-under-test via a Supervisor relay. The supervisor writes
|
|
272
|
+
* a combined tagged NDJSON trace; after the session we split it into
|
|
273
|
+
* agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
|
|
249
274
|
*/
|
|
250
275
|
async #runAgent(task, workdir) {
|
|
251
|
-
const
|
|
252
|
-
const
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
:
|
|
258
|
-
|
|
259
|
-
cwd: workdir.cwd,
|
|
276
|
+
const combinedPath = join(workdir.runDir, ".combined.ndjson");
|
|
277
|
+
const combinedStream = createWriteStream(combinedPath);
|
|
278
|
+
const supervisorInstructions = task.paths.supervisor
|
|
279
|
+
? await readFile(task.paths.supervisor, "utf8").catch(() => null)
|
|
280
|
+
: null;
|
|
281
|
+
const supervisor = createSupervisor({
|
|
282
|
+
supervisorCwd: workdir.cwd,
|
|
283
|
+
agentCwd: workdir.cwd,
|
|
260
284
|
query: this.query,
|
|
261
|
-
output:
|
|
262
|
-
|
|
285
|
+
output: combinedStream,
|
|
286
|
+
agentModel: this.agentModel,
|
|
287
|
+
supervisorModel: this.supervisorModel,
|
|
263
288
|
maxTurns: this.maxTurns ?? 50,
|
|
264
|
-
allowedTools:
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
redactor: createRedactor(
|
|
289
|
+
allowedTools: this.allowedTools,
|
|
290
|
+
...(this.profiles.agent && { agentProfile: this.profiles.agent }),
|
|
291
|
+
...(supervisorInstructions && { taskAmend: supervisorInstructions }),
|
|
292
|
+
redactor: createRedactor({
|
|
293
|
+
allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
|
|
294
|
+
}),
|
|
268
295
|
});
|
|
269
296
|
const instructions = await readFile(task.paths.instructions, "utf8");
|
|
270
297
|
let agentError = null;
|
|
271
298
|
try {
|
|
272
|
-
const result = await
|
|
273
|
-
if (!result.success) {
|
|
274
|
-
agentError = {
|
|
275
|
-
message:
|
|
276
|
-
result.error?.message ??
|
|
277
|
-
(result.aborted ? "aborted" : "agent did not succeed"),
|
|
278
|
-
aborted: result.aborted ?? false,
|
|
279
|
-
};
|
|
299
|
+
const result = await supervisor.run(instructions);
|
|
300
|
+
if (!result.success && !result.concluded) {
|
|
301
|
+
agentError = { message: "supervisor did not succeed", aborted: false };
|
|
280
302
|
}
|
|
281
303
|
} catch (e) {
|
|
282
304
|
agentError = { message: e.message ?? String(e), aborted: false };
|
|
283
305
|
} finally {
|
|
284
|
-
await new Promise((r) =>
|
|
306
|
+
await new Promise((r) => combinedStream.end(r));
|
|
285
307
|
}
|
|
286
|
-
const summary = await
|
|
308
|
+
const summary = await splitAndSummarize(
|
|
309
|
+
combinedPath,
|
|
310
|
+
workdir.agentTracePath,
|
|
311
|
+
workdir.supervisorTracePath,
|
|
312
|
+
);
|
|
313
|
+
await unlink(combinedPath).catch(() => {});
|
|
287
314
|
return { ...summary, agentError };
|
|
288
315
|
}
|
|
289
316
|
|
|
@@ -321,11 +348,16 @@ export class BenchmarkRunner {
|
|
|
321
348
|
supervisor: null,
|
|
322
349
|
judge: this.profiles.judge,
|
|
323
350
|
},
|
|
324
|
-
model:
|
|
351
|
+
model: {
|
|
352
|
+
agent: this.agentModel,
|
|
353
|
+
supervisor: this.supervisorModel,
|
|
354
|
+
judge: this.judgeModel,
|
|
355
|
+
},
|
|
325
356
|
skillSetHash,
|
|
326
357
|
familyRevision,
|
|
327
358
|
durationMs,
|
|
328
359
|
agentTracePath: workdir.agentTracePath,
|
|
360
|
+
supervisorTracePath: workdir.supervisorTracePath,
|
|
329
361
|
judgeTracePath: workdir.judgeTracePath,
|
|
330
362
|
};
|
|
331
363
|
}
|
|
@@ -360,52 +392,67 @@ async function writeRecord(stream, record) {
|
|
|
360
392
|
}
|
|
361
393
|
|
|
362
394
|
/**
|
|
363
|
-
*
|
|
364
|
-
*
|
|
365
|
-
*
|
|
366
|
-
*
|
|
395
|
+
* Split the combined supervisor trace into agent and supervisor files, and
|
|
396
|
+
* extract cost, turn count, and submission in a single pass. Agent-source
|
|
397
|
+
* events go to `agentPath`; supervisor and orchestrator events go to
|
|
398
|
+
* `supervisorPath`.
|
|
367
399
|
*/
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
400
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
|
|
401
|
+
async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
|
|
402
|
+
const agentStream = createWriteStream(agentPath);
|
|
403
|
+
const supStream = createWriteStream(supervisorPath);
|
|
404
|
+
const rl = createInterface({
|
|
405
|
+
input: createReadStream(combinedPath),
|
|
406
|
+
crlfDelay: Infinity,
|
|
407
|
+
});
|
|
408
|
+
let agentCost = 0;
|
|
409
|
+
let supervisorCost = 0;
|
|
410
|
+
let turns = 0;
|
|
411
|
+
let submission = "";
|
|
412
|
+
for await (const line of rl) {
|
|
413
|
+
if (!line.trim()) continue;
|
|
414
|
+
let event;
|
|
415
|
+
try {
|
|
416
|
+
event = JSON.parse(line);
|
|
417
|
+
} catch {
|
|
418
|
+
continue;
|
|
419
|
+
}
|
|
420
|
+
const target = event.source === "agent" ? agentStream : supStream;
|
|
421
|
+
target.write(line + "\n");
|
|
422
|
+
const inner = event.event;
|
|
423
|
+
if (!inner) continue;
|
|
424
|
+
if (event.source === "agent") {
|
|
425
|
+
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
426
|
+
agentCost = inner.total_cost_usd;
|
|
427
|
+
}
|
|
428
|
+
if (inner.type === "assistant") {
|
|
429
|
+
const text = extractText(inner);
|
|
430
|
+
if (text) submission = text;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
if (event.source === "supervisor") {
|
|
434
|
+
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
435
|
+
supervisorCost = inner.total_cost_usd;
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
if (event.source === "orchestrator" && inner.type === "summary") {
|
|
439
|
+
turns = inner.turns ?? 0;
|
|
440
|
+
}
|
|
376
441
|
}
|
|
442
|
+
await Promise.all([
|
|
443
|
+
new Promise((r) => agentStream.end(r)),
|
|
444
|
+
new Promise((r) => supStream.end(r)),
|
|
445
|
+
]);
|
|
446
|
+
return { costUsd: agentCost + supervisorCost, turns, submission };
|
|
377
447
|
}
|
|
378
448
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
const collector = createTraceCollector();
|
|
385
|
-
const stream = createReadStream(tracePath);
|
|
386
|
-
const rl = createInterface({ input: stream, crlfDelay: Infinity });
|
|
387
|
-
for await (const line of rl) collector.addLine(line);
|
|
388
|
-
const json = collector.toJSON();
|
|
389
|
-
const summary = json.summary ?? {};
|
|
390
|
-
return {
|
|
391
|
-
costUsd:
|
|
392
|
-
typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
|
|
393
|
-
turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
|
|
394
|
-
submission: lastAssistantText(json),
|
|
395
|
-
};
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
function lastAssistantText(json) {
|
|
399
|
-
const turns = json.turns ?? [];
|
|
400
|
-
for (let i = turns.length - 1; i >= 0; i--) {
|
|
401
|
-
const turn = turns[i];
|
|
402
|
-
if (turn.role !== "assistant") continue;
|
|
403
|
-
const content = turn.content ?? [];
|
|
404
|
-
for (let j = content.length - 1; j >= 0; j--) {
|
|
405
|
-
if (content[j].type === "text" && content[j].text) return content[j].text;
|
|
406
|
-
}
|
|
449
|
+
function extractText(inner) {
|
|
450
|
+
const content = inner.message?.content ?? inner.content;
|
|
451
|
+
if (!Array.isArray(content)) return null;
|
|
452
|
+
for (let i = content.length - 1; i >= 0; i--) {
|
|
453
|
+
if (content[i].type === "text" && content[i].text) return content[i].text;
|
|
407
454
|
}
|
|
408
|
-
return
|
|
455
|
+
return null;
|
|
409
456
|
}
|
|
410
457
|
|
|
411
458
|
/**
|
package/src/benchmark/scorer.js
CHANGED
|
@@ -1,10 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Scorer — runs `<task.paths.
|
|
2
|
+
* Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
|
|
3
3
|
* the post-run agent CWD. The exit code is authoritative for the verdict;
|
|
4
4
|
* structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
|
-
*
|
|
6
|
-
* Scoring scripts are never copied into the agent CWD — they live only in the
|
|
7
|
-
* task template (design Decision 3).
|
|
8
5
|
*/
|
|
9
6
|
|
|
10
7
|
import { spawn } from "node:child_process";
|
|
@@ -31,8 +28,11 @@ import { join } from "node:path";
|
|
|
31
28
|
* @returns {Promise<ScoringResult>}
|
|
32
29
|
*/
|
|
33
30
|
export function runScoring(task, ctx) {
|
|
31
|
+
if (!task.paths.score) {
|
|
32
|
+
return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
|
|
33
|
+
}
|
|
34
34
|
return new Promise((res, rej) => {
|
|
35
|
-
const script =
|
|
35
|
+
const script = task.paths.score;
|
|
36
36
|
const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
|
|
37
37
|
|
|
38
38
|
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
@@ -4,13 +4,14 @@
|
|
|
4
4
|
* apm.lock.yaml
|
|
5
5
|
* .claude/ # pre-staged skills + agents (P1)
|
|
6
6
|
* tasks/<task_name>/
|
|
7
|
-
*
|
|
8
|
-
* supervisor.task.md #
|
|
7
|
+
* agent.task.md
|
|
8
|
+
* supervisor.task.md # optional; appended to the task as supervisor context
|
|
9
9
|
* judge.task.md
|
|
10
|
+
* hooks/ # harness-only; never copied to agent CWD
|
|
11
|
+
* preflight.sh
|
|
12
|
+
* score.sh
|
|
10
13
|
* specs/ # copied into agent CWD
|
|
11
|
-
* workdir/ # copied into agent CWD
|
|
12
|
-
* scripts/preflight.sh
|
|
13
|
-
* scoring/ # template-only; never copied
|
|
14
|
+
* workdir/ # copied into agent CWD
|
|
14
15
|
*
|
|
15
16
|
* Local paths or git URLs are both accepted; git URLs are shallow-cloned into
|
|
16
17
|
* a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
|
|
@@ -22,6 +23,7 @@ import { spawn } from "node:child_process";
|
|
|
22
23
|
import { createHash } from "node:crypto";
|
|
23
24
|
import {
|
|
24
25
|
access,
|
|
26
|
+
constants,
|
|
25
27
|
lstat,
|
|
26
28
|
mkdtemp,
|
|
27
29
|
readdir,
|
|
@@ -53,13 +55,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
53
55
|
familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
|
|
54
56
|
}
|
|
55
57
|
|
|
56
|
-
const apmLockBytes = await readApmLockBytes(rootPath);
|
|
57
58
|
const tasks = await discoverTasks(rootPath);
|
|
58
59
|
|
|
59
60
|
return {
|
|
60
61
|
rootPath,
|
|
61
62
|
familyRevision,
|
|
62
|
-
apmLockBytes,
|
|
63
63
|
tasks() {
|
|
64
64
|
return tasks;
|
|
65
65
|
},
|
|
@@ -67,58 +67,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
/**
|
|
70
|
-
* Assert that `<
|
|
71
|
-
*
|
|
70
|
+
* Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
|
|
71
|
+
* `BenchmarkRunner.run()` so a missing judge profile fails the family
|
|
72
72
|
* install before any agent session starts.
|
|
73
73
|
* @param {TaskFamily} _family
|
|
74
|
-
* @param {string}
|
|
74
|
+
* @param {string} judgeProfilesDir
|
|
75
75
|
* @param {string} judgeProfile
|
|
76
76
|
* @returns {Promise<void>}
|
|
77
77
|
*/
|
|
78
78
|
export async function assertJudgeProfileStaged(
|
|
79
79
|
_family,
|
|
80
|
-
|
|
80
|
+
judgeProfilesDir,
|
|
81
81
|
judgeProfile,
|
|
82
82
|
) {
|
|
83
|
-
const candidate = join(
|
|
83
|
+
const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
|
|
84
84
|
try {
|
|
85
85
|
await access(candidate);
|
|
86
86
|
} catch {
|
|
87
|
-
throw new Error(
|
|
88
|
-
`judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
|
|
89
|
-
);
|
|
87
|
+
throw new Error(`judge profile not staged: ${candidate}`);
|
|
90
88
|
}
|
|
91
89
|
}
|
|
92
90
|
|
|
93
|
-
async function readApmLockBytes(rootPath) {
|
|
94
|
-
const lockPath = join(rootPath, "apm.lock.yaml");
|
|
95
|
-
try {
|
|
96
|
-
const raw = await readFile(lockPath);
|
|
97
|
-
return normalizeLf(raw);
|
|
98
|
-
} catch (e) {
|
|
99
|
-
if (e.code === "ENOENT") {
|
|
100
|
-
throw new Error(
|
|
101
|
-
`task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
|
|
102
|
-
);
|
|
103
|
-
}
|
|
104
|
-
throw e;
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/**
|
|
109
|
-
* Replace CRLF with LF so cross-OS authored lockfiles hash identically.
|
|
110
|
-
* @param {Buffer} buf
|
|
111
|
-
* @returns {Buffer}
|
|
112
|
-
*/
|
|
113
|
-
function normalizeLf(buf) {
|
|
114
|
-
const out = [];
|
|
115
|
-
for (let i = 0; i < buf.length; i++) {
|
|
116
|
-
if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
|
|
117
|
-
out.push(buf[i]);
|
|
118
|
-
}
|
|
119
|
-
return Buffer.from(out);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
91
|
async function discoverTasks(rootPath) {
|
|
123
92
|
const tasksRoot = join(rootPath, "tasks");
|
|
124
93
|
const tasks = [];
|
|
@@ -132,15 +101,22 @@ async function discoverTasks(rootPath) {
|
|
|
132
101
|
for (const entry of entries) {
|
|
133
102
|
if (!entry.isDirectory()) continue;
|
|
134
103
|
const taskDir = join(tasksRoot, entry.name);
|
|
104
|
+
const supervisorPath = join(taskDir, "supervisor.task.md");
|
|
105
|
+
const judgePath = join(taskDir, "judge.task.md");
|
|
106
|
+
const preflightPath = join(taskDir, "hooks", "preflight.sh");
|
|
107
|
+
const scorePath = join(taskDir, "hooks", "score.sh");
|
|
135
108
|
tasks.push({
|
|
136
109
|
id: entry.name,
|
|
137
110
|
paths: {
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
111
|
+
taskDir,
|
|
112
|
+
instructions: join(taskDir, "agent.task.md"),
|
|
113
|
+
supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
|
|
114
|
+
judge: (await fileExists(judgePath)) ? judgePath : null,
|
|
115
|
+
hooks: join(taskDir, "hooks"),
|
|
116
|
+
preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
|
|
117
|
+
score: (await fileExecutable(scorePath)) ? scorePath : null,
|
|
141
118
|
specs: join(taskDir, "specs"),
|
|
142
119
|
workdir: join(taskDir, "workdir"),
|
|
143
|
-
scoring: join(taskDir, "scoring"),
|
|
144
120
|
},
|
|
145
121
|
});
|
|
146
122
|
}
|
|
@@ -148,6 +124,24 @@ async function discoverTasks(rootPath) {
|
|
|
148
124
|
return tasks;
|
|
149
125
|
}
|
|
150
126
|
|
|
127
|
+
async function fileExists(path) {
|
|
128
|
+
try {
|
|
129
|
+
await access(path);
|
|
130
|
+
return true;
|
|
131
|
+
} catch {
|
|
132
|
+
return false;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
async function fileExecutable(path) {
|
|
137
|
+
try {
|
|
138
|
+
await access(path, constants.X_OK);
|
|
139
|
+
return true;
|
|
140
|
+
} catch {
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
151
145
|
/**
|
|
152
146
|
* Canonical-tree hash per design § Family revision algorithm:
|
|
153
147
|
* list regular files (excluding .git/, node_modules/)
|
|
@@ -242,13 +236,12 @@ function run(cmd, args) {
|
|
|
242
236
|
/**
|
|
243
237
|
* @typedef {object} Task
|
|
244
238
|
* @property {string} id - Task name (directory name under tasks/)
|
|
245
|
-
* @property {{instructions: string, supervisor: string, judge: string,
|
|
239
|
+
* @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
|
|
246
240
|
*/
|
|
247
241
|
|
|
248
242
|
/**
|
|
249
243
|
* @typedef {object} TaskFamily
|
|
250
244
|
* @property {string} rootPath
|
|
251
245
|
* @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
|
|
252
|
-
* @property {Buffer} apmLockBytes - LF-normalised
|
|
253
246
|
* @property {() => Task[]} tasks
|
|
254
247
|
*/
|