@forwardimpact/libeval 0.1.35 → 0.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/benchmark/judge.js +33 -7
- package/src/benchmark/report.js +338 -17
- package/src/benchmark/runner.js +30 -5
- package/src/commands/benchmark-report.js +5 -1
- package/src/supervisor.js +5 -2
package/package.json
CHANGED
package/src/benchmark/judge.js
CHANGED
|
@@ -1,9 +1,20 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Benchmark adapter for the libeval `Judge`. Templates the family's
|
|
3
|
-
* `judge.task.md`
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
3
|
+
* `judge.task.md` with structured context variables, runs the judge against
|
|
4
|
+
* the post-run agent CWD, and returns the verdict in the benchmark's
|
|
5
|
+
* `pass`/`fail` vocabulary (mapped from libeval's `success`/`failure`).
|
|
6
|
+
*
|
|
7
|
+
* Template variables available in `judge.task.md`:
|
|
8
|
+
*
|
|
9
|
+
* {{AGENT_INSTRUCTIONS}} — contents of instructions.md
|
|
10
|
+
* {{AGENT_PROFILE}} — agent profile body (empty string if none)
|
|
11
|
+
* {{AGENT_TRACE_PATH}} — path to agent.ndjson
|
|
12
|
+
* {{SCORING_RESULT}} — JSON scoring object
|
|
13
|
+
* {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
|
|
14
|
+
* {{TASK_ID}} — task name (directory under tasks/)
|
|
15
|
+
* {{TASK_DIR}} — agent working directory path
|
|
16
|
+
*
|
|
17
|
+
* Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
|
|
7
18
|
*
|
|
8
19
|
* The judge verdict is captured from the orchestration context's
|
|
9
20
|
* `concluded` flag directly — no trace parsing on the happy path.
|
|
@@ -24,19 +35,34 @@ import { createRedactor } from "../redaction.js";
|
|
|
24
35
|
* @property {string} summary
|
|
25
36
|
*/
|
|
26
37
|
|
|
38
|
+
/**
|
|
39
|
+
* @typedef {object} JudgeContext
|
|
40
|
+
* @property {string} agentInstructions - Contents of instructions.md.
|
|
41
|
+
* @property {string} agentProfile - Agent profile body (empty string if none).
|
|
42
|
+
* @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
|
|
43
|
+
*/
|
|
44
|
+
|
|
27
45
|
/**
|
|
28
46
|
* Run the judge over a completed task run.
|
|
29
47
|
* @param {import("./task-family.js").Task} task
|
|
30
48
|
* @param {import("./workdir.js").Workdir} workdir
|
|
31
49
|
* @param {import("./scorer.js").ScoringResult} scoring
|
|
32
50
|
* @param {{query: Function, model: string, judgeProfile?: string}} deps
|
|
51
|
+
* @param {JudgeContext} [context]
|
|
33
52
|
* @returns {Promise<JudgeVerdict>}
|
|
34
53
|
*/
|
|
35
|
-
export async function runJudge(task, workdir, scoring, deps) {
|
|
54
|
+
export async function runJudge(task, workdir, scoring, deps, context) {
|
|
36
55
|
const template = await readFile(task.paths.judge, "utf8");
|
|
56
|
+
const scoringJson = JSON.stringify(scoring, null, 2);
|
|
37
57
|
const taskText = template
|
|
38
|
-
.replaceAll("{{
|
|
39
|
-
.replaceAll("{{
|
|
58
|
+
.replaceAll("{{SCORING_RESULT}}", scoringJson)
|
|
59
|
+
.replaceAll("{{SCORING}}", scoringJson)
|
|
60
|
+
.replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
|
|
61
|
+
.replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
|
|
62
|
+
.replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
|
|
63
|
+
.replaceAll("{{SKILL_SET_HASH}}", context?.skillSetHash ?? "")
|
|
64
|
+
.replaceAll("{{TASK_ID}}", task.id)
|
|
65
|
+
.replaceAll("{{TASK_DIR}}", workdir.cwd);
|
|
40
66
|
|
|
41
67
|
const output = createWriteStream(workdir.judgeTracePath);
|
|
42
68
|
const judge = createJudge({
|
package/src/benchmark/report.js
CHANGED
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
* records by `taskId`, and compute pass@k via the OpenAI HumanEval
|
|
4
4
|
* unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
|
|
5
5
|
*
|
|
6
|
+
* When `includeRuns` is true, each task carries per-run detail (scoring
|
|
7
|
+
* checks, judge commentary, cost, duration) and the text renderer produces
|
|
8
|
+
* a full markdown report instead of just the pass@k table.
|
|
9
|
+
*
|
|
6
10
|
* Records that fail schema validation are skipped with a stderr warning
|
|
7
11
|
* (counted under `totals.skipped`) so a corrupt line cannot abort the
|
|
8
12
|
* whole report.
|
|
@@ -14,48 +18,194 @@ import { createInterface } from "node:readline";
|
|
|
14
18
|
|
|
15
19
|
import { validateResultRecord } from "./result.js";
|
|
16
20
|
|
|
21
|
+
/**
|
|
22
|
+
* @typedef {object} RunDetail
|
|
23
|
+
* @property {number} runIndex
|
|
24
|
+
* @property {"pass"|"fail"} verdict
|
|
25
|
+
* @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
|
|
26
|
+
* @property {{verdict: string, summary: string}} [judgeVerdict]
|
|
27
|
+
* @property {number} costUsd
|
|
28
|
+
* @property {number} turns
|
|
29
|
+
* @property {number} durationMs
|
|
30
|
+
* @property {{message: string, aborted: boolean}} [agentError]
|
|
31
|
+
* @property {{phase: string, message: string, exitCode: number}} [preflightError]
|
|
32
|
+
*/
|
|
33
|
+
|
|
17
34
|
/**
|
|
18
35
|
* @typedef {object} TaskReport
|
|
19
36
|
* @property {string} taskId
|
|
20
37
|
* @property {number} n - Total runs.
|
|
21
38
|
* @property {number} c - Passing runs.
|
|
22
39
|
* @property {Record<string|number, number|null>} passAtK
|
|
40
|
+
* @property {RunDetail[]} [runs] - Per-run detail (only when includeRuns).
|
|
23
41
|
*/
|
|
24
42
|
|
|
25
43
|
/**
|
|
26
|
-
* @param {{inputDir: string, kValues: number[]}} opts
|
|
27
|
-
* @returns {Promise<{tasks: TaskReport[], totals:
|
|
44
|
+
* @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
|
|
45
|
+
* @returns {Promise<{tasks: TaskReport[], totals: object}>}
|
|
28
46
|
*/
|
|
29
|
-
export async function aggregate({ inputDir, kValues }) {
|
|
47
|
+
export async function aggregate({ inputDir, kValues, includeRuns = false }) {
|
|
30
48
|
const records = await loadRecords(inputDir);
|
|
31
49
|
const grouped = groupByTask(records.records);
|
|
32
50
|
const tasks = [];
|
|
33
|
-
let
|
|
51
|
+
let totalRuns = 0;
|
|
52
|
+
let totalCost = 0;
|
|
53
|
+
const allDurations = [];
|
|
54
|
+
const allTurns = [];
|
|
55
|
+
let firstRecord = null;
|
|
56
|
+
|
|
34
57
|
for (const [taskId, group] of grouped) {
|
|
35
58
|
const n = group.length;
|
|
36
59
|
const c = group.filter((r) => r.verdict === "pass").length;
|
|
37
|
-
|
|
60
|
+
totalRuns += n;
|
|
38
61
|
const passAtK = {};
|
|
39
62
|
for (const k of kValues) passAtK[k] = passAtKValue(n, c, k);
|
|
40
|
-
|
|
63
|
+
|
|
64
|
+
const task = { taskId, n, c, passAtK };
|
|
65
|
+
|
|
66
|
+
if (includeRuns) {
|
|
67
|
+
if (!firstRecord) firstRecord = group[0];
|
|
68
|
+
const accumulators = { allDurations, allTurns };
|
|
69
|
+
task.runs = group
|
|
70
|
+
.map((r) => {
|
|
71
|
+
totalCost += r.costUsd ?? 0;
|
|
72
|
+
return buildRunDetail(r, accumulators);
|
|
73
|
+
})
|
|
74
|
+
.sort((a, b) => a.runIndex - b.runIndex);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
tasks.push(task);
|
|
41
78
|
}
|
|
42
79
|
tasks.sort((a, b) =>
|
|
43
80
|
a.taskId < b.taskId ? -1 : a.taskId > b.taskId ? 1 : 0,
|
|
44
81
|
);
|
|
82
|
+
|
|
83
|
+
const totals = {
|
|
84
|
+
tasks: tasks.length,
|
|
85
|
+
runs: totalRuns,
|
|
86
|
+
skipped: records.skipped,
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
if (includeRuns) {
|
|
90
|
+
totals.costUsd = totalCost;
|
|
91
|
+
totals.medianDurationMs = median(allDurations);
|
|
92
|
+
totals.medianTurns = median(allTurns);
|
|
93
|
+
totals.model = firstRecord?.model ?? "";
|
|
94
|
+
totals.skillSetHash = firstRecord?.skillSetHash ?? "";
|
|
95
|
+
totals.familyRevision = firstRecord?.familyRevision ?? "";
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return { tasks, totals };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Build a normalized per-run detail object and accumulate duration/turn
|
|
103
|
+
* samples for median calculation. Extracted from `aggregate` to keep its
|
|
104
|
+
* cognitive complexity below the lint ceiling.
|
|
105
|
+
* @param {object} r - Raw record.
|
|
106
|
+
* @param {{allDurations: number[], allTurns: number[]}} acc
|
|
107
|
+
* @returns {RunDetail}
|
|
108
|
+
*/
|
|
109
|
+
function buildRunDetail(r, acc) {
|
|
110
|
+
if (r.durationMs != null) acc.allDurations.push(r.durationMs);
|
|
111
|
+
if (r.turns != null) acc.allTurns.push(r.turns);
|
|
45
112
|
return {
|
|
46
|
-
|
|
47
|
-
|
|
113
|
+
runIndex: r.runIndex,
|
|
114
|
+
verdict: r.verdict,
|
|
115
|
+
...(r.scoring && { scoring: r.scoring }),
|
|
116
|
+
...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
|
|
117
|
+
costUsd: r.costUsd ?? 0,
|
|
118
|
+
turns: r.turns ?? 0,
|
|
119
|
+
durationMs: r.durationMs ?? 0,
|
|
120
|
+
...(r.agentError && { agentError: r.agentError }),
|
|
121
|
+
...(r.preflightError && { preflightError: r.preflightError }),
|
|
48
122
|
};
|
|
49
123
|
}
|
|
50
124
|
|
|
51
125
|
/**
|
|
52
|
-
* Render an aggregate report as
|
|
53
|
-
*
|
|
126
|
+
* Render an aggregate report as markdown. When the report contains per-run
|
|
127
|
+
* detail (from `includeRuns: true`), renders a full report with summary,
|
|
128
|
+
* pass@k table, and per-task detail sections. Otherwise falls back to the
|
|
129
|
+
* compact pass@k table.
|
|
54
130
|
* @param {Awaited<ReturnType<typeof aggregate>>} report
|
|
55
131
|
* @param {number[]} kValues
|
|
56
132
|
* @returns {string}
|
|
57
133
|
*/
|
|
58
134
|
export function renderTextReport(report, kValues) {
|
|
135
|
+
if (report.tasks[0]?.runs) {
|
|
136
|
+
return renderFullReport(report, kValues);
|
|
137
|
+
}
|
|
138
|
+
return renderCompactReport(report, kValues);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
// Compact report (legacy path)
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
function renderCompactReport(report, kValues) {
|
|
146
|
+
const lines = [
|
|
147
|
+
renderPassAtKTable(report, kValues),
|
|
148
|
+
"",
|
|
149
|
+
renderTotalsLine(report),
|
|
150
|
+
];
|
|
151
|
+
return lines.join("\n");
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// ---------------------------------------------------------------------------
|
|
155
|
+
// Full report
|
|
156
|
+
// ---------------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
function renderFullReport(report, kValues) {
|
|
159
|
+
const sections = [
|
|
160
|
+
renderSummary(report),
|
|
161
|
+
"## Pass@k",
|
|
162
|
+
"",
|
|
163
|
+
renderPassAtKTable(report, kValues),
|
|
164
|
+
"",
|
|
165
|
+
renderTotalsLine(report),
|
|
166
|
+
"",
|
|
167
|
+
"## Task Details",
|
|
168
|
+
];
|
|
169
|
+
|
|
170
|
+
for (const task of report.tasks) {
|
|
171
|
+
sections.push("");
|
|
172
|
+
sections.push(renderTaskDetail(task));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
return sections.join("\n");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
function renderSummary(report) {
|
|
179
|
+
const { totals } = report;
|
|
180
|
+
const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
|
|
181
|
+
const lines = [
|
|
182
|
+
"# Benchmark Report",
|
|
183
|
+
"",
|
|
184
|
+
`**Result: ${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
|
|
185
|
+
];
|
|
186
|
+
const meta = [];
|
|
187
|
+
if (totals.model) meta.push(`Model: \`${totals.model}\``);
|
|
188
|
+
if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
|
|
189
|
+
if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
|
|
190
|
+
if (meta.length) lines.push(meta.join(" | "));
|
|
191
|
+
|
|
192
|
+
const stats = [];
|
|
193
|
+
if (totals.costUsd != null) stats.push(`Cost: ${formatCost(totals.costUsd)}`);
|
|
194
|
+
if (totals.medianDurationMs != null)
|
|
195
|
+
stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
|
|
196
|
+
if (totals.medianTurns != null)
|
|
197
|
+
stats.push(`Median turns: ${totals.medianTurns}`);
|
|
198
|
+
if (stats.length) lines.push(stats.join(" | "));
|
|
199
|
+
|
|
200
|
+
lines.push("");
|
|
201
|
+
return lines.join("\n");
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// ---------------------------------------------------------------------------
|
|
205
|
+
// Pass@k table (shared between compact and full)
|
|
206
|
+
// ---------------------------------------------------------------------------
|
|
207
|
+
|
|
208
|
+
function renderPassAtKTable(report, kValues) {
|
|
59
209
|
const header = ["taskId", "n", "c", ...kValues.map((k) => `pass@${k}`)];
|
|
60
210
|
const rows = [header, header.map(() => "---")];
|
|
61
211
|
for (const t of report.tasks) {
|
|
@@ -66,20 +216,193 @@ export function renderTextReport(report, kValues) {
|
|
|
66
216
|
...kValues.map((k) => formatPassAt(t.passAtK[k])),
|
|
67
217
|
]);
|
|
68
218
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
219
|
+
return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
function renderTotalsLine(report) {
|
|
223
|
+
return `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// ---------------------------------------------------------------------------
|
|
227
|
+
// Per-task detail
|
|
228
|
+
// ---------------------------------------------------------------------------
|
|
229
|
+
|
|
230
|
+
function renderTaskDetail(task) {
|
|
231
|
+
const runs = task.runs ?? [];
|
|
232
|
+
const status = task.c === task.n ? "PASS" : "FAIL";
|
|
233
|
+
const singleRun = runs.length === 1;
|
|
234
|
+
|
|
235
|
+
const lines = [
|
|
236
|
+
`### ${task.taskId}`,
|
|
237
|
+
"",
|
|
238
|
+
`**${status} — ${task.c}/${task.n} runs passed**`,
|
|
239
|
+
];
|
|
240
|
+
|
|
241
|
+
lines.push("", renderRunsTable(runs));
|
|
242
|
+
|
|
243
|
+
const checks = renderScoringChecks(runs, singleRun);
|
|
244
|
+
if (checks) lines.push("", checks);
|
|
245
|
+
|
|
246
|
+
const commentary = renderJudgeCommentary(runs, singleRun);
|
|
247
|
+
if (commentary) lines.push("", commentary);
|
|
248
|
+
|
|
249
|
+
const errors = renderErrors(runs);
|
|
250
|
+
if (errors) lines.push("", errors);
|
|
251
|
+
|
|
74
252
|
return lines.join("\n");
|
|
75
253
|
}
|
|
76
254
|
|
|
255
|
+
function renderRunsTable(runs) {
|
|
256
|
+
const header = [
|
|
257
|
+
"Run",
|
|
258
|
+
"Verdict",
|
|
259
|
+
"Scoring",
|
|
260
|
+
"Judge",
|
|
261
|
+
"Cost",
|
|
262
|
+
"Turns",
|
|
263
|
+
"Duration",
|
|
264
|
+
];
|
|
265
|
+
const rows = [header, header.map(() => "---")];
|
|
266
|
+
for (const r of runs) {
|
|
267
|
+
const scoringCell = r.preflightError
|
|
268
|
+
? "preflight error"
|
|
269
|
+
: r.scoring
|
|
270
|
+
? r.scoring.verdict
|
|
271
|
+
: "—";
|
|
272
|
+
const judgeCell = r.preflightError
|
|
273
|
+
? "—"
|
|
274
|
+
: r.judgeVerdict
|
|
275
|
+
? r.judgeVerdict.verdict
|
|
276
|
+
: "—";
|
|
277
|
+
rows.push([
|
|
278
|
+
String(r.runIndex),
|
|
279
|
+
r.verdict.toUpperCase(),
|
|
280
|
+
scoringCell,
|
|
281
|
+
judgeCell,
|
|
282
|
+
formatCost(r.costUsd),
|
|
283
|
+
String(r.turns),
|
|
284
|
+
formatDuration(r.durationMs),
|
|
285
|
+
]);
|
|
286
|
+
}
|
|
287
|
+
return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function renderScoringChecks(runs, singleRun) {
|
|
291
|
+
const rows = collectScoringRows(runs);
|
|
292
|
+
if (!rows.length) return null;
|
|
293
|
+
|
|
294
|
+
const header = singleRun
|
|
295
|
+
? ["Check", "Result", "Message"]
|
|
296
|
+
: ["Run", "Check", "Result", "Message"];
|
|
297
|
+
const lines = [
|
|
298
|
+
"#### Scoring Checks",
|
|
299
|
+
"",
|
|
300
|
+
`| ${header.join(" | ")} |`,
|
|
301
|
+
`| ${header.map(() => "---").join(" | ")} |`,
|
|
302
|
+
];
|
|
303
|
+
for (const row of rows) {
|
|
304
|
+
const cells = singleRun
|
|
305
|
+
? [row.check, row.result, row.message]
|
|
306
|
+
: [String(row.run), row.check, row.result, row.message];
|
|
307
|
+
lines.push(`| ${cells.join(" | ")} |`);
|
|
308
|
+
}
|
|
309
|
+
return lines.join("\n");
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function collectScoringRows(runs) {
|
|
313
|
+
const rows = [];
|
|
314
|
+
for (const r of runs) {
|
|
315
|
+
if (!r.scoring?.details?.length) continue;
|
|
316
|
+
for (const d of r.scoring.details) {
|
|
317
|
+
rows.push({
|
|
318
|
+
run: r.runIndex,
|
|
319
|
+
check: escapeCell(String(d.test ?? "(unnamed)")),
|
|
320
|
+
result: d.pass ? "PASS" : "FAIL",
|
|
321
|
+
message: escapeCell(String(d.message ?? "")),
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
return rows;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
function renderJudgeCommentary(runs, singleRun) {
|
|
329
|
+
const entries = runs.filter((r) => r.judgeVerdict?.summary);
|
|
330
|
+
if (!entries.length) return null;
|
|
331
|
+
|
|
332
|
+
const lines = ["#### Judge Commentary", ""];
|
|
333
|
+
for (let i = 0; i < entries.length; i++) {
|
|
334
|
+
const r = entries[i];
|
|
335
|
+
const summary = r.judgeVerdict.summary.replace(/\n/g, "\n> ");
|
|
336
|
+
if (singleRun) {
|
|
337
|
+
lines.push(`> ${summary}`);
|
|
338
|
+
} else {
|
|
339
|
+
lines.push(`> **Run ${r.runIndex}:** ${summary}`);
|
|
340
|
+
}
|
|
341
|
+
if (i < entries.length - 1) lines.push(">");
|
|
342
|
+
}
|
|
343
|
+
return lines.join("\n");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
function renderErrors(runs) {
|
|
347
|
+
const lines = [];
|
|
348
|
+
for (const r of runs) {
|
|
349
|
+
if (r.agentError) {
|
|
350
|
+
lines.push(
|
|
351
|
+
`- **Run ${r.runIndex}:** Agent error — "${escapeCell(r.agentError.message)}" (aborted: ${r.agentError.aborted})`,
|
|
352
|
+
);
|
|
353
|
+
}
|
|
354
|
+
if (r.preflightError) {
|
|
355
|
+
lines.push(
|
|
356
|
+
`- **Run ${r.runIndex}:** Preflight error — "${escapeCell(r.preflightError.message)}" (exit ${r.preflightError.exitCode})`,
|
|
357
|
+
);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
if (!lines.length) return null;
|
|
361
|
+
return ["#### Errors", "", ...lines].join("\n");
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// ---------------------------------------------------------------------------
|
|
365
|
+
// Formatting helpers
|
|
366
|
+
// ---------------------------------------------------------------------------
|
|
367
|
+
|
|
77
368
|
function formatPassAt(v) {
|
|
78
369
|
if (v == null) return "—";
|
|
79
370
|
if (typeof v === "object" && "error" in v) return v.error;
|
|
80
371
|
return Number(v).toFixed(4);
|
|
81
372
|
}
|
|
82
373
|
|
|
374
|
+
function formatDuration(ms) {
|
|
375
|
+
if (ms == null || ms === 0) return "0s";
|
|
376
|
+
const totalSeconds = Math.round(ms / 1000);
|
|
377
|
+
if (totalSeconds < 60) return `${totalSeconds}s`;
|
|
378
|
+
const minutes = Math.floor(totalSeconds / 60);
|
|
379
|
+
const seconds = totalSeconds % 60;
|
|
380
|
+
return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
function formatCost(usd) {
|
|
384
|
+
if (usd == null) return "$0.00";
|
|
385
|
+
return `$${usd.toFixed(2)}`;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
function escapeCell(str) {
|
|
389
|
+
return str.replace(/\|/g, "\\|");
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
function median(arr) {
|
|
393
|
+
if (!arr.length) return 0;
|
|
394
|
+
const sorted = [...arr].sort((a, b) => a - b);
|
|
395
|
+
const mid = Math.floor(sorted.length / 2);
|
|
396
|
+
if (sorted.length % 2 === 0) {
|
|
397
|
+
return Math.round((sorted[mid - 1] + sorted[mid]) / 2);
|
|
398
|
+
}
|
|
399
|
+
return sorted[mid];
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// ---------------------------------------------------------------------------
|
|
403
|
+
// Record loading
|
|
404
|
+
// ---------------------------------------------------------------------------
|
|
405
|
+
|
|
83
406
|
async function loadRecords(inputDir) {
|
|
84
407
|
const path = join(inputDir, "results.jsonl");
|
|
85
408
|
const stream = createReadStream(path);
|
|
@@ -142,8 +465,6 @@ function passAtKValue(n, c, k) {
|
|
|
142
465
|
if (n - c < k) return 1;
|
|
143
466
|
const total = binomial(BigInt(n), BigInt(k));
|
|
144
467
|
const fail = binomial(BigInt(n - c), BigInt(k));
|
|
145
|
-
// Compute the ratio as a single division so we avoid `1 - x` which
|
|
146
|
-
// accumulates IEEE-754 error (e.g. 1 - 0.6 = 0.39999...).
|
|
147
468
|
const passing = total - fail;
|
|
148
469
|
return Number(passing) / Number(total);
|
|
149
470
|
}
|
package/src/benchmark/runner.js
CHANGED
|
@@ -165,11 +165,22 @@ export class BenchmarkRunner {
|
|
|
165
165
|
port: workdir.port,
|
|
166
166
|
runDir: workdir.runDir,
|
|
167
167
|
});
|
|
168
|
-
const
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
168
|
+
const judgeContext = await this.#buildJudgeContext(
|
|
169
|
+
task,
|
|
170
|
+
workdir,
|
|
171
|
+
skillSetHash,
|
|
172
|
+
);
|
|
173
|
+
const judgeVerdict = await this._runJudgeHook(
|
|
174
|
+
task,
|
|
175
|
+
workdir,
|
|
176
|
+
scoring,
|
|
177
|
+
{
|
|
178
|
+
query: this.query,
|
|
179
|
+
model: this.model,
|
|
180
|
+
judgeProfile: this.profiles.judge ?? undefined,
|
|
181
|
+
},
|
|
182
|
+
judgeContext,
|
|
183
|
+
);
|
|
173
184
|
const record = {
|
|
174
185
|
taskId: task.id,
|
|
175
186
|
runIndex,
|
|
@@ -276,6 +287,20 @@ export class BenchmarkRunner {
|
|
|
276
287
|
return { ...summary, agentError };
|
|
277
288
|
}
|
|
278
289
|
|
|
290
|
+
async #buildJudgeContext(task, workdir, skillSetHash) {
|
|
291
|
+
const agentInstructions = await readFile(task.paths.instructions, "utf8");
|
|
292
|
+
let agentProfile = "";
|
|
293
|
+
if (this.profiles.agent) {
|
|
294
|
+
const profilePath = resolvePath(
|
|
295
|
+
workdir.cwd,
|
|
296
|
+
".claude/agents",
|
|
297
|
+
`${this.profiles.agent}.md`,
|
|
298
|
+
);
|
|
299
|
+
agentProfile = await readFile(profilePath, "utf8").catch(() => "");
|
|
300
|
+
}
|
|
301
|
+
return { agentInstructions, agentProfile, skillSetHash };
|
|
302
|
+
}
|
|
303
|
+
|
|
279
304
|
#buildPreflightFailureRecord({
|
|
280
305
|
task,
|
|
281
306
|
runIndex,
|
|
@@ -30,7 +30,11 @@ export async function runBenchmarkReportCommand(values, _args) {
|
|
|
30
30
|
throw new Error("--format must be 'json' or 'text'");
|
|
31
31
|
}
|
|
32
32
|
|
|
33
|
-
const report = await aggregate({
|
|
33
|
+
const report = await aggregate({
|
|
34
|
+
inputDir: resolve(inputDir),
|
|
35
|
+
kValues,
|
|
36
|
+
includeRuns: format === "text",
|
|
37
|
+
});
|
|
34
38
|
if (format === "text") {
|
|
35
39
|
process.stdout.write(renderTextReport(report, kValues) + "\n");
|
|
36
40
|
} else {
|
package/src/supervisor.js
CHANGED
|
@@ -536,12 +536,15 @@ export function createSupervisor({
|
|
|
536
536
|
|
|
537
537
|
const onLine = (line) => supervisor.emitLine(line);
|
|
538
538
|
|
|
539
|
+
const perInvocationTurns =
|
|
540
|
+
maxTurns === 0 ? 0 : Math.max(maxTurns ?? 100, 200);
|
|
541
|
+
|
|
539
542
|
const agentRunner = createAgentRunner({
|
|
540
543
|
cwd: agentCwd,
|
|
541
544
|
query,
|
|
542
545
|
output: devNull,
|
|
543
546
|
model,
|
|
544
|
-
maxTurns:
|
|
547
|
+
maxTurns: perInvocationTurns,
|
|
545
548
|
allowedTools,
|
|
546
549
|
onLine,
|
|
547
550
|
settingSources: ["project"],
|
|
@@ -560,7 +563,7 @@ export function createSupervisor({
|
|
|
560
563
|
query,
|
|
561
564
|
output: devNull,
|
|
562
565
|
model,
|
|
563
|
-
maxTurns:
|
|
566
|
+
maxTurns: perInvocationTurns,
|
|
564
567
|
allowedTools: supervisorAllowedTools ?? [
|
|
565
568
|
"Bash",
|
|
566
569
|
"Read",
|