@forwardimpact/libeval 0.1.35 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,10 @@
3
3
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
4
4
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
5
5
  *
6
+ * When `includeRuns` is true, each task carries per-run detail (scoring
7
+ * checks, judge commentary, cost, duration) and the text renderer produces
8
+ * a full markdown report instead of just the pass@k table.
9
+ *
6
10
  * Records that fail schema validation are skipped with a stderr warning
7
11
  * (counted under `totals.skipped`) so a corrupt line cannot abort the
8
12
  * whole report.
@@ -14,48 +18,216 @@ import { createInterface } from "node:readline";
14
18
 
15
19
  import { validateResultRecord } from "./result.js";
16
20
 
21
+ /**
22
+ * @typedef {object} RunDetail
23
+ * @property {number} runIndex
24
+ * @property {"pass"|"fail"} verdict
25
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
26
+ * @property {{verdict: string, summary: string}} [judgeVerdict]
27
+ * @property {number} costUsd
28
+ * @property {number} turns
29
+ * @property {number} durationMs
30
+ * @property {{message: string, aborted: boolean}} [agentError]
31
+ * @property {{phase: string, message: string, exitCode: number}} [preflightError]
32
+ */
33
+
17
34
  /**
18
35
  * @typedef {object} TaskReport
19
36
  * @property {string} taskId
20
37
  * @property {number} n - Total runs.
21
38
  * @property {number} c - Passing runs.
22
39
  * @property {Record<string|number, number|null>} passAtK
40
+ * @property {RunDetail[]} [runs] - Per-run detail (only when includeRuns).
23
41
  */
24
42
 
25
43
  /**
26
- * @param {{inputDir: string, kValues: number[]}} opts
27
- * @returns {Promise<{tasks: TaskReport[], totals: {tasks: number, runs: number, skipped: number}}>}
44
+ * @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
45
+ * @returns {Promise<{tasks: TaskReport[], totals: object}>}
28
46
  */
29
- export async function aggregate({ inputDir, kValues }) {
47
+ export async function aggregate({ inputDir, kValues, includeRuns = false }) {
30
48
  const records = await loadRecords(inputDir);
31
49
  const grouped = groupByTask(records.records);
32
50
  const tasks = [];
33
- let runs = 0;
51
+ let totalRuns = 0;
52
+ let totalCost = 0;
53
+ const allDurations = [];
54
+ const allTurns = [];
55
+ let firstRecord = null;
56
+
34
57
  for (const [taskId, group] of grouped) {
35
58
  const n = group.length;
36
59
  const c = group.filter((r) => r.verdict === "pass").length;
37
- runs += n;
60
+ totalRuns += n;
38
61
  const passAtK = {};
39
62
  for (const k of kValues) passAtK[k] = passAtKValue(n, c, k);
40
- tasks.push({ taskId, n, c, passAtK });
63
+
64
+ const task = { taskId, n, c, passAtK };
65
+
66
+ if (includeRuns) {
67
+ if (!firstRecord) firstRecord = group[0];
68
+ const accumulators = { allDurations, allTurns };
69
+ task.runs = group
70
+ .map((r) => {
71
+ totalCost += r.costUsd ?? 0;
72
+ return buildRunDetail(r, accumulators);
73
+ })
74
+ .sort((a, b) => a.runIndex - b.runIndex);
75
+ }
76
+
77
+ tasks.push(task);
41
78
  }
42
79
  tasks.sort((a, b) =>
43
80
  a.taskId < b.taskId ? -1 : a.taskId > b.taskId ? 1 : 0,
44
81
  );
82
+
83
+ const totals = {
84
+ tasks: tasks.length,
85
+ runs: totalRuns,
86
+ skipped: records.skipped,
87
+ };
88
+
89
+ if (includeRuns) {
90
+ totals.costUsd = totalCost;
91
+ totals.medianDurationMs = median(allDurations);
92
+ totals.medianTurns = median(allTurns);
93
+ totals.model = firstRecord?.model ?? "";
94
+ totals.skillSetHash = firstRecord?.skillSetHash ?? "";
95
+ totals.familyRevision = firstRecord?.familyRevision ?? "";
96
+ }
97
+
98
+ return { tasks, totals };
99
+ }
100
+
101
+ /**
102
+ * Build a normalized per-run detail object and accumulate duration/turn
103
+ * samples for median calculation. Extracted from `aggregate` to keep its
104
+ * cognitive complexity below the lint ceiling.
105
+ * @param {object} r - Raw record.
106
+ * @param {{allDurations: number[], allTurns: number[]}} acc
107
+ * @returns {RunDetail}
108
+ */
109
+ function buildRunDetail(r, acc) {
110
+ if (r.durationMs != null) acc.allDurations.push(r.durationMs);
111
+ if (r.turns != null) acc.allTurns.push(r.turns);
45
112
  return {
46
- tasks,
47
- totals: { tasks: tasks.length, runs, skipped: records.skipped },
113
+ runIndex: r.runIndex,
114
+ verdict: r.verdict,
115
+ ...(r.scoring && { scoring: r.scoring }),
116
+ ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
117
+ costUsd: r.costUsd ?? 0,
118
+ turns: r.turns ?? 0,
119
+ durationMs: r.durationMs ?? 0,
120
+ ...(r.agentError && { agentError: r.agentError }),
121
+ ...(r.preflightError && { preflightError: r.preflightError }),
48
122
  };
49
123
  }
50
124
 
51
125
  /**
52
- * Render an aggregate report as a Markdown table. Columns: taskId | n | c |
53
- * pass@k1 | pass@k2 ... one column per kValues entry, in the same order.
126
+ * Render an aggregate report as markdown. When the report contains per-run
127
+ * detail (from `includeRuns: true`), renders a full report with summary,
128
+ * pass@k table, and per-task detail sections. Otherwise falls back to the
129
+ * compact pass@k table.
54
130
  * @param {Awaited<ReturnType<typeof aggregate>>} report
55
131
  * @param {number[]} kValues
56
132
  * @returns {string}
57
133
  */
58
134
  export function renderTextReport(report, kValues) {
135
+ if (report.tasks[0]?.runs) {
136
+ return renderFullReport(report, kValues);
137
+ }
138
+ return renderCompactReport(report, kValues);
139
+ }
140
+
141
+ // ---------------------------------------------------------------------------
142
+ // Compact report (legacy path)
143
+ // ---------------------------------------------------------------------------
144
+
145
+ function renderCompactReport(report, kValues) {
146
+ const lines = [
147
+ renderPassAtKTable(report, kValues),
148
+ "",
149
+ renderTotalsLine(report),
150
+ ];
151
+ return lines.join("\n");
152
+ }
153
+
154
+ // ---------------------------------------------------------------------------
155
+ // Full report
156
+ // ---------------------------------------------------------------------------
157
+
158
+ function renderFullReport(report, kValues) {
159
+ const sections = [
160
+ renderSummary(report),
161
+ "## Pass@k",
162
+ "",
163
+ renderPassAtKTable(report, kValues),
164
+ "",
165
+ renderTotalsLine(report),
166
+ "",
167
+ "## Task Details",
168
+ ];
169
+
170
+ for (const task of report.tasks) {
171
+ sections.push("");
172
+ sections.push(renderTaskDetail(task));
173
+ }
174
+
175
+ return sections.join("\n");
176
+ }
177
+
178
+ function renderSummary(report) {
179
+ const { totals } = report;
180
+ const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
181
+ const icon = statusIcon(passing === totals.tasks);
182
+ const lines = [
183
+ "# Benchmark Report",
184
+ "",
185
+ `${icon} **${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
186
+ ];
187
+
188
+ const headers = [];
189
+ const values = [];
190
+ if (totals.costUsd != null) {
191
+ headers.push("Cost");
192
+ values.push(formatCost(totals.costUsd));
193
+ }
194
+ if (totals.medianDurationMs != null) {
195
+ headers.push("Median Duration");
196
+ values.push(formatDuration(totals.medianDurationMs));
197
+ }
198
+ if (totals.medianTurns != null) {
199
+ headers.push("Median Turns");
200
+ values.push(String(totals.medianTurns));
201
+ }
202
+ if (headers.length) {
203
+ lines.push("");
204
+ lines.push(`| ${headers.join(" | ")} |`);
205
+ lines.push(`| ${headers.map(() => "---").join(" | ")} |`);
206
+ lines.push(`| ${values.join(" | ")} |`);
207
+ }
208
+
209
+ const meta = [];
210
+ if (totals.model) {
211
+ meta.push(`Agent: \`${totals.model.agent}\``);
212
+ meta.push(`Supervisor: \`${totals.model.supervisor}\``);
213
+ meta.push(`Judge: \`${totals.model.judge}\``);
214
+ }
215
+ if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
216
+ if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
217
+ if (meta.length) {
218
+ lines.push("");
219
+ lines.push(meta.join(" | "));
220
+ }
221
+
222
+ lines.push("");
223
+ return lines.join("\n");
224
+ }
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // Pass@k table (shared between compact and full)
228
+ // ---------------------------------------------------------------------------
229
+
230
+ function renderPassAtKTable(report, kValues) {
59
231
  const header = ["taskId", "n", "c", ...kValues.map((k) => `pass@${k}`)];
60
232
  const rows = [header, header.map(() => "---")];
61
233
  for (const t of report.tasks) {
@@ -66,20 +238,197 @@ export function renderTextReport(report, kValues) {
66
238
  ...kValues.map((k) => formatPassAt(t.passAtK[k])),
67
239
  ]);
68
240
  }
69
- const lines = rows.map((r) => `| ${r.join(" | ")} |`);
70
- lines.push("");
71
- lines.push(
72
- `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`,
73
- );
241
+ return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
242
+ }
243
+
244
+ function renderTotalsLine(report) {
245
+ return `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`;
246
+ }
247
+
248
+ // ---------------------------------------------------------------------------
249
+ // Per-task detail
250
+ // ---------------------------------------------------------------------------
251
+
252
+ function renderTaskDetail(task) {
253
+ const runs = task.runs ?? [];
254
+ const icon = statusIcon(task.c === task.n);
255
+ const singleRun = runs.length === 1;
256
+
257
+ const lines = [
258
+ `### ${task.taskId}`,
259
+ "",
260
+ `${icon} **${task.c}/${task.n} runs passed**`,
261
+ ];
262
+
263
+ lines.push("", renderRunsTable(runs));
264
+
265
+ const checks = renderScoringChecks(runs, singleRun);
266
+ if (checks) lines.push("", checks);
267
+
268
+ const commentary = renderJudgeCommentary(runs, singleRun);
269
+ if (commentary) lines.push("", commentary);
270
+
271
+ const errors = renderErrors(runs);
272
+ if (errors) lines.push("", errors);
273
+
74
274
  return lines.join("\n");
75
275
  }
76
276
 
277
+ function renderRunsTable(runs) {
278
+ const header = [
279
+ "Run",
280
+ "Verdict",
281
+ "Scoring",
282
+ "Judge",
283
+ "Cost",
284
+ "Turns",
285
+ "Duration",
286
+ ];
287
+ const rows = [header, header.map(() => "---")];
288
+ for (const r of runs) {
289
+ const scoringCell = r.preflightError
290
+ ? "preflight error"
291
+ : r.scoring
292
+ ? statusIcon(r.scoring.verdict === "pass")
293
+ : "—";
294
+ const judgeCell = r.preflightError
295
+ ? "—"
296
+ : r.judgeVerdict
297
+ ? statusIcon(r.judgeVerdict.verdict === "pass")
298
+ : "—";
299
+ rows.push([
300
+ String(r.runIndex),
301
+ statusIcon(r.verdict === "pass"),
302
+ scoringCell,
303
+ judgeCell,
304
+ formatCost(r.costUsd),
305
+ String(r.turns),
306
+ formatDuration(r.durationMs),
307
+ ]);
308
+ }
309
+ return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
310
+ }
311
+
312
+ function renderScoringChecks(runs, singleRun) {
313
+ const rows = collectScoringRows(runs);
314
+ if (!rows.length) return null;
315
+
316
+ const header = singleRun
317
+ ? ["Check", "Result", "Message"]
318
+ : ["Run", "Check", "Result", "Message"];
319
+ const lines = [
320
+ "#### Scoring Checks",
321
+ "",
322
+ `| ${header.join(" | ")} |`,
323
+ `| ${header.map(() => "---").join(" | ")} |`,
324
+ ];
325
+ for (const row of rows) {
326
+ const cells = singleRun
327
+ ? [row.check, row.result, row.message]
328
+ : [String(row.run), row.check, row.result, row.message];
329
+ lines.push(`| ${cells.join(" | ")} |`);
330
+ }
331
+ return lines.join("\n");
332
+ }
333
+
334
+ function collectScoringRows(runs) {
335
+ const rows = [];
336
+ for (const r of runs) {
337
+ if (!r.scoring?.details?.length) continue;
338
+ for (const d of r.scoring.details) {
339
+ rows.push({
340
+ run: r.runIndex,
341
+ check: escapeCell(String(d.test ?? "(unnamed)")),
342
+ result: statusIcon(d.pass),
343
+ message: escapeCell(String(d.message ?? "")),
344
+ });
345
+ }
346
+ }
347
+ return rows;
348
+ }
349
+
350
+ function renderJudgeCommentary(runs, singleRun) {
351
+ const entries = runs.filter((r) => r.judgeVerdict?.summary);
352
+ if (!entries.length) return null;
353
+
354
+ const lines = ["#### Judge Commentary", ""];
355
+ for (let i = 0; i < entries.length; i++) {
356
+ const r = entries[i];
357
+ const summary = r.judgeVerdict.summary.replace(/\n/g, "\n> ");
358
+ if (singleRun) {
359
+ lines.push(`> ${summary}`);
360
+ } else {
361
+ lines.push(`> **Run ${r.runIndex}:** ${summary}`);
362
+ }
363
+ if (i < entries.length - 1) lines.push(">");
364
+ }
365
+ return lines.join("\n");
366
+ }
367
+
368
+ function renderErrors(runs) {
369
+ const lines = [];
370
+ for (const r of runs) {
371
+ if (r.agentError) {
372
+ lines.push(
373
+ `- **Run ${r.runIndex}:** Agent error — "${escapeCell(r.agentError.message)}" (aborted: ${r.agentError.aborted})`,
374
+ );
375
+ }
376
+ if (r.preflightError) {
377
+ lines.push(
378
+ `- **Run ${r.runIndex}:** Preflight error — "${escapeCell(r.preflightError.message)}" (exit ${r.preflightError.exitCode})`,
379
+ );
380
+ }
381
+ }
382
+ if (!lines.length) return null;
383
+ return ["#### Errors", "", ...lines].join("\n");
384
+ }
385
+
386
+ // ---------------------------------------------------------------------------
387
+ // Formatting helpers
388
+ // ---------------------------------------------------------------------------
389
+
390
+ function statusIcon(pass) {
391
+ return pass ? "✅" : "❌";
392
+ }
393
+
77
394
  function formatPassAt(v) {
78
395
  if (v == null) return "—";
79
396
  if (typeof v === "object" && "error" in v) return v.error;
80
397
  return Number(v).toFixed(4);
81
398
  }
82
399
 
400
+ function formatDuration(ms) {
401
+ if (ms == null || ms === 0) return "0s";
402
+ const totalSeconds = Math.round(ms / 1000);
403
+ if (totalSeconds < 60) return `${totalSeconds}s`;
404
+ const minutes = Math.floor(totalSeconds / 60);
405
+ const seconds = totalSeconds % 60;
406
+ return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
407
+ }
408
+
409
+ function formatCost(usd) {
410
+ if (usd == null) return "$0.00";
411
+ return `$${usd.toFixed(2)}`;
412
+ }
413
+
414
+ function escapeCell(str) {
415
+ return str.replace(/\|/g, "\\|");
416
+ }
417
+
418
+ function median(arr) {
419
+ if (!arr.length) return 0;
420
+ const sorted = [...arr].sort((a, b) => a - b);
421
+ const mid = Math.floor(sorted.length / 2);
422
+ if (sorted.length % 2 === 0) {
423
+ return Math.round((sorted[mid - 1] + sorted[mid]) / 2);
424
+ }
425
+ return sorted[mid];
426
+ }
427
+
428
+ // ---------------------------------------------------------------------------
429
+ // Record loading
430
+ // ---------------------------------------------------------------------------
431
+
83
432
  async function loadRecords(inputDir) {
84
433
  const path = join(inputDir, "results.jsonl");
85
434
  const stream = createReadStream(path);
@@ -142,8 +491,6 @@ function passAtKValue(n, c, k) {
142
491
  if (n - c < k) return 1;
143
492
  const total = binomial(BigInt(n), BigInt(k));
144
493
  const fail = binomial(BigInt(n - c), BigInt(k));
145
- // Compute the ratio as a single division so we avoid `1 - x` which
146
- // accumulates IEEE-754 error (e.g. 1 - 0.6 = 0.39999...).
147
494
  const passing = total - fail;
148
495
  return Number(passing) / Number(total);
149
496
  }
@@ -46,7 +46,11 @@ const COMMON_FIELDS = {
46
46
  costUsd: z.number(),
47
47
  turns: z.number().int().min(0),
48
48
  profiles: PROFILES_SHAPE,
49
- model: z.string(),
49
+ model: z.object({
50
+ agent: z.string(),
51
+ supervisor: z.string(),
52
+ judge: z.string(),
53
+ }),
50
54
  skillSetHash: z.string(),
51
55
  familyRevision: z.string(),
52
56
  durationMs: z.number().int().min(0),
@@ -63,6 +67,7 @@ const HAPPY_RECORD = z.object({
63
67
  submission: z.string(),
64
68
  judgeVerdict: JUDGE_VERDICT_SHAPE,
65
69
  agentTracePath: z.string(),
70
+ supervisorTracePath: z.string(),
66
71
  judgeTracePath: z.string(),
67
72
  agentError: AGENT_ERROR_SHAPE.optional(),
68
73
  preflightError: z.undefined().optional(),
@@ -76,6 +81,7 @@ const PREFLIGHT_RECORD = z.object({
76
81
  // them in WorkdirManager.start) so the record is uniform across branches
77
82
  // and downstream consumers can reference them without conditional fields.
78
83
  agentTracePath: z.string(),
84
+ supervisorTracePath: z.string(),
79
85
  judgeTracePath: z.string(),
80
86
  scoring: z.undefined().optional(),
81
87
  submission: z.undefined().optional(),