@forwardimpact/libeval 0.1.34 → 0.1.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -71,7 +71,7 @@ export const definition = {
71
71
  },
72
72
  task: {
73
73
  type: "string",
74
- description: "METR-style task id (task_family_name/task_name)",
74
+ description: "Task id (directory name under tasks/)",
75
75
  },
76
76
  workdir: {
77
77
  type: "string",
@@ -112,7 +112,7 @@ export const definition = {
112
112
  },
113
113
  examples: [
114
114
  "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
115
- "fit-benchmark score --family=./families/coding --task=coding/todo-api --workdir=./runs/2026-05-11/runs/coding__todo-api/0",
115
+ "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
116
116
  "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
117
117
  ],
118
118
  documentation: [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.34",
3
+ "version": "0.1.36",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -1,9 +1,20 @@
1
1
  /**
2
2
  * Benchmark adapter for the libeval `Judge`. Templates the family's
3
- * `judge.task.md` ({{SCORING}} / {{AGENT_TRACE_PATH}} substitution), runs the
4
- * judge against the post-run agent CWD, and returns the verdict in the
5
- * benchmark's `pass`/`fail` vocabulary (mapped from libeval's
6
- * `success`/`failure`).
3
+ * `judge.task.md` with structured context variables, runs the judge against
4
+ * the post-run agent CWD, and returns the verdict in the benchmark's
5
+ * `pass`/`fail` vocabulary (mapped from libeval's `success`/`failure`).
6
+ *
7
+ * Template variables available in `judge.task.md`:
8
+ *
9
+ * {{AGENT_INSTRUCTIONS}} — contents of instructions.md
10
+ * {{AGENT_PROFILE}} — agent profile body (empty string if none)
11
+ * {{AGENT_TRACE_PATH}} — path to agent.ndjson
12
+ * {{SCORING_RESULT}} — JSON scoring object
13
+ * {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
14
+ * {{TASK_ID}} — task name (directory under tasks/)
15
+ * {{TASK_DIR}} — agent working directory path
16
+ *
17
+ * Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
7
18
  *
8
19
  * The judge verdict is captured from the orchestration context's
9
20
  * `concluded` flag directly — no trace parsing on the happy path.
@@ -24,19 +35,34 @@ import { createRedactor } from "../redaction.js";
24
35
  * @property {string} summary
25
36
  */
26
37
 
38
+ /**
39
+ * @typedef {object} JudgeContext
40
+ * @property {string} agentInstructions - Contents of instructions.md.
41
+ * @property {string} agentProfile - Agent profile body (empty string if none).
42
+ * @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
43
+ */
44
+
27
45
  /**
28
46
  * Run the judge over a completed task run.
29
47
  * @param {import("./task-family.js").Task} task
30
48
  * @param {import("./workdir.js").Workdir} workdir
31
49
  * @param {import("./scorer.js").ScoringResult} scoring
32
50
  * @param {{query: Function, model: string, judgeProfile?: string}} deps
51
+ * @param {JudgeContext} [context]
33
52
  * @returns {Promise<JudgeVerdict>}
34
53
  */
35
- export async function runJudge(task, workdir, scoring, deps) {
54
+ export async function runJudge(task, workdir, scoring, deps, context) {
36
55
  const template = await readFile(task.paths.judge, "utf8");
56
+ const scoringJson = JSON.stringify(scoring, null, 2);
37
57
  const taskText = template
38
- .replaceAll("{{SCORING}}", JSON.stringify(scoring, null, 2))
39
- .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath);
58
+ .replaceAll("{{SCORING_RESULT}}", scoringJson)
59
+ .replaceAll("{{SCORING}}", scoringJson)
60
+ .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
61
+ .replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
62
+ .replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
63
+ .replaceAll("{{SKILL_SET_HASH}}", context?.skillSetHash ?? "")
64
+ .replaceAll("{{TASK_ID}}", task.id)
65
+ .replaceAll("{{TASK_DIR}}", workdir.cwd);
40
66
 
41
67
  const output = createWriteStream(workdir.judgeTracePath);
42
68
  const judge = createJudge({
@@ -45,7 +71,7 @@ export async function runJudge(task, workdir, scoring, deps) {
45
71
  output,
46
72
  model: deps.model,
47
73
  judgeProfile: deps.judgeProfile,
48
- maxTurns: 5,
74
+ maxTurns: 25,
49
75
  redactor: createRedactor(),
50
76
  });
51
77
 
@@ -3,6 +3,10 @@
3
3
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
4
4
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
5
5
  *
6
+ * When `includeRuns` is true, each task carries per-run detail (scoring
7
+ * checks, judge commentary, cost, duration) and the text renderer produces
8
+ * a full markdown report instead of just the pass@k table.
9
+ *
6
10
  * Records that fail schema validation are skipped with a stderr warning
7
11
  * (counted under `totals.skipped`) so a corrupt line cannot abort the
8
12
  * whole report.
@@ -14,48 +18,194 @@ import { createInterface } from "node:readline";
14
18
 
15
19
  import { validateResultRecord } from "./result.js";
16
20
 
21
+ /**
22
+ * @typedef {object} RunDetail
23
+ * @property {number} runIndex
24
+ * @property {"pass"|"fail"} verdict
25
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
26
+ * @property {{verdict: string, summary: string}} [judgeVerdict]
27
+ * @property {number} costUsd
28
+ * @property {number} turns
29
+ * @property {number} durationMs
30
+ * @property {{message: string, aborted: boolean}} [agentError]
31
+ * @property {{phase: string, message: string, exitCode: number}} [preflightError]
32
+ */
33
+
17
34
  /**
18
35
  * @typedef {object} TaskReport
19
36
  * @property {string} taskId
20
37
  * @property {number} n - Total runs.
21
38
  * @property {number} c - Passing runs.
22
39
  * @property {Record<string|number, number|null>} passAtK
40
+ * @property {RunDetail[]} [runs] - Per-run detail (only when includeRuns).
23
41
  */
24
42
 
25
43
  /**
26
- * @param {{inputDir: string, kValues: number[]}} opts
27
- * @returns {Promise<{tasks: TaskReport[], totals: {tasks: number, runs: number, skipped: number}}>}
44
+ * @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
45
+ * @returns {Promise<{tasks: TaskReport[], totals: object}>}
28
46
  */
29
- export async function aggregate({ inputDir, kValues }) {
47
+ export async function aggregate({ inputDir, kValues, includeRuns = false }) {
30
48
  const records = await loadRecords(inputDir);
31
49
  const grouped = groupByTask(records.records);
32
50
  const tasks = [];
33
- let runs = 0;
51
+ let totalRuns = 0;
52
+ let totalCost = 0;
53
+ const allDurations = [];
54
+ const allTurns = [];
55
+ let firstRecord = null;
56
+
34
57
  for (const [taskId, group] of grouped) {
35
58
  const n = group.length;
36
59
  const c = group.filter((r) => r.verdict === "pass").length;
37
- runs += n;
60
+ totalRuns += n;
38
61
  const passAtK = {};
39
62
  for (const k of kValues) passAtK[k] = passAtKValue(n, c, k);
40
- tasks.push({ taskId, n, c, passAtK });
63
+
64
+ const task = { taskId, n, c, passAtK };
65
+
66
+ if (includeRuns) {
67
+ if (!firstRecord) firstRecord = group[0];
68
+ const accumulators = { allDurations, allTurns };
69
+ task.runs = group
70
+ .map((r) => {
71
+ totalCost += r.costUsd ?? 0;
72
+ return buildRunDetail(r, accumulators);
73
+ })
74
+ .sort((a, b) => a.runIndex - b.runIndex);
75
+ }
76
+
77
+ tasks.push(task);
41
78
  }
42
79
  tasks.sort((a, b) =>
43
80
  a.taskId < b.taskId ? -1 : a.taskId > b.taskId ? 1 : 0,
44
81
  );
82
+
83
+ const totals = {
84
+ tasks: tasks.length,
85
+ runs: totalRuns,
86
+ skipped: records.skipped,
87
+ };
88
+
89
+ if (includeRuns) {
90
+ totals.costUsd = totalCost;
91
+ totals.medianDurationMs = median(allDurations);
92
+ totals.medianTurns = median(allTurns);
93
+ totals.model = firstRecord?.model ?? "";
94
+ totals.skillSetHash = firstRecord?.skillSetHash ?? "";
95
+ totals.familyRevision = firstRecord?.familyRevision ?? "";
96
+ }
97
+
98
+ return { tasks, totals };
99
+ }
100
+
101
+ /**
102
+ * Build a normalized per-run detail object and accumulate duration/turn
103
+ * samples for median calculation. Extracted from `aggregate` to keep its
104
+ * cognitive complexity below the lint ceiling.
105
+ * @param {object} r - Raw record.
106
+ * @param {{allDurations: number[], allTurns: number[]}} acc
107
+ * @returns {RunDetail}
108
+ */
109
+ function buildRunDetail(r, acc) {
110
+ if (r.durationMs != null) acc.allDurations.push(r.durationMs);
111
+ if (r.turns != null) acc.allTurns.push(r.turns);
45
112
  return {
46
- tasks,
47
- totals: { tasks: tasks.length, runs, skipped: records.skipped },
113
+ runIndex: r.runIndex,
114
+ verdict: r.verdict,
115
+ ...(r.scoring && { scoring: r.scoring }),
116
+ ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
117
+ costUsd: r.costUsd ?? 0,
118
+ turns: r.turns ?? 0,
119
+ durationMs: r.durationMs ?? 0,
120
+ ...(r.agentError && { agentError: r.agentError }),
121
+ ...(r.preflightError && { preflightError: r.preflightError }),
48
122
  };
49
123
  }
50
124
 
51
125
  /**
52
- * Render an aggregate report as a Markdown table. Columns: taskId | n | c |
53
- * pass@k1 | pass@k2 ... one column per kValues entry, in the same order.
126
+ * Render an aggregate report as markdown. When the report contains per-run
127
+ * detail (from `includeRuns: true`), renders a full report with summary,
128
+ * pass@k table, and per-task detail sections. Otherwise falls back to the
129
+ * compact pass@k table.
54
130
  * @param {Awaited<ReturnType<typeof aggregate>>} report
55
131
  * @param {number[]} kValues
56
132
  * @returns {string}
57
133
  */
58
134
  export function renderTextReport(report, kValues) {
135
+ if (report.tasks[0]?.runs) {
136
+ return renderFullReport(report, kValues);
137
+ }
138
+ return renderCompactReport(report, kValues);
139
+ }
140
+
141
+ // ---------------------------------------------------------------------------
142
+ // Compact report (legacy path)
143
+ // ---------------------------------------------------------------------------
144
+
145
+ function renderCompactReport(report, kValues) {
146
+ const lines = [
147
+ renderPassAtKTable(report, kValues),
148
+ "",
149
+ renderTotalsLine(report),
150
+ ];
151
+ return lines.join("\n");
152
+ }
153
+
154
+ // ---------------------------------------------------------------------------
155
+ // Full report
156
+ // ---------------------------------------------------------------------------
157
+
158
+ function renderFullReport(report, kValues) {
159
+ const sections = [
160
+ renderSummary(report),
161
+ "## Pass@k",
162
+ "",
163
+ renderPassAtKTable(report, kValues),
164
+ "",
165
+ renderTotalsLine(report),
166
+ "",
167
+ "## Task Details",
168
+ ];
169
+
170
+ for (const task of report.tasks) {
171
+ sections.push("");
172
+ sections.push(renderTaskDetail(task));
173
+ }
174
+
175
+ return sections.join("\n");
176
+ }
177
+
178
+ function renderSummary(report) {
179
+ const { totals } = report;
180
+ const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
181
+ const lines = [
182
+ "# Benchmark Report",
183
+ "",
184
+ `**Result: ${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
185
+ ];
186
+ const meta = [];
187
+ if (totals.model) meta.push(`Model: \`${totals.model}\``);
188
+ if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
189
+ if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
190
+ if (meta.length) lines.push(meta.join(" | "));
191
+
192
+ const stats = [];
193
+ if (totals.costUsd != null) stats.push(`Cost: ${formatCost(totals.costUsd)}`);
194
+ if (totals.medianDurationMs != null)
195
+ stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
196
+ if (totals.medianTurns != null)
197
+ stats.push(`Median turns: ${totals.medianTurns}`);
198
+ if (stats.length) lines.push(stats.join(" | "));
199
+
200
+ lines.push("");
201
+ return lines.join("\n");
202
+ }
203
+
204
+ // ---------------------------------------------------------------------------
205
+ // Pass@k table (shared between compact and full)
206
+ // ---------------------------------------------------------------------------
207
+
208
+ function renderPassAtKTable(report, kValues) {
59
209
  const header = ["taskId", "n", "c", ...kValues.map((k) => `pass@${k}`)];
60
210
  const rows = [header, header.map(() => "---")];
61
211
  for (const t of report.tasks) {
@@ -66,20 +216,193 @@ export function renderTextReport(report, kValues) {
66
216
  ...kValues.map((k) => formatPassAt(t.passAtK[k])),
67
217
  ]);
68
218
  }
69
- const lines = rows.map((r) => `| ${r.join(" | ")} |`);
70
- lines.push("");
71
- lines.push(
72
- `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`,
73
- );
219
+ return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
220
+ }
221
+
222
+ function renderTotalsLine(report) {
223
+ return `Totals — tasks: ${report.totals.tasks}, runs: ${report.totals.runs}, skipped: ${report.totals.skipped}`;
224
+ }
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // Per-task detail
228
+ // ---------------------------------------------------------------------------
229
+
230
+ function renderTaskDetail(task) {
231
+ const runs = task.runs ?? [];
232
+ const status = task.c === task.n ? "PASS" : "FAIL";
233
+ const singleRun = runs.length === 1;
234
+
235
+ const lines = [
236
+ `### ${task.taskId}`,
237
+ "",
238
+ `**${status} — ${task.c}/${task.n} runs passed**`,
239
+ ];
240
+
241
+ lines.push("", renderRunsTable(runs));
242
+
243
+ const checks = renderScoringChecks(runs, singleRun);
244
+ if (checks) lines.push("", checks);
245
+
246
+ const commentary = renderJudgeCommentary(runs, singleRun);
247
+ if (commentary) lines.push("", commentary);
248
+
249
+ const errors = renderErrors(runs);
250
+ if (errors) lines.push("", errors);
251
+
74
252
  return lines.join("\n");
75
253
  }
76
254
 
255
+ function renderRunsTable(runs) {
256
+ const header = [
257
+ "Run",
258
+ "Verdict",
259
+ "Scoring",
260
+ "Judge",
261
+ "Cost",
262
+ "Turns",
263
+ "Duration",
264
+ ];
265
+ const rows = [header, header.map(() => "---")];
266
+ for (const r of runs) {
267
+ const scoringCell = r.preflightError
268
+ ? "preflight error"
269
+ : r.scoring
270
+ ? r.scoring.verdict
271
+ : "—";
272
+ const judgeCell = r.preflightError
273
+ ? "—"
274
+ : r.judgeVerdict
275
+ ? r.judgeVerdict.verdict
276
+ : "—";
277
+ rows.push([
278
+ String(r.runIndex),
279
+ r.verdict.toUpperCase(),
280
+ scoringCell,
281
+ judgeCell,
282
+ formatCost(r.costUsd),
283
+ String(r.turns),
284
+ formatDuration(r.durationMs),
285
+ ]);
286
+ }
287
+ return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
288
+ }
289
+
290
+ function renderScoringChecks(runs, singleRun) {
291
+ const rows = collectScoringRows(runs);
292
+ if (!rows.length) return null;
293
+
294
+ const header = singleRun
295
+ ? ["Check", "Result", "Message"]
296
+ : ["Run", "Check", "Result", "Message"];
297
+ const lines = [
298
+ "#### Scoring Checks",
299
+ "",
300
+ `| ${header.join(" | ")} |`,
301
+ `| ${header.map(() => "---").join(" | ")} |`,
302
+ ];
303
+ for (const row of rows) {
304
+ const cells = singleRun
305
+ ? [row.check, row.result, row.message]
306
+ : [String(row.run), row.check, row.result, row.message];
307
+ lines.push(`| ${cells.join(" | ")} |`);
308
+ }
309
+ return lines.join("\n");
310
+ }
311
+
312
+ function collectScoringRows(runs) {
313
+ const rows = [];
314
+ for (const r of runs) {
315
+ if (!r.scoring?.details?.length) continue;
316
+ for (const d of r.scoring.details) {
317
+ rows.push({
318
+ run: r.runIndex,
319
+ check: escapeCell(String(d.test ?? "(unnamed)")),
320
+ result: d.pass ? "PASS" : "FAIL",
321
+ message: escapeCell(String(d.message ?? "")),
322
+ });
323
+ }
324
+ }
325
+ return rows;
326
+ }
327
+
328
+ function renderJudgeCommentary(runs, singleRun) {
329
+ const entries = runs.filter((r) => r.judgeVerdict?.summary);
330
+ if (!entries.length) return null;
331
+
332
+ const lines = ["#### Judge Commentary", ""];
333
+ for (let i = 0; i < entries.length; i++) {
334
+ const r = entries[i];
335
+ const summary = r.judgeVerdict.summary.replace(/\n/g, "\n> ");
336
+ if (singleRun) {
337
+ lines.push(`> ${summary}`);
338
+ } else {
339
+ lines.push(`> **Run ${r.runIndex}:** ${summary}`);
340
+ }
341
+ if (i < entries.length - 1) lines.push(">");
342
+ }
343
+ return lines.join("\n");
344
+ }
345
+
346
+ function renderErrors(runs) {
347
+ const lines = [];
348
+ for (const r of runs) {
349
+ if (r.agentError) {
350
+ lines.push(
351
+ `- **Run ${r.runIndex}:** Agent error — "${escapeCell(r.agentError.message)}" (aborted: ${r.agentError.aborted})`,
352
+ );
353
+ }
354
+ if (r.preflightError) {
355
+ lines.push(
356
+ `- **Run ${r.runIndex}:** Preflight error — "${escapeCell(r.preflightError.message)}" (exit ${r.preflightError.exitCode})`,
357
+ );
358
+ }
359
+ }
360
+ if (!lines.length) return null;
361
+ return ["#### Errors", "", ...lines].join("\n");
362
+ }
363
+
364
+ // ---------------------------------------------------------------------------
365
+ // Formatting helpers
366
+ // ---------------------------------------------------------------------------
367
+
77
368
  function formatPassAt(v) {
78
369
  if (v == null) return "—";
79
370
  if (typeof v === "object" && "error" in v) return v.error;
80
371
  return Number(v).toFixed(4);
81
372
  }
82
373
 
374
+ function formatDuration(ms) {
375
+ if (ms == null || ms === 0) return "0s";
376
+ const totalSeconds = Math.round(ms / 1000);
377
+ if (totalSeconds < 60) return `${totalSeconds}s`;
378
+ const minutes = Math.floor(totalSeconds / 60);
379
+ const seconds = totalSeconds % 60;
380
+ return seconds > 0 ? `${minutes}m ${seconds}s` : `${minutes}m`;
381
+ }
382
+
383
+ function formatCost(usd) {
384
+ if (usd == null) return "$0.00";
385
+ return `$${usd.toFixed(2)}`;
386
+ }
387
+
388
+ function escapeCell(str) {
389
+ return str.replace(/\|/g, "\\|");
390
+ }
391
+
392
+ function median(arr) {
393
+ if (!arr.length) return 0;
394
+ const sorted = [...arr].sort((a, b) => a - b);
395
+ const mid = Math.floor(sorted.length / 2);
396
+ if (sorted.length % 2 === 0) {
397
+ return Math.round((sorted[mid - 1] + sorted[mid]) / 2);
398
+ }
399
+ return sorted[mid];
400
+ }
401
+
402
+ // ---------------------------------------------------------------------------
403
+ // Record loading
404
+ // ---------------------------------------------------------------------------
405
+
83
406
  async function loadRecords(inputDir) {
84
407
  const path = join(inputDir, "results.jsonl");
85
408
  const stream = createReadStream(path);
@@ -142,8 +465,6 @@ function passAtKValue(n, c, k) {
142
465
  if (n - c < k) return 1;
143
466
  const total = binomial(BigInt(n), BigInt(k));
144
467
  const fail = binomial(BigInt(n - c), BigInt(k));
145
- // Compute the ratio as a single division so we avoid `1 - x` which
146
- // accumulates IEEE-754 error (e.g. 1 - 0.6 = 0.39999...).
147
468
  const passing = total - fail;
148
469
  return Number(passing) / Number(total);
149
470
  }
@@ -165,11 +165,22 @@ export class BenchmarkRunner {
165
165
  port: workdir.port,
166
166
  runDir: workdir.runDir,
167
167
  });
168
- const judgeVerdict = await this._runJudgeHook(task, workdir, scoring, {
169
- query: this.query,
170
- model: this.model,
171
- judgeProfile: this.profiles.judge ?? undefined,
172
- });
168
+ const judgeContext = await this.#buildJudgeContext(
169
+ task,
170
+ workdir,
171
+ skillSetHash,
172
+ );
173
+ const judgeVerdict = await this._runJudgeHook(
174
+ task,
175
+ workdir,
176
+ scoring,
177
+ {
178
+ query: this.query,
179
+ model: this.model,
180
+ judgeProfile: this.profiles.judge ?? undefined,
181
+ },
182
+ judgeContext,
183
+ );
173
184
  const record = {
174
185
  taskId: task.id,
175
186
  runIndex,
@@ -276,6 +287,20 @@ export class BenchmarkRunner {
276
287
  return { ...summary, agentError };
277
288
  }
278
289
 
290
+ async #buildJudgeContext(task, workdir, skillSetHash) {
291
+ const agentInstructions = await readFile(task.paths.instructions, "utf8");
292
+ let agentProfile = "";
293
+ if (this.profiles.agent) {
294
+ const profilePath = resolvePath(
295
+ workdir.cwd,
296
+ ".claude/agents",
297
+ `${this.profiles.agent}.md`,
298
+ );
299
+ agentProfile = await readFile(profilePath, "utf8").catch(() => "");
300
+ }
301
+ return { agentInstructions, agentProfile, skillSetHash };
302
+ }
303
+
279
304
  #buildPreflightFailureRecord({
280
305
  task,
281
306
  runIndex,
@@ -3,7 +3,7 @@
3
3
  * <root>/
4
4
  * apm.lock.yaml
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
- * tasks/<task_family_name>/<task_name>/
6
+ * tasks/<task_name>/
7
7
  * instructions.md
8
8
  * supervisor.task.md # preserved for v2; not read in v1
9
9
  * judge.task.md
@@ -122,32 +122,27 @@ function normalizeLf(buf) {
122
122
  async function discoverTasks(rootPath) {
123
123
  const tasksRoot = join(rootPath, "tasks");
124
124
  const tasks = [];
125
- let families;
125
+ let entries;
126
126
  try {
127
- families = await readdir(tasksRoot, { withFileTypes: true });
127
+ entries = await readdir(tasksRoot, { withFileTypes: true });
128
128
  } catch (e) {
129
129
  if (e.code === "ENOENT") return tasks;
130
130
  throw e;
131
131
  }
132
- for (const family of families) {
133
- if (!family.isDirectory()) continue;
134
- const familyDir = join(tasksRoot, family.name);
135
- const entries = await readdir(familyDir, { withFileTypes: true });
136
- for (const entry of entries) {
137
- if (!entry.isDirectory()) continue;
138
- const taskDir = join(familyDir, entry.name);
139
- tasks.push({
140
- id: `${family.name}/${entry.name}`,
141
- paths: {
142
- instructions: join(taskDir, "instructions.md"),
143
- supervisor: join(taskDir, "supervisor.task.md"),
144
- judge: join(taskDir, "judge.task.md"),
145
- specs: join(taskDir, "specs"),
146
- workdir: join(taskDir, "workdir"),
147
- scoring: join(taskDir, "scoring"),
148
- },
149
- });
150
- }
132
+ for (const entry of entries) {
133
+ if (!entry.isDirectory()) continue;
134
+ const taskDir = join(tasksRoot, entry.name);
135
+ tasks.push({
136
+ id: entry.name,
137
+ paths: {
138
+ instructions: join(taskDir, "instructions.md"),
139
+ supervisor: join(taskDir, "supervisor.task.md"),
140
+ judge: join(taskDir, "judge.task.md"),
141
+ specs: join(taskDir, "specs"),
142
+ workdir: join(taskDir, "workdir"),
143
+ scoring: join(taskDir, "scoring"),
144
+ },
145
+ });
151
146
  }
152
147
  tasks.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
153
148
  return tasks;
@@ -246,7 +241,7 @@ function run(cmd, args) {
246
241
 
247
242
  /**
248
243
  * @typedef {object} Task
249
- * @property {string} id - METR-style "task_family_name/task_name"
244
+ * @property {string} id - Task name (directory name under tasks/)
250
245
  * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
251
246
  */
252
247
 
@@ -30,7 +30,11 @@ export async function runBenchmarkReportCommand(values, _args) {
30
30
  throw new Error("--format must be 'json' or 'text'");
31
31
  }
32
32
 
33
- const report = await aggregate({ inputDir: resolve(inputDir), kValues });
33
+ const report = await aggregate({
34
+ inputDir: resolve(inputDir),
35
+ kValues,
36
+ includeRuns: format === "text",
37
+ });
34
38
  if (format === "text") {
35
39
  process.stdout.write(renderTextReport(report, kValues) + "\n");
36
40
  } else {
@@ -6,6 +6,7 @@
6
6
 
7
7
  import { resolve } from "node:path";
8
8
 
9
+ import { createConfig } from "@forwardimpact/libconfig";
9
10
  import { createBenchmarkRunner } from "../benchmark/runner.js";
10
11
 
11
12
  /**
@@ -14,6 +15,8 @@ import { createBenchmarkRunner } from "../benchmark/runner.js";
14
15
  */
15
16
  export async function runBenchmarkRunCommand(values, _args) {
16
17
  const opts = parseRunOptions(values);
18
+ const config = await createConfig("script", "benchmark");
19
+ process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
17
20
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
18
21
  const runner = createBenchmarkRunner({ ...opts, query });
19
22
 
package/src/supervisor.js CHANGED
@@ -536,12 +536,15 @@ export function createSupervisor({
536
536
 
537
537
  const onLine = (line) => supervisor.emitLine(line);
538
538
 
539
+ const perInvocationTurns =
540
+ maxTurns === 0 ? 0 : Math.max(maxTurns ?? 100, 200);
541
+
539
542
  const agentRunner = createAgentRunner({
540
543
  cwd: agentCwd,
541
544
  query,
542
545
  output: devNull,
543
546
  model,
544
- maxTurns: 50,
547
+ maxTurns: perInvocationTurns,
545
548
  allowedTools,
546
549
  onLine,
547
550
  settingSources: ["project"],
@@ -560,7 +563,7 @@ export function createSupervisor({
560
563
  query,
561
564
  output: devNull,
562
565
  model,
563
- maxTurns: 20,
566
+ maxTurns: perInvocationTurns,
564
567
  allowedTools: supervisorAllowedTools ?? [
565
568
  "Bash",
566
569
  "Read",