@forwardimpact/libeval 0.1.60 → 0.1.62

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,152 +2,11 @@
2
2
 
3
3
  import "@forwardimpact/libpreflight/node22";
4
4
 
5
- import { realpathSync } from "node:fs";
6
5
  import { createCli } from "@forwardimpact/libcli";
7
6
  import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
8
7
  import { createLogger } from "@forwardimpact/libtelemetry";
9
8
 
10
- import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
11
- import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
12
- import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
13
- import {
14
- BENCHMARK_AGENT_MODEL,
15
- LEAD_MODEL,
16
- } from "@forwardimpact/libutil/models";
17
-
18
- export const definition = {
19
- name: "fit-benchmark",
20
- description:
21
- "Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
22
- commands: [
23
- {
24
- name: "run",
25
- args: [],
26
- handler: runBenchmarkRunCommand,
27
- description:
28
- "Run every task in a family for N runs and emit one result record per (task, runIndex).",
29
- options: {
30
- family: {
31
- type: "string",
32
- description: "Path or git URL to a task family",
33
- },
34
- output: {
35
- type: "string",
36
- description:
37
- "Run-output directory (created if missing, default: benchmark-runs)",
38
- },
39
- runs: {
40
- type: "string",
41
- description: "Runs per task (integer ≥ 1, default: 5)",
42
- },
43
- "agent-model": {
44
- type: "string",
45
- description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
46
- },
47
- "lead-model": {
48
- type: "string",
49
- description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
50
- },
51
- "judge-model": {
52
- type: "string",
53
- description: `Claude model for the judge (default: ${LEAD_MODEL})`,
54
- },
55
- "agent-profile": {
56
- type: "string",
57
- description: "Agent-under-test profile name",
58
- },
59
- "judge-profile": {
60
- type: "string",
61
- description: "Judge profile name",
62
- },
63
- "max-turns": {
64
- type: "string",
65
- description:
66
- "Agent-under-test turn budget (default: 50, 0 = unlimited)",
67
- },
68
- "allowed-tools": {
69
- type: "string",
70
- description:
71
- "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
72
- },
73
- },
74
- },
75
- {
76
- name: "invariants",
77
- args: [],
78
- handler: runBenchmarkInvariantsCommand,
79
- description:
80
- "Check a single task's invariants against a post-run workdir without invoking an agent.",
81
- options: {
82
- family: {
83
- type: "string",
84
- description: "Path or git URL to a task family",
85
- },
86
- task: {
87
- type: "string",
88
- description: "Task id (directory name under tasks/)",
89
- },
90
- workdir: {
91
- type: "string",
92
- description:
93
- "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
94
- },
95
- output: {
96
- type: "string",
97
- description: "Output file (defaults to stdout; one JSONL line)",
98
- },
99
- },
100
- },
101
- {
102
- name: "report",
103
- args: [],
104
- handler: runBenchmarkReportCommand,
105
- description:
106
- "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
107
- options: {
108
- input: {
109
- type: "string",
110
- description:
111
- "Run-output directory containing results.jsonl (default: benchmark-runs)",
112
- },
113
- k: {
114
- type: "string",
115
- description: "Comma-separated k values (default: 1,3,5)",
116
- },
117
- format: {
118
- type: "string",
119
- description: "Output format (json|text, default: json)",
120
- },
121
- },
122
- },
123
- ],
124
- globalOptions: {
125
- help: { type: "boolean", short: "h", description: "Show this help" },
126
- version: { type: "boolean", description: "Show version" },
127
- json: { type: "boolean", description: "Output help as JSON" },
128
- },
129
- examples: [
130
- "fit-benchmark run --family=./families/coding",
131
- `fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
132
- "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
133
- "fit-benchmark report --format=text",
134
- "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
135
- ],
136
- documentation: [
137
- {
138
- title: "Run a Benchmark",
139
- url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
140
- description:
141
- "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
142
- },
143
- {
144
- title: "Automate with GitHub Actions",
145
- url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
146
- description:
147
- "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
148
- },
149
- ],
150
- };
9
+ import { definition } from "../src/commands/benchmark-definition.js";
151
10
 
152
11
  const runtime = createDefaultRuntime();
153
12
  const logger = createLogger("benchmark", runtime);
@@ -178,12 +37,8 @@ async function main() {
178
37
  runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
179
38
  }
180
39
 
181
- // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
182
- // should not execute the entry point.
183
- if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
184
- main().catch((error) => {
185
- logger.exception("main", error);
186
- createCli(definition, { runtime }).error(error.message);
187
- process.exit(1);
188
- });
189
- }
40
+ main().catch((error) => {
41
+ logger.exception("main", error);
42
+ createCli(definition, { runtime }).error(error.message);
43
+ process.exit(1);
44
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.60",
3
+ "version": "0.1.62",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -0,0 +1,147 @@
1
+ /**
2
+ * `fit-benchmark` CLI definition. Lives in `src/` so the bin stays an
3
+ * execute-on-import entry point — launcher packages import the bin to run
4
+ * it — while tests import the definition without running the CLI.
5
+ */
6
+
7
+ import { runBenchmarkRunCommand } from "./benchmark-run.js";
8
+ import { runBenchmarkInvariantsCommand } from "./benchmark-invariants.js";
9
+ import { runBenchmarkReportCommand } from "./benchmark-report.js";
10
+ import {
11
+ BENCHMARK_AGENT_MODEL,
12
+ LEAD_MODEL,
13
+ } from "@forwardimpact/libutil/models";
14
+
15
+ export const definition = {
16
+ name: "fit-benchmark",
17
+ description:
18
+ "Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
19
+ commands: [
20
+ {
21
+ name: "run",
22
+ args: [],
23
+ handler: runBenchmarkRunCommand,
24
+ description:
25
+ "Run every task in a family for N runs and emit one result record per (task, runIndex).",
26
+ options: {
27
+ family: {
28
+ type: "string",
29
+ description: "Path or git URL to a task family",
30
+ },
31
+ output: {
32
+ type: "string",
33
+ description:
34
+ "Run-output directory (created if missing, default: benchmark-runs)",
35
+ },
36
+ runs: {
37
+ type: "string",
38
+ description: "Runs per task (integer ≥ 1, default: 5)",
39
+ },
40
+ "agent-model": {
41
+ type: "string",
42
+ description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
43
+ },
44
+ "lead-model": {
45
+ type: "string",
46
+ description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
47
+ },
48
+ "judge-model": {
49
+ type: "string",
50
+ description: `Claude model for the judge (default: ${LEAD_MODEL})`,
51
+ },
52
+ "agent-profile": {
53
+ type: "string",
54
+ description: "Agent-under-test profile name",
55
+ },
56
+ "judge-profile": {
57
+ type: "string",
58
+ description: "Judge profile name",
59
+ },
60
+ "max-turns": {
61
+ type: "string",
62
+ description:
63
+ "Agent-under-test turn budget (default: 50, 0 = unlimited)",
64
+ },
65
+ "allowed-tools": {
66
+ type: "string",
67
+ description:
68
+ "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
69
+ },
70
+ },
71
+ },
72
+ {
73
+ name: "invariants",
74
+ args: [],
75
+ handler: runBenchmarkInvariantsCommand,
76
+ description:
77
+ "Check a single task's invariants against a post-run workdir without invoking an agent.",
78
+ options: {
79
+ family: {
80
+ type: "string",
81
+ description: "Path or git URL to a task family",
82
+ },
83
+ task: {
84
+ type: "string",
85
+ description: "Task id (directory name under tasks/)",
86
+ },
87
+ workdir: {
88
+ type: "string",
89
+ description:
90
+ "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
91
+ },
92
+ output: {
93
+ type: "string",
94
+ description: "Output file (defaults to stdout; one JSONL line)",
95
+ },
96
+ },
97
+ },
98
+ {
99
+ name: "report",
100
+ args: [],
101
+ handler: runBenchmarkReportCommand,
102
+ description:
103
+ "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
104
+ options: {
105
+ input: {
106
+ type: "string",
107
+ description:
108
+ "Run-output directory containing results.jsonl (default: benchmark-runs)",
109
+ },
110
+ k: {
111
+ type: "string",
112
+ description: "Comma-separated k values (default: 1,3,5)",
113
+ },
114
+ format: {
115
+ type: "string",
116
+ description: "Output format (json|text, default: json)",
117
+ },
118
+ },
119
+ },
120
+ ],
121
+ globalOptions: {
122
+ help: { type: "boolean", short: "h", description: "Show this help" },
123
+ version: { type: "boolean", description: "Show version" },
124
+ json: { type: "boolean", description: "Output help as JSON" },
125
+ },
126
+ examples: [
127
+ "fit-benchmark run --family=./families/coding",
128
+ `fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
129
+ "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
130
+ "fit-benchmark report --format=text",
131
+ "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
132
+ ],
133
+ documentation: [
134
+ {
135
+ title: "Run a Benchmark",
136
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
137
+ description:
138
+ "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
139
+ },
140
+ {
141
+ title: "Automate with GitHub Actions",
142
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
143
+ description:
144
+ "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
145
+ },
146
+ ],
147
+ };
@@ -218,25 +218,24 @@ export class TraceCollector {
218
218
  }
219
219
 
220
220
  /**
221
+ * Accumulate a result event into the running summary. Facilitated and
222
+ * supervised sessions emit one result event per runner invocation, so a
223
+ * single trace can carry several — cost, duration, turn, and token
224
+ * figures sum across all of them. `result` reflects the latest event;
225
+ * `isError` is true once any event errored.
221
226
  * @param {object} event
222
227
  */
223
228
  handleResult(event) {
229
+ const prev = this.result ?? EMPTY_RESULT;
230
+
224
231
  this.result = {
225
232
  result: event.subtype ?? "unknown",
226
- isError: event.is_error ?? false,
227
- totalCostUsd: event.total_cost_usd ?? 0,
228
- durationMs: event.duration_ms ?? 0,
229
- numTurns: event.num_turns ?? 0,
230
- tokenUsage: event.usage
231
- ? {
232
- inputTokens: event.usage.input_tokens ?? 0,
233
- outputTokens: event.usage.output_tokens ?? 0,
234
- cacheReadInputTokens: event.usage.cache_read_input_tokens ?? 0,
235
- cacheCreationInputTokens:
236
- event.usage.cache_creation_input_tokens ?? 0,
237
- }
238
- : null,
239
- modelUsage: event.modelUsage ?? null,
233
+ isError: prev.isError || (event.is_error ?? false),
234
+ totalCostUsd: prev.totalCostUsd + (event.total_cost_usd ?? 0),
235
+ durationMs: prev.durationMs + (event.duration_ms ?? 0),
236
+ numTurns: prev.numTurns + (event.num_turns ?? 0),
237
+ tokenUsage: sumTokenUsage(prev.tokenUsage, normalizeUsage(event.usage)),
238
+ modelUsage: event.modelUsage ?? prev.modelUsage,
240
239
  };
241
240
  }
242
241
 
@@ -303,7 +302,9 @@ export class TraceCollector {
303
302
  * Format the trailing result summary line. When an orchestrator
304
303
  * summary is present (supervised / facilitated mode), the headline word is
305
304
  * the supervisor's verdict ("success" / "failure") rather than the SDK's
306
- * per-runner subtype, so the footer aligns with the CI exit code.
305
+ * per-runner subtype, so the footer aligns with the CI exit code. Turn,
306
+ * cost, and duration figures are the accumulated totals across every
307
+ * result event in the trace, not the last event's.
307
308
  * @returns {string}
308
309
  */
309
310
  #formatResultTail() {
@@ -318,6 +319,50 @@ export class TraceCollector {
318
319
  }
319
320
  }
320
321
 
322
+ /** Identity element for result-event accumulation in handleResult. */
323
+ const EMPTY_RESULT = {
324
+ isError: false,
325
+ totalCostUsd: 0,
326
+ durationMs: 0,
327
+ numTurns: 0,
328
+ tokenUsage: null,
329
+ modelUsage: null,
330
+ };
331
+
332
+ /**
333
+ * Normalize an SDK snake_case usage block to camelCase token fields.
334
+ * @param {object|null|undefined} usage
335
+ * @returns {object|null}
336
+ */
337
+ function normalizeUsage(usage) {
338
+ if (!usage) return null;
339
+ return {
340
+ inputTokens: usage.input_tokens ?? 0,
341
+ outputTokens: usage.output_tokens ?? 0,
342
+ cacheReadInputTokens: usage.cache_read_input_tokens ?? 0,
343
+ cacheCreationInputTokens: usage.cache_creation_input_tokens ?? 0,
344
+ };
345
+ }
346
+
347
+ /**
348
+ * Sum two token-usage records field-by-field. Either side may be null
349
+ * (a result event without usage); the sum is null only when both are.
350
+ * @param {object|null} a
351
+ * @param {object|null} b
352
+ * @returns {object|null}
353
+ */
354
+ function sumTokenUsage(a, b) {
355
+ if (!a) return b;
356
+ if (!b) return a;
357
+ return {
358
+ inputTokens: a.inputTokens + b.inputTokens,
359
+ outputTokens: a.outputTokens + b.outputTokens,
360
+ cacheReadInputTokens: a.cacheReadInputTokens + b.cacheReadInputTokens,
361
+ cacheCreationInputTokens:
362
+ a.cacheCreationInputTokens + b.cacheCreationInputTokens,
363
+ };
364
+ }
365
+
321
366
  /**
322
367
  * Format milliseconds into a human-readable duration.
323
368
  * @param {number} ms - Duration in milliseconds
@@ -278,38 +278,20 @@ export class TraceQuery {
278
278
 
279
279
  /**
280
280
  * Token usage and cost breakdown per assistant turn, plus totals.
281
+ *
282
+ * Token totals prefer the summary's result-event usage — the SDK's
283
+ * authoritative ledger, accumulated across every result event in the
284
+ * trace — over per-turn sums, whose stream-time snapshots double-count
285
+ * re-emitted messages. Traces without a result event (truncated or
286
+ * in-flight) fall back to the per-turn sums.
281
287
  * @returns {object}
282
288
  */
283
289
  stats() {
284
- let totalInput = 0;
285
- let totalOutput = 0;
286
- let totalCacheRead = 0;
287
- let totalCacheCreate = 0;
288
- const perTurn = [];
289
-
290
- for (const turn of this.turns) {
291
- if (turn.role !== "assistant" || !turn.usage) continue;
292
- const u = turn.usage;
293
- totalInput += u.inputTokens ?? 0;
294
- totalOutput += u.outputTokens ?? 0;
295
- totalCacheRead += u.cacheReadInputTokens ?? 0;
296
- totalCacheCreate += u.cacheCreationInputTokens ?? 0;
297
-
298
- perTurn.push({
299
- index: turn.index,
300
- inputTokens: u.inputTokens ?? 0,
301
- outputTokens: u.outputTokens ?? 0,
302
- cacheReadInputTokens: u.cacheReadInputTokens ?? 0,
303
- cacheCreationInputTokens: u.cacheCreationInputTokens ?? 0,
304
- });
305
- }
306
-
290
+ const { perTurn, totals: turnTotals } = perTurnUsage(this.turns);
291
+ const tokenTotals = this.summary.tokenUsage ?? turnTotals;
307
292
  return {
308
293
  totals: {
309
- inputTokens: totalInput,
310
- outputTokens: totalOutput,
311
- cacheReadInputTokens: totalCacheRead,
312
- cacheCreationInputTokens: totalCacheCreate,
294
+ ...tokenTotals,
313
295
  totalCostUsd: this.summary.totalCostUsd ?? 0,
314
296
  durationMs: this.summary.durationMs ?? 0,
315
297
  },
@@ -318,6 +300,38 @@ export class TraceQuery {
318
300
  }
319
301
  }
320
302
 
303
+ /**
304
+ * Sum per-turn assistant usage and build the per-turn breakdown rows.
305
+ * @param {object[]} turns
306
+ * @returns {{perTurn: object[], totals: object}}
307
+ */
308
+ function perTurnUsage(turns) {
309
+ const totals = {
310
+ inputTokens: 0,
311
+ outputTokens: 0,
312
+ cacheReadInputTokens: 0,
313
+ cacheCreationInputTokens: 0,
314
+ };
315
+ const perTurn = [];
316
+
317
+ for (const turn of turns) {
318
+ if (turn.role !== "assistant" || !turn.usage) continue;
319
+ const row = {
320
+ index: turn.index,
321
+ inputTokens: turn.usage.inputTokens ?? 0,
322
+ outputTokens: turn.usage.outputTokens ?? 0,
323
+ cacheReadInputTokens: turn.usage.cacheReadInputTokens ?? 0,
324
+ cacheCreationInputTokens: turn.usage.cacheCreationInputTokens ?? 0,
325
+ };
326
+ totals.inputTokens += row.inputTokens;
327
+ totals.outputTokens += row.outputTokens;
328
+ totals.cacheReadInputTokens += row.cacheReadInputTokens;
329
+ totals.cacheCreationInputTokens += row.cacheCreationInputTokens;
330
+ perTurn.push(row);
331
+ }
332
+ return { perTurn, totals };
333
+ }
334
+
321
335
  /**
322
336
  * @param {object} turn
323
337
  * @param {string|undefined} role