@forwardimpact/libeval 0.1.61 → 0.1.63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,152 +2,11 @@
2
2
 
3
3
  import "@forwardimpact/libpreflight/node22";
4
4
 
5
- import { realpathSync } from "node:fs";
6
5
  import { createCli } from "@forwardimpact/libcli";
7
6
  import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
8
7
  import { createLogger } from "@forwardimpact/libtelemetry";
9
8
 
10
- import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
11
- import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
12
- import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
13
- import {
14
- BENCHMARK_AGENT_MODEL,
15
- LEAD_MODEL,
16
- } from "@forwardimpact/libutil/models";
17
-
18
- export const definition = {
19
- name: "fit-benchmark",
20
- description:
21
- "Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
22
- commands: [
23
- {
24
- name: "run",
25
- args: [],
26
- handler: runBenchmarkRunCommand,
27
- description:
28
- "Run every task in a family for N runs and emit one result record per (task, runIndex).",
29
- options: {
30
- family: {
31
- type: "string",
32
- description: "Path or git URL to a task family",
33
- },
34
- output: {
35
- type: "string",
36
- description:
37
- "Run-output directory (created if missing, default: benchmark-runs)",
38
- },
39
- runs: {
40
- type: "string",
41
- description: "Runs per task (integer ≥ 1, default: 5)",
42
- },
43
- "agent-model": {
44
- type: "string",
45
- description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
46
- },
47
- "lead-model": {
48
- type: "string",
49
- description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
50
- },
51
- "judge-model": {
52
- type: "string",
53
- description: `Claude model for the judge (default: ${LEAD_MODEL})`,
54
- },
55
- "agent-profile": {
56
- type: "string",
57
- description: "Agent-under-test profile name",
58
- },
59
- "judge-profile": {
60
- type: "string",
61
- description: "Judge profile name",
62
- },
63
- "max-turns": {
64
- type: "string",
65
- description:
66
- "Agent-under-test turn budget (default: 50, 0 = unlimited)",
67
- },
68
- "allowed-tools": {
69
- type: "string",
70
- description:
71
- "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
72
- },
73
- },
74
- },
75
- {
76
- name: "invariants",
77
- args: [],
78
- handler: runBenchmarkInvariantsCommand,
79
- description:
80
- "Check a single task's invariants against a post-run workdir without invoking an agent.",
81
- options: {
82
- family: {
83
- type: "string",
84
- description: "Path or git URL to a task family",
85
- },
86
- task: {
87
- type: "string",
88
- description: "Task id (directory name under tasks/)",
89
- },
90
- workdir: {
91
- type: "string",
92
- description:
93
- "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
94
- },
95
- output: {
96
- type: "string",
97
- description: "Output file (defaults to stdout; one JSONL line)",
98
- },
99
- },
100
- },
101
- {
102
- name: "report",
103
- args: [],
104
- handler: runBenchmarkReportCommand,
105
- description:
106
- "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
107
- options: {
108
- input: {
109
- type: "string",
110
- description:
111
- "Run-output directory containing results.jsonl (default: benchmark-runs)",
112
- },
113
- k: {
114
- type: "string",
115
- description: "Comma-separated k values (default: 1,3,5)",
116
- },
117
- format: {
118
- type: "string",
119
- description: "Output format (json|text, default: json)",
120
- },
121
- },
122
- },
123
- ],
124
- globalOptions: {
125
- help: { type: "boolean", short: "h", description: "Show this help" },
126
- version: { type: "boolean", description: "Show version" },
127
- json: { type: "boolean", description: "Output help as JSON" },
128
- },
129
- examples: [
130
- "fit-benchmark run --family=./families/coding",
131
- `fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
132
- "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
133
- "fit-benchmark report --format=text",
134
- "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
135
- ],
136
- documentation: [
137
- {
138
- title: "Run a Benchmark",
139
- url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
140
- description:
141
- "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
142
- },
143
- {
144
- title: "Automate with GitHub Actions",
145
- url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
146
- description:
147
- "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
148
- },
149
- ],
150
- };
9
+ import { definition } from "../src/commands/benchmark-definition.js";
151
10
 
152
11
  const runtime = createDefaultRuntime();
153
12
  const logger = createLogger("benchmark", runtime);
@@ -178,12 +37,8 @@ async function main() {
178
37
  runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
179
38
  }
180
39
 
181
- // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
182
- // should not execute the entry point.
183
- if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
184
- main().catch((error) => {
185
- logger.exception("main", error);
186
- createCli(definition, { runtime }).error(error.message);
187
- process.exit(1);
188
- });
189
- }
40
+ main().catch((error) => {
41
+ logger.exception("main", error);
42
+ createCli(definition, { runtime }).error(error.message);
43
+ process.exit(1);
44
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.61",
3
+ "version": "0.1.63",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -10,6 +10,32 @@ import { AGENT_MODEL } from "@forwardimpact/libutil/models";
10
10
 
11
11
  const DEFAULT_ALLOWED_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
12
12
 
13
+ /**
14
+ * Did the session actually invoke the model? A genuine run always bills
15
+ * tokens (the system prompt alone is thousands of input tokens) and costs
16
+ * more than zero. A `result` message with `subtype: "success"` but zero
17
+ * token usage and zero cost means the model was never reached — the
18
+ * canonical signature of a Claude Code init/auth failure (e.g. an invalid
19
+ * `ANTHROPIC_API_KEY`), which the SDK otherwise reports as a clean success.
20
+ *
21
+ * If the SDK gave us neither a `usage` object nor `total_cost_usd`, don't
22
+ * second-guess the subtype — trust the reported success.
23
+ * @param {object|null} result - The SDK `result` message, or null.
24
+ * @returns {boolean}
25
+ */
26
+ function modelDidWork(result) {
27
+ if (!result) return false;
28
+ const { usage, total_cost_usd: cost } = result;
29
+ if (usage == null && cost == null) return true;
30
+ const tokens = usage
31
+ ? (usage.input_tokens ?? 0) +
32
+ (usage.output_tokens ?? 0) +
33
+ (usage.cache_creation_input_tokens ?? 0) +
34
+ (usage.cache_read_input_tokens ?? 0)
35
+ : 0;
36
+ return tokens > 0 || (cost ?? 0) > 0;
37
+ }
38
+
13
39
  // fit-eval and kata-action run headless in CI/CD with no human to answer
14
40
  // permission prompts. The SDK is always launched in bypass mode — not
15
41
  // overridable — so a future caller can't accidentally reduce permissions.
@@ -148,6 +174,7 @@ export class AgentRunner {
148
174
  async #consumeQuery(iterator) {
149
175
  let text = "";
150
176
  let stopReason = null;
177
+ let resultMessage = null;
151
178
  let error = null;
152
179
  let aborted = false;
153
180
 
@@ -157,6 +184,7 @@ export class AgentRunner {
157
184
  if (message.type === "result") {
158
185
  text = message.result ?? "";
159
186
  stopReason = message.subtype;
187
+ resultMessage = message;
160
188
  }
161
189
  }
162
190
  } catch (err) {
@@ -167,8 +195,23 @@ export class AgentRunner {
167
195
  }
168
196
  }
169
197
 
198
+ // A "success" subtype is necessary but not sufficient: the SDK reports a
199
+ // failed init (e.g. an invalid API key) as success with zero model work.
200
+ // Require evidence the model actually ran, and surface a clear error when
201
+ // it didn't, so the masked failure can't be reported as a green run.
202
+ const reportedSuccess = stopReason === "success";
203
+ const success =
204
+ reportedSuccess &&
205
+ resultMessage?.is_error !== true &&
206
+ modelDidWork(resultMessage);
207
+ if (reportedSuccess && !success && !error) {
208
+ error = new Error(
209
+ "agent reported success but performed no model work (zero token usage) — likely a Claude Code init or authentication failure",
210
+ );
211
+ }
212
+
170
213
  return {
171
- success: stopReason === "success",
214
+ success,
172
215
  text,
173
216
  sessionId: this.sessionId,
174
217
  error,
@@ -0,0 +1,147 @@
1
+ /**
2
+ * `fit-benchmark` CLI definition. Lives in `src/` so the bin stays an
3
+ * execute-on-import entry point — launcher packages import the bin to run
4
+ * it — while tests import the definition without running the CLI.
5
+ */
6
+
7
+ import { runBenchmarkRunCommand } from "./benchmark-run.js";
8
+ import { runBenchmarkInvariantsCommand } from "./benchmark-invariants.js";
9
+ import { runBenchmarkReportCommand } from "./benchmark-report.js";
10
+ import {
11
+ BENCHMARK_AGENT_MODEL,
12
+ LEAD_MODEL,
13
+ } from "@forwardimpact/libutil/models";
14
+
15
+ export const definition = {
16
+ name: "fit-benchmark",
17
+ description:
18
+ "Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
19
+ commands: [
20
+ {
21
+ name: "run",
22
+ args: [],
23
+ handler: runBenchmarkRunCommand,
24
+ description:
25
+ "Run every task in a family for N runs and emit one result record per (task, runIndex).",
26
+ options: {
27
+ family: {
28
+ type: "string",
29
+ description: "Path or git URL to a task family",
30
+ },
31
+ output: {
32
+ type: "string",
33
+ description:
34
+ "Run-output directory (created if missing, default: benchmark-runs)",
35
+ },
36
+ runs: {
37
+ type: "string",
38
+ description: "Runs per task (integer ≥ 1, default: 5)",
39
+ },
40
+ "agent-model": {
41
+ type: "string",
42
+ description: `Claude model for the agent-under-test (default: ${BENCHMARK_AGENT_MODEL})`,
43
+ },
44
+ "lead-model": {
45
+ type: "string",
46
+ description: `Claude model for the lead role (default: ${LEAD_MODEL})`,
47
+ },
48
+ "judge-model": {
49
+ type: "string",
50
+ description: `Claude model for the judge (default: ${LEAD_MODEL})`,
51
+ },
52
+ "agent-profile": {
53
+ type: "string",
54
+ description: "Agent-under-test profile name",
55
+ },
56
+ "judge-profile": {
57
+ type: "string",
58
+ description: "Judge profile name",
59
+ },
60
+ "max-turns": {
61
+ type: "string",
62
+ description:
63
+ "Agent-under-test turn budget (default: 50, 0 = unlimited)",
64
+ },
65
+ "allowed-tools": {
66
+ type: "string",
67
+ description:
68
+ "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
69
+ },
70
+ },
71
+ },
72
+ {
73
+ name: "invariants",
74
+ args: [],
75
+ handler: runBenchmarkInvariantsCommand,
76
+ description:
77
+ "Check a single task's invariants against a post-run workdir without invoking an agent.",
78
+ options: {
79
+ family: {
80
+ type: "string",
81
+ description: "Path or git URL to a task family",
82
+ },
83
+ task: {
84
+ type: "string",
85
+ description: "Task id (directory name under tasks/)",
86
+ },
87
+ workdir: {
88
+ type: "string",
89
+ description:
90
+ "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
91
+ },
92
+ output: {
93
+ type: "string",
94
+ description: "Output file (defaults to stdout; one JSONL line)",
95
+ },
96
+ },
97
+ },
98
+ {
99
+ name: "report",
100
+ args: [],
101
+ handler: runBenchmarkReportCommand,
102
+ description:
103
+ "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
104
+ options: {
105
+ input: {
106
+ type: "string",
107
+ description:
108
+ "Run-output directory containing results.jsonl (default: benchmark-runs)",
109
+ },
110
+ k: {
111
+ type: "string",
112
+ description: "Comma-separated k values (default: 1,3,5)",
113
+ },
114
+ format: {
115
+ type: "string",
116
+ description: "Output format (json|text, default: json)",
117
+ },
118
+ },
119
+ },
120
+ ],
121
+ globalOptions: {
122
+ help: { type: "boolean", short: "h", description: "Show this help" },
123
+ version: { type: "boolean", description: "Show version" },
124
+ json: { type: "boolean", description: "Output help as JSON" },
125
+ },
126
+ examples: [
127
+ "fit-benchmark run --family=./families/coding",
128
+ `fit-benchmark run --family=./families/coding --runs=10 --agent-model=${BENCHMARK_AGENT_MODEL}`,
129
+ "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
130
+ "fit-benchmark report --format=text",
131
+ "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
132
+ ],
133
+ documentation: [
134
+ {
135
+ title: "Run a Benchmark",
136
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/index.md",
137
+ description:
138
+ "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
139
+ },
140
+ {
141
+ title: "Automate with GitHub Actions",
142
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
143
+ description:
144
+ "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
145
+ },
146
+ ],
147
+ };
@@ -140,5 +140,7 @@ export async function runRunCommand(ctx) {
140
140
  await new Promise((r) => fileStream.end(r));
141
141
  }
142
142
 
143
- return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
143
+ return result.success
144
+ ? { ok: true }
145
+ : { ok: false, code: 1, error: result.error?.message ?? "" };
144
146
  }