@forwardimpact/libeval 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -34,15 +34,26 @@ export const definition = {
34
34
  },
35
35
  output: {
36
36
  type: "string",
37
- description: "Run-output directory (created if missing)",
37
+ description:
38
+ "Run-output directory (created if missing, default: benchmark-runs)",
38
39
  },
39
40
  runs: {
40
41
  type: "string",
41
- description: "Runs per task (integer ≥ 1, default 1)",
42
+ description: "Runs per task (integer ≥ 1, default: 5)",
43
+ },
44
+ "agent-model": {
45
+ type: "string",
46
+ description:
47
+ "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
42
48
  },
43
- model: {
49
+ "supervisor-model": {
44
50
  type: "string",
45
- description: "Claude model id (default: claude-opus-4-7[1m])",
51
+ description:
52
+ "Claude model for the supervisor (default: claude-opus-4-7)",
53
+ },
54
+ "judge-model": {
55
+ type: "string",
56
+ description: "Claude model for the judge (default: claude-opus-4-7)",
46
57
  },
47
58
  "agent-profile": {
48
59
  type: "string",
@@ -92,7 +103,8 @@ export const definition = {
92
103
  options: {
93
104
  input: {
94
105
  type: "string",
95
- description: "Run-output directory containing results.jsonl",
106
+ description:
107
+ "Run-output directory containing results.jsonl (default: benchmark-runs)",
96
108
  },
97
109
  k: {
98
110
  type: "string",
@@ -111,8 +123,10 @@ export const definition = {
111
123
  json: { type: "boolean", description: "Output help as JSON" },
112
124
  },
113
125
  examples: [
114
- "fit-benchmark run --family=./families/coding --output=./runs/2026-05-11 --runs=5",
115
- "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./runs/2026-05-11/runs/todo-api/0",
126
+ "fit-benchmark run --family=./families/coding",
127
+ "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
128
+ "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
129
+ "fit-benchmark report --format=text",
116
130
  "fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
117
131
  ],
118
132
  documentation: [
@@ -122,6 +136,12 @@ export const definition = {
122
136
  description:
123
137
  "Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
124
138
  },
139
+ {
140
+ title: "Automate with GitHub Actions",
141
+ url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
142
+ description:
143
+ "Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
144
+ },
125
145
  ],
126
146
  };
127
147
 
package/bin/fit-eval.js CHANGED
@@ -41,7 +41,11 @@ const definition = {
41
41
  type: "string",
42
42
  description: "Additional text appended to the task",
43
43
  },
44
- model: { type: "string", description: "Claude model (default: opus)" },
44
+ "agent-model": {
45
+ type: "string",
46
+ description:
47
+ "Claude model for the agent (default: claude-opus-4-7[1m])",
48
+ },
45
49
  "max-turns": {
46
50
  type: "string",
47
51
  description: "Max agentic turns (default: 50, 0 = unlimited)",
@@ -84,7 +88,16 @@ const definition = {
84
88
  type: "string",
85
89
  description: "Additional text appended to the task",
86
90
  },
87
- model: { type: "string", description: "Claude model (default: opus)" },
91
+ "agent-model": {
92
+ type: "string",
93
+ description:
94
+ "Claude model for the agent (default: claude-opus-4-7[1m])",
95
+ },
96
+ "supervisor-model": {
97
+ type: "string",
98
+ description:
99
+ "Claude model for the supervisor (default: claude-opus-4-7[1m])",
100
+ },
88
101
  "max-turns": {
89
102
  type: "string",
90
103
  description: "Max agentic turns (default: 20, 0 = unlimited)",
@@ -136,7 +149,15 @@ const definition = {
136
149
  type: "string",
137
150
  description: "Additional text appended to the task",
138
151
  },
139
- model: { type: "string", description: "Claude model (default: opus)" },
152
+ "agent-model": {
153
+ type: "string",
154
+ description: "Claude model for agents (default: claude-opus-4-7[1m])",
155
+ },
156
+ "facilitator-model": {
157
+ type: "string",
158
+ description:
159
+ "Claude model for the facilitator (default: claude-opus-4-7[1m])",
160
+ },
140
161
  "max-turns": {
141
162
  type: "string",
142
163
  description: "Max agentic turns (default: 20, 0 = unlimited)",
package/bin/fit-trace.js CHANGED
@@ -25,6 +25,7 @@ import {
25
25
  runFilterCommand,
26
26
  runSplitCommand,
27
27
  } from "../src/commands/trace.js";
28
+ import { runAssertCommand } from "../src/commands/assert.js";
28
29
 
29
30
  // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
30
31
  // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -199,6 +200,41 @@ const definition = {
199
200
  },
200
201
  },
201
202
  },
203
+ {
204
+ name: "assert",
205
+ args: "<test-name> <file>",
206
+ description:
207
+ "Shell-friendly assertion — outputs structured JSON for scoring hooks",
208
+ options: {
209
+ grep: {
210
+ type: "string",
211
+ description:
212
+ "Pass if extended regex matches file content (case-insensitive)",
213
+ },
214
+ query: {
215
+ type: "string",
216
+ description:
217
+ "Pass if JMESPath expression against JSON/NDJSON yields a truthy result",
218
+ },
219
+ exists: {
220
+ type: "boolean",
221
+ description: "Pass if file exists",
222
+ },
223
+ "cites-job": {
224
+ type: "string",
225
+ description:
226
+ "Pass if <file> contains the canonical citation from a <job> tag in the given JTBD file",
227
+ },
228
+ not: {
229
+ type: "boolean",
230
+ description: "Invert the assertion",
231
+ },
232
+ message: {
233
+ type: "string",
234
+ description: "Custom failure message",
235
+ },
236
+ },
237
+ },
202
238
  ],
203
239
  globalOptions: {
204
240
  help: { type: "boolean", short: "h", description: "Show this help" },
@@ -220,6 +256,11 @@ const definition = {
220
256
  "fit-trace search structured.json 'error|fail' --context 1",
221
257
  "fit-trace filter structured.json --tool Bash --error",
222
258
  "fit-trace turn structured.json 3",
259
+ "fit-trace assert has-heading --grep '^## Problem' spec.md",
260
+ "fit-trace assert no-leak --not --grep 'password' output.log",
261
+ "fit-trace assert file-present --exists path/to/spec.md",
262
+ "fit-trace assert cites-jtbd --cites-job jtbd-excerpt.md spec.md",
263
+ "fit-trace assert used-edit --query \"[?type=='assistant'].message.content[] | [?name=='Edit']\" trace.ndjson",
223
264
  ],
224
265
  documentation: [
225
266
  {
@@ -265,6 +306,7 @@ const COMMANDS = {
265
306
  turn: runTurnCommand,
266
307
  filter: runFilterCommand,
267
308
  split: runSplitCommand,
309
+ assert: runAssertCommand,
268
310
  };
269
311
 
270
312
  async function main() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.36",
3
+ "version": "0.1.38",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -53,6 +53,7 @@
53
53
  "@forwardimpact/libcli": "^0.1.0",
54
54
  "@forwardimpact/libconfig": "^0.1.0",
55
55
  "@forwardimpact/libtelemetry": "^0.1.22",
56
+ "jmespath": "^0.16.0",
56
57
  "zod": "^4.4.3"
57
58
  },
58
59
  "devDependencies": {
@@ -1,14 +1,13 @@
1
1
  /**
2
- * ApmInstaller — materialises the family's pre-staged `.claude/` tree into a
3
- * single staging directory, computes the manifest fingerprint, and is invoked
4
- * once per family install. Per-task copy happens later in WorkdirManager.
5
- *
6
- * v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
7
- * verbatim, not interpreted.
2
+ * ApmInstaller — runs `apm install --target claude` in the family root to
3
+ * materialise skills and agents, copies the resulting `.claude/` into a
4
+ * staging directory, and computes the manifest fingerprint from the lockfile.
5
+ * Per-task copy happens later in WorkdirManager.
8
6
  */
9
7
 
8
+ import { spawn } from "node:child_process";
10
9
  import { createHash } from "node:crypto";
11
- import { access, cp, rm } from "node:fs/promises";
10
+ import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
12
11
  import { join } from "node:path";
13
12
 
14
13
  /**
@@ -21,19 +20,66 @@ export async function installApm(family, outputDir) {
21
20
  const stagedClaude = join(stagingDir, ".claude");
22
21
  const sourceClaude = join(family.rootPath, ".claude");
23
22
 
23
+ await runApmInstall(family.rootPath);
24
+
24
25
  try {
25
26
  await access(sourceClaude);
26
27
  } catch {
27
28
  throw new Error(
28
- `task family missing .claude/ at ${sourceClaude}; family must check in a pre-staged skills/agents tree (design decision P1)`,
29
+ `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
29
30
  );
30
31
  }
31
32
 
32
33
  await rm(stagingDir, { recursive: true, force: true });
33
34
  await cp(sourceClaude, stagedClaude, { recursive: true });
34
35
 
36
+ // Stage the family-local judge profile outside .claude/ so it is available
37
+ // to the judge but never copied into the agent-under-test's CWD.
38
+ const judgeSource = join(family.rootPath, "judge.md");
39
+ const judgeProfilesDir = join(stagingDir, "judge-profiles");
40
+ try {
41
+ await access(judgeSource);
42
+ await mkdir(judgeProfilesDir, { recursive: true });
43
+ await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
44
+ } catch {}
45
+
46
+ const lockPath = join(family.rootPath, "apm.lock.yaml");
47
+ const lockBytes = await readFile(lockPath).catch(() => {
48
+ throw new Error(`apm install did not produce apm.lock.yaml at ${lockPath}`);
49
+ });
35
50
  const skillSetHash =
36
- "sha256:" + createHash("sha256").update(family.apmLockBytes).digest("hex");
51
+ "sha256:" +
52
+ createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
53
+
54
+ return { stagingDir, skillSetHash, judgeProfilesDir };
55
+ }
56
+
57
+ function normalizeLf(buf) {
58
+ const out = [];
59
+ for (let i = 0; i < buf.length; i++) {
60
+ if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
61
+ out.push(buf[i]);
62
+ }
63
+ return Buffer.from(out);
64
+ }
37
65
 
38
- return { stagingDir, skillSetHash };
66
+ function runApmInstall(cwd) {
67
+ return new Promise((res, rej) => {
68
+ const child = spawn("apm", ["install", "--target", "claude"], {
69
+ cwd,
70
+ stdio: ["ignore", "pipe", "pipe"],
71
+ });
72
+ let stderr = "";
73
+ child.stdout.on("data", () => {});
74
+ child.stderr.on("data", (d) => {
75
+ stderr += d.toString();
76
+ });
77
+ child.on("error", (e) => {
78
+ rej(new Error(`failed to spawn apm: ${e.message}`));
79
+ });
80
+ child.on("close", (code) => {
81
+ if (code === 0) res();
82
+ else rej(new Error(`apm install exited ${code}: ${stderr}`));
83
+ });
84
+ });
39
85
  }
@@ -6,7 +6,7 @@
6
6
  *
7
7
  * Template variables available in `judge.task.md`:
8
8
  *
9
- * {{AGENT_INSTRUCTIONS}} — contents of instructions.md
9
+ * {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
10
10
  * {{AGENT_PROFILE}} — agent profile body (empty string if none)
11
11
  * {{AGENT_TRACE_PATH}} — path to agent.ndjson
12
12
  * {{SCORING_RESULT}} — JSON scoring object
@@ -37,7 +37,7 @@ import { createRedactor } from "../redaction.js";
37
37
 
38
38
  /**
39
39
  * @typedef {object} JudgeContext
40
- * @property {string} agentInstructions - Contents of instructions.md.
40
+ * @property {string} agentInstructions - Contents of agent.task.md.
41
41
  * @property {string} agentProfile - Agent profile body (empty string if none).
42
42
  * @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
43
43
  */
@@ -47,7 +47,7 @@ import { createRedactor } from "../redaction.js";
47
47
  * @param {import("./task-family.js").Task} task
48
48
  * @param {import("./workdir.js").Workdir} workdir
49
49
  * @param {import("./scorer.js").ScoringResult} scoring
50
- * @param {{query: Function, model: string, judgeProfile?: string}} deps
50
+ * @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
51
51
  * @param {JudgeContext} [context]
52
52
  * @returns {Promise<JudgeVerdict>}
53
53
  */
@@ -71,6 +71,7 @@ export async function runJudge(task, workdir, scoring, deps, context) {
71
71
  output,
72
72
  model: deps.model,
73
73
  judgeProfile: deps.judgeProfile,
74
+ profilesDir: deps.profilesDir,
74
75
  maxTurns: 25,
75
76
  redactor: createRedactor(),
76
77
  });
@@ -178,24 +178,46 @@ function renderFullReport(report, kValues) {
178
178
  function renderSummary(report) {
179
179
  const { totals } = report;
180
180
  const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
181
+ const icon = statusIcon(passing === totals.tasks);
181
182
  const lines = [
182
183
  "# Benchmark Report",
183
184
  "",
184
- `**Result: ${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
185
+ `${icon} **${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
185
186
  ];
187
+
188
+ const headers = [];
189
+ const values = [];
190
+ if (totals.costUsd != null) {
191
+ headers.push("Cost");
192
+ values.push(formatCost(totals.costUsd));
193
+ }
194
+ if (totals.medianDurationMs != null) {
195
+ headers.push("Median Duration");
196
+ values.push(formatDuration(totals.medianDurationMs));
197
+ }
198
+ if (totals.medianTurns != null) {
199
+ headers.push("Median Turns");
200
+ values.push(String(totals.medianTurns));
201
+ }
202
+ if (headers.length) {
203
+ lines.push("");
204
+ lines.push(`| ${headers.join(" | ")} |`);
205
+ lines.push(`| ${headers.map(() => "---").join(" | ")} |`);
206
+ lines.push(`| ${values.join(" | ")} |`);
207
+ }
208
+
186
209
  const meta = [];
187
- if (totals.model) meta.push(`Model: \`${totals.model}\``);
210
+ if (totals.model) {
211
+ meta.push(`Agent: \`${totals.model.agent}\``);
212
+ meta.push(`Supervisor: \`${totals.model.supervisor}\``);
213
+ meta.push(`Judge: \`${totals.model.judge}\``);
214
+ }
188
215
  if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
189
216
  if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
190
- if (meta.length) lines.push(meta.join(" | "));
191
-
192
- const stats = [];
193
- if (totals.costUsd != null) stats.push(`Cost: ${formatCost(totals.costUsd)}`);
194
- if (totals.medianDurationMs != null)
195
- stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
196
- if (totals.medianTurns != null)
197
- stats.push(`Median turns: ${totals.medianTurns}`);
198
- if (stats.length) lines.push(stats.join(" | "));
217
+ if (meta.length) {
218
+ lines.push("");
219
+ lines.push(meta.join(" | "));
220
+ }
199
221
 
200
222
  lines.push("");
201
223
  return lines.join("\n");
@@ -229,13 +251,13 @@ function renderTotalsLine(report) {
229
251
 
230
252
  function renderTaskDetail(task) {
231
253
  const runs = task.runs ?? [];
232
- const status = task.c === task.n ? "PASS" : "FAIL";
254
+ const icon = statusIcon(task.c === task.n);
233
255
  const singleRun = runs.length === 1;
234
256
 
235
257
  const lines = [
236
258
  `### ${task.taskId}`,
237
259
  "",
238
- `**${status} — ${task.c}/${task.n} runs passed**`,
260
+ `${icon} **${task.c}/${task.n} runs passed**`,
239
261
  ];
240
262
 
241
263
  lines.push("", renderRunsTable(runs));
@@ -267,16 +289,16 @@ function renderRunsTable(runs) {
267
289
  const scoringCell = r.preflightError
268
290
  ? "preflight error"
269
291
  : r.scoring
270
- ? r.scoring.verdict
292
+ ? statusIcon(r.scoring.verdict === "pass")
271
293
  : "—";
272
294
  const judgeCell = r.preflightError
273
295
  ? "—"
274
296
  : r.judgeVerdict
275
- ? r.judgeVerdict.verdict
297
+ ? statusIcon(r.judgeVerdict.verdict === "pass")
276
298
  : "—";
277
299
  rows.push([
278
300
  String(r.runIndex),
279
- r.verdict.toUpperCase(),
301
+ statusIcon(r.verdict === "pass"),
280
302
  scoringCell,
281
303
  judgeCell,
282
304
  formatCost(r.costUsd),
@@ -317,7 +339,7 @@ function collectScoringRows(runs) {
317
339
  rows.push({
318
340
  run: r.runIndex,
319
341
  check: escapeCell(String(d.test ?? "(unnamed)")),
320
- result: d.pass ? "PASS" : "FAIL",
342
+ result: statusIcon(d.pass),
321
343
  message: escapeCell(String(d.message ?? "")),
322
344
  });
323
345
  }
@@ -365,6 +387,10 @@ function renderErrors(runs) {
365
387
  // Formatting helpers
366
388
  // ---------------------------------------------------------------------------
367
389
 
390
+ function statusIcon(pass) {
391
+ return pass ? "✅" : "❌";
392
+ }
393
+
368
394
  function formatPassAt(v) {
369
395
  if (v == null) return "—";
370
396
  if (typeof v === "object" && "error" in v) return v.error;
@@ -46,7 +46,11 @@ const COMMON_FIELDS = {
46
46
  costUsd: z.number(),
47
47
  turns: z.number().int().min(0),
48
48
  profiles: PROFILES_SHAPE,
49
- model: z.string(),
49
+ model: z.object({
50
+ agent: z.string(),
51
+ supervisor: z.string(),
52
+ judge: z.string(),
53
+ }),
50
54
  skillSetHash: z.string(),
51
55
  familyRevision: z.string(),
52
56
  durationMs: z.number().int().min(0),
@@ -63,6 +67,7 @@ const HAPPY_RECORD = z.object({
63
67
  submission: z.string(),
64
68
  judgeVerdict: JUDGE_VERDICT_SHAPE,
65
69
  agentTracePath: z.string(),
70
+ supervisorTracePath: z.string(),
66
71
  judgeTracePath: z.string(),
67
72
  agentError: AGENT_ERROR_SHAPE.optional(),
68
73
  preflightError: z.undefined().optional(),
@@ -76,6 +81,7 @@ const PREFLIGHT_RECORD = z.object({
76
81
  // them in WorkdirManager.start) so the record is uniform across branches
77
82
  // and downstream consumers can reference them without conditional fields.
78
83
  agentTracePath: z.string(),
84
+ supervisorTracePath: z.string(),
79
85
  judgeTracePath: z.string(),
80
86
  scoring: z.undefined().optional(),
81
87
  submission: z.undefined().optional(),