@forwardimpact/libeval 0.1.35 → 0.1.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +27 -7
- package/bin/fit-eval.js +24 -3
- package/bin/fit-trace.js +42 -0
- package/package.json +2 -1
- package/src/benchmark/apm-installer.js +56 -10
- package/src/benchmark/judge.js +35 -8
- package/src/benchmark/report.js +364 -17
- package/src/benchmark/result.js +7 -1
- package/src/benchmark/runner.js +149 -79
- package/src/benchmark/scorer.js +2 -5
- package/src/benchmark/task-family.js +14 -47
- package/src/benchmark/workdir.js +7 -6
- package/src/commands/assert.js +145 -0
- package/src/commands/benchmark-report.js +6 -3
- package/src/commands/benchmark-run.js +5 -4
- package/src/commands/facilitate.js +4 -2
- package/src/commands/run.js +3 -3
- package/src/commands/supervise.js +5 -2
- package/src/facilitator.js +7 -3
- package/src/supervisor.js +47 -14
package/bin/fit-benchmark.js
CHANGED
|
@@ -34,15 +34,26 @@ export const definition = {
|
|
|
34
34
|
},
|
|
35
35
|
output: {
|
|
36
36
|
type: "string",
|
|
37
|
-
description:
|
|
37
|
+
description:
|
|
38
|
+
"Run-output directory (created if missing, default: benchmark-runs)",
|
|
38
39
|
},
|
|
39
40
|
runs: {
|
|
40
41
|
type: "string",
|
|
41
|
-
description: "Runs per task (integer ≥ 1, default
|
|
42
|
+
description: "Runs per task (integer ≥ 1, default: 5)",
|
|
43
|
+
},
|
|
44
|
+
"agent-model": {
|
|
45
|
+
type: "string",
|
|
46
|
+
description:
|
|
47
|
+
"Claude model for the agent-under-test (default: claude-sonnet-4-6)",
|
|
42
48
|
},
|
|
43
|
-
model: {
|
|
49
|
+
"supervisor-model": {
|
|
44
50
|
type: "string",
|
|
45
|
-
description:
|
|
51
|
+
description:
|
|
52
|
+
"Claude model for the supervisor (default: claude-opus-4-7)",
|
|
53
|
+
},
|
|
54
|
+
"judge-model": {
|
|
55
|
+
type: "string",
|
|
56
|
+
description: "Claude model for the judge (default: claude-opus-4-7)",
|
|
46
57
|
},
|
|
47
58
|
"agent-profile": {
|
|
48
59
|
type: "string",
|
|
@@ -92,7 +103,8 @@ export const definition = {
|
|
|
92
103
|
options: {
|
|
93
104
|
input: {
|
|
94
105
|
type: "string",
|
|
95
|
-
description:
|
|
106
|
+
description:
|
|
107
|
+
"Run-output directory containing results.jsonl (default: benchmark-runs)",
|
|
96
108
|
},
|
|
97
109
|
k: {
|
|
98
110
|
type: "string",
|
|
@@ -111,8 +123,10 @@ export const definition = {
|
|
|
111
123
|
json: { type: "boolean", description: "Output help as JSON" },
|
|
112
124
|
},
|
|
113
125
|
examples: [
|
|
114
|
-
"fit-benchmark run --family=./families/coding
|
|
115
|
-
"fit-benchmark
|
|
126
|
+
"fit-benchmark run --family=./families/coding",
|
|
127
|
+
"fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
|
|
128
|
+
"fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
|
|
129
|
+
"fit-benchmark report --format=text",
|
|
116
130
|
"fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
|
|
117
131
|
],
|
|
118
132
|
documentation: [
|
|
@@ -122,6 +136,12 @@ export const definition = {
|
|
|
122
136
|
description:
|
|
123
137
|
"Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
|
|
124
138
|
},
|
|
139
|
+
{
|
|
140
|
+
title: "Automate with GitHub Actions",
|
|
141
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
|
|
142
|
+
description:
|
|
143
|
+
"Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
|
|
144
|
+
},
|
|
125
145
|
],
|
|
126
146
|
};
|
|
127
147
|
|
package/bin/fit-eval.js
CHANGED
|
@@ -41,7 +41,11 @@ const definition = {
|
|
|
41
41
|
type: "string",
|
|
42
42
|
description: "Additional text appended to the task",
|
|
43
43
|
},
|
|
44
|
-
model: {
|
|
44
|
+
"agent-model": {
|
|
45
|
+
type: "string",
|
|
46
|
+
description:
|
|
47
|
+
"Claude model for the agent (default: claude-opus-4-7[1m])",
|
|
48
|
+
},
|
|
45
49
|
"max-turns": {
|
|
46
50
|
type: "string",
|
|
47
51
|
description: "Max agentic turns (default: 50, 0 = unlimited)",
|
|
@@ -84,7 +88,16 @@ const definition = {
|
|
|
84
88
|
type: "string",
|
|
85
89
|
description: "Additional text appended to the task",
|
|
86
90
|
},
|
|
87
|
-
model: {
|
|
91
|
+
"agent-model": {
|
|
92
|
+
type: "string",
|
|
93
|
+
description:
|
|
94
|
+
"Claude model for the agent (default: claude-opus-4-7[1m])",
|
|
95
|
+
},
|
|
96
|
+
"supervisor-model": {
|
|
97
|
+
type: "string",
|
|
98
|
+
description:
|
|
99
|
+
"Claude model for the supervisor (default: claude-opus-4-7[1m])",
|
|
100
|
+
},
|
|
88
101
|
"max-turns": {
|
|
89
102
|
type: "string",
|
|
90
103
|
description: "Max agentic turns (default: 20, 0 = unlimited)",
|
|
@@ -136,7 +149,15 @@ const definition = {
|
|
|
136
149
|
type: "string",
|
|
137
150
|
description: "Additional text appended to the task",
|
|
138
151
|
},
|
|
139
|
-
model: {
|
|
152
|
+
"agent-model": {
|
|
153
|
+
type: "string",
|
|
154
|
+
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
155
|
+
},
|
|
156
|
+
"facilitator-model": {
|
|
157
|
+
type: "string",
|
|
158
|
+
description:
|
|
159
|
+
"Claude model for the facilitator (default: claude-opus-4-7[1m])",
|
|
160
|
+
},
|
|
140
161
|
"max-turns": {
|
|
141
162
|
type: "string",
|
|
142
163
|
description: "Max agentic turns (default: 20, 0 = unlimited)",
|
package/bin/fit-trace.js
CHANGED
|
@@ -25,6 +25,7 @@ import {
|
|
|
25
25
|
runFilterCommand,
|
|
26
26
|
runSplitCommand,
|
|
27
27
|
} from "../src/commands/trace.js";
|
|
28
|
+
import { runAssertCommand } from "../src/commands/assert.js";
|
|
28
29
|
|
|
29
30
|
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
30
31
|
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
@@ -199,6 +200,41 @@ const definition = {
|
|
|
199
200
|
},
|
|
200
201
|
},
|
|
201
202
|
},
|
|
203
|
+
{
|
|
204
|
+
name: "assert",
|
|
205
|
+
args: "<test-name> <file>",
|
|
206
|
+
description:
|
|
207
|
+
"Shell-friendly assertion — outputs structured JSON for scoring hooks",
|
|
208
|
+
options: {
|
|
209
|
+
grep: {
|
|
210
|
+
type: "string",
|
|
211
|
+
description:
|
|
212
|
+
"Pass if extended regex matches file content (case-insensitive)",
|
|
213
|
+
},
|
|
214
|
+
query: {
|
|
215
|
+
type: "string",
|
|
216
|
+
description:
|
|
217
|
+
"Pass if JMESPath expression against JSON/NDJSON yields a truthy result",
|
|
218
|
+
},
|
|
219
|
+
exists: {
|
|
220
|
+
type: "boolean",
|
|
221
|
+
description: "Pass if file exists",
|
|
222
|
+
},
|
|
223
|
+
"cites-job": {
|
|
224
|
+
type: "string",
|
|
225
|
+
description:
|
|
226
|
+
"Pass if <file> contains the canonical citation from a <job> tag in the given JTBD file",
|
|
227
|
+
},
|
|
228
|
+
not: {
|
|
229
|
+
type: "boolean",
|
|
230
|
+
description: "Invert the assertion",
|
|
231
|
+
},
|
|
232
|
+
message: {
|
|
233
|
+
type: "string",
|
|
234
|
+
description: "Custom failure message",
|
|
235
|
+
},
|
|
236
|
+
},
|
|
237
|
+
},
|
|
202
238
|
],
|
|
203
239
|
globalOptions: {
|
|
204
240
|
help: { type: "boolean", short: "h", description: "Show this help" },
|
|
@@ -220,6 +256,11 @@ const definition = {
|
|
|
220
256
|
"fit-trace search structured.json 'error|fail' --context 1",
|
|
221
257
|
"fit-trace filter structured.json --tool Bash --error",
|
|
222
258
|
"fit-trace turn structured.json 3",
|
|
259
|
+
"fit-trace assert has-heading --grep '^## Problem' spec.md",
|
|
260
|
+
"fit-trace assert no-leak --not --grep 'password' output.log",
|
|
261
|
+
"fit-trace assert file-present --exists path/to/spec.md",
|
|
262
|
+
"fit-trace assert cites-jtbd --cites-job jtbd-excerpt.md spec.md",
|
|
263
|
+
"fit-trace assert used-edit --query \"[?type=='assistant'].message.content[] | [?name=='Edit']\" trace.ndjson",
|
|
223
264
|
],
|
|
224
265
|
documentation: [
|
|
225
266
|
{
|
|
@@ -265,6 +306,7 @@ const COMMANDS = {
|
|
|
265
306
|
turn: runTurnCommand,
|
|
266
307
|
filter: runFilterCommand,
|
|
267
308
|
split: runSplitCommand,
|
|
309
|
+
assert: runAssertCommand,
|
|
268
310
|
};
|
|
269
311
|
|
|
270
312
|
async function main() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.38",
|
|
4
4
|
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
@@ -53,6 +53,7 @@
|
|
|
53
53
|
"@forwardimpact/libcli": "^0.1.0",
|
|
54
54
|
"@forwardimpact/libconfig": "^0.1.0",
|
|
55
55
|
"@forwardimpact/libtelemetry": "^0.1.22",
|
|
56
|
+
"jmespath": "^0.16.0",
|
|
56
57
|
"zod": "^4.4.3"
|
|
57
58
|
},
|
|
58
59
|
"devDependencies": {
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* ApmInstaller —
|
|
3
|
-
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
|
|
7
|
-
* verbatim, not interpreted.
|
|
2
|
+
* ApmInstaller — runs `apm install --target claude` in the family root to
|
|
3
|
+
* materialise skills and agents, copies the resulting `.claude/` into a
|
|
4
|
+
* staging directory, and computes the manifest fingerprint from the lockfile.
|
|
5
|
+
* Per-task copy happens later in WorkdirManager.
|
|
8
6
|
*/
|
|
9
7
|
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
10
9
|
import { createHash } from "node:crypto";
|
|
11
|
-
import { access, cp, rm } from "node:fs/promises";
|
|
10
|
+
import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
|
|
12
11
|
import { join } from "node:path";
|
|
13
12
|
|
|
14
13
|
/**
|
|
@@ -21,19 +20,66 @@ export async function installApm(family, outputDir) {
|
|
|
21
20
|
const stagedClaude = join(stagingDir, ".claude");
|
|
22
21
|
const sourceClaude = join(family.rootPath, ".claude");
|
|
23
22
|
|
|
23
|
+
await runApmInstall(family.rootPath);
|
|
24
|
+
|
|
24
25
|
try {
|
|
25
26
|
await access(sourceClaude);
|
|
26
27
|
} catch {
|
|
27
28
|
throw new Error(
|
|
28
|
-
`
|
|
29
|
+
`apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
|
|
29
30
|
);
|
|
30
31
|
}
|
|
31
32
|
|
|
32
33
|
await rm(stagingDir, { recursive: true, force: true });
|
|
33
34
|
await cp(sourceClaude, stagedClaude, { recursive: true });
|
|
34
35
|
|
|
36
|
+
// Stage the family-local judge profile outside .claude/ so it is available
|
|
37
|
+
// to the judge but never copied into the agent-under-test's CWD.
|
|
38
|
+
const judgeSource = join(family.rootPath, "judge.md");
|
|
39
|
+
const judgeProfilesDir = join(stagingDir, "judge-profiles");
|
|
40
|
+
try {
|
|
41
|
+
await access(judgeSource);
|
|
42
|
+
await mkdir(judgeProfilesDir, { recursive: true });
|
|
43
|
+
await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
|
|
44
|
+
} catch {}
|
|
45
|
+
|
|
46
|
+
const lockPath = join(family.rootPath, "apm.lock.yaml");
|
|
47
|
+
const lockBytes = await readFile(lockPath).catch(() => {
|
|
48
|
+
throw new Error(`apm install did not produce apm.lock.yaml at ${lockPath}`);
|
|
49
|
+
});
|
|
35
50
|
const skillSetHash =
|
|
36
|
-
"sha256:" +
|
|
51
|
+
"sha256:" +
|
|
52
|
+
createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
|
|
53
|
+
|
|
54
|
+
return { stagingDir, skillSetHash, judgeProfilesDir };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function normalizeLf(buf) {
|
|
58
|
+
const out = [];
|
|
59
|
+
for (let i = 0; i < buf.length; i++) {
|
|
60
|
+
if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
|
|
61
|
+
out.push(buf[i]);
|
|
62
|
+
}
|
|
63
|
+
return Buffer.from(out);
|
|
64
|
+
}
|
|
37
65
|
|
|
38
|
-
|
|
66
|
+
function runApmInstall(cwd) {
|
|
67
|
+
return new Promise((res, rej) => {
|
|
68
|
+
const child = spawn("apm", ["install", "--target", "claude"], {
|
|
69
|
+
cwd,
|
|
70
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
71
|
+
});
|
|
72
|
+
let stderr = "";
|
|
73
|
+
child.stdout.on("data", () => {});
|
|
74
|
+
child.stderr.on("data", (d) => {
|
|
75
|
+
stderr += d.toString();
|
|
76
|
+
});
|
|
77
|
+
child.on("error", (e) => {
|
|
78
|
+
rej(new Error(`failed to spawn apm: ${e.message}`));
|
|
79
|
+
});
|
|
80
|
+
child.on("close", (code) => {
|
|
81
|
+
if (code === 0) res();
|
|
82
|
+
else rej(new Error(`apm install exited ${code}: ${stderr}`));
|
|
83
|
+
});
|
|
84
|
+
});
|
|
39
85
|
}
|
package/src/benchmark/judge.js
CHANGED
|
@@ -1,9 +1,20 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Benchmark adapter for the libeval `Judge`. Templates the family's
|
|
3
|
-
* `judge.task.md`
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
3
|
+
* `judge.task.md` with structured context variables, runs the judge against
|
|
4
|
+
* the post-run agent CWD, and returns the verdict in the benchmark's
|
|
5
|
+
* `pass`/`fail` vocabulary (mapped from libeval's `success`/`failure`).
|
|
6
|
+
*
|
|
7
|
+
* Template variables available in `judge.task.md`:
|
|
8
|
+
*
|
|
9
|
+
* {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
|
|
10
|
+
* {{AGENT_PROFILE}} — agent profile body (empty string if none)
|
|
11
|
+
* {{AGENT_TRACE_PATH}} — path to agent.ndjson
|
|
12
|
+
* {{SCORING_RESULT}} — JSON scoring object
|
|
13
|
+
* {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
|
|
14
|
+
* {{TASK_ID}} — task name (directory under tasks/)
|
|
15
|
+
* {{TASK_DIR}} — agent working directory path
|
|
16
|
+
*
|
|
17
|
+
* Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
|
|
7
18
|
*
|
|
8
19
|
* The judge verdict is captured from the orchestration context's
|
|
9
20
|
* `concluded` flag directly — no trace parsing on the happy path.
|
|
@@ -24,19 +35,34 @@ import { createRedactor } from "../redaction.js";
|
|
|
24
35
|
* @property {string} summary
|
|
25
36
|
*/
|
|
26
37
|
|
|
38
|
+
/**
|
|
39
|
+
* @typedef {object} JudgeContext
|
|
40
|
+
* @property {string} agentInstructions - Contents of agent.task.md.
|
|
41
|
+
* @property {string} agentProfile - Agent profile body (empty string if none).
|
|
42
|
+
* @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
|
|
43
|
+
*/
|
|
44
|
+
|
|
27
45
|
/**
|
|
28
46
|
* Run the judge over a completed task run.
|
|
29
47
|
* @param {import("./task-family.js").Task} task
|
|
30
48
|
* @param {import("./workdir.js").Workdir} workdir
|
|
31
49
|
* @param {import("./scorer.js").ScoringResult} scoring
|
|
32
|
-
* @param {{query: Function, model: string, judgeProfile?: string}} deps
|
|
50
|
+
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
|
|
51
|
+
* @param {JudgeContext} [context]
|
|
33
52
|
* @returns {Promise<JudgeVerdict>}
|
|
34
53
|
*/
|
|
35
|
-
export async function runJudge(task, workdir, scoring, deps) {
|
|
54
|
+
export async function runJudge(task, workdir, scoring, deps, context) {
|
|
36
55
|
const template = await readFile(task.paths.judge, "utf8");
|
|
56
|
+
const scoringJson = JSON.stringify(scoring, null, 2);
|
|
37
57
|
const taskText = template
|
|
38
|
-
.replaceAll("{{
|
|
39
|
-
.replaceAll("{{
|
|
58
|
+
.replaceAll("{{SCORING_RESULT}}", scoringJson)
|
|
59
|
+
.replaceAll("{{SCORING}}", scoringJson)
|
|
60
|
+
.replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
|
|
61
|
+
.replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
|
|
62
|
+
.replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
|
|
63
|
+
.replaceAll("{{SKILL_SET_HASH}}", context?.skillSetHash ?? "")
|
|
64
|
+
.replaceAll("{{TASK_ID}}", task.id)
|
|
65
|
+
.replaceAll("{{TASK_DIR}}", workdir.cwd);
|
|
40
66
|
|
|
41
67
|
const output = createWriteStream(workdir.judgeTracePath);
|
|
42
68
|
const judge = createJudge({
|
|
@@ -45,6 +71,7 @@ export async function runJudge(task, workdir, scoring, deps) {
|
|
|
45
71
|
output,
|
|
46
72
|
model: deps.model,
|
|
47
73
|
judgeProfile: deps.judgeProfile,
|
|
74
|
+
profilesDir: deps.profilesDir,
|
|
48
75
|
maxTurns: 25,
|
|
49
76
|
redactor: createRedactor(),
|
|
50
77
|
});
|