@forwardimpact/libeval 0.1.36 → 0.1.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +32 -7
- package/bin/fit-eval.js +24 -3
- package/bin/fit-trace.js +42 -0
- package/package.json +2 -1
- package/src/benchmark/apm-installer.js +78 -16
- package/src/benchmark/env-loader.js +146 -0
- package/src/benchmark/judge.js +4 -3
- package/src/benchmark/report.js +43 -17
- package/src/benchmark/result.js +9 -3
- package/src/benchmark/runner.js +164 -117
- package/src/benchmark/scorer.js +5 -5
- package/src/benchmark/task-family.js +43 -50
- package/src/benchmark/workdir.js +21 -8
- package/src/commands/assert.js +145 -0
- package/src/commands/benchmark-report.js +1 -2
- package/src/commands/benchmark-run.js +11 -4
- package/src/commands/facilitate.js +4 -2
- package/src/commands/run.js +3 -3
- package/src/commands/supervise.js +5 -2
- package/src/facilitator.js +7 -3
- package/src/supervisor.js +42 -12
package/bin/fit-benchmark.js
CHANGED
|
@@ -34,15 +34,26 @@ export const definition = {
|
|
|
34
34
|
},
|
|
35
35
|
output: {
|
|
36
36
|
type: "string",
|
|
37
|
-
description:
|
|
37
|
+
description:
|
|
38
|
+
"Run-output directory (created if missing, default: benchmark-runs)",
|
|
38
39
|
},
|
|
39
40
|
runs: {
|
|
40
41
|
type: "string",
|
|
41
|
-
description: "Runs per task (integer ≥ 1, default
|
|
42
|
+
description: "Runs per task (integer ≥ 1, default: 5)",
|
|
43
|
+
},
|
|
44
|
+
"agent-model": {
|
|
45
|
+
type: "string",
|
|
46
|
+
description:
|
|
47
|
+
"Claude model for the agent-under-test (default: claude-sonnet-4-6)",
|
|
48
|
+
},
|
|
49
|
+
"supervisor-model": {
|
|
50
|
+
type: "string",
|
|
51
|
+
description:
|
|
52
|
+
"Claude model for the supervisor (default: claude-opus-4-7)",
|
|
42
53
|
},
|
|
43
|
-
model: {
|
|
54
|
+
"judge-model": {
|
|
44
55
|
type: "string",
|
|
45
|
-
description: "Claude model
|
|
56
|
+
description: "Claude model for the judge (default: claude-opus-4-7)",
|
|
46
57
|
},
|
|
47
58
|
"agent-profile": {
|
|
48
59
|
type: "string",
|
|
@@ -57,6 +68,11 @@ export const definition = {
|
|
|
57
68
|
description:
|
|
58
69
|
"Agent-under-test turn budget (default: 50, 0 = unlimited)",
|
|
59
70
|
},
|
|
71
|
+
"allowed-tools": {
|
|
72
|
+
type: "string",
|
|
73
|
+
description:
|
|
74
|
+
"Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
|
|
75
|
+
},
|
|
60
76
|
},
|
|
61
77
|
},
|
|
62
78
|
{
|
|
@@ -92,7 +108,8 @@ export const definition = {
|
|
|
92
108
|
options: {
|
|
93
109
|
input: {
|
|
94
110
|
type: "string",
|
|
95
|
-
description:
|
|
111
|
+
description:
|
|
112
|
+
"Run-output directory containing results.jsonl (default: benchmark-runs)",
|
|
96
113
|
},
|
|
97
114
|
k: {
|
|
98
115
|
type: "string",
|
|
@@ -111,8 +128,10 @@ export const definition = {
|
|
|
111
128
|
json: { type: "boolean", description: "Output help as JSON" },
|
|
112
129
|
},
|
|
113
130
|
examples: [
|
|
114
|
-
"fit-benchmark run --family=./families/coding
|
|
115
|
-
"fit-benchmark
|
|
131
|
+
"fit-benchmark run --family=./families/coding",
|
|
132
|
+
"fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
|
|
133
|
+
"fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
|
|
134
|
+
"fit-benchmark report --format=text",
|
|
116
135
|
"fit-benchmark report --input=./runs/2026-05-11 --k=1,3,5 --format=text",
|
|
117
136
|
],
|
|
118
137
|
documentation: [
|
|
@@ -122,6 +141,12 @@ export const definition = {
|
|
|
122
141
|
description:
|
|
123
142
|
"Author a coding-task family, run a benchmark across multiple runs, and read the pass@k report.",
|
|
124
143
|
},
|
|
144
|
+
{
|
|
145
|
+
title: "Automate with GitHub Actions",
|
|
146
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-benchmark/ci-workflow/index.md",
|
|
147
|
+
description:
|
|
148
|
+
"Run benchmarks in CI with the forwardimpact/fit-benchmark action.",
|
|
149
|
+
},
|
|
125
150
|
],
|
|
126
151
|
};
|
|
127
152
|
|
package/bin/fit-eval.js
CHANGED
|
@@ -41,7 +41,11 @@ const definition = {
|
|
|
41
41
|
type: "string",
|
|
42
42
|
description: "Additional text appended to the task",
|
|
43
43
|
},
|
|
44
|
-
model: {
|
|
44
|
+
"agent-model": {
|
|
45
|
+
type: "string",
|
|
46
|
+
description:
|
|
47
|
+
"Claude model for the agent (default: claude-opus-4-7[1m])",
|
|
48
|
+
},
|
|
45
49
|
"max-turns": {
|
|
46
50
|
type: "string",
|
|
47
51
|
description: "Max agentic turns (default: 50, 0 = unlimited)",
|
|
@@ -84,7 +88,16 @@ const definition = {
|
|
|
84
88
|
type: "string",
|
|
85
89
|
description: "Additional text appended to the task",
|
|
86
90
|
},
|
|
87
|
-
model: {
|
|
91
|
+
"agent-model": {
|
|
92
|
+
type: "string",
|
|
93
|
+
description:
|
|
94
|
+
"Claude model for the agent (default: claude-opus-4-7[1m])",
|
|
95
|
+
},
|
|
96
|
+
"supervisor-model": {
|
|
97
|
+
type: "string",
|
|
98
|
+
description:
|
|
99
|
+
"Claude model for the supervisor (default: claude-opus-4-7[1m])",
|
|
100
|
+
},
|
|
88
101
|
"max-turns": {
|
|
89
102
|
type: "string",
|
|
90
103
|
description: "Max agentic turns (default: 20, 0 = unlimited)",
|
|
@@ -136,7 +149,15 @@ const definition = {
|
|
|
136
149
|
type: "string",
|
|
137
150
|
description: "Additional text appended to the task",
|
|
138
151
|
},
|
|
139
|
-
model: {
|
|
152
|
+
"agent-model": {
|
|
153
|
+
type: "string",
|
|
154
|
+
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
155
|
+
},
|
|
156
|
+
"facilitator-model": {
|
|
157
|
+
type: "string",
|
|
158
|
+
description:
|
|
159
|
+
"Claude model for the facilitator (default: claude-opus-4-7[1m])",
|
|
160
|
+
},
|
|
140
161
|
"max-turns": {
|
|
141
162
|
type: "string",
|
|
142
163
|
description: "Max agentic turns (default: 20, 0 = unlimited)",
|
package/bin/fit-trace.js
CHANGED
|
@@ -25,6 +25,7 @@ import {
|
|
|
25
25
|
runFilterCommand,
|
|
26
26
|
runSplitCommand,
|
|
27
27
|
} from "../src/commands/trace.js";
|
|
28
|
+
import { runAssertCommand } from "../src/commands/assert.js";
|
|
28
29
|
|
|
29
30
|
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
30
31
|
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
@@ -199,6 +200,41 @@ const definition = {
|
|
|
199
200
|
},
|
|
200
201
|
},
|
|
201
202
|
},
|
|
203
|
+
{
|
|
204
|
+
name: "assert",
|
|
205
|
+
args: "<test-name> <file>",
|
|
206
|
+
description:
|
|
207
|
+
"Shell-friendly assertion — outputs structured JSON for scoring hooks",
|
|
208
|
+
options: {
|
|
209
|
+
grep: {
|
|
210
|
+
type: "string",
|
|
211
|
+
description:
|
|
212
|
+
"Pass if extended regex matches file content (case-insensitive)",
|
|
213
|
+
},
|
|
214
|
+
query: {
|
|
215
|
+
type: "string",
|
|
216
|
+
description:
|
|
217
|
+
"Pass if JMESPath expression against JSON/NDJSON yields a truthy result",
|
|
218
|
+
},
|
|
219
|
+
exists: {
|
|
220
|
+
type: "boolean",
|
|
221
|
+
description: "Pass if file exists",
|
|
222
|
+
},
|
|
223
|
+
"cites-job": {
|
|
224
|
+
type: "string",
|
|
225
|
+
description:
|
|
226
|
+
"Pass if <file> contains the canonical citation from a <job> tag in the given JTBD file",
|
|
227
|
+
},
|
|
228
|
+
not: {
|
|
229
|
+
type: "boolean",
|
|
230
|
+
description: "Invert the assertion",
|
|
231
|
+
},
|
|
232
|
+
message: {
|
|
233
|
+
type: "string",
|
|
234
|
+
description: "Custom failure message",
|
|
235
|
+
},
|
|
236
|
+
},
|
|
237
|
+
},
|
|
202
238
|
],
|
|
203
239
|
globalOptions: {
|
|
204
240
|
help: { type: "boolean", short: "h", description: "Show this help" },
|
|
@@ -220,6 +256,11 @@ const definition = {
|
|
|
220
256
|
"fit-trace search structured.json 'error|fail' --context 1",
|
|
221
257
|
"fit-trace filter structured.json --tool Bash --error",
|
|
222
258
|
"fit-trace turn structured.json 3",
|
|
259
|
+
"fit-trace assert has-heading --grep '^## Problem' spec.md",
|
|
260
|
+
"fit-trace assert no-leak --not --grep 'password' output.log",
|
|
261
|
+
"fit-trace assert file-present --exists path/to/spec.md",
|
|
262
|
+
"fit-trace assert cites-jtbd --cites-job jtbd-excerpt.md spec.md",
|
|
263
|
+
"fit-trace assert used-edit --query \"[?type=='assistant'].message.content[] | [?name=='Edit']\" trace.ndjson",
|
|
223
264
|
],
|
|
224
265
|
documentation: [
|
|
225
266
|
{
|
|
@@ -265,6 +306,7 @@ const COMMANDS = {
|
|
|
265
306
|
turn: runTurnCommand,
|
|
266
307
|
filter: runFilterCommand,
|
|
267
308
|
split: runSplitCommand,
|
|
309
|
+
assert: runAssertCommand,
|
|
268
310
|
};
|
|
269
311
|
|
|
270
312
|
async function main() {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.39",
|
|
4
4
|
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
@@ -53,6 +53,7 @@
|
|
|
53
53
|
"@forwardimpact/libcli": "^0.1.0",
|
|
54
54
|
"@forwardimpact/libconfig": "^0.1.0",
|
|
55
55
|
"@forwardimpact/libtelemetry": "^0.1.22",
|
|
56
|
+
"jmespath": "^0.16.0",
|
|
56
57
|
"zod": "^4.4.3"
|
|
57
58
|
},
|
|
58
59
|
"devDependencies": {
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* ApmInstaller —
|
|
3
|
-
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* v1 trusts the family's checked-in `.claude/` (P1); the lockfile is hashed
|
|
7
|
-
* verbatim, not interpreted.
|
|
2
|
+
* ApmInstaller — runs `apm install --target claude` in the family root to
|
|
3
|
+
* materialise skills and agents, copies the resulting `.claude/` into a
|
|
4
|
+
* staging directory, and computes the manifest fingerprint from the lockfile.
|
|
5
|
+
* Per-task copy happens later in WorkdirManager.
|
|
8
6
|
*/
|
|
9
7
|
|
|
8
|
+
import { spawn } from "node:child_process";
|
|
10
9
|
import { createHash } from "node:crypto";
|
|
11
|
-
import { access, cp, rm } from "node:fs/promises";
|
|
10
|
+
import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
|
|
12
11
|
import { join } from "node:path";
|
|
13
12
|
|
|
14
13
|
/**
|
|
@@ -20,20 +19,83 @@ export async function installApm(family, outputDir) {
|
|
|
20
19
|
const stagingDir = join(outputDir, ".apm-staging");
|
|
21
20
|
const stagedClaude = join(stagingDir, ".claude");
|
|
22
21
|
const sourceClaude = join(family.rootPath, ".claude");
|
|
22
|
+
const apmYml = join(family.rootPath, "apm.yml");
|
|
23
23
|
|
|
24
|
+
const hasApm = await access(apmYml)
|
|
25
|
+
.then(() => true)
|
|
26
|
+
.catch(() => false);
|
|
27
|
+
|
|
28
|
+
if (hasApm) {
|
|
29
|
+
await runApmInstall(family.rootPath);
|
|
30
|
+
try {
|
|
31
|
+
await access(sourceClaude);
|
|
32
|
+
} catch {
|
|
33
|
+
throw new Error(
|
|
34
|
+
`apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
|
|
35
|
+
);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
await rm(stagingDir, { recursive: true, force: true });
|
|
40
|
+
const hasClaudeDir = await access(sourceClaude)
|
|
41
|
+
.then(() => true)
|
|
42
|
+
.catch(() => false);
|
|
43
|
+
if (hasClaudeDir) {
|
|
44
|
+
await cp(sourceClaude, stagedClaude, { recursive: true });
|
|
45
|
+
} else {
|
|
46
|
+
await mkdir(stagedClaude, { recursive: true });
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// Stage the family-local judge profile outside .claude/ so it is available
|
|
50
|
+
// to the judge but never copied into the agent-under-test's CWD.
|
|
51
|
+
const judgeSource = join(family.rootPath, "judge.md");
|
|
52
|
+
const judgeProfilesDir = join(stagingDir, "judge-profiles");
|
|
24
53
|
try {
|
|
25
|
-
await access(
|
|
54
|
+
await access(judgeSource);
|
|
55
|
+
await mkdir(judgeProfilesDir, { recursive: true });
|
|
56
|
+
await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
|
|
57
|
+
} catch {}
|
|
58
|
+
|
|
59
|
+
const lockPath = join(family.rootPath, "apm.lock.yaml");
|
|
60
|
+
let skillSetHash = "";
|
|
61
|
+
try {
|
|
62
|
+
const lockBytes = await readFile(lockPath);
|
|
63
|
+
skillSetHash =
|
|
64
|
+
"sha256:" +
|
|
65
|
+
createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
|
|
26
66
|
} catch {
|
|
27
|
-
|
|
28
|
-
`task family missing .claude/ at ${sourceClaude}; family must check in a pre-staged skills/agents tree (design decision P1)`,
|
|
29
|
-
);
|
|
67
|
+
// No lockfile — family doesn't use skill packs.
|
|
30
68
|
}
|
|
31
69
|
|
|
32
|
-
|
|
33
|
-
|
|
70
|
+
return { stagingDir, skillSetHash, judgeProfilesDir };
|
|
71
|
+
}
|
|
34
72
|
|
|
35
|
-
|
|
36
|
-
|
|
73
|
+
function normalizeLf(buf) {
|
|
74
|
+
const out = [];
|
|
75
|
+
for (let i = 0; i < buf.length; i++) {
|
|
76
|
+
if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
|
|
77
|
+
out.push(buf[i]);
|
|
78
|
+
}
|
|
79
|
+
return Buffer.from(out);
|
|
80
|
+
}
|
|
37
81
|
|
|
38
|
-
|
|
82
|
+
function runApmInstall(cwd) {
|
|
83
|
+
return new Promise((res, rej) => {
|
|
84
|
+
const child = spawn("apm", ["install", "--target", "claude"], {
|
|
85
|
+
cwd,
|
|
86
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
87
|
+
});
|
|
88
|
+
let stderr = "";
|
|
89
|
+
child.stdout.on("data", () => {});
|
|
90
|
+
child.stderr.on("data", (d) => {
|
|
91
|
+
stderr += d.toString();
|
|
92
|
+
});
|
|
93
|
+
child.on("error", (e) => {
|
|
94
|
+
rej(new Error(`failed to spawn apm: ${e.message}`));
|
|
95
|
+
});
|
|
96
|
+
child.on("close", (code) => {
|
|
97
|
+
if (code === 0) res();
|
|
98
|
+
else rej(new Error(`apm install exited ${code}: ${stderr}`));
|
|
99
|
+
});
|
|
100
|
+
});
|
|
39
101
|
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Env-loader — auto-discover `.env` / `.env.local` files in a task family
|
|
3
|
+
* and its tasks, load them into `process.env`, and render the merged result
|
|
4
|
+
* into each agent CWD.
|
|
5
|
+
*
|
|
6
|
+
* Discovery paths (loaded in this order, first value per key wins):
|
|
7
|
+
* 1. process.env (CI secrets, shell env — never overwritten)
|
|
8
|
+
* 2. <family>/.env.local
|
|
9
|
+
* 3. <family>/.env
|
|
10
|
+
* 4. tasks/<id>/.env.local
|
|
11
|
+
* 5. tasks/<id>/.env
|
|
12
|
+
*
|
|
13
|
+
* Every discovered env file — family or task — is loaded into process.env
|
|
14
|
+
* AND rendered (with resolved values) into the agent working directory.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
18
|
+
import { join } from "node:path";
|
|
19
|
+
|
|
20
|
+
const ENV_FILES = [".env.local", ".env"];
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Parse a `.env` file into an array of {key, value} pairs.
|
|
24
|
+
* Handles KEY=VALUE, # comments, blank lines, and single/double-quoted values.
|
|
25
|
+
* @param {string} content
|
|
26
|
+
* @returns {Array<{key: string, value: string}>}
|
|
27
|
+
*/
|
|
28
|
+
export function parseEnvFile(content) {
|
|
29
|
+
const entries = [];
|
|
30
|
+
for (const raw of content.split("\n")) {
|
|
31
|
+
const line = raw.trim();
|
|
32
|
+
if (!line || line.startsWith("#")) continue;
|
|
33
|
+
const eq = line.indexOf("=");
|
|
34
|
+
if (eq === -1) continue;
|
|
35
|
+
const key = line.slice(0, eq).trim();
|
|
36
|
+
if (!key) continue;
|
|
37
|
+
let value = line.slice(eq + 1).trim();
|
|
38
|
+
if (
|
|
39
|
+
(value.startsWith('"') && value.endsWith('"')) ||
|
|
40
|
+
(value.startsWith("'") && value.endsWith("'"))
|
|
41
|
+
) {
|
|
42
|
+
value = value.slice(1, -1);
|
|
43
|
+
}
|
|
44
|
+
entries.push({ key, value });
|
|
45
|
+
}
|
|
46
|
+
return entries;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Read and parse an env file, returning [] if the file does not exist.
|
|
51
|
+
* @param {string} filePath
|
|
52
|
+
* @returns {Promise<Array<{key: string, value: string}>>}
|
|
53
|
+
*/
|
|
54
|
+
async function readEnvFile(filePath) {
|
|
55
|
+
try {
|
|
56
|
+
const content = await readFile(filePath, "utf8");
|
|
57
|
+
return parseEnvFile(content);
|
|
58
|
+
} catch (e) {
|
|
59
|
+
if (e.code === "ENOENT") return [];
|
|
60
|
+
throw e;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Load entries into process.env. Existing keys are never overwritten.
|
|
66
|
+
* @param {Array<{key: string, value: string}>} entries
|
|
67
|
+
* @returns {string[]} var names that were loaded
|
|
68
|
+
*/
|
|
69
|
+
function applyToProcessEnv(entries) {
|
|
70
|
+
const names = [];
|
|
71
|
+
for (const { key, value } of entries) {
|
|
72
|
+
names.push(key);
|
|
73
|
+
if (process.env[key] === undefined) {
|
|
74
|
+
process.env[key] = value;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return names;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Load one env file: apply to process.env, record keys in the merged map.
|
|
82
|
+
* @param {string} dir
|
|
83
|
+
* @param {string} file
|
|
84
|
+
* @param {Set<string>} names
|
|
85
|
+
* @param {Map<string, Map<string, true>>} merged
|
|
86
|
+
*/
|
|
87
|
+
async function loadOneEnvFile(dir, file, names, merged) {
|
|
88
|
+
const entries = await readEnvFile(join(dir, file));
|
|
89
|
+
if (entries.length === 0) return;
|
|
90
|
+
for (const name of applyToProcessEnv(entries)) names.add(name);
|
|
91
|
+
if (!merged.has(file)) merged.set(file, new Map());
|
|
92
|
+
const fileMap = merged.get(file);
|
|
93
|
+
for (const { key } of entries) {
|
|
94
|
+
if (!fileMap.has(key)) fileMap.set(key, true);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Scan directories for env files, load into process.env, and collect
|
|
100
|
+
* a merged key manifest per filename.
|
|
101
|
+
* @param {string[]} dirs
|
|
102
|
+
* @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
|
|
103
|
+
*/
|
|
104
|
+
async function collectEnvEntries(dirs) {
|
|
105
|
+
const names = new Set();
|
|
106
|
+
const merged = new Map();
|
|
107
|
+
for (const dir of dirs) {
|
|
108
|
+
for (const file of ENV_FILES) {
|
|
109
|
+
await loadOneEnvFile(dir, file, names, merged);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return { names, merged };
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Write resolved env files into the agent CWD and warn about empty values.
|
|
117
|
+
* @param {Map<string, Map<string, true>>} merged
|
|
118
|
+
* @param {string} agentCwd
|
|
119
|
+
*/
|
|
120
|
+
async function renderEnvFiles(merged, agentCwd) {
|
|
121
|
+
for (const [file, keyMap] of merged) {
|
|
122
|
+
const keys = [...keyMap.keys()];
|
|
123
|
+
const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
|
|
124
|
+
await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
|
|
125
|
+
const empty = keys.filter((key) => !process.env[key]);
|
|
126
|
+
if (empty.length > 0) {
|
|
127
|
+
process.stderr.write(
|
|
128
|
+
`libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Discover `.env` / `.env.local` in one or more directories, load them
|
|
136
|
+
* into process.env, and render the resolved values into the agent CWD.
|
|
137
|
+
*
|
|
138
|
+
* @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
|
|
139
|
+
* @param {string} agentCwd - Agent working directory to render into.
|
|
140
|
+
* @returns {Promise<string[]>} All var names discovered (for redaction).
|
|
141
|
+
*/
|
|
142
|
+
export async function loadEnv(dirs, agentCwd) {
|
|
143
|
+
const { names, merged } = await collectEnvEntries(dirs);
|
|
144
|
+
await renderEnvFiles(merged, agentCwd);
|
|
145
|
+
return [...names];
|
|
146
|
+
}
|
package/src/benchmark/judge.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*
|
|
7
7
|
* Template variables available in `judge.task.md`:
|
|
8
8
|
*
|
|
9
|
-
* {{AGENT_INSTRUCTIONS}} — contents of
|
|
9
|
+
* {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
|
|
10
10
|
* {{AGENT_PROFILE}} — agent profile body (empty string if none)
|
|
11
11
|
* {{AGENT_TRACE_PATH}} — path to agent.ndjson
|
|
12
12
|
* {{SCORING_RESULT}} — JSON scoring object
|
|
@@ -37,7 +37,7 @@ import { createRedactor } from "../redaction.js";
|
|
|
37
37
|
|
|
38
38
|
/**
|
|
39
39
|
* @typedef {object} JudgeContext
|
|
40
|
-
* @property {string} agentInstructions - Contents of
|
|
40
|
+
* @property {string} agentInstructions - Contents of agent.task.md.
|
|
41
41
|
* @property {string} agentProfile - Agent profile body (empty string if none).
|
|
42
42
|
* @property {string} skillSetHash - SHA-256 fingerprint from apm.lock.yaml.
|
|
43
43
|
*/
|
|
@@ -47,7 +47,7 @@ import { createRedactor } from "../redaction.js";
|
|
|
47
47
|
* @param {import("./task-family.js").Task} task
|
|
48
48
|
* @param {import("./workdir.js").Workdir} workdir
|
|
49
49
|
* @param {import("./scorer.js").ScoringResult} scoring
|
|
50
|
-
* @param {{query: Function, model: string, judgeProfile?: string}} deps
|
|
50
|
+
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
|
|
51
51
|
* @param {JudgeContext} [context]
|
|
52
52
|
* @returns {Promise<JudgeVerdict>}
|
|
53
53
|
*/
|
|
@@ -71,6 +71,7 @@ export async function runJudge(task, workdir, scoring, deps, context) {
|
|
|
71
71
|
output,
|
|
72
72
|
model: deps.model,
|
|
73
73
|
judgeProfile: deps.judgeProfile,
|
|
74
|
+
profilesDir: deps.profilesDir,
|
|
74
75
|
maxTurns: 25,
|
|
75
76
|
redactor: createRedactor(),
|
|
76
77
|
});
|
package/src/benchmark/report.js
CHANGED
|
@@ -178,24 +178,46 @@ function renderFullReport(report, kValues) {
|
|
|
178
178
|
function renderSummary(report) {
|
|
179
179
|
const { totals } = report;
|
|
180
180
|
const passing = report.tasks.filter((t) => t.c > 0 && t.c === t.n).length;
|
|
181
|
+
const icon = statusIcon(passing === totals.tasks);
|
|
181
182
|
const lines = [
|
|
182
183
|
"# Benchmark Report",
|
|
183
184
|
"",
|
|
184
|
-
|
|
185
|
+
`${icon} **${passing}/${totals.tasks} tasks passing** | ${totals.runs} runs${totals.skipped ? ` | ${totals.skipped} skipped` : ""}`,
|
|
185
186
|
];
|
|
187
|
+
|
|
188
|
+
const headers = [];
|
|
189
|
+
const values = [];
|
|
190
|
+
if (totals.costUsd != null) {
|
|
191
|
+
headers.push("Cost");
|
|
192
|
+
values.push(formatCost(totals.costUsd));
|
|
193
|
+
}
|
|
194
|
+
if (totals.medianDurationMs != null) {
|
|
195
|
+
headers.push("Median Duration");
|
|
196
|
+
values.push(formatDuration(totals.medianDurationMs));
|
|
197
|
+
}
|
|
198
|
+
if (totals.medianTurns != null) {
|
|
199
|
+
headers.push("Median Turns");
|
|
200
|
+
values.push(String(totals.medianTurns));
|
|
201
|
+
}
|
|
202
|
+
if (headers.length) {
|
|
203
|
+
lines.push("");
|
|
204
|
+
lines.push(`| ${headers.join(" | ")} |`);
|
|
205
|
+
lines.push(`| ${headers.map(() => "---").join(" | ")} |`);
|
|
206
|
+
lines.push(`| ${values.join(" | ")} |`);
|
|
207
|
+
}
|
|
208
|
+
|
|
186
209
|
const meta = [];
|
|
187
|
-
if (totals.model)
|
|
210
|
+
if (totals.model) {
|
|
211
|
+
meta.push(`Agent: \`${totals.model.agent}\``);
|
|
212
|
+
meta.push(`Supervisor: \`${totals.model.supervisor}\``);
|
|
213
|
+
meta.push(`Judge: \`${totals.model.judge}\``);
|
|
214
|
+
}
|
|
188
215
|
if (totals.skillSetHash) meta.push(`Skill set: \`${totals.skillSetHash}\``);
|
|
189
216
|
if (totals.familyRevision) meta.push(`Family: \`${totals.familyRevision}\``);
|
|
190
|
-
if (meta.length)
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
if (totals.medianDurationMs != null)
|
|
195
|
-
stats.push(`Median duration: ${formatDuration(totals.medianDurationMs)}`);
|
|
196
|
-
if (totals.medianTurns != null)
|
|
197
|
-
stats.push(`Median turns: ${totals.medianTurns}`);
|
|
198
|
-
if (stats.length) lines.push(stats.join(" | "));
|
|
217
|
+
if (meta.length) {
|
|
218
|
+
lines.push("");
|
|
219
|
+
lines.push(meta.join(" | "));
|
|
220
|
+
}
|
|
199
221
|
|
|
200
222
|
lines.push("");
|
|
201
223
|
return lines.join("\n");
|
|
@@ -229,13 +251,13 @@ function renderTotalsLine(report) {
|
|
|
229
251
|
|
|
230
252
|
function renderTaskDetail(task) {
|
|
231
253
|
const runs = task.runs ?? [];
|
|
232
|
-
const
|
|
254
|
+
const icon = statusIcon(task.c === task.n);
|
|
233
255
|
const singleRun = runs.length === 1;
|
|
234
256
|
|
|
235
257
|
const lines = [
|
|
236
258
|
`### ${task.taskId}`,
|
|
237
259
|
"",
|
|
238
|
-
|
|
260
|
+
`${icon} **${task.c}/${task.n} runs passed**`,
|
|
239
261
|
];
|
|
240
262
|
|
|
241
263
|
lines.push("", renderRunsTable(runs));
|
|
@@ -267,16 +289,16 @@ function renderRunsTable(runs) {
|
|
|
267
289
|
const scoringCell = r.preflightError
|
|
268
290
|
? "preflight error"
|
|
269
291
|
: r.scoring
|
|
270
|
-
? r.scoring.verdict
|
|
292
|
+
? statusIcon(r.scoring.verdict === "pass")
|
|
271
293
|
: "—";
|
|
272
294
|
const judgeCell = r.preflightError
|
|
273
295
|
? "—"
|
|
274
296
|
: r.judgeVerdict
|
|
275
|
-
? r.judgeVerdict.verdict
|
|
297
|
+
? statusIcon(r.judgeVerdict.verdict === "pass")
|
|
276
298
|
: "—";
|
|
277
299
|
rows.push([
|
|
278
300
|
String(r.runIndex),
|
|
279
|
-
r.verdict
|
|
301
|
+
statusIcon(r.verdict === "pass"),
|
|
280
302
|
scoringCell,
|
|
281
303
|
judgeCell,
|
|
282
304
|
formatCost(r.costUsd),
|
|
@@ -317,7 +339,7 @@ function collectScoringRows(runs) {
|
|
|
317
339
|
rows.push({
|
|
318
340
|
run: r.runIndex,
|
|
319
341
|
check: escapeCell(String(d.test ?? "(unnamed)")),
|
|
320
|
-
result: d.pass
|
|
342
|
+
result: statusIcon(d.pass),
|
|
321
343
|
message: escapeCell(String(d.message ?? "")),
|
|
322
344
|
});
|
|
323
345
|
}
|
|
@@ -365,6 +387,10 @@ function renderErrors(runs) {
|
|
|
365
387
|
// Formatting helpers
|
|
366
388
|
// ---------------------------------------------------------------------------
|
|
367
389
|
|
|
390
|
+
function statusIcon(pass) {
|
|
391
|
+
return pass ? "✅" : "❌";
|
|
392
|
+
}
|
|
393
|
+
|
|
368
394
|
function formatPassAt(v) {
|
|
369
395
|
if (v == null) return "—";
|
|
370
396
|
if (typeof v === "object" && "error" in v) return v.error;
|
package/src/benchmark/result.js
CHANGED
|
@@ -29,7 +29,7 @@ const JUDGE_VERDICT_SHAPE = z.object({
|
|
|
29
29
|
|
|
30
30
|
const PROFILES_SHAPE = z.object({
|
|
31
31
|
agent: z.union([z.string(), z.null()]),
|
|
32
|
-
supervisor: z.null(),
|
|
32
|
+
supervisor: z.union([z.string(), z.null()]),
|
|
33
33
|
judge: z.union([z.string(), z.null()]),
|
|
34
34
|
});
|
|
35
35
|
|
|
@@ -46,7 +46,11 @@ const COMMON_FIELDS = {
|
|
|
46
46
|
costUsd: z.number(),
|
|
47
47
|
turns: z.number().int().min(0),
|
|
48
48
|
profiles: PROFILES_SHAPE,
|
|
49
|
-
model: z.
|
|
49
|
+
model: z.object({
|
|
50
|
+
agent: z.string(),
|
|
51
|
+
supervisor: z.string().optional(),
|
|
52
|
+
judge: z.string().optional(),
|
|
53
|
+
}),
|
|
50
54
|
skillSetHash: z.string(),
|
|
51
55
|
familyRevision: z.string(),
|
|
52
56
|
durationMs: z.number().int().min(0),
|
|
@@ -61,8 +65,9 @@ const HAPPY_RECORD = z.object({
|
|
|
61
65
|
...COMMON_FIELDS,
|
|
62
66
|
scoring: SCORING_SHAPE,
|
|
63
67
|
submission: z.string(),
|
|
64
|
-
judgeVerdict: JUDGE_VERDICT_SHAPE,
|
|
68
|
+
judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
|
|
65
69
|
agentTracePath: z.string(),
|
|
70
|
+
supervisorTracePath: z.string(),
|
|
66
71
|
judgeTracePath: z.string(),
|
|
67
72
|
agentError: AGENT_ERROR_SHAPE.optional(),
|
|
68
73
|
preflightError: z.undefined().optional(),
|
|
@@ -76,6 +81,7 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
76
81
|
// them in WorkdirManager.start) so the record is uniform across branches
|
|
77
82
|
// and downstream consumers can reference them without conditional fields.
|
|
78
83
|
agentTracePath: z.string(),
|
|
84
|
+
supervisorTracePath: z.string(),
|
|
79
85
|
judgeTracePath: z.string(),
|
|
80
86
|
scoring: z.undefined().optional(),
|
|
81
87
|
submission: z.undefined().optional(),
|