@forwardimpact/libeval 0.1.49 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +76 -78
- package/bin/fit-trace.js +83 -57
- package/package.json +2 -2
- package/src/agent-runner.js +23 -13
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/{scorer.js → invariants.js} +14 -12
- package/src/benchmark/judge.js +5 -8
- package/src/benchmark/npm-installer.js +87 -0
- package/src/benchmark/report.js +15 -15
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +17 -11
- package/src/benchmark/task-family.js +6 -4
- package/src/benchmark/workdir.js +23 -3
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +23 -15
- package/src/commands/benchmark-run.js +22 -7
- package/src/commands/by-discussion.js +29 -18
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +30 -21
- package/src/commands/facilitate.js +20 -21
- package/src/commands/output.js +11 -12
- package/src/commands/run.js +24 -21
- package/src/commands/supervise.js +27 -27
- package/src/commands/task-input.js +54 -0
- package/src/commands/trace.js +174 -97
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +49 -2
- package/src/events/github.js +155 -0
- package/src/inbox-poller.js +84 -0
- package/src/index.js +10 -0
- package/src/judge.js +1 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +19 -5
- package/src/orchestration-toolkit.js +14 -0
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/commands/benchmark-score.js +0 -68
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.51",
|
|
4
4
|
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
@@ -62,7 +62,7 @@
|
|
|
62
62
|
"zod": "^4.4.3"
|
|
63
63
|
},
|
|
64
64
|
"devDependencies": {
|
|
65
|
-
"@forwardimpact/
|
|
65
|
+
"@forwardimpact/libmock": "^0.1.0"
|
|
66
66
|
},
|
|
67
67
|
"engines": {
|
|
68
68
|
"bun": ">=1.2.0",
|
package/src/agent-runner.js
CHANGED
|
@@ -29,12 +29,16 @@ export class AgentRunner {
|
|
|
29
29
|
* @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
|
|
30
30
|
* @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
|
|
31
31
|
* @param {object} deps.redactor
|
|
32
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} [deps.runtime] -
|
|
33
|
+
* Ambient collaborators. Only `proc.env` is read (to record Skill
|
|
34
|
+
* invocations into `LIBEVAL_SKILL`); when absent the write is skipped.
|
|
32
35
|
*/
|
|
33
36
|
constructor(deps) {
|
|
34
37
|
if (!deps.cwd) throw new Error("cwd is required");
|
|
35
38
|
if (!deps.query) throw new Error("query is required");
|
|
36
39
|
if (!deps.output) throw new Error("output is required");
|
|
37
40
|
if (!deps.redactor) throw new Error("redactor is required");
|
|
41
|
+
this.runtime = deps.runtime ?? null;
|
|
38
42
|
this.cwd = deps.cwd;
|
|
39
43
|
this.query = deps.query;
|
|
40
44
|
this.output = deps.output;
|
|
@@ -62,7 +66,9 @@ export class AgentRunner {
|
|
|
62
66
|
const abortController = new AbortController();
|
|
63
67
|
this.currentAbortController = abortController;
|
|
64
68
|
const effectiveTask = this.taskAmend
|
|
65
|
-
?
|
|
69
|
+
? task
|
|
70
|
+
? `${task}\n\n${this.taskAmend}`
|
|
71
|
+
: this.taskAmend
|
|
66
72
|
: task;
|
|
67
73
|
try {
|
|
68
74
|
const iterator = this.query({
|
|
@@ -177,20 +183,24 @@ export class AgentRunner {
|
|
|
177
183
|
if (message.type === "system" && message.subtype === "init") {
|
|
178
184
|
this.sessionId = message.session_id;
|
|
179
185
|
}
|
|
180
|
-
if (message.type === "assistant") trackSkillInvocation(message);
|
|
186
|
+
if (message.type === "assistant") this.#trackSkillInvocation(message);
|
|
181
187
|
}
|
|
182
|
-
}
|
|
183
188
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
189
|
+
#trackSkillInvocation(message) {
|
|
190
|
+
const content = message.message?.content ?? message.content;
|
|
191
|
+
if (!Array.isArray(content)) return;
|
|
192
|
+
// Skill metric is recorded into the env map; without a runtime there is
|
|
193
|
+
// no env surface to write to, so the side-effect is simply skipped.
|
|
194
|
+
const env = this.runtime?.proc?.env ?? null;
|
|
195
|
+
if (!env) return;
|
|
196
|
+
for (const block of content) {
|
|
197
|
+
if (
|
|
198
|
+
block.type === "tool_use" &&
|
|
199
|
+
block.name === "Skill" &&
|
|
200
|
+
block.input?.skill
|
|
201
|
+
) {
|
|
202
|
+
env.LIBEVAL_SKILL = block.input.skill;
|
|
203
|
+
}
|
|
194
204
|
}
|
|
195
205
|
}
|
|
196
206
|
}
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
* AND rendered (with resolved values) into the agent working directory.
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
import { readFile, writeFile } from "node:fs/promises";
|
|
18
17
|
import { join } from "node:path";
|
|
19
18
|
|
|
20
19
|
const ENV_FILES = [".env.local", ".env"];
|
|
@@ -48,12 +47,13 @@ export function parseEnvFile(content) {
|
|
|
48
47
|
|
|
49
48
|
/**
|
|
50
49
|
* Read and parse an env file, returning [] if the file does not exist.
|
|
50
|
+
* @param {object} fs - Async filesystem surface (`runtime.fs`).
|
|
51
51
|
* @param {string} filePath
|
|
52
52
|
* @returns {Promise<Array<{key: string, value: string}>>}
|
|
53
53
|
*/
|
|
54
|
-
async function readEnvFile(filePath) {
|
|
54
|
+
async function readEnvFile(fs, filePath) {
|
|
55
55
|
try {
|
|
56
|
-
const content = await readFile(filePath, "utf8");
|
|
56
|
+
const content = await fs.readFile(filePath, "utf8");
|
|
57
57
|
return parseEnvFile(content);
|
|
58
58
|
} catch (e) {
|
|
59
59
|
if (e.code === "ENOENT") return [];
|
|
@@ -62,32 +62,36 @@ async function readEnvFile(filePath) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
/**
|
|
65
|
-
* Load entries into process
|
|
65
|
+
* Load entries into the process env map. Existing keys are never overwritten.
|
|
66
|
+
* @param {Record<string, string|undefined>} env - The `runtime.proc.env` map.
|
|
66
67
|
* @param {Array<{key: string, value: string}>} entries
|
|
67
68
|
* @returns {string[]} var names that were loaded
|
|
68
69
|
*/
|
|
69
|
-
function applyToProcessEnv(entries) {
|
|
70
|
+
function applyToProcessEnv(env, entries) {
|
|
70
71
|
const names = [];
|
|
71
72
|
for (const { key, value } of entries) {
|
|
72
73
|
names.push(key);
|
|
73
|
-
if (
|
|
74
|
-
|
|
74
|
+
if (env[key] === undefined) {
|
|
75
|
+
env[key] = value;
|
|
75
76
|
}
|
|
76
77
|
}
|
|
77
78
|
return names;
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
/**
|
|
81
|
-
* Load one env file: apply to
|
|
82
|
+
* Load one env file: apply to the env map, record keys in the merged map.
|
|
83
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
82
84
|
* @param {string} dir
|
|
83
85
|
* @param {string} file
|
|
84
86
|
* @param {Set<string>} names
|
|
85
87
|
* @param {Map<string, Map<string, true>>} merged
|
|
86
88
|
*/
|
|
87
|
-
async function loadOneEnvFile(dir, file, names, merged) {
|
|
88
|
-
const entries = await readEnvFile(join(dir, file));
|
|
89
|
+
async function loadOneEnvFile(runtime, dir, file, names, merged) {
|
|
90
|
+
const entries = await readEnvFile(runtime.fs, join(dir, file));
|
|
89
91
|
if (entries.length === 0) return;
|
|
90
|
-
for (const name of applyToProcessEnv(entries))
|
|
92
|
+
for (const name of applyToProcessEnv(runtime.proc.env, entries)) {
|
|
93
|
+
names.add(name);
|
|
94
|
+
}
|
|
91
95
|
if (!merged.has(file)) merged.set(file, new Map());
|
|
92
96
|
const fileMap = merged.get(file);
|
|
93
97
|
for (const { key } of entries) {
|
|
@@ -96,17 +100,18 @@ async function loadOneEnvFile(dir, file, names, merged) {
|
|
|
96
100
|
}
|
|
97
101
|
|
|
98
102
|
/**
|
|
99
|
-
* Scan directories for env files, load into
|
|
103
|
+
* Scan directories for env files, load into the env map, and collect
|
|
100
104
|
* a merged key manifest per filename.
|
|
105
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
101
106
|
* @param {string[]} dirs
|
|
102
107
|
* @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
|
|
103
108
|
*/
|
|
104
|
-
async function collectEnvEntries(dirs) {
|
|
109
|
+
async function collectEnvEntries(runtime, dirs) {
|
|
105
110
|
const names = new Set();
|
|
106
111
|
const merged = new Map();
|
|
107
112
|
for (const dir of dirs) {
|
|
108
113
|
for (const file of ENV_FILES) {
|
|
109
|
-
await loadOneEnvFile(dir, file, names, merged);
|
|
114
|
+
await loadOneEnvFile(runtime, dir, file, names, merged);
|
|
110
115
|
}
|
|
111
116
|
}
|
|
112
117
|
return { names, merged };
|
|
@@ -114,17 +119,22 @@ async function collectEnvEntries(dirs) {
|
|
|
114
119
|
|
|
115
120
|
/**
|
|
116
121
|
* Write resolved env files into the agent CWD and warn about empty values.
|
|
122
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
117
123
|
* @param {Map<string, Map<string, true>>} merged
|
|
118
124
|
* @param {string} agentCwd
|
|
119
125
|
*/
|
|
120
|
-
async function renderEnvFiles(merged, agentCwd) {
|
|
126
|
+
async function renderEnvFiles(runtime, merged, agentCwd) {
|
|
127
|
+
const env = runtime.proc.env;
|
|
121
128
|
for (const [file, keyMap] of merged) {
|
|
122
129
|
const keys = [...keyMap.keys()];
|
|
123
|
-
const resolved = keys.map((key) => `${key}=${
|
|
124
|
-
await writeFile(
|
|
125
|
-
|
|
130
|
+
const resolved = keys.map((key) => `${key}=${env[key] ?? ""}`);
|
|
131
|
+
await runtime.fs.writeFile(
|
|
132
|
+
join(agentCwd, file),
|
|
133
|
+
resolved.join("\n") + "\n",
|
|
134
|
+
);
|
|
135
|
+
const empty = keys.filter((key) => !env[key]);
|
|
126
136
|
if (empty.length > 0) {
|
|
127
|
-
|
|
137
|
+
runtime.proc.stderr.write(
|
|
128
138
|
`libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
|
|
129
139
|
);
|
|
130
140
|
}
|
|
@@ -133,14 +143,16 @@ async function renderEnvFiles(merged, agentCwd) {
|
|
|
133
143
|
|
|
134
144
|
/**
|
|
135
145
|
* Discover `.env` / `.env.local` in one or more directories, load them
|
|
136
|
-
* into process
|
|
146
|
+
* into the process env map, and render the resolved values into the agent CWD.
|
|
137
147
|
*
|
|
138
148
|
* @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
|
|
139
149
|
* @param {string} agentCwd - Agent working directory to render into.
|
|
150
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
|
|
151
|
+
* collaborators; uses `fs` (async read/write), `proc.env`, `proc.stderr`.
|
|
140
152
|
* @returns {Promise<string[]>} All var names discovered (for redaction).
|
|
141
153
|
*/
|
|
142
|
-
export async function loadEnv(dirs, agentCwd) {
|
|
143
|
-
const { names, merged } = await collectEnvEntries(dirs);
|
|
144
|
-
await renderEnvFiles(merged, agentCwd);
|
|
154
|
+
export async function loadEnv(dirs, agentCwd, runtime) {
|
|
155
|
+
const { names, merged } = await collectEnvEntries(runtime, dirs);
|
|
156
|
+
await renderEnvFiles(runtime, merged, agentCwd);
|
|
145
157
|
return [...names];
|
|
146
158
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
* the post-run agent CWD. The exit code is authoritative for the
|
|
4
|
-
* structured per-
|
|
2
|
+
* Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
|
|
3
|
+
* against the post-run agent CWD. The exit code is authoritative for the
|
|
4
|
+
* verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
import { spawn } from "node:child_process";
|
|
@@ -15,31 +15,33 @@ import {
|
|
|
15
15
|
import { join } from "node:path";
|
|
16
16
|
|
|
17
17
|
/**
|
|
18
|
-
* @typedef {object}
|
|
18
|
+
* @typedef {object} InvariantsResult
|
|
19
19
|
* @property {"pass" | "fail"} verdict
|
|
20
20
|
* @property {Array<object>} details
|
|
21
21
|
* @property {number} exitCode
|
|
22
22
|
*/
|
|
23
23
|
|
|
24
24
|
/**
|
|
25
|
-
* Run the task's
|
|
25
|
+
* Run the task's invariants script.
|
|
26
26
|
* @param {import("./task-family.js").Task} task
|
|
27
27
|
* @param {{cwd: string, port: number, runDir: string}} ctx
|
|
28
|
-
* @returns {Promise<
|
|
28
|
+
* @returns {Promise<InvariantsResult>}
|
|
29
29
|
*/
|
|
30
|
-
export function
|
|
31
|
-
if (!task.paths.
|
|
30
|
+
export function runInvariants(task, ctx) {
|
|
31
|
+
if (!task.paths.invariants) {
|
|
32
32
|
return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
|
|
33
33
|
}
|
|
34
34
|
return new Promise((res, rej) => {
|
|
35
|
-
const script = task.paths.
|
|
36
|
-
const stderrLog = createWriteStream(
|
|
35
|
+
const script = task.paths.invariants;
|
|
36
|
+
const stderrLog = createWriteStream(
|
|
37
|
+
join(ctx.runDir, "invariants.stderr.log"),
|
|
38
|
+
);
|
|
37
39
|
|
|
38
40
|
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
39
41
|
// creates a unix socket pair and the connect() can return ENOENT). Use
|
|
40
42
|
// a temp file as the fd-3 backing store instead — the script still
|
|
41
43
|
// writes via `$RESULTS_FD`, but we hand it a real file descriptor.
|
|
42
|
-
const fd3Path = join(ctx.runDir, "
|
|
44
|
+
const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
|
|
43
45
|
let fd3File;
|
|
44
46
|
try {
|
|
45
47
|
fd3File = openSync(fd3Path, "w+");
|
|
@@ -63,7 +65,7 @@ export function runScoring(task, ctx) {
|
|
|
63
65
|
} catch {
|
|
64
66
|
// already closed
|
|
65
67
|
}
|
|
66
|
-
rej(new Error(`failed to spawn
|
|
68
|
+
rej(new Error(`failed to spawn invariants script: ${script}`));
|
|
67
69
|
return;
|
|
68
70
|
}
|
|
69
71
|
|
package/src/benchmark/judge.js
CHANGED
|
@@ -9,13 +9,11 @@
|
|
|
9
9
|
* {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
|
|
10
10
|
* {{AGENT_PROFILE}} — agent profile body (empty string if none)
|
|
11
11
|
* {{AGENT_TRACE_PATH}} — path to agent.ndjson
|
|
12
|
-
* {{
|
|
12
|
+
* {{INVARIANTS_RESULT}} — JSON invariants object
|
|
13
13
|
* {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
|
|
14
14
|
* {{TASK_ID}} — task name (directory under tasks/)
|
|
15
15
|
* {{TASK_DIR}} — agent working directory path
|
|
16
16
|
*
|
|
17
|
-
* Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
|
|
18
|
-
*
|
|
19
17
|
* The judge verdict is captured from the orchestration context's
|
|
20
18
|
* `concluded` flag directly — no trace parsing on the happy path.
|
|
21
19
|
* `parseConcludeFromTrace` is preserved for offline analysis and as a
|
|
@@ -46,17 +44,16 @@ import { createRedactor } from "../redaction.js";
|
|
|
46
44
|
* Run the judge over a completed task run.
|
|
47
45
|
* @param {import("./task-family.js").Task} task
|
|
48
46
|
* @param {import("./workdir.js").Workdir} workdir
|
|
49
|
-
* @param {import("./
|
|
47
|
+
* @param {import("./invariants.js").InvariantsResult} invariants
|
|
50
48
|
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
|
|
51
49
|
* @param {JudgeContext} [context]
|
|
52
50
|
* @returns {Promise<JudgeVerdict>}
|
|
53
51
|
*/
|
|
54
|
-
export async function runJudge(task, workdir,
|
|
52
|
+
export async function runJudge(task, workdir, invariants, deps, context) {
|
|
55
53
|
const template = await readFile(task.paths.judge, "utf8");
|
|
56
|
-
const
|
|
54
|
+
const invariantsJson = JSON.stringify(invariants, null, 2);
|
|
57
55
|
const taskText = template
|
|
58
|
-
.replaceAll("{{
|
|
59
|
-
.replaceAll("{{SCORING}}", scoringJson)
|
|
56
|
+
.replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
|
|
60
57
|
.replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
|
|
61
58
|
.replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
|
|
62
59
|
.replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* NpmInstaller — runs `bun install` in the family root when a package.json
|
|
3
|
+
* is present, then copies the resulting `node_modules/` into the staging
|
|
4
|
+
* directory so WorkdirManager can seed each per-task CWD.
|
|
5
|
+
*
|
|
6
|
+
* Symmetric to ApmInstaller: constructor injection of `spawn` for testability,
|
|
7
|
+
* factory function, and a free-function shorthand.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { spawn as nodeSpawn } from "node:child_process";
|
|
11
|
+
import { access, cp } from "node:fs/promises";
|
|
12
|
+
import { join } from "node:path";
|
|
13
|
+
|
|
14
|
+
/** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
|
|
15
|
+
export class NpmInstaller {
|
|
16
|
+
/**
|
|
17
|
+
* @param {object} [deps]
|
|
18
|
+
* @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
|
|
19
|
+
* `node:child_process` spawn). Tests inject a fake to avoid shelling out.
|
|
20
|
+
*/
|
|
21
|
+
constructor({ spawn } = {}) {
|
|
22
|
+
this.spawn = spawn ?? nodeSpawn;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* @param {import("./task-family.js").TaskFamily} family
|
|
27
|
+
* @param {string} stagingDir - The staging directory (created by ApmInstaller).
|
|
28
|
+
* @returns {Promise<void>}
|
|
29
|
+
*/
|
|
30
|
+
async install(family, stagingDir) {
|
|
31
|
+
const pkgJson = join(family.rootPath, "package.json");
|
|
32
|
+
const hasPkg = await access(pkgJson)
|
|
33
|
+
.then(() => true)
|
|
34
|
+
.catch(() => false);
|
|
35
|
+
if (!hasPkg) return;
|
|
36
|
+
|
|
37
|
+
await this.#runBunInstall(family.rootPath);
|
|
38
|
+
|
|
39
|
+
const sourceModules = join(family.rootPath, "node_modules");
|
|
40
|
+
try {
|
|
41
|
+
await access(sourceModules);
|
|
42
|
+
} catch {
|
|
43
|
+
throw new Error(
|
|
44
|
+
`bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
await cp(sourceModules, join(stagingDir, "node_modules"), {
|
|
49
|
+
recursive: true,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
#runBunInstall(cwd) {
|
|
54
|
+
return new Promise((res, rej) => {
|
|
55
|
+
const child = this.spawn("bun", ["install"], {
|
|
56
|
+
cwd,
|
|
57
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
58
|
+
});
|
|
59
|
+
let stderr = "";
|
|
60
|
+
child.stdout.on("data", () => {});
|
|
61
|
+
child.stderr.on("data", (d) => {
|
|
62
|
+
stderr += d.toString();
|
|
63
|
+
});
|
|
64
|
+
child.on("error", (e) => {
|
|
65
|
+
rej(new Error(`failed to spawn bun: ${e.message}`));
|
|
66
|
+
});
|
|
67
|
+
child.on("close", (code) => {
|
|
68
|
+
if (code === 0) res();
|
|
69
|
+
else rej(new Error(`bun install exited ${code}: ${stderr}`));
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Factory function — wires real dependencies. */
|
|
76
|
+
export function createNpmInstaller(deps) {
|
|
77
|
+
return new NpmInstaller(deps);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Free-function shorthand for callers that don't need to inject a spawn seam.
|
|
82
|
+
* @param {import("./task-family.js").TaskFamily} family
|
|
83
|
+
* @param {string} stagingDir
|
|
84
|
+
*/
|
|
85
|
+
export function installNpm(family, stagingDir) {
|
|
86
|
+
return new NpmInstaller().install(family, stagingDir);
|
|
87
|
+
}
|
package/src/benchmark/report.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* records by `taskId`, and compute pass@k via the OpenAI HumanEval
|
|
4
4
|
* unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
|
|
5
5
|
*
|
|
6
|
-
* When `includeRuns` is true, each task carries per-run detail (
|
|
6
|
+
* When `includeRuns` is true, each task carries per-run detail (invariant
|
|
7
7
|
* checks, judge commentary, cost, duration) and the text renderer produces
|
|
8
8
|
* a full markdown report instead of just the pass@k table.
|
|
9
9
|
*
|
|
@@ -22,7 +22,7 @@ import { validateResultRecord } from "./result.js";
|
|
|
22
22
|
* @typedef {object} RunDetail
|
|
23
23
|
* @property {number} runIndex
|
|
24
24
|
* @property {"pass"|"fail"} verdict
|
|
25
|
-
* @property {{verdict: string, details: unknown[], exitCode: number}} [
|
|
25
|
+
* @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
|
|
26
26
|
* @property {{verdict: string, summary: string}} [judgeVerdict]
|
|
27
27
|
* @property {number} costUsd
|
|
28
28
|
* @property {number} turns
|
|
@@ -112,7 +112,7 @@ function buildRunDetail(r, acc) {
|
|
|
112
112
|
return {
|
|
113
113
|
runIndex: r.runIndex,
|
|
114
114
|
verdict: r.verdict,
|
|
115
|
-
...(r.
|
|
115
|
+
...(r.invariants && { invariants: r.invariants }),
|
|
116
116
|
...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
|
|
117
117
|
costUsd: r.costUsd ?? 0,
|
|
118
118
|
turns: r.turns ?? 0,
|
|
@@ -262,7 +262,7 @@ function renderTaskDetail(task) {
|
|
|
262
262
|
|
|
263
263
|
lines.push("", renderRunsTable(runs));
|
|
264
264
|
|
|
265
|
-
const checks =
|
|
265
|
+
const checks = renderInvariantChecks(runs, singleRun);
|
|
266
266
|
if (checks) lines.push("", checks);
|
|
267
267
|
|
|
268
268
|
const commentary = renderJudgeCommentary(runs, singleRun);
|
|
@@ -278,7 +278,7 @@ function renderRunsTable(runs) {
|
|
|
278
278
|
const header = [
|
|
279
279
|
"Run",
|
|
280
280
|
"Verdict",
|
|
281
|
-
"
|
|
281
|
+
"Invariants",
|
|
282
282
|
"Judge",
|
|
283
283
|
"Cost",
|
|
284
284
|
"Turns",
|
|
@@ -286,10 +286,10 @@ function renderRunsTable(runs) {
|
|
|
286
286
|
];
|
|
287
287
|
const rows = [header, header.map(() => "---")];
|
|
288
288
|
for (const r of runs) {
|
|
289
|
-
const
|
|
289
|
+
const invariantsCell = r.preflightError
|
|
290
290
|
? "preflight error"
|
|
291
|
-
: r.
|
|
292
|
-
? statusIcon(r.
|
|
291
|
+
: r.invariants
|
|
292
|
+
? statusIcon(r.invariants.verdict === "pass")
|
|
293
293
|
: "—";
|
|
294
294
|
const judgeCell = r.preflightError
|
|
295
295
|
? "—"
|
|
@@ -299,7 +299,7 @@ function renderRunsTable(runs) {
|
|
|
299
299
|
rows.push([
|
|
300
300
|
String(r.runIndex),
|
|
301
301
|
statusIcon(r.verdict === "pass"),
|
|
302
|
-
|
|
302
|
+
invariantsCell,
|
|
303
303
|
judgeCell,
|
|
304
304
|
formatCost(r.costUsd),
|
|
305
305
|
String(r.turns),
|
|
@@ -309,15 +309,15 @@ function renderRunsTable(runs) {
|
|
|
309
309
|
return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
|
|
310
310
|
}
|
|
311
311
|
|
|
312
|
-
function
|
|
313
|
-
const rows =
|
|
312
|
+
function renderInvariantChecks(runs, singleRun) {
|
|
313
|
+
const rows = collectInvariantRows(runs);
|
|
314
314
|
if (!rows.length) return null;
|
|
315
315
|
|
|
316
316
|
const header = singleRun
|
|
317
317
|
? ["Check", "Result", "Message"]
|
|
318
318
|
: ["Run", "Check", "Result", "Message"];
|
|
319
319
|
const lines = [
|
|
320
|
-
"####
|
|
320
|
+
"#### Invariant Checks",
|
|
321
321
|
"",
|
|
322
322
|
`| ${header.join(" | ")} |`,
|
|
323
323
|
`| ${header.map(() => "---").join(" | ")} |`,
|
|
@@ -331,11 +331,11 @@ function renderScoringChecks(runs, singleRun) {
|
|
|
331
331
|
return lines.join("\n");
|
|
332
332
|
}
|
|
333
333
|
|
|
334
|
-
function
|
|
334
|
+
function collectInvariantRows(runs) {
|
|
335
335
|
const rows = [];
|
|
336
336
|
for (const r of runs) {
|
|
337
|
-
if (!r.
|
|
338
|
-
for (const d of r.
|
|
337
|
+
if (!r.invariants?.details?.length) continue;
|
|
338
|
+
for (const d of r.invariants.details) {
|
|
339
339
|
rows.push({
|
|
340
340
|
run: r.runIndex,
|
|
341
341
|
check: escapeCell(String(d.test ?? "(unnamed)")),
|
package/src/benchmark/result.js
CHANGED
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Two schemas live here:
|
|
5
5
|
* - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
|
|
6
|
-
* benchmark run. Has a happy branch (
|
|
7
|
-
* pre-flight-failure branch (
|
|
8
|
-
* -
|
|
9
|
-
* ad-hoc grading without a full lifecycle.
|
|
6
|
+
* benchmark run. Has a happy branch (invariants + judge present) and a
|
|
7
|
+
* pre-flight-failure branch (invariants/judgeVerdict/submission absent).
|
|
8
|
+
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
|
|
9
|
+
* (P7): ad-hoc grading without a full lifecycle.
|
|
10
10
|
*
|
|
11
11
|
* Validation is throw-on-mismatch so the runner can wrap every JSONL append
|
|
12
12
|
* in a guard and reject schema drift at write time.
|
|
@@ -16,7 +16,7 @@ import { z } from "zod";
|
|
|
16
16
|
|
|
17
17
|
const VERDICT_ENUM = z.enum(["pass", "fail"]);
|
|
18
18
|
|
|
19
|
-
const
|
|
19
|
+
const INVARIANTS_SHAPE = z.object({
|
|
20
20
|
verdict: VERDICT_ENUM,
|
|
21
21
|
details: z.array(z.unknown()),
|
|
22
22
|
exitCode: z.number().int(),
|
|
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
|
|
|
63
63
|
|
|
64
64
|
const HAPPY_RECORD = z.object({
|
|
65
65
|
...COMMON_FIELDS,
|
|
66
|
-
|
|
66
|
+
invariants: INVARIANTS_SHAPE,
|
|
67
67
|
submission: z.string(),
|
|
68
68
|
judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
|
|
69
69
|
agentTracePath: z.string(),
|
|
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
83
83
|
agentTracePath: z.string(),
|
|
84
84
|
supervisorTracePath: z.string(),
|
|
85
85
|
judgeTracePath: z.string(),
|
|
86
|
-
|
|
86
|
+
invariants: z.undefined().optional(),
|
|
87
87
|
submission: z.undefined().optional(),
|
|
88
88
|
judgeVerdict: z.undefined().optional(),
|
|
89
89
|
agentError: z.undefined().optional(),
|
|
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
91
91
|
|
|
92
92
|
export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
|
|
93
93
|
|
|
94
|
-
export const
|
|
94
|
+
export const INVARIANTS_RECORD_SCHEMA = z.object({
|
|
95
95
|
taskId: z.string().min(1),
|
|
96
|
-
|
|
96
|
+
invariants: INVARIANTS_SHAPE,
|
|
97
97
|
exitCode: z.number().int(),
|
|
98
98
|
});
|
|
99
99
|
|
|
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
|
|
|
109
109
|
* Throw on schema mismatch.
|
|
110
110
|
* @param {object} record
|
|
111
111
|
*/
|
|
112
|
-
export function
|
|
113
|
-
|
|
112
|
+
export function validateInvariantsRecord(record) {
|
|
113
|
+
INVARIANTS_RECORD_SCHEMA.parse(record);
|
|
114
114
|
}
|