@forwardimpact/libeval 0.1.50 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +49 -30
- package/bin/fit-trace.js +83 -57
- package/package.json +1 -1
- package/src/agent-runner.js +20 -12
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/{scorer.js → invariants.js} +14 -12
- package/src/benchmark/judge.js +5 -8
- package/src/benchmark/report.js +15 -15
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +11 -11
- package/src/benchmark/task-family.js +6 -4
- package/src/benchmark/workdir.js +18 -3
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +23 -15
- package/src/commands/benchmark-run.js +15 -8
- package/src/commands/by-discussion.js +29 -18
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +28 -11
- package/src/commands/facilitate.js +18 -12
- package/src/commands/output.js +11 -12
- package/src/commands/run.js +22 -12
- package/src/commands/supervise.js +27 -18
- package/src/commands/task-input.js +10 -5
- package/src/commands/trace.js +174 -97
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +49 -2
- package/src/events/github.js +27 -5
- package/src/inbox-poller.js +84 -0
- package/src/judge.js +1 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +14 -4
- package/src/orchestration-toolkit.js +14 -0
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/commands/benchmark-score.js +0 -68
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
* AND rendered (with resolved values) into the agent working directory.
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
import { readFile, writeFile } from "node:fs/promises";
|
|
18
17
|
import { join } from "node:path";
|
|
19
18
|
|
|
20
19
|
const ENV_FILES = [".env.local", ".env"];
|
|
@@ -48,12 +47,13 @@ export function parseEnvFile(content) {
|
|
|
48
47
|
|
|
49
48
|
/**
|
|
50
49
|
* Read and parse an env file, returning [] if the file does not exist.
|
|
50
|
+
* @param {object} fs - Async filesystem surface (`runtime.fs`).
|
|
51
51
|
* @param {string} filePath
|
|
52
52
|
* @returns {Promise<Array<{key: string, value: string}>>}
|
|
53
53
|
*/
|
|
54
|
-
async function readEnvFile(filePath) {
|
|
54
|
+
async function readEnvFile(fs, filePath) {
|
|
55
55
|
try {
|
|
56
|
-
const content = await readFile(filePath, "utf8");
|
|
56
|
+
const content = await fs.readFile(filePath, "utf8");
|
|
57
57
|
return parseEnvFile(content);
|
|
58
58
|
} catch (e) {
|
|
59
59
|
if (e.code === "ENOENT") return [];
|
|
@@ -62,32 +62,36 @@ async function readEnvFile(filePath) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
/**
|
|
65
|
-
* Load entries into process
|
|
65
|
+
* Load entries into the process env map. Existing keys are never overwritten.
|
|
66
|
+
* @param {Record<string, string|undefined>} env - The `runtime.proc.env` map.
|
|
66
67
|
* @param {Array<{key: string, value: string}>} entries
|
|
67
68
|
* @returns {string[]} var names that were loaded
|
|
68
69
|
*/
|
|
69
|
-
function applyToProcessEnv(entries) {
|
|
70
|
+
function applyToProcessEnv(env, entries) {
|
|
70
71
|
const names = [];
|
|
71
72
|
for (const { key, value } of entries) {
|
|
72
73
|
names.push(key);
|
|
73
|
-
if (
|
|
74
|
-
|
|
74
|
+
if (env[key] === undefined) {
|
|
75
|
+
env[key] = value;
|
|
75
76
|
}
|
|
76
77
|
}
|
|
77
78
|
return names;
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
/**
|
|
81
|
-
* Load one env file: apply to
|
|
82
|
+
* Load one env file: apply to the env map, record keys in the merged map.
|
|
83
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
82
84
|
* @param {string} dir
|
|
83
85
|
* @param {string} file
|
|
84
86
|
* @param {Set<string>} names
|
|
85
87
|
* @param {Map<string, Map<string, true>>} merged
|
|
86
88
|
*/
|
|
87
|
-
async function loadOneEnvFile(dir, file, names, merged) {
|
|
88
|
-
const entries = await readEnvFile(join(dir, file));
|
|
89
|
+
async function loadOneEnvFile(runtime, dir, file, names, merged) {
|
|
90
|
+
const entries = await readEnvFile(runtime.fs, join(dir, file));
|
|
89
91
|
if (entries.length === 0) return;
|
|
90
|
-
for (const name of applyToProcessEnv(entries))
|
|
92
|
+
for (const name of applyToProcessEnv(runtime.proc.env, entries)) {
|
|
93
|
+
names.add(name);
|
|
94
|
+
}
|
|
91
95
|
if (!merged.has(file)) merged.set(file, new Map());
|
|
92
96
|
const fileMap = merged.get(file);
|
|
93
97
|
for (const { key } of entries) {
|
|
@@ -96,17 +100,18 @@ async function loadOneEnvFile(dir, file, names, merged) {
|
|
|
96
100
|
}
|
|
97
101
|
|
|
98
102
|
/**
|
|
99
|
-
* Scan directories for env files, load into
|
|
103
|
+
* Scan directories for env files, load into the env map, and collect
|
|
100
104
|
* a merged key manifest per filename.
|
|
105
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
101
106
|
* @param {string[]} dirs
|
|
102
107
|
* @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
|
|
103
108
|
*/
|
|
104
|
-
async function collectEnvEntries(dirs) {
|
|
109
|
+
async function collectEnvEntries(runtime, dirs) {
|
|
105
110
|
const names = new Set();
|
|
106
111
|
const merged = new Map();
|
|
107
112
|
for (const dir of dirs) {
|
|
108
113
|
for (const file of ENV_FILES) {
|
|
109
|
-
await loadOneEnvFile(dir, file, names, merged);
|
|
114
|
+
await loadOneEnvFile(runtime, dir, file, names, merged);
|
|
110
115
|
}
|
|
111
116
|
}
|
|
112
117
|
return { names, merged };
|
|
@@ -114,17 +119,22 @@ async function collectEnvEntries(dirs) {
|
|
|
114
119
|
|
|
115
120
|
/**
|
|
116
121
|
* Write resolved env files into the agent CWD and warn about empty values.
|
|
122
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
117
123
|
* @param {Map<string, Map<string, true>>} merged
|
|
118
124
|
* @param {string} agentCwd
|
|
119
125
|
*/
|
|
120
|
-
async function renderEnvFiles(merged, agentCwd) {
|
|
126
|
+
async function renderEnvFiles(runtime, merged, agentCwd) {
|
|
127
|
+
const env = runtime.proc.env;
|
|
121
128
|
for (const [file, keyMap] of merged) {
|
|
122
129
|
const keys = [...keyMap.keys()];
|
|
123
|
-
const resolved = keys.map((key) => `${key}=${
|
|
124
|
-
await writeFile(
|
|
125
|
-
|
|
130
|
+
const resolved = keys.map((key) => `${key}=${env[key] ?? ""}`);
|
|
131
|
+
await runtime.fs.writeFile(
|
|
132
|
+
join(agentCwd, file),
|
|
133
|
+
resolved.join("\n") + "\n",
|
|
134
|
+
);
|
|
135
|
+
const empty = keys.filter((key) => !env[key]);
|
|
126
136
|
if (empty.length > 0) {
|
|
127
|
-
|
|
137
|
+
runtime.proc.stderr.write(
|
|
128
138
|
`libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
|
|
129
139
|
);
|
|
130
140
|
}
|
|
@@ -133,14 +143,16 @@ async function renderEnvFiles(merged, agentCwd) {
|
|
|
133
143
|
|
|
134
144
|
/**
|
|
135
145
|
* Discover `.env` / `.env.local` in one or more directories, load them
|
|
136
|
-
* into process
|
|
146
|
+
* into the process env map, and render the resolved values into the agent CWD.
|
|
137
147
|
*
|
|
138
148
|
* @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
|
|
139
149
|
* @param {string} agentCwd - Agent working directory to render into.
|
|
150
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
|
|
151
|
+
* collaborators; uses `fs` (async read/write), `proc.env`, `proc.stderr`.
|
|
140
152
|
* @returns {Promise<string[]>} All var names discovered (for redaction).
|
|
141
153
|
*/
|
|
142
|
-
export async function loadEnv(dirs, agentCwd) {
|
|
143
|
-
const { names, merged } = await collectEnvEntries(dirs);
|
|
144
|
-
await renderEnvFiles(merged, agentCwd);
|
|
154
|
+
export async function loadEnv(dirs, agentCwd, runtime) {
|
|
155
|
+
const { names, merged } = await collectEnvEntries(runtime, dirs);
|
|
156
|
+
await renderEnvFiles(runtime, merged, agentCwd);
|
|
145
157
|
return [...names];
|
|
146
158
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
* the post-run agent CWD. The exit code is authoritative for the
|
|
4
|
-
* structured per-
|
|
2
|
+
* Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
|
|
3
|
+
* against the post-run agent CWD. The exit code is authoritative for the
|
|
4
|
+
* verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
7
|
import { spawn } from "node:child_process";
|
|
@@ -15,31 +15,33 @@ import {
|
|
|
15
15
|
import { join } from "node:path";
|
|
16
16
|
|
|
17
17
|
/**
|
|
18
|
-
* @typedef {object}
|
|
18
|
+
* @typedef {object} InvariantsResult
|
|
19
19
|
* @property {"pass" | "fail"} verdict
|
|
20
20
|
* @property {Array<object>} details
|
|
21
21
|
* @property {number} exitCode
|
|
22
22
|
*/
|
|
23
23
|
|
|
24
24
|
/**
|
|
25
|
-
* Run the task's
|
|
25
|
+
* Run the task's invariants script.
|
|
26
26
|
* @param {import("./task-family.js").Task} task
|
|
27
27
|
* @param {{cwd: string, port: number, runDir: string}} ctx
|
|
28
|
-
* @returns {Promise<
|
|
28
|
+
* @returns {Promise<InvariantsResult>}
|
|
29
29
|
*/
|
|
30
|
-
export function
|
|
31
|
-
if (!task.paths.
|
|
30
|
+
export function runInvariants(task, ctx) {
|
|
31
|
+
if (!task.paths.invariants) {
|
|
32
32
|
return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
|
|
33
33
|
}
|
|
34
34
|
return new Promise((res, rej) => {
|
|
35
|
-
const script = task.paths.
|
|
36
|
-
const stderrLog = createWriteStream(
|
|
35
|
+
const script = task.paths.invariants;
|
|
36
|
+
const stderrLog = createWriteStream(
|
|
37
|
+
join(ctx.runDir, "invariants.stderr.log"),
|
|
38
|
+
);
|
|
37
39
|
|
|
38
40
|
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
39
41
|
// creates a unix socket pair and the connect() can return ENOENT). Use
|
|
40
42
|
// a temp file as the fd-3 backing store instead — the script still
|
|
41
43
|
// writes via `$RESULTS_FD`, but we hand it a real file descriptor.
|
|
42
|
-
const fd3Path = join(ctx.runDir, "
|
|
44
|
+
const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
|
|
43
45
|
let fd3File;
|
|
44
46
|
try {
|
|
45
47
|
fd3File = openSync(fd3Path, "w+");
|
|
@@ -63,7 +65,7 @@ export function runScoring(task, ctx) {
|
|
|
63
65
|
} catch {
|
|
64
66
|
// already closed
|
|
65
67
|
}
|
|
66
|
-
rej(new Error(`failed to spawn
|
|
68
|
+
rej(new Error(`failed to spawn invariants script: ${script}`));
|
|
67
69
|
return;
|
|
68
70
|
}
|
|
69
71
|
|
package/src/benchmark/judge.js
CHANGED
|
@@ -9,13 +9,11 @@
|
|
|
9
9
|
* {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
|
|
10
10
|
* {{AGENT_PROFILE}} — agent profile body (empty string if none)
|
|
11
11
|
* {{AGENT_TRACE_PATH}} — path to agent.ndjson
|
|
12
|
-
* {{
|
|
12
|
+
* {{INVARIANTS_RESULT}} — JSON invariants object
|
|
13
13
|
* {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
|
|
14
14
|
* {{TASK_ID}} — task name (directory under tasks/)
|
|
15
15
|
* {{TASK_DIR}} — agent working directory path
|
|
16
16
|
*
|
|
17
|
-
* Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
|
|
18
|
-
*
|
|
19
17
|
* The judge verdict is captured from the orchestration context's
|
|
20
18
|
* `concluded` flag directly — no trace parsing on the happy path.
|
|
21
19
|
* `parseConcludeFromTrace` is preserved for offline analysis and as a
|
|
@@ -46,17 +44,16 @@ import { createRedactor } from "../redaction.js";
|
|
|
46
44
|
* Run the judge over a completed task run.
|
|
47
45
|
* @param {import("./task-family.js").Task} task
|
|
48
46
|
* @param {import("./workdir.js").Workdir} workdir
|
|
49
|
-
* @param {import("./
|
|
47
|
+
* @param {import("./invariants.js").InvariantsResult} invariants
|
|
50
48
|
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
|
|
51
49
|
* @param {JudgeContext} [context]
|
|
52
50
|
* @returns {Promise<JudgeVerdict>}
|
|
53
51
|
*/
|
|
54
|
-
export async function runJudge(task, workdir,
|
|
52
|
+
export async function runJudge(task, workdir, invariants, deps, context) {
|
|
55
53
|
const template = await readFile(task.paths.judge, "utf8");
|
|
56
|
-
const
|
|
54
|
+
const invariantsJson = JSON.stringify(invariants, null, 2);
|
|
57
55
|
const taskText = template
|
|
58
|
-
.replaceAll("{{
|
|
59
|
-
.replaceAll("{{SCORING}}", scoringJson)
|
|
56
|
+
.replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
|
|
60
57
|
.replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
|
|
61
58
|
.replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
|
|
62
59
|
.replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
|
package/src/benchmark/report.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* records by `taskId`, and compute pass@k via the OpenAI HumanEval
|
|
4
4
|
* unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
|
|
5
5
|
*
|
|
6
|
-
* When `includeRuns` is true, each task carries per-run detail (
|
|
6
|
+
* When `includeRuns` is true, each task carries per-run detail (invariant
|
|
7
7
|
* checks, judge commentary, cost, duration) and the text renderer produces
|
|
8
8
|
* a full markdown report instead of just the pass@k table.
|
|
9
9
|
*
|
|
@@ -22,7 +22,7 @@ import { validateResultRecord } from "./result.js";
|
|
|
22
22
|
* @typedef {object} RunDetail
|
|
23
23
|
* @property {number} runIndex
|
|
24
24
|
* @property {"pass"|"fail"} verdict
|
|
25
|
-
* @property {{verdict: string, details: unknown[], exitCode: number}} [
|
|
25
|
+
* @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
|
|
26
26
|
* @property {{verdict: string, summary: string}} [judgeVerdict]
|
|
27
27
|
* @property {number} costUsd
|
|
28
28
|
* @property {number} turns
|
|
@@ -112,7 +112,7 @@ function buildRunDetail(r, acc) {
|
|
|
112
112
|
return {
|
|
113
113
|
runIndex: r.runIndex,
|
|
114
114
|
verdict: r.verdict,
|
|
115
|
-
...(r.
|
|
115
|
+
...(r.invariants && { invariants: r.invariants }),
|
|
116
116
|
...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
|
|
117
117
|
costUsd: r.costUsd ?? 0,
|
|
118
118
|
turns: r.turns ?? 0,
|
|
@@ -262,7 +262,7 @@ function renderTaskDetail(task) {
|
|
|
262
262
|
|
|
263
263
|
lines.push("", renderRunsTable(runs));
|
|
264
264
|
|
|
265
|
-
const checks =
|
|
265
|
+
const checks = renderInvariantChecks(runs, singleRun);
|
|
266
266
|
if (checks) lines.push("", checks);
|
|
267
267
|
|
|
268
268
|
const commentary = renderJudgeCommentary(runs, singleRun);
|
|
@@ -278,7 +278,7 @@ function renderRunsTable(runs) {
|
|
|
278
278
|
const header = [
|
|
279
279
|
"Run",
|
|
280
280
|
"Verdict",
|
|
281
|
-
"
|
|
281
|
+
"Invariants",
|
|
282
282
|
"Judge",
|
|
283
283
|
"Cost",
|
|
284
284
|
"Turns",
|
|
@@ -286,10 +286,10 @@ function renderRunsTable(runs) {
|
|
|
286
286
|
];
|
|
287
287
|
const rows = [header, header.map(() => "---")];
|
|
288
288
|
for (const r of runs) {
|
|
289
|
-
const
|
|
289
|
+
const invariantsCell = r.preflightError
|
|
290
290
|
? "preflight error"
|
|
291
|
-
: r.
|
|
292
|
-
? statusIcon(r.
|
|
291
|
+
: r.invariants
|
|
292
|
+
? statusIcon(r.invariants.verdict === "pass")
|
|
293
293
|
: "—";
|
|
294
294
|
const judgeCell = r.preflightError
|
|
295
295
|
? "—"
|
|
@@ -299,7 +299,7 @@ function renderRunsTable(runs) {
|
|
|
299
299
|
rows.push([
|
|
300
300
|
String(r.runIndex),
|
|
301
301
|
statusIcon(r.verdict === "pass"),
|
|
302
|
-
|
|
302
|
+
invariantsCell,
|
|
303
303
|
judgeCell,
|
|
304
304
|
formatCost(r.costUsd),
|
|
305
305
|
String(r.turns),
|
|
@@ -309,15 +309,15 @@ function renderRunsTable(runs) {
|
|
|
309
309
|
return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
|
|
310
310
|
}
|
|
311
311
|
|
|
312
|
-
function
|
|
313
|
-
const rows =
|
|
312
|
+
function renderInvariantChecks(runs, singleRun) {
|
|
313
|
+
const rows = collectInvariantRows(runs);
|
|
314
314
|
if (!rows.length) return null;
|
|
315
315
|
|
|
316
316
|
const header = singleRun
|
|
317
317
|
? ["Check", "Result", "Message"]
|
|
318
318
|
: ["Run", "Check", "Result", "Message"];
|
|
319
319
|
const lines = [
|
|
320
|
-
"####
|
|
320
|
+
"#### Invariant Checks",
|
|
321
321
|
"",
|
|
322
322
|
`| ${header.join(" | ")} |`,
|
|
323
323
|
`| ${header.map(() => "---").join(" | ")} |`,
|
|
@@ -331,11 +331,11 @@ function renderScoringChecks(runs, singleRun) {
|
|
|
331
331
|
return lines.join("\n");
|
|
332
332
|
}
|
|
333
333
|
|
|
334
|
-
function
|
|
334
|
+
function collectInvariantRows(runs) {
|
|
335
335
|
const rows = [];
|
|
336
336
|
for (const r of runs) {
|
|
337
|
-
if (!r.
|
|
338
|
-
for (const d of r.
|
|
337
|
+
if (!r.invariants?.details?.length) continue;
|
|
338
|
+
for (const d of r.invariants.details) {
|
|
339
339
|
rows.push({
|
|
340
340
|
run: r.runIndex,
|
|
341
341
|
check: escapeCell(String(d.test ?? "(unnamed)")),
|
package/src/benchmark/result.js
CHANGED
|
@@ -3,10 +3,10 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Two schemas live here:
|
|
5
5
|
* - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
|
|
6
|
-
* benchmark run. Has a happy branch (
|
|
7
|
-
* pre-flight-failure branch (
|
|
8
|
-
* -
|
|
9
|
-
* ad-hoc grading without a full lifecycle.
|
|
6
|
+
* benchmark run. Has a happy branch (invariants + judge present) and a
|
|
7
|
+
* pre-flight-failure branch (invariants/judgeVerdict/submission absent).
|
|
8
|
+
* - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
|
|
9
|
+
* (P7): ad-hoc grading without a full lifecycle.
|
|
10
10
|
*
|
|
11
11
|
* Validation is throw-on-mismatch so the runner can wrap every JSONL append
|
|
12
12
|
* in a guard and reject schema drift at write time.
|
|
@@ -16,7 +16,7 @@ import { z } from "zod";
|
|
|
16
16
|
|
|
17
17
|
const VERDICT_ENUM = z.enum(["pass", "fail"]);
|
|
18
18
|
|
|
19
|
-
const
|
|
19
|
+
const INVARIANTS_SHAPE = z.object({
|
|
20
20
|
verdict: VERDICT_ENUM,
|
|
21
21
|
details: z.array(z.unknown()),
|
|
22
22
|
exitCode: z.number().int(),
|
|
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
|
|
|
63
63
|
|
|
64
64
|
const HAPPY_RECORD = z.object({
|
|
65
65
|
...COMMON_FIELDS,
|
|
66
|
-
|
|
66
|
+
invariants: INVARIANTS_SHAPE,
|
|
67
67
|
submission: z.string(),
|
|
68
68
|
judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
|
|
69
69
|
agentTracePath: z.string(),
|
|
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
83
83
|
agentTracePath: z.string(),
|
|
84
84
|
supervisorTracePath: z.string(),
|
|
85
85
|
judgeTracePath: z.string(),
|
|
86
|
-
|
|
86
|
+
invariants: z.undefined().optional(),
|
|
87
87
|
submission: z.undefined().optional(),
|
|
88
88
|
judgeVerdict: z.undefined().optional(),
|
|
89
89
|
agentError: z.undefined().optional(),
|
|
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
|
|
|
91
91
|
|
|
92
92
|
export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
|
|
93
93
|
|
|
94
|
-
export const
|
|
94
|
+
export const INVARIANTS_RECORD_SCHEMA = z.object({
|
|
95
95
|
taskId: z.string().min(1),
|
|
96
|
-
|
|
96
|
+
invariants: INVARIANTS_SHAPE,
|
|
97
97
|
exitCode: z.number().int(),
|
|
98
98
|
});
|
|
99
99
|
|
|
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
|
|
|
109
109
|
* Throw on schema mismatch.
|
|
110
110
|
* @param {object} record
|
|
111
111
|
*/
|
|
112
|
-
export function
|
|
113
|
-
|
|
112
|
+
export function validateInvariantsRecord(record) {
|
|
113
|
+
INVARIANTS_RECORD_SCHEMA.parse(record);
|
|
114
114
|
}
|
package/src/benchmark/runner.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Phases per (task, runIndex):
|
|
5
5
|
* 1. WorkdirManager.start → seed CWD + run pre-flight probe
|
|
6
6
|
* 2. Supervisor session (agent + supervisor) → produce traces + submission
|
|
7
|
-
* 3.
|
|
7
|
+
* 3. Invariants.runInvariants → exit-code-driven verdict via fd-3 NDJSON
|
|
8
8
|
* 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
|
|
9
9
|
* 5. WorkdirManager.teardown → process-group cleanup
|
|
10
10
|
*
|
|
@@ -25,7 +25,7 @@ import { installApm as defaultInstallApm } from "./apm-installer.js";
|
|
|
25
25
|
import { installNpm as defaultInstallNpm } from "./npm-installer.js";
|
|
26
26
|
import { runJudge } from "./judge.js";
|
|
27
27
|
import { validateResultRecord } from "./result.js";
|
|
28
|
-
import {
|
|
28
|
+
import { runInvariants } from "./invariants.js";
|
|
29
29
|
import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
|
|
30
30
|
import { createWorkdirManager } from "./workdir.js";
|
|
31
31
|
|
|
@@ -60,10 +60,10 @@ export class BenchmarkRunner {
|
|
|
60
60
|
* write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
|
|
61
61
|
* `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
|
|
62
62
|
* testing only — not part of the public API.
|
|
63
|
-
* @param {Function} [opts.
|
|
64
|
-
* Same contract as `
|
|
63
|
+
* @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
|
|
64
|
+
* Same contract as `runInvariants(task, ctx)`. Internal testing only.
|
|
65
65
|
* @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
|
|
66
|
-
* contract as `runJudge(task, workdir,
|
|
66
|
+
* contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
|
|
67
67
|
* only.
|
|
68
68
|
* @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
|
|
69
69
|
* Same contract as `installApm(family, outputDir)`. Lets tests inject a
|
|
@@ -86,7 +86,7 @@ export class BenchmarkRunner {
|
|
|
86
86
|
termGraceMs,
|
|
87
87
|
// Test seams — default to the real implementations.
|
|
88
88
|
runAgent,
|
|
89
|
-
|
|
89
|
+
runInvariants: runInvariantsHook,
|
|
90
90
|
runJudge: runJudgeHook,
|
|
91
91
|
installApm: installApmHook,
|
|
92
92
|
installNpm: installNpmHook,
|
|
@@ -112,7 +112,7 @@ export class BenchmarkRunner {
|
|
|
112
112
|
this.maxTurns = maxTurns;
|
|
113
113
|
this.termGraceMs = termGraceMs;
|
|
114
114
|
this._runAgentHook = runAgent ?? null;
|
|
115
|
-
this.
|
|
115
|
+
this._runInvariantsHook = runInvariantsHook ?? runInvariants;
|
|
116
116
|
this._runJudgeHook = runJudgeHook ?? runJudge;
|
|
117
117
|
this._installApmHook = installApmHook ?? defaultInstallApm;
|
|
118
118
|
this._installNpmHook = installNpmHook ?? defaultInstallNpm;
|
|
@@ -191,7 +191,7 @@ export class BenchmarkRunner {
|
|
|
191
191
|
}
|
|
192
192
|
const { costUsd, turns, submission, agentError } =
|
|
193
193
|
await this.#runAgentSafe(task, workdir);
|
|
194
|
-
const
|
|
194
|
+
const invariants = await this._runInvariantsHook(task, {
|
|
195
195
|
cwd: workdir.cwd,
|
|
196
196
|
port: workdir.port,
|
|
197
197
|
runDir: workdir.runDir,
|
|
@@ -206,7 +206,7 @@ export class BenchmarkRunner {
|
|
|
206
206
|
judgeVerdict = await this._runJudgeHook(
|
|
207
207
|
task,
|
|
208
208
|
workdir,
|
|
209
|
-
|
|
209
|
+
invariants,
|
|
210
210
|
{
|
|
211
211
|
query: this.query,
|
|
212
212
|
model: this.judgeModel,
|
|
@@ -217,7 +217,7 @@ export class BenchmarkRunner {
|
|
|
217
217
|
);
|
|
218
218
|
}
|
|
219
219
|
const verdict =
|
|
220
|
-
|
|
220
|
+
invariants.verdict === "pass" &&
|
|
221
221
|
(judgeVerdict === null || judgeVerdict.verdict === "pass")
|
|
222
222
|
? "pass"
|
|
223
223
|
: "fail";
|
|
@@ -225,7 +225,7 @@ export class BenchmarkRunner {
|
|
|
225
225
|
taskId: task.id,
|
|
226
226
|
runIndex,
|
|
227
227
|
verdict,
|
|
228
|
-
|
|
228
|
+
invariants,
|
|
229
229
|
submission,
|
|
230
230
|
...(judgeVerdict && { judgeVerdict }),
|
|
231
231
|
costUsd,
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* judge.task.md
|
|
10
10
|
* hooks/ # harness-only; never copied to agent CWD
|
|
11
11
|
* preflight.sh
|
|
12
|
-
*
|
|
12
|
+
* invariants.sh
|
|
13
13
|
* specs/ # copied into agent CWD
|
|
14
14
|
* workdir/ # copied into agent CWD
|
|
15
15
|
*
|
|
@@ -104,7 +104,7 @@ async function discoverTasks(rootPath) {
|
|
|
104
104
|
const supervisorPath = join(taskDir, "supervisor.task.md");
|
|
105
105
|
const judgePath = join(taskDir, "judge.task.md");
|
|
106
106
|
const preflightPath = join(taskDir, "hooks", "preflight.sh");
|
|
107
|
-
const
|
|
107
|
+
const invariantsPath = join(taskDir, "hooks", "invariants.sh");
|
|
108
108
|
tasks.push({
|
|
109
109
|
id: entry.name,
|
|
110
110
|
paths: {
|
|
@@ -114,7 +114,9 @@ async function discoverTasks(rootPath) {
|
|
|
114
114
|
judge: (await fileExists(judgePath)) ? judgePath : null,
|
|
115
115
|
hooks: join(taskDir, "hooks"),
|
|
116
116
|
preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
|
|
117
|
-
|
|
117
|
+
invariants: (await fileExecutable(invariantsPath))
|
|
118
|
+
? invariantsPath
|
|
119
|
+
: null,
|
|
118
120
|
specs: join(taskDir, "specs"),
|
|
119
121
|
workdir: join(taskDir, "workdir"),
|
|
120
122
|
},
|
|
@@ -236,7 +238,7 @@ function run(cmd, args) {
|
|
|
236
238
|
/**
|
|
237
239
|
* @typedef {object} Task
|
|
238
240
|
* @property {string} id - Task name (directory name under tasks/)
|
|
239
|
-
* @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null,
|
|
241
|
+
* @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, invariants: string|null, specs: string, workdir: string}} paths
|
|
240
242
|
*/
|
|
241
243
|
|
|
242
244
|
/**
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* the pre-flight smoke probe, and tear down the process group at end of run.
|
|
5
5
|
*
|
|
6
6
|
* The Workdir handle threads `cwd`, `port`, `pgid`, and trace paths through
|
|
7
|
-
* runAgent →
|
|
7
|
+
* runAgent → invariants → judge → teardown.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { spawn } from "node:child_process";
|
|
@@ -13,6 +13,8 @@ import { createServer } from "node:net";
|
|
|
13
13
|
import { connect } from "node:net";
|
|
14
14
|
import { join } from "node:path";
|
|
15
15
|
|
|
16
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
17
|
+
|
|
16
18
|
import { loadEnv } from "./env-loader.js";
|
|
17
19
|
|
|
18
20
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
@@ -38,13 +40,23 @@ export class WorkdirManager {
|
|
|
38
40
|
* @param {string} deps.stagingDir - Output of `installApm(...)`.
|
|
39
41
|
* @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
|
|
40
42
|
*/
|
|
41
|
-
constructor({
|
|
43
|
+
constructor({
|
|
44
|
+
stagingDir,
|
|
45
|
+
runOutputDir,
|
|
46
|
+
termGraceMs,
|
|
47
|
+
familyRootPath,
|
|
48
|
+
runtime,
|
|
49
|
+
}) {
|
|
42
50
|
if (!stagingDir) throw new Error("stagingDir is required");
|
|
43
51
|
if (!runOutputDir) throw new Error("runOutputDir is required");
|
|
44
52
|
this.stagingDir = stagingDir;
|
|
45
53
|
this.runOutputDir = runOutputDir;
|
|
46
54
|
this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
|
|
47
55
|
this.familyRootPath = familyRootPath ?? null;
|
|
56
|
+
// `loadEnv` is the only collaborator routed through the runtime today; the
|
|
57
|
+
// rest of this manager still uses raw streaming/net/process-group APIs the
|
|
58
|
+
// runtime surface does not yet cover.
|
|
59
|
+
this.runtime = runtime ?? null;
|
|
48
60
|
}
|
|
49
61
|
|
|
50
62
|
/**
|
|
@@ -80,7 +92,10 @@ export class WorkdirManager {
|
|
|
80
92
|
...(this.familyRootPath ? [this.familyRootPath] : []),
|
|
81
93
|
...(task.paths.taskDir ? [task.paths.taskDir] : []),
|
|
82
94
|
];
|
|
83
|
-
const envNames =
|
|
95
|
+
const envNames =
|
|
96
|
+
envDirs.length > 0
|
|
97
|
+
? await loadEnv(envDirs, cwd, this.runtime ?? createDefaultRuntime())
|
|
98
|
+
: [];
|
|
84
99
|
|
|
85
100
|
const port = await allocatePort();
|
|
86
101
|
const agentTracePath = join(runDir, "agent.ndjson");
|