@forwardimpact/libeval 0.1.31 → 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -0
- package/bin/fit-benchmark.js +167 -0
- package/package.json +5 -3
- package/src/agent-runner.js +7 -1
- package/src/benchmark/apm-installer.js +39 -0
- package/src/benchmark/judge.js +146 -0
- package/src/benchmark/report.js +161 -0
- package/src/benchmark/result.js +108 -0
- package/src/benchmark/runner.js +396 -0
- package/src/benchmark/scorer.js +138 -0
- package/src/benchmark/task-family.js +259 -0
- package/src/benchmark/workdir.js +248 -0
- package/src/commands/benchmark-report.js +39 -0
- package/src/commands/benchmark-run.js +53 -0
- package/src/commands/benchmark-score.js +68 -0
- package/src/commands/facilitate.js +7 -0
- package/src/commands/run.js +9 -3
- package/src/commands/supervise.js +7 -0
- package/src/facilitator.js +35 -21
- package/src/index.js +9 -0
- package/src/judge.js +211 -0
- package/src/orchestration-toolkit.js +25 -0
- package/src/redaction.js +163 -0
- package/src/supervisor.js +29 -17
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Task-family loader. A task family is a directory under
|
|
3
|
+
* <root>/
|
|
4
|
+
* apm.lock.yaml
|
|
5
|
+
* .claude/ # pre-staged skills + agents (P1)
|
|
6
|
+
* tasks/<task_family_name>/<task_name>/
|
|
7
|
+
* instructions.md
|
|
8
|
+
* supervisor.task.md # preserved for v2; not read in v1
|
|
9
|
+
* judge.task.md
|
|
10
|
+
* specs/ # copied into agent CWD
|
|
11
|
+
* workdir/ # copied into agent CWD (excludes scripts/)
|
|
12
|
+
* scripts/preflight.sh
|
|
13
|
+
* scoring/ # template-only; never copied
|
|
14
|
+
*
|
|
15
|
+
* Local paths or git URLs are both accepted; git URLs are shallow-cloned into
|
|
16
|
+
* a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
|
|
17
|
+
* Local paths use the canonical-tree algorithm from design § Family revision
|
|
18
|
+
* algorithm so the result is stable across operating systems.
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { spawn } from "node:child_process";
|
|
22
|
+
import { createHash } from "node:crypto";
|
|
23
|
+
import {
|
|
24
|
+
access,
|
|
25
|
+
lstat,
|
|
26
|
+
mkdtemp,
|
|
27
|
+
readdir,
|
|
28
|
+
readFile,
|
|
29
|
+
realpath,
|
|
30
|
+
} from "node:fs/promises";
|
|
31
|
+
import { tmpdir } from "node:os";
|
|
32
|
+
import { join, posix, relative, resolve, sep } from "node:path";
|
|
33
|
+
|
|
34
|
+
const GIT_URL_RE = /^(git@|https?:\/\/|ssh:\/\/|git:\/\/)/;
|
|
35
|
+
const SKIP_DIRS = new Set([".git", "node_modules"]);
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Load a task family from a local path or git URL.
|
|
39
|
+
* @param {string} rootPathOrGitUrl
|
|
40
|
+
* @returns {Promise<TaskFamily>}
|
|
41
|
+
*/
|
|
42
|
+
export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
43
|
+
const isGit = GIT_URL_RE.test(rootPathOrGitUrl);
|
|
44
|
+
let rootPath;
|
|
45
|
+
let familyRevision;
|
|
46
|
+
if (isGit) {
|
|
47
|
+
const dir = await mkdtemp(join(tmpdir(), "fit-benchmark-family-"));
|
|
48
|
+
await gitClone(rootPathOrGitUrl, dir);
|
|
49
|
+
rootPath = dir;
|
|
50
|
+
familyRevision = "git:" + (await gitHead(dir));
|
|
51
|
+
} else {
|
|
52
|
+
rootPath = resolve(rootPathOrGitUrl);
|
|
53
|
+
familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const apmLockBytes = await readApmLockBytes(rootPath);
|
|
57
|
+
const tasks = await discoverTasks(rootPath);
|
|
58
|
+
|
|
59
|
+
return {
|
|
60
|
+
rootPath,
|
|
61
|
+
familyRevision,
|
|
62
|
+
apmLockBytes,
|
|
63
|
+
tasks() {
|
|
64
|
+
return tasks;
|
|
65
|
+
},
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Assert that `<stagingDir>/.claude/agents/<judgeProfile>.md` exists. Called
|
|
71
|
+
* from `BenchmarkRunner.run()` so a missing judge profile fails the family
|
|
72
|
+
* install before any agent session starts.
|
|
73
|
+
* @param {TaskFamily} _family
|
|
74
|
+
* @param {string} stagingDir
|
|
75
|
+
* @param {string} judgeProfile
|
|
76
|
+
* @returns {Promise<void>}
|
|
77
|
+
*/
|
|
78
|
+
export async function assertJudgeProfileStaged(
|
|
79
|
+
_family,
|
|
80
|
+
stagingDir,
|
|
81
|
+
judgeProfile,
|
|
82
|
+
) {
|
|
83
|
+
const candidate = join(stagingDir, ".claude", "agents", `${judgeProfile}.md`);
|
|
84
|
+
try {
|
|
85
|
+
await access(candidate);
|
|
86
|
+
} catch {
|
|
87
|
+
throw new Error(
|
|
88
|
+
`judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
|
|
89
|
+
);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function readApmLockBytes(rootPath) {
|
|
94
|
+
const lockPath = join(rootPath, "apm.lock.yaml");
|
|
95
|
+
try {
|
|
96
|
+
const raw = await readFile(lockPath);
|
|
97
|
+
return normalizeLf(raw);
|
|
98
|
+
} catch (e) {
|
|
99
|
+
if (e.code === "ENOENT") {
|
|
100
|
+
throw new Error(
|
|
101
|
+
`task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
|
|
102
|
+
);
|
|
103
|
+
}
|
|
104
|
+
throw e;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Replace CRLF with LF so cross-OS authored lockfiles hash identically.
|
|
110
|
+
* @param {Buffer} buf
|
|
111
|
+
* @returns {Buffer}
|
|
112
|
+
*/
|
|
113
|
+
function normalizeLf(buf) {
|
|
114
|
+
const out = [];
|
|
115
|
+
for (let i = 0; i < buf.length; i++) {
|
|
116
|
+
if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
|
|
117
|
+
out.push(buf[i]);
|
|
118
|
+
}
|
|
119
|
+
return Buffer.from(out);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
async function discoverTasks(rootPath) {
|
|
123
|
+
const tasksRoot = join(rootPath, "tasks");
|
|
124
|
+
const tasks = [];
|
|
125
|
+
let families;
|
|
126
|
+
try {
|
|
127
|
+
families = await readdir(tasksRoot, { withFileTypes: true });
|
|
128
|
+
} catch (e) {
|
|
129
|
+
if (e.code === "ENOENT") return tasks;
|
|
130
|
+
throw e;
|
|
131
|
+
}
|
|
132
|
+
for (const family of families) {
|
|
133
|
+
if (!family.isDirectory()) continue;
|
|
134
|
+
const familyDir = join(tasksRoot, family.name);
|
|
135
|
+
const entries = await readdir(familyDir, { withFileTypes: true });
|
|
136
|
+
for (const entry of entries) {
|
|
137
|
+
if (!entry.isDirectory()) continue;
|
|
138
|
+
const taskDir = join(familyDir, entry.name);
|
|
139
|
+
tasks.push({
|
|
140
|
+
id: `${family.name}/${entry.name}`,
|
|
141
|
+
paths: {
|
|
142
|
+
instructions: join(taskDir, "instructions.md"),
|
|
143
|
+
supervisor: join(taskDir, "supervisor.task.md"),
|
|
144
|
+
judge: join(taskDir, "judge.task.md"),
|
|
145
|
+
specs: join(taskDir, "specs"),
|
|
146
|
+
workdir: join(taskDir, "workdir"),
|
|
147
|
+
scoring: join(taskDir, "scoring"),
|
|
148
|
+
},
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
tasks.sort((a, b) => (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
|
|
153
|
+
return tasks;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Canonical-tree hash per design § Family revision algorithm:
|
|
158
|
+
* list regular files (excluding .git/, node_modules/)
|
|
159
|
+
* resolve symlinks before reading
|
|
160
|
+
* sort by NFC-normalised POSIX-style root-relative path
|
|
161
|
+
* row = <rel-path>\0<hex-sha256>\n
|
|
162
|
+
* sha256(concat(rows))
|
|
163
|
+
* @param {string} rootPath
|
|
164
|
+
* @returns {Promise<string>} hex digest
|
|
165
|
+
*/
|
|
166
|
+
async function canonicalTreeHash(rootPath) {
|
|
167
|
+
const real = await realpath(rootPath);
|
|
168
|
+
const rows = [];
|
|
169
|
+
for await (const filePath of walkFiles(real)) {
|
|
170
|
+
const rel = toPosix(relative(real, filePath)).normalize("NFC");
|
|
171
|
+
const target = await realpath(filePath);
|
|
172
|
+
const bytes = await readFile(target);
|
|
173
|
+
const hex = createHash("sha256").update(bytes).digest("hex");
|
|
174
|
+
rows.push({ rel, hex });
|
|
175
|
+
}
|
|
176
|
+
rows.sort((a, b) => (a.rel < b.rel ? -1 : a.rel > b.rel ? 1 : 0));
|
|
177
|
+
const acc = createHash("sha256");
|
|
178
|
+
for (const r of rows) acc.update(`${r.rel}\0${r.hex}\n`, "utf8");
|
|
179
|
+
return acc.digest("hex");
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
async function* walkFiles(dir) {
|
|
183
|
+
const entries = await readdir(dir, { withFileTypes: true });
|
|
184
|
+
for (const entry of entries) {
|
|
185
|
+
const full = join(dir, entry.name);
|
|
186
|
+
if (entry.isDirectory()) {
|
|
187
|
+
if (SKIP_DIRS.has(entry.name)) continue;
|
|
188
|
+
yield* walkFiles(full);
|
|
189
|
+
} else if (entry.isSymbolicLink()) {
|
|
190
|
+
const resolvedFile = await resolveSymlinkToFile(full);
|
|
191
|
+
if (resolvedFile) yield full;
|
|
192
|
+
} else if (entry.isFile()) {
|
|
193
|
+
yield full;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Return the resolved path if `linkPath` is a symlink to a regular file.
|
|
200
|
+
* Returns null for dangling symlinks or links to non-file targets.
|
|
201
|
+
*/
|
|
202
|
+
async function resolveSymlinkToFile(linkPath) {
|
|
203
|
+
const st = await lstat(linkPath);
|
|
204
|
+
if (!st.isSymbolicLink()) return null;
|
|
205
|
+
try {
|
|
206
|
+
const resolved = await realpath(linkPath);
|
|
207
|
+
const tstat = await lstat(resolved);
|
|
208
|
+
return tstat.isFile() ? resolved : null;
|
|
209
|
+
} catch {
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function toPosix(p) {
|
|
215
|
+
if (sep === posix.sep) return p;
|
|
216
|
+
return p.split(sep).join(posix.sep);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
async function gitClone(url, dir) {
|
|
220
|
+
await run("git", ["clone", "--depth", "1", url, dir]);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
async function gitHead(dir) {
|
|
224
|
+
const out = await run("git", ["-C", dir, "rev-parse", "HEAD"]);
|
|
225
|
+
return out.trim();
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
function run(cmd, args) {
|
|
229
|
+
return new Promise((res, rej) => {
|
|
230
|
+
const child = spawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
|
|
231
|
+
let stdout = "";
|
|
232
|
+
let stderr = "";
|
|
233
|
+
child.stdout.on("data", (d) => {
|
|
234
|
+
stdout += d.toString();
|
|
235
|
+
});
|
|
236
|
+
child.stderr.on("data", (d) => {
|
|
237
|
+
stderr += d.toString();
|
|
238
|
+
});
|
|
239
|
+
child.on("error", rej);
|
|
240
|
+
child.on("close", (code) => {
|
|
241
|
+
if (code === 0) res(stdout);
|
|
242
|
+
else rej(new Error(`${cmd} ${args.join(" ")} exited ${code}: ${stderr}`));
|
|
243
|
+
});
|
|
244
|
+
});
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* @typedef {object} Task
|
|
249
|
+
* @property {string} id - METR-style "task_family_name/task_name"
|
|
250
|
+
* @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
|
|
251
|
+
*/
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* @typedef {object} TaskFamily
|
|
255
|
+
* @property {string} rootPath
|
|
256
|
+
* @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
|
|
257
|
+
* @property {Buffer} apmLockBytes - LF-normalised
|
|
258
|
+
* @property {() => Task[]} tasks
|
|
259
|
+
*/
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WorkdirManager — per-task lifecycle: create the agent CWD, seed it from the
|
|
3
|
+
* task's workdir + specs + staged .claude/, allocate a free TCP port, run
|
|
4
|
+
* the pre-flight smoke probe, and tear down the process group at end of run.
|
|
5
|
+
*
|
|
6
|
+
* The Workdir handle threads `cwd`, `port`, `pgid`, and trace paths through
|
|
7
|
+
* runAgent → score → judge → teardown.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { spawn } from "node:child_process";
|
|
11
|
+
import { cp, mkdir } from "node:fs/promises";
|
|
12
|
+
import { createServer } from "node:net";
|
|
13
|
+
import { connect } from "node:net";
|
|
14
|
+
import { join, sep } from "node:path";
|
|
15
|
+
|
|
16
|
+
const PREFLIGHT_REL = join("workdir", "scripts");
|
|
17
|
+
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* @typedef {object} Workdir
|
|
21
|
+
* @property {string} cwd - Agent CWD (per-task copy).
|
|
22
|
+
* @property {string} runDir - Parent of `cwd`; holds trace/log siblings.
|
|
23
|
+
* @property {number} port - Allocated TCP port for the agent.
|
|
24
|
+
* @property {number} pgid - Process-group id captured from the preflight child.
|
|
25
|
+
* @property {*} scaffold - Reserved per design § Components; v1 sets null.
|
|
26
|
+
* @property {string} agentTracePath
|
|
27
|
+
* @property {string} judgeTracePath
|
|
28
|
+
* @property {{phase: string, message: string, exitCode: number}} [preflightError]
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
/** Per-task workdir lifecycle: seed → preflight → teardown. */
|
|
32
|
+
export class WorkdirManager {
|
|
33
|
+
/**
|
|
34
|
+
* @param {object} deps
|
|
35
|
+
* @param {string} deps.stagingDir - Output of `installApm(...)`.
|
|
36
|
+
* @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
|
|
37
|
+
*/
|
|
38
|
+
constructor({ stagingDir, runOutputDir, termGraceMs }) {
|
|
39
|
+
if (!stagingDir) throw new Error("stagingDir is required");
|
|
40
|
+
if (!runOutputDir) throw new Error("runOutputDir is required");
|
|
41
|
+
this.stagingDir = stagingDir;
|
|
42
|
+
this.runOutputDir = runOutputDir;
|
|
43
|
+
this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Create the per-task working directory and run the pre-flight probe.
|
|
48
|
+
* @param {import("./task-family.js").Task} task
|
|
49
|
+
* @param {number} runIndex
|
|
50
|
+
* @returns {Promise<Workdir>}
|
|
51
|
+
*/
|
|
52
|
+
async start(task, runIndex) {
|
|
53
|
+
const slug = task.id.replace("/", "__");
|
|
54
|
+
const runDir = join(this.runOutputDir, "runs", slug, String(runIndex));
|
|
55
|
+
const cwd = join(runDir, "cwd");
|
|
56
|
+
await mkdir(cwd, { recursive: true });
|
|
57
|
+
|
|
58
|
+
await cp(task.paths.workdir, cwd, {
|
|
59
|
+
recursive: true,
|
|
60
|
+
filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
|
|
61
|
+
});
|
|
62
|
+
await cp(task.paths.specs, join(cwd, "specs"), {
|
|
63
|
+
recursive: true,
|
|
64
|
+
}).catch((e) => {
|
|
65
|
+
if (e.code !== "ENOENT") throw e;
|
|
66
|
+
});
|
|
67
|
+
await cp(join(this.stagingDir, ".claude"), join(cwd, ".claude"), {
|
|
68
|
+
recursive: true,
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
const port = await allocatePort();
|
|
72
|
+
const agentTracePath = join(runDir, "agent.ndjson");
|
|
73
|
+
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
74
|
+
|
|
75
|
+
const preflightScript = join(task.paths.workdir, "scripts", "preflight.sh");
|
|
76
|
+
const preflight = await runPreflight(preflightScript, cwd, port);
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
cwd,
|
|
80
|
+
runDir,
|
|
81
|
+
port,
|
|
82
|
+
pgid: preflight.pgid,
|
|
83
|
+
scaffold: null,
|
|
84
|
+
agentTracePath,
|
|
85
|
+
judgeTracePath,
|
|
86
|
+
...(preflight.error && { preflightError: preflight.error }),
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Tear down the per-task process group: SIGTERM, wait, SIGKILL, then probe.
|
|
92
|
+
* @param {Workdir} workdir
|
|
93
|
+
* @returns {Promise<{portFree: boolean, descendants: number}>}
|
|
94
|
+
*/
|
|
95
|
+
async teardown(workdir) {
|
|
96
|
+
if (workdir.pgid && workdir.pgid > 0) {
|
|
97
|
+
try {
|
|
98
|
+
process.kill(-workdir.pgid, "SIGTERM");
|
|
99
|
+
} catch {
|
|
100
|
+
// Process group already gone — fine.
|
|
101
|
+
}
|
|
102
|
+
await sleep(this.termGraceMs);
|
|
103
|
+
try {
|
|
104
|
+
process.kill(-workdir.pgid, "SIGKILL");
|
|
105
|
+
} catch {
|
|
106
|
+
// Already exited.
|
|
107
|
+
}
|
|
108
|
+
// Poll briefly until the process group is empty — SIGKILL returns
|
|
109
|
+
// before the kernel finishes reaping descendants.
|
|
110
|
+
await waitFor(
|
|
111
|
+
async () => (await countDescendants(workdir.pgid)) === 0,
|
|
112
|
+
2_000,
|
|
113
|
+
);
|
|
114
|
+
}
|
|
115
|
+
const portFree = await isPortFree(workdir.port);
|
|
116
|
+
const descendants = await countDescendants(workdir.pgid);
|
|
117
|
+
return { portFree, descendants };
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Spawn preflight. Stays detached so we can SIGTERM the whole process group.
|
|
123
|
+
* @param {string} script
|
|
124
|
+
* @param {string} cwd - Agent CWD passed via $WORKDIR.
|
|
125
|
+
* @param {number} port - Free TCP port passed via $PORT.
|
|
126
|
+
* @returns {Promise<{pgid: number, error?: {phase: string, message: string, exitCode: number}}>}
|
|
127
|
+
*/
|
|
128
|
+
function runPreflight(script, cwd, port) {
|
|
129
|
+
return new Promise((res, rej) => {
|
|
130
|
+
let stderr = "";
|
|
131
|
+
const child = spawn(script, [], {
|
|
132
|
+
cwd,
|
|
133
|
+
env: { ...process.env, WORKDIR: cwd, PORT: String(port) },
|
|
134
|
+
detached: true,
|
|
135
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
136
|
+
});
|
|
137
|
+
if (child.pid === undefined) {
|
|
138
|
+
rej(new Error(`failed to spawn preflight: ${script}`));
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
const pgid = child.pid;
|
|
142
|
+
child.stderr.on("data", (d) => {
|
|
143
|
+
stderr += d.toString();
|
|
144
|
+
});
|
|
145
|
+
child.on("error", (e) => {
|
|
146
|
+
res({
|
|
147
|
+
pgid,
|
|
148
|
+
error: {
|
|
149
|
+
phase: "preflight",
|
|
150
|
+
message: `preflight failed to spawn: ${e.message}`,
|
|
151
|
+
exitCode: -1,
|
|
152
|
+
},
|
|
153
|
+
});
|
|
154
|
+
});
|
|
155
|
+
child.on("exit", (code, signal) => {
|
|
156
|
+
if (code === 0) {
|
|
157
|
+
res({ pgid });
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
const message = stderr.trim() || `preflight exited with signal ${signal}`;
|
|
161
|
+
res({
|
|
162
|
+
pgid,
|
|
163
|
+
error: {
|
|
164
|
+
phase: "preflight",
|
|
165
|
+
message,
|
|
166
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
167
|
+
},
|
|
168
|
+
});
|
|
169
|
+
});
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function allocatePort() {
|
|
174
|
+
return new Promise((res, rej) => {
|
|
175
|
+
const server = createServer();
|
|
176
|
+
server.unref();
|
|
177
|
+
server.on("error", rej);
|
|
178
|
+
server.listen(0, "127.0.0.1", () => {
|
|
179
|
+
const addr = server.address();
|
|
180
|
+
if (!addr || typeof addr === "string") {
|
|
181
|
+
server.close();
|
|
182
|
+
rej(new Error("failed to allocate port"));
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
const port = addr.port;
|
|
186
|
+
server.close(() => res(port));
|
|
187
|
+
});
|
|
188
|
+
});
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function isPortFree(port) {
|
|
192
|
+
if (!port) return Promise.resolve(true);
|
|
193
|
+
return new Promise((res) => {
|
|
194
|
+
const socket = connect({ port, host: "127.0.0.1" }, () => {
|
|
195
|
+
socket.destroy();
|
|
196
|
+
res(false);
|
|
197
|
+
});
|
|
198
|
+
socket.on("error", () => res(true));
|
|
199
|
+
socket.setTimeout(500, () => {
|
|
200
|
+
socket.destroy();
|
|
201
|
+
res(true);
|
|
202
|
+
});
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
function countDescendants(pgid) {
|
|
207
|
+
if (!pgid || pgid <= 0) return Promise.resolve(0);
|
|
208
|
+
return new Promise((res) => {
|
|
209
|
+
const child = spawn("ps", ["-o", "pid=", "-g", String(pgid)], {
|
|
210
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
211
|
+
});
|
|
212
|
+
let out = "";
|
|
213
|
+
child.stdout.on("data", (d) => {
|
|
214
|
+
out += d.toString();
|
|
215
|
+
});
|
|
216
|
+
child.on("error", () => res(0));
|
|
217
|
+
child.on("close", () => {
|
|
218
|
+
const pids = out
|
|
219
|
+
.split("\n")
|
|
220
|
+
.map((s) => s.trim())
|
|
221
|
+
.filter(Boolean)
|
|
222
|
+
.filter((s) => Number(s) !== process.pid);
|
|
223
|
+
res(pids.length);
|
|
224
|
+
});
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
function sleep(ms) {
|
|
229
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
async function waitFor(predicate, timeoutMs) {
|
|
233
|
+
const deadline = Date.now() + timeoutMs;
|
|
234
|
+
while (Date.now() < deadline) {
|
|
235
|
+
if (await predicate()) return true;
|
|
236
|
+
await sleep(50);
|
|
237
|
+
}
|
|
238
|
+
return false;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Factory function — wires real dependencies.
|
|
243
|
+
* @param {ConstructorParameters<typeof WorkdirManager>[0]} deps
|
|
244
|
+
* @returns {WorkdirManager}
|
|
245
|
+
*/
|
|
246
|
+
export function createWorkdirManager(deps) {
|
|
247
|
+
return new WorkdirManager(deps);
|
|
248
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `fit-benchmark report` — aggregate `results.jsonl` into pass@k via the
|
|
3
|
+
* OpenAI HumanEval estimator. Output is JSON by default; pass --format=text
|
|
4
|
+
* to render a markdown table.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { resolve } from "node:path";
|
|
8
|
+
|
|
9
|
+
import { aggregate, renderTextReport } from "../benchmark/report.js";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @param {object} values
|
|
13
|
+
* @param {string[]} _args
|
|
14
|
+
*/
|
|
15
|
+
export async function runBenchmarkReportCommand(values, _args) {
|
|
16
|
+
const inputDir = values.input;
|
|
17
|
+
if (!inputDir) throw new Error("--input is required");
|
|
18
|
+
const kRaw = values.k ?? "1,3,5";
|
|
19
|
+
const kValues = kRaw.split(",").map((t) => {
|
|
20
|
+
const n = Number.parseInt(t.trim(), 10);
|
|
21
|
+
if (!Number.isFinite(n) || n < 1) {
|
|
22
|
+
throw new Error(
|
|
23
|
+
"--k must be a comma-separated list of positive integers",
|
|
24
|
+
);
|
|
25
|
+
}
|
|
26
|
+
return n;
|
|
27
|
+
});
|
|
28
|
+
const format = values.format ?? "json";
|
|
29
|
+
if (format !== "json" && format !== "text") {
|
|
30
|
+
throw new Error("--format must be 'json' or 'text'");
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const report = await aggregate({ inputDir: resolve(inputDir), kValues });
|
|
34
|
+
if (format === "text") {
|
|
35
|
+
process.stdout.write(renderTextReport(report, kValues) + "\n");
|
|
36
|
+
} else {
|
|
37
|
+
process.stdout.write(JSON.stringify(report, null, 2) + "\n");
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `fit-benchmark run` — run every task in a family for N runs, stream each
|
|
3
|
+
* ResultRecord to stdout (one JSON line per record), and append to the
|
|
4
|
+
* canonical `<output>/results.jsonl` for the report subcommand.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { resolve } from "node:path";
|
|
8
|
+
|
|
9
|
+
import { createBenchmarkRunner } from "../benchmark/runner.js";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @param {object} values
|
|
13
|
+
* @param {string[]} _args
|
|
14
|
+
*/
|
|
15
|
+
export async function runBenchmarkRunCommand(values, _args) {
|
|
16
|
+
const opts = parseRunOptions(values);
|
|
17
|
+
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
18
|
+
const runner = createBenchmarkRunner({ ...opts, query });
|
|
19
|
+
|
|
20
|
+
let anyFail = false;
|
|
21
|
+
for await (const record of runner.run()) {
|
|
22
|
+
process.stdout.write(JSON.stringify(record) + "\n");
|
|
23
|
+
if (record.verdict !== "pass") anyFail = true;
|
|
24
|
+
}
|
|
25
|
+
process.exit(anyFail ? 1 : 0);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function parseRunOptions(values) {
|
|
29
|
+
const family = values.family;
|
|
30
|
+
if (!family) throw new Error("--family is required");
|
|
31
|
+
const output = values.output;
|
|
32
|
+
if (!output) throw new Error("--output is required");
|
|
33
|
+
const runs = Number.parseInt(values.runs ?? "1", 10);
|
|
34
|
+
if (!Number.isFinite(runs) || runs < 1)
|
|
35
|
+
throw new Error("--runs must be a positive integer");
|
|
36
|
+
return {
|
|
37
|
+
family,
|
|
38
|
+
runs,
|
|
39
|
+
output: resolve(output),
|
|
40
|
+
model: values.model ?? "claude-opus-4-7[1m]",
|
|
41
|
+
profiles: {
|
|
42
|
+
agent: values["agent-profile"] ?? null,
|
|
43
|
+
judge: values["judge-profile"] ?? null,
|
|
44
|
+
},
|
|
45
|
+
maxTurns: parseMaxTurns(values["max-turns"]),
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function parseMaxTurns(raw) {
|
|
50
|
+
if (raw === undefined) return undefined;
|
|
51
|
+
if (raw === "0") return 0;
|
|
52
|
+
return Number.parseInt(raw, 10);
|
|
53
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `fit-benchmark score` — score a single task against a post-run workdir
|
|
3
|
+
* directory without invoking an agent (P6/P7). Useful for re-scoring an
|
|
4
|
+
* agent's output against revised grading material.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { writeFileSync } from "node:fs";
|
|
8
|
+
import { join, resolve } from "node:path";
|
|
9
|
+
import { createServer } from "node:net";
|
|
10
|
+
|
|
11
|
+
import { validateScoringRecord } from "../benchmark/result.js";
|
|
12
|
+
import { runScoring } from "../benchmark/scorer.js";
|
|
13
|
+
import { loadTaskFamily } from "../benchmark/task-family.js";
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* @param {object} values
|
|
17
|
+
* @param {string[]} _args
|
|
18
|
+
*/
|
|
19
|
+
export async function runBenchmarkScoreCommand(values, _args) {
|
|
20
|
+
const familyInput = values.family;
|
|
21
|
+
if (!familyInput) throw new Error("--family is required");
|
|
22
|
+
const taskId = values.task;
|
|
23
|
+
if (!taskId) throw new Error("--task is required");
|
|
24
|
+
const workdirArg = values.workdir;
|
|
25
|
+
if (!workdirArg) throw new Error("--workdir is required");
|
|
26
|
+
|
|
27
|
+
const family = await loadTaskFamily(familyInput);
|
|
28
|
+
const task = family.tasks().find((t) => t.id === taskId);
|
|
29
|
+
if (!task) throw new Error(`task not found in family: ${taskId}`);
|
|
30
|
+
|
|
31
|
+
const runDir = resolve(workdirArg);
|
|
32
|
+
const cwd = join(runDir, "cwd");
|
|
33
|
+
const port = await allocatePort();
|
|
34
|
+
|
|
35
|
+
const scoring = await runScoring(task, { cwd, port, runDir });
|
|
36
|
+
const record = {
|
|
37
|
+
taskId: task.id,
|
|
38
|
+
scoring,
|
|
39
|
+
exitCode: scoring.exitCode,
|
|
40
|
+
};
|
|
41
|
+
validateScoringRecord(record);
|
|
42
|
+
|
|
43
|
+
const line = JSON.stringify(record) + "\n";
|
|
44
|
+
if (values.output) {
|
|
45
|
+
writeFileSync(resolve(values.output), line);
|
|
46
|
+
} else {
|
|
47
|
+
process.stdout.write(line);
|
|
48
|
+
}
|
|
49
|
+
process.exit(scoring.verdict === "pass" ? 0 : 1);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function allocatePort() {
|
|
53
|
+
return new Promise((res, rej) => {
|
|
54
|
+
const server = createServer();
|
|
55
|
+
server.unref();
|
|
56
|
+
server.on("error", rej);
|
|
57
|
+
server.listen(0, "127.0.0.1", () => {
|
|
58
|
+
const addr = server.address();
|
|
59
|
+
if (!addr || typeof addr === "string") {
|
|
60
|
+
server.close();
|
|
61
|
+
rej(new Error("failed to allocate port"));
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
const port = addr.port;
|
|
65
|
+
server.close(() => res(port));
|
|
66
|
+
});
|
|
67
|
+
});
|
|
68
|
+
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { readFileSync, createWriteStream } from "node:fs";
|
|
2
2
|
import { resolve } from "node:path";
|
|
3
3
|
import { createFacilitator } from "../facilitator.js";
|
|
4
|
+
import { createRedactor } from "../redaction.js";
|
|
4
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
5
6
|
|
|
6
7
|
/**
|
|
@@ -62,6 +63,11 @@ function parseFacilitateOptions(values) {
|
|
|
62
63
|
export async function runFacilitateCommand(values, _args) {
|
|
63
64
|
const opts = parseFacilitateOptions(values);
|
|
64
65
|
|
|
66
|
+
// Build the redactor as the first observable side-effect after option
|
|
67
|
+
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
68
|
+
// process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
69
|
+
const redactor = createRedactor();
|
|
70
|
+
|
|
65
71
|
const fileStream = opts.outputPath
|
|
66
72
|
? createWriteStream(opts.outputPath)
|
|
67
73
|
: null;
|
|
@@ -87,6 +93,7 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
87
93
|
maxTurns: opts.maxTurns,
|
|
88
94
|
facilitatorProfile: opts.facilitatorProfile,
|
|
89
95
|
taskAmend: opts.taskAmend,
|
|
96
|
+
redactor,
|
|
90
97
|
});
|
|
91
98
|
|
|
92
99
|
const result = await facilitator.run(opts.taskContent);
|