@forwardimpact/libeval 0.1.38 → 0.1.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +5 -0
- package/package.json +1 -1
- package/src/benchmark/apm-installer.js +30 -14
- package/src/benchmark/env-loader.js +146 -0
- package/src/benchmark/result.js +4 -4
- package/src/benchmark/runner.js +51 -49
- package/src/benchmark/scorer.js +4 -1
- package/src/benchmark/task-family.js +30 -4
- package/src/benchmark/workdir.js +15 -3
- package/src/commands/benchmark-run.js +6 -0
package/bin/fit-benchmark.js
CHANGED
|
@@ -68,6 +68,11 @@ export const definition = {
|
|
|
68
68
|
description:
|
|
69
69
|
"Agent-under-test turn budget (default: 50, 0 = unlimited)",
|
|
70
70
|
},
|
|
71
|
+
"allowed-tools": {
|
|
72
|
+
type: "string",
|
|
73
|
+
description:
|
|
74
|
+
"Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
|
|
75
|
+
},
|
|
71
76
|
},
|
|
72
77
|
},
|
|
73
78
|
{
|
package/package.json
CHANGED
|
@@ -19,19 +19,32 @@ export async function installApm(family, outputDir) {
|
|
|
19
19
|
const stagingDir = join(outputDir, ".apm-staging");
|
|
20
20
|
const stagedClaude = join(stagingDir, ".claude");
|
|
21
21
|
const sourceClaude = join(family.rootPath, ".claude");
|
|
22
|
+
const apmYml = join(family.rootPath, "apm.yml");
|
|
22
23
|
|
|
23
|
-
await
|
|
24
|
+
const hasApm = await access(apmYml)
|
|
25
|
+
.then(() => true)
|
|
26
|
+
.catch(() => false);
|
|
24
27
|
|
|
25
|
-
|
|
26
|
-
await
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
if (hasApm) {
|
|
29
|
+
await runApmInstall(family.rootPath);
|
|
30
|
+
try {
|
|
31
|
+
await access(sourceClaude);
|
|
32
|
+
} catch {
|
|
33
|
+
throw new Error(
|
|
34
|
+
`apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
|
|
35
|
+
);
|
|
36
|
+
}
|
|
31
37
|
}
|
|
32
38
|
|
|
33
39
|
await rm(stagingDir, { recursive: true, force: true });
|
|
34
|
-
await
|
|
40
|
+
const hasClaudeDir = await access(sourceClaude)
|
|
41
|
+
.then(() => true)
|
|
42
|
+
.catch(() => false);
|
|
43
|
+
if (hasClaudeDir) {
|
|
44
|
+
await cp(sourceClaude, stagedClaude, { recursive: true });
|
|
45
|
+
} else {
|
|
46
|
+
await mkdir(stagedClaude, { recursive: true });
|
|
47
|
+
}
|
|
35
48
|
|
|
36
49
|
// Stage the family-local judge profile outside .claude/ so it is available
|
|
37
50
|
// to the judge but never copied into the agent-under-test's CWD.
|
|
@@ -44,12 +57,15 @@ export async function installApm(family, outputDir) {
|
|
|
44
57
|
} catch {}
|
|
45
58
|
|
|
46
59
|
const lockPath = join(family.rootPath, "apm.lock.yaml");
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
60
|
+
let skillSetHash = "";
|
|
61
|
+
try {
|
|
62
|
+
const lockBytes = await readFile(lockPath);
|
|
63
|
+
skillSetHash =
|
|
64
|
+
"sha256:" +
|
|
65
|
+
createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
|
|
66
|
+
} catch {
|
|
67
|
+
// No lockfile — family doesn't use skill packs.
|
|
68
|
+
}
|
|
53
69
|
|
|
54
70
|
return { stagingDir, skillSetHash, judgeProfilesDir };
|
|
55
71
|
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Env-loader — auto-discover `.env` / `.env.local` files in a task family
|
|
3
|
+
* and its tasks, load them into `process.env`, and render the merged result
|
|
4
|
+
* into each agent CWD.
|
|
5
|
+
*
|
|
6
|
+
* Discovery paths (loaded in this order, first value per key wins):
|
|
7
|
+
* 1. process.env (CI secrets, shell env — never overwritten)
|
|
8
|
+
* 2. <family>/.env.local
|
|
9
|
+
* 3. <family>/.env
|
|
10
|
+
* 4. tasks/<id>/.env.local
|
|
11
|
+
* 5. tasks/<id>/.env
|
|
12
|
+
*
|
|
13
|
+
* Every discovered env file — family or task — is loaded into process.env
|
|
14
|
+
* AND rendered (with resolved values) into the agent working directory.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
18
|
+
import { join } from "node:path";
|
|
19
|
+
|
|
20
|
+
const ENV_FILES = [".env.local", ".env"];
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Parse a `.env` file into an array of {key, value} pairs.
|
|
24
|
+
* Handles KEY=VALUE, # comments, blank lines, and single/double-quoted values.
|
|
25
|
+
* @param {string} content
|
|
26
|
+
* @returns {Array<{key: string, value: string}>}
|
|
27
|
+
*/
|
|
28
|
+
export function parseEnvFile(content) {
|
|
29
|
+
const entries = [];
|
|
30
|
+
for (const raw of content.split("\n")) {
|
|
31
|
+
const line = raw.trim();
|
|
32
|
+
if (!line || line.startsWith("#")) continue;
|
|
33
|
+
const eq = line.indexOf("=");
|
|
34
|
+
if (eq === -1) continue;
|
|
35
|
+
const key = line.slice(0, eq).trim();
|
|
36
|
+
if (!key) continue;
|
|
37
|
+
let value = line.slice(eq + 1).trim();
|
|
38
|
+
if (
|
|
39
|
+
(value.startsWith('"') && value.endsWith('"')) ||
|
|
40
|
+
(value.startsWith("'") && value.endsWith("'"))
|
|
41
|
+
) {
|
|
42
|
+
value = value.slice(1, -1);
|
|
43
|
+
}
|
|
44
|
+
entries.push({ key, value });
|
|
45
|
+
}
|
|
46
|
+
return entries;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Read and parse an env file, returning [] if the file does not exist.
|
|
51
|
+
* @param {string} filePath
|
|
52
|
+
* @returns {Promise<Array<{key: string, value: string}>>}
|
|
53
|
+
*/
|
|
54
|
+
async function readEnvFile(filePath) {
|
|
55
|
+
try {
|
|
56
|
+
const content = await readFile(filePath, "utf8");
|
|
57
|
+
return parseEnvFile(content);
|
|
58
|
+
} catch (e) {
|
|
59
|
+
if (e.code === "ENOENT") return [];
|
|
60
|
+
throw e;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Load entries into process.env. Existing keys are never overwritten.
|
|
66
|
+
* @param {Array<{key: string, value: string}>} entries
|
|
67
|
+
* @returns {string[]} var names that were loaded
|
|
68
|
+
*/
|
|
69
|
+
function applyToProcessEnv(entries) {
|
|
70
|
+
const names = [];
|
|
71
|
+
for (const { key, value } of entries) {
|
|
72
|
+
names.push(key);
|
|
73
|
+
if (process.env[key] === undefined) {
|
|
74
|
+
process.env[key] = value;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return names;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Load one env file: apply to process.env, record keys in the merged map.
|
|
82
|
+
* @param {string} dir
|
|
83
|
+
* @param {string} file
|
|
84
|
+
* @param {Set<string>} names
|
|
85
|
+
* @param {Map<string, Map<string, true>>} merged
|
|
86
|
+
*/
|
|
87
|
+
async function loadOneEnvFile(dir, file, names, merged) {
|
|
88
|
+
const entries = await readEnvFile(join(dir, file));
|
|
89
|
+
if (entries.length === 0) return;
|
|
90
|
+
for (const name of applyToProcessEnv(entries)) names.add(name);
|
|
91
|
+
if (!merged.has(file)) merged.set(file, new Map());
|
|
92
|
+
const fileMap = merged.get(file);
|
|
93
|
+
for (const { key } of entries) {
|
|
94
|
+
if (!fileMap.has(key)) fileMap.set(key, true);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Scan directories for env files, load into process.env, and collect
|
|
100
|
+
* a merged key manifest per filename.
|
|
101
|
+
* @param {string[]} dirs
|
|
102
|
+
* @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
|
|
103
|
+
*/
|
|
104
|
+
async function collectEnvEntries(dirs) {
|
|
105
|
+
const names = new Set();
|
|
106
|
+
const merged = new Map();
|
|
107
|
+
for (const dir of dirs) {
|
|
108
|
+
for (const file of ENV_FILES) {
|
|
109
|
+
await loadOneEnvFile(dir, file, names, merged);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
return { names, merged };
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Write resolved env files into the agent CWD and warn about empty values.
|
|
117
|
+
* @param {Map<string, Map<string, true>>} merged
|
|
118
|
+
* @param {string} agentCwd
|
|
119
|
+
*/
|
|
120
|
+
async function renderEnvFiles(merged, agentCwd) {
|
|
121
|
+
for (const [file, keyMap] of merged) {
|
|
122
|
+
const keys = [...keyMap.keys()];
|
|
123
|
+
const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
|
|
124
|
+
await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
|
|
125
|
+
const empty = keys.filter((key) => !process.env[key]);
|
|
126
|
+
if (empty.length > 0) {
|
|
127
|
+
process.stderr.write(
|
|
128
|
+
`libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
|
|
129
|
+
);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Discover `.env` / `.env.local` in one or more directories, load them
|
|
136
|
+
* into process.env, and render the resolved values into the agent CWD.
|
|
137
|
+
*
|
|
138
|
+
* @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
|
|
139
|
+
* @param {string} agentCwd - Agent working directory to render into.
|
|
140
|
+
* @returns {Promise<string[]>} All var names discovered (for redaction).
|
|
141
|
+
*/
|
|
142
|
+
export async function loadEnv(dirs, agentCwd) {
|
|
143
|
+
const { names, merged } = await collectEnvEntries(dirs);
|
|
144
|
+
await renderEnvFiles(merged, agentCwd);
|
|
145
|
+
return [...names];
|
|
146
|
+
}
|
package/src/benchmark/result.js
CHANGED
|
@@ -29,7 +29,7 @@ const JUDGE_VERDICT_SHAPE = z.object({
|
|
|
29
29
|
|
|
30
30
|
const PROFILES_SHAPE = z.object({
|
|
31
31
|
agent: z.union([z.string(), z.null()]),
|
|
32
|
-
supervisor: z.null(),
|
|
32
|
+
supervisor: z.union([z.string(), z.null()]),
|
|
33
33
|
judge: z.union([z.string(), z.null()]),
|
|
34
34
|
});
|
|
35
35
|
|
|
@@ -48,8 +48,8 @@ const COMMON_FIELDS = {
|
|
|
48
48
|
profiles: PROFILES_SHAPE,
|
|
49
49
|
model: z.object({
|
|
50
50
|
agent: z.string(),
|
|
51
|
-
supervisor: z.string(),
|
|
52
|
-
judge: z.string(),
|
|
51
|
+
supervisor: z.string().optional(),
|
|
52
|
+
judge: z.string().optional(),
|
|
53
53
|
}),
|
|
54
54
|
skillSetHash: z.string(),
|
|
55
55
|
familyRevision: z.string(),
|
|
@@ -65,7 +65,7 @@ const HAPPY_RECORD = z.object({
|
|
|
65
65
|
...COMMON_FIELDS,
|
|
66
66
|
scoring: SCORING_SHAPE,
|
|
67
67
|
submission: z.string(),
|
|
68
|
-
judgeVerdict: JUDGE_VERDICT_SHAPE,
|
|
68
|
+
judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
|
|
69
69
|
agentTracePath: z.string(),
|
|
70
70
|
supervisorTracePath: z.string(),
|
|
71
71
|
judgeTracePath: z.string(),
|
package/src/benchmark/runner.js
CHANGED
|
@@ -15,11 +15,11 @@
|
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
17
|
import { createReadStream, createWriteStream } from "node:fs";
|
|
18
|
-
import {
|
|
18
|
+
import { mkdir, readFile, unlink } from "node:fs/promises";
|
|
19
19
|
import { createInterface } from "node:readline";
|
|
20
20
|
import { join, resolve as resolvePath } from "node:path";
|
|
21
21
|
|
|
22
|
-
import { createRedactor } from "../redaction.js";
|
|
22
|
+
import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
|
|
23
23
|
import { createSupervisor } from "../supervisor.js";
|
|
24
24
|
import { installApm } from "./apm-installer.js";
|
|
25
25
|
import { runJudge } from "./judge.js";
|
|
@@ -28,7 +28,16 @@ import { runScoring } from "./scorer.js";
|
|
|
28
28
|
import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
|
|
29
29
|
import { createWorkdirManager } from "./workdir.js";
|
|
30
30
|
|
|
31
|
-
const BASE_TOOLS = [
|
|
31
|
+
const BASE_TOOLS = [
|
|
32
|
+
"Bash",
|
|
33
|
+
"Read",
|
|
34
|
+
"Glob",
|
|
35
|
+
"Grep",
|
|
36
|
+
"Write",
|
|
37
|
+
"Edit",
|
|
38
|
+
"Agent",
|
|
39
|
+
"TodoWrite",
|
|
40
|
+
];
|
|
32
41
|
|
|
33
42
|
/** Sole orchestrator for a task-family benchmark run. */
|
|
34
43
|
export class BenchmarkRunner {
|
|
@@ -42,6 +51,7 @@ export class BenchmarkRunner {
|
|
|
42
51
|
* @param {string} opts.judgeModel
|
|
43
52
|
* @param {{agent?: string, judge?: string}} [opts.profiles]
|
|
44
53
|
* @param {Function} opts.query - SDK query (injected for testability).
|
|
54
|
+
* @param {string[]} [opts.allowedTools] - Agent tool allowlist (default: BASE_TOOLS).
|
|
45
55
|
* @param {number} [opts.maxTurns] - Agent-under-test turn budget.
|
|
46
56
|
* @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
|
|
47
57
|
* @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
|
|
@@ -64,6 +74,7 @@ export class BenchmarkRunner {
|
|
|
64
74
|
judgeModel,
|
|
65
75
|
profiles,
|
|
66
76
|
query,
|
|
77
|
+
allowedTools,
|
|
67
78
|
maxTurns,
|
|
68
79
|
termGraceMs,
|
|
69
80
|
// Test seams — default to the real implementations.
|
|
@@ -76,8 +87,6 @@ export class BenchmarkRunner {
|
|
|
76
87
|
throw new Error("runs must be an integer ≥ 1");
|
|
77
88
|
if (!output) throw new Error("output is required");
|
|
78
89
|
if (!agentModel) throw new Error("agentModel is required");
|
|
79
|
-
if (!supervisorModel) throw new Error("supervisorModel is required");
|
|
80
|
-
if (!judgeModel) throw new Error("judgeModel is required");
|
|
81
90
|
if (!query) throw new Error("query is required");
|
|
82
91
|
this.familyInput = family;
|
|
83
92
|
this.runs = runs;
|
|
@@ -85,6 +94,7 @@ export class BenchmarkRunner {
|
|
|
85
94
|
this.agentModel = agentModel;
|
|
86
95
|
this.supervisorModel = supervisorModel;
|
|
87
96
|
this.judgeModel = judgeModel;
|
|
97
|
+
this.allowedTools = allowedTools ?? BASE_TOOLS;
|
|
88
98
|
this.profiles = {
|
|
89
99
|
agent: profiles?.agent ?? null,
|
|
90
100
|
judge: profiles?.judge ?? null,
|
|
@@ -114,9 +124,6 @@ export class BenchmarkRunner {
|
|
|
114
124
|
);
|
|
115
125
|
|
|
116
126
|
const tasks = family.tasks();
|
|
117
|
-
for (const task of tasks) {
|
|
118
|
-
await assertPreflightExecutable(task);
|
|
119
|
-
}
|
|
120
127
|
if (this.profiles.judge) {
|
|
121
128
|
await assertJudgeProfileStaged(
|
|
122
129
|
family,
|
|
@@ -129,6 +136,7 @@ export class BenchmarkRunner {
|
|
|
129
136
|
stagingDir,
|
|
130
137
|
runOutputDir: this.output,
|
|
131
138
|
termGraceMs: this.termGraceMs,
|
|
139
|
+
familyRootPath: family.rootPath,
|
|
132
140
|
});
|
|
133
141
|
|
|
134
142
|
const resultsPath = join(this.output, "results.jsonl");
|
|
@@ -178,33 +186,38 @@ export class BenchmarkRunner {
|
|
|
178
186
|
port: workdir.port,
|
|
179
187
|
runDir: workdir.runDir,
|
|
180
188
|
});
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
189
|
+
let judgeVerdict = null;
|
|
190
|
+
if (task.paths.judge) {
|
|
191
|
+
const judgeContext = await this.#buildJudgeContext(
|
|
192
|
+
task,
|
|
193
|
+
workdir,
|
|
194
|
+
skillSetHash,
|
|
195
|
+
);
|
|
196
|
+
judgeVerdict = await this._runJudgeHook(
|
|
197
|
+
task,
|
|
198
|
+
workdir,
|
|
199
|
+
scoring,
|
|
200
|
+
{
|
|
201
|
+
query: this.query,
|
|
202
|
+
model: this.judgeModel,
|
|
203
|
+
judgeProfile: this.profiles.judge ?? undefined,
|
|
204
|
+
profilesDir: judgeProfilesDir,
|
|
205
|
+
},
|
|
206
|
+
judgeContext,
|
|
207
|
+
);
|
|
208
|
+
}
|
|
209
|
+
const verdict =
|
|
210
|
+
scoring.verdict === "pass" &&
|
|
211
|
+
(judgeVerdict === null || judgeVerdict.verdict === "pass")
|
|
212
|
+
? "pass"
|
|
213
|
+
: "fail";
|
|
198
214
|
const record = {
|
|
199
215
|
taskId: task.id,
|
|
200
216
|
runIndex,
|
|
201
|
-
verdict
|
|
202
|
-
scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
|
|
203
|
-
? "pass"
|
|
204
|
-
: "fail",
|
|
217
|
+
verdict,
|
|
205
218
|
scoring,
|
|
206
219
|
submission,
|
|
207
|
-
judgeVerdict,
|
|
220
|
+
...(judgeVerdict && { judgeVerdict }),
|
|
208
221
|
costUsd,
|
|
209
222
|
turns,
|
|
210
223
|
agentTracePath: workdir.agentTracePath,
|
|
@@ -262,6 +275,9 @@ export class BenchmarkRunner {
|
|
|
262
275
|
async #runAgent(task, workdir) {
|
|
263
276
|
const combinedPath = join(workdir.runDir, ".combined.ndjson");
|
|
264
277
|
const combinedStream = createWriteStream(combinedPath);
|
|
278
|
+
const supervisorInstructions = task.paths.supervisor
|
|
279
|
+
? await readFile(task.paths.supervisor, "utf8").catch(() => null)
|
|
280
|
+
: null;
|
|
265
281
|
const supervisor = createSupervisor({
|
|
266
282
|
supervisorCwd: workdir.cwd,
|
|
267
283
|
agentCwd: workdir.cwd,
|
|
@@ -270,9 +286,12 @@ export class BenchmarkRunner {
|
|
|
270
286
|
agentModel: this.agentModel,
|
|
271
287
|
supervisorModel: this.supervisorModel,
|
|
272
288
|
maxTurns: this.maxTurns ?? 50,
|
|
273
|
-
allowedTools:
|
|
289
|
+
allowedTools: this.allowedTools,
|
|
274
290
|
...(this.profiles.agent && { agentProfile: this.profiles.agent }),
|
|
275
|
-
|
|
291
|
+
...(supervisorInstructions && { taskAmend: supervisorInstructions }),
|
|
292
|
+
redactor: createRedactor({
|
|
293
|
+
allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
|
|
294
|
+
}),
|
|
276
295
|
});
|
|
277
296
|
const instructions = await readFile(task.paths.instructions, "utf8");
|
|
278
297
|
let agentError = null;
|
|
@@ -372,23 +391,6 @@ async function writeRecord(stream, record) {
|
|
|
372
391
|
});
|
|
373
392
|
}
|
|
374
393
|
|
|
375
|
-
/**
|
|
376
|
-
* Pre-flight install gate. Throws synchronously if any task's preflight
|
|
377
|
-
* script is missing or not executable — design § Pre-flight contract:
|
|
378
|
-
* "The harness fails the family at install if any task's preflight script
|
|
379
|
-
* is missing or non-executable, before any agent session starts."
|
|
380
|
-
*/
|
|
381
|
-
async function assertPreflightExecutable(task) {
|
|
382
|
-
const path = join(task.paths.hooks, "preflight.sh");
|
|
383
|
-
try {
|
|
384
|
-
await access(path, constants.X_OK);
|
|
385
|
-
} catch (e) {
|
|
386
|
-
throw new Error(
|
|
387
|
-
`task ${task.id}: preflight script not executable at ${path} (${e.code ?? e.message})`,
|
|
388
|
-
);
|
|
389
|
-
}
|
|
390
|
-
}
|
|
391
|
-
|
|
392
394
|
/**
|
|
393
395
|
* Split the combined supervisor trace into agent and supervisor files, and
|
|
394
396
|
* extract cost, turn count, and submission in a single pass. Agent-source
|
package/src/benchmark/scorer.js
CHANGED
|
@@ -28,8 +28,11 @@ import { join } from "node:path";
|
|
|
28
28
|
* @returns {Promise<ScoringResult>}
|
|
29
29
|
*/
|
|
30
30
|
export function runScoring(task, ctx) {
|
|
31
|
+
if (!task.paths.score) {
|
|
32
|
+
return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
|
|
33
|
+
}
|
|
31
34
|
return new Promise((res, rej) => {
|
|
32
|
-
const script =
|
|
35
|
+
const script = task.paths.score;
|
|
33
36
|
const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
|
|
34
37
|
|
|
35
38
|
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* .claude/ # pre-staged skills + agents (P1)
|
|
6
6
|
* tasks/<task_name>/
|
|
7
7
|
* agent.task.md
|
|
8
|
-
* supervisor.task.md #
|
|
8
|
+
* supervisor.task.md # optional; appended to the task as supervisor context
|
|
9
9
|
* judge.task.md
|
|
10
10
|
* hooks/ # harness-only; never copied to agent CWD
|
|
11
11
|
* preflight.sh
|
|
@@ -23,6 +23,7 @@ import { spawn } from "node:child_process";
|
|
|
23
23
|
import { createHash } from "node:crypto";
|
|
24
24
|
import {
|
|
25
25
|
access,
|
|
26
|
+
constants,
|
|
26
27
|
lstat,
|
|
27
28
|
mkdtemp,
|
|
28
29
|
readdir,
|
|
@@ -100,13 +101,20 @@ async function discoverTasks(rootPath) {
|
|
|
100
101
|
for (const entry of entries) {
|
|
101
102
|
if (!entry.isDirectory()) continue;
|
|
102
103
|
const taskDir = join(tasksRoot, entry.name);
|
|
104
|
+
const supervisorPath = join(taskDir, "supervisor.task.md");
|
|
105
|
+
const judgePath = join(taskDir, "judge.task.md");
|
|
106
|
+
const preflightPath = join(taskDir, "hooks", "preflight.sh");
|
|
107
|
+
const scorePath = join(taskDir, "hooks", "score.sh");
|
|
103
108
|
tasks.push({
|
|
104
109
|
id: entry.name,
|
|
105
110
|
paths: {
|
|
111
|
+
taskDir,
|
|
106
112
|
instructions: join(taskDir, "agent.task.md"),
|
|
107
|
-
supervisor:
|
|
108
|
-
judge:
|
|
113
|
+
supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
|
|
114
|
+
judge: (await fileExists(judgePath)) ? judgePath : null,
|
|
109
115
|
hooks: join(taskDir, "hooks"),
|
|
116
|
+
preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
|
|
117
|
+
score: (await fileExecutable(scorePath)) ? scorePath : null,
|
|
110
118
|
specs: join(taskDir, "specs"),
|
|
111
119
|
workdir: join(taskDir, "workdir"),
|
|
112
120
|
},
|
|
@@ -116,6 +124,24 @@ async function discoverTasks(rootPath) {
|
|
|
116
124
|
return tasks;
|
|
117
125
|
}
|
|
118
126
|
|
|
127
|
+
async function fileExists(path) {
|
|
128
|
+
try {
|
|
129
|
+
await access(path);
|
|
130
|
+
return true;
|
|
131
|
+
} catch {
|
|
132
|
+
return false;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
async function fileExecutable(path) {
|
|
137
|
+
try {
|
|
138
|
+
await access(path, constants.X_OK);
|
|
139
|
+
return true;
|
|
140
|
+
} catch {
|
|
141
|
+
return false;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
119
145
|
/**
|
|
120
146
|
* Canonical-tree hash per design § Family revision algorithm:
|
|
121
147
|
* list regular files (excluding .git/, node_modules/)
|
|
@@ -210,7 +236,7 @@ function run(cmd, args) {
|
|
|
210
236
|
/**
|
|
211
237
|
* @typedef {object} Task
|
|
212
238
|
* @property {string} id - Task name (directory name under tasks/)
|
|
213
|
-
* @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
|
|
239
|
+
* @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
|
|
214
240
|
*/
|
|
215
241
|
|
|
216
242
|
/**
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -13,6 +13,8 @@ import { createServer } from "node:net";
|
|
|
13
13
|
import { connect } from "node:net";
|
|
14
14
|
import { join } from "node:path";
|
|
15
15
|
|
|
16
|
+
import { loadEnv } from "./env-loader.js";
|
|
17
|
+
|
|
16
18
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
17
19
|
|
|
18
20
|
/**
|
|
@@ -25,6 +27,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
|
25
27
|
* @property {string} agentTracePath
|
|
26
28
|
* @property {string} supervisorTracePath
|
|
27
29
|
* @property {string} judgeTracePath
|
|
30
|
+
* @property {string[]} [envNames] - Env var names loaded from .env files.
|
|
28
31
|
* @property {{phase: string, message: string, exitCode: number}} [preflightError]
|
|
29
32
|
*/
|
|
30
33
|
|
|
@@ -35,12 +38,13 @@ export class WorkdirManager {
|
|
|
35
38
|
* @param {string} deps.stagingDir - Output of `installApm(...)`.
|
|
36
39
|
* @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
|
|
37
40
|
*/
|
|
38
|
-
constructor({ stagingDir, runOutputDir, termGraceMs }) {
|
|
41
|
+
constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
|
|
39
42
|
if (!stagingDir) throw new Error("stagingDir is required");
|
|
40
43
|
if (!runOutputDir) throw new Error("runOutputDir is required");
|
|
41
44
|
this.stagingDir = stagingDir;
|
|
42
45
|
this.runOutputDir = runOutputDir;
|
|
43
46
|
this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
|
|
47
|
+
this.familyRootPath = familyRootPath ?? null;
|
|
44
48
|
}
|
|
45
49
|
|
|
46
50
|
/**
|
|
@@ -67,13 +71,20 @@ export class WorkdirManager {
|
|
|
67
71
|
recursive: true,
|
|
68
72
|
});
|
|
69
73
|
|
|
74
|
+
const envDirs = [
|
|
75
|
+
...(this.familyRootPath ? [this.familyRootPath] : []),
|
|
76
|
+
...(task.paths.taskDir ? [task.paths.taskDir] : []),
|
|
77
|
+
];
|
|
78
|
+
const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
|
|
79
|
+
|
|
70
80
|
const port = await allocatePort();
|
|
71
81
|
const agentTracePath = join(runDir, "agent.ndjson");
|
|
72
82
|
const supervisorTracePath = join(runDir, "supervisor.ndjson");
|
|
73
83
|
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
74
84
|
|
|
75
|
-
const
|
|
76
|
-
|
|
85
|
+
const preflight = task.paths.preflight
|
|
86
|
+
? await runPreflight(task.paths.preflight, cwd, port)
|
|
87
|
+
: { pgid: 0 };
|
|
77
88
|
|
|
78
89
|
return {
|
|
79
90
|
cwd,
|
|
@@ -84,6 +95,7 @@ export class WorkdirManager {
|
|
|
84
95
|
agentTracePath,
|
|
85
96
|
supervisorTracePath,
|
|
86
97
|
judgeTracePath,
|
|
98
|
+
envNames,
|
|
87
99
|
...(preflight.error && { preflightError: preflight.error }),
|
|
88
100
|
};
|
|
89
101
|
}
|
|
@@ -47,6 +47,12 @@ function parseRunOptions(values) {
|
|
|
47
47
|
judge: values["judge-profile"] ?? null,
|
|
48
48
|
},
|
|
49
49
|
maxTurns: parseMaxTurns(values["max-turns"]),
|
|
50
|
+
allowedTools: values["allowed-tools"]
|
|
51
|
+
? values["allowed-tools"]
|
|
52
|
+
.split(",")
|
|
53
|
+
.map((s) => s.trim())
|
|
54
|
+
.filter(Boolean)
|
|
55
|
+
: undefined,
|
|
50
56
|
};
|
|
51
57
|
}
|
|
52
58
|
|