@forwardimpact/libeval 0.1.38 → 0.1.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,6 +68,11 @@ export const definition = {
68
68
  description:
69
69
  "Agent-under-test turn budget (default: 50, 0 = unlimited)",
70
70
  },
71
+ "allowed-tools": {
72
+ type: "string",
73
+ description:
74
+ "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
75
+ },
71
76
  },
72
77
  },
73
78
  {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.38",
3
+ "version": "0.1.41",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -77,22 +77,7 @@ export class AgentRunner {
77
77
  try {
78
78
  const iterator = this.query({
79
79
  prompt: effectiveTask,
80
- options: {
81
- cwd: this.cwd,
82
- allowedTools: this.allowedTools,
83
- maxTurns:
84
- this.maxTurns === 0 ? Number.MAX_SAFE_INTEGER : this.maxTurns,
85
- model: this.model,
86
- permissionMode: PERMISSION_MODE,
87
- allowDangerouslySkipPermissions: true,
88
- settingSources: this.settingSources,
89
- abortController,
90
- ...(this.disallowedTools.length > 0 && {
91
- disallowedTools: this.disallowedTools,
92
- }),
93
- ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
94
- ...(this.mcpServers && { mcpServers: this.mcpServers }),
95
- },
80
+ options: this.#callOptions(abortController),
96
81
  });
97
82
  return await this.#consumeQuery(iterator);
98
83
  } finally {
@@ -112,12 +97,8 @@ export class AgentRunner {
112
97
  const iterator = this.query({
113
98
  prompt,
114
99
  options: {
100
+ ...this.#callOptions(abortController),
115
101
  resume: this.sessionId,
116
- model: this.model,
117
- permissionMode: PERMISSION_MODE,
118
- allowDangerouslySkipPermissions: true,
119
- abortController,
120
- ...(this.mcpServers && { mcpServers: this.mcpServers }),
121
102
  },
122
103
  });
123
104
  return await this.#consumeQuery(iterator);
@@ -126,6 +107,37 @@ export class AgentRunner {
126
107
  }
127
108
  }
128
109
 
110
+ /**
111
+ * Build the options passed to every SDK query() call. Shared by run() and
112
+ * resume() so the agent's configuration — cwd, tools, prompt, setting
113
+ * sources, turn budget — is identical across the session's lifetime. Only
114
+ * resume() layers `resume: this.sessionId` on top.
115
+ *
116
+ * SDK options are call-attached, not session-attached: the resumed call
117
+ * loads the prior conversation but otherwise uses whatever options this
118
+ * call passes. Omitting tool/prompt/setting options on resume causes the
119
+ * agent to silently lose its restrictions and persona between turns.
120
+ * @param {AbortController} abortController
121
+ * @returns {object}
122
+ */
123
+ #callOptions(abortController) {
124
+ return {
125
+ cwd: this.cwd,
126
+ allowedTools: this.allowedTools,
127
+ maxTurns: this.maxTurns === 0 ? Number.MAX_SAFE_INTEGER : this.maxTurns,
128
+ model: this.model,
129
+ permissionMode: PERMISSION_MODE,
130
+ allowDangerouslySkipPermissions: true,
131
+ settingSources: this.settingSources,
132
+ abortController,
133
+ ...(this.disallowedTools.length > 0 && {
134
+ disallowedTools: this.disallowedTools,
135
+ }),
136
+ ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
137
+ ...(this.mcpServers && { mcpServers: this.mcpServers }),
138
+ };
139
+ }
140
+
129
141
  /**
130
142
  * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
131
143
  * iterator, mirroring every line to the output stream / buffer / onLine
@@ -3,55 +3,109 @@
3
3
  * materialise skills and agents, copies the resulting `.claude/` into a
4
4
  * staging directory, and computes the manifest fingerprint from the lockfile.
5
5
  * Per-task copy happens later in WorkdirManager.
6
+ *
7
+ * The class takes a `spawn` seam so tests can substitute a fake child process
8
+ * without ever shelling out to a real `apm` binary. See `createApmInstaller`
9
+ * for the real-dependency wiring; `installApm` is a thin free-function wrapper
10
+ * for callers that don't need to inject anything.
6
11
  */
7
12
 
8
- import { spawn } from "node:child_process";
13
+ import { spawn as nodeSpawn } from "node:child_process";
9
14
  import { createHash } from "node:crypto";
10
15
  import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
11
16
  import { join } from "node:path";
12
17
 
13
- /**
14
- * @param {import("./task-family.js").TaskFamily} family
15
- * @param {string} outputDir - The benchmark run's output directory.
16
- * @returns {Promise<{stagingDir: string, skillSetHash: string}>}
17
- */
18
- export async function installApm(family, outputDir) {
19
- const stagingDir = join(outputDir, ".apm-staging");
20
- const stagedClaude = join(stagingDir, ".claude");
21
- const sourceClaude = join(family.rootPath, ".claude");
18
+ /** Installs apm and stages `.claude/` for a task family. */
19
+ export class ApmInstaller {
20
+ /**
21
+ * @param {object} [deps]
22
+ * @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
23
+ * `node:child_process` spawn). Tests inject a fake to avoid shelling out.
24
+ */
25
+ constructor({ spawn } = {}) {
26
+ this.spawn = spawn ?? nodeSpawn;
27
+ }
22
28
 
23
- await runApmInstall(family.rootPath);
29
+ /**
30
+ * @param {import("./task-family.js").TaskFamily} family
31
+ * @param {string} outputDir - The benchmark run's output directory.
32
+ * @returns {Promise<{stagingDir: string, skillSetHash: string, judgeProfilesDir: string}>}
33
+ */
34
+ async install(family, outputDir) {
35
+ const stagingDir = join(outputDir, ".apm-staging");
36
+ const stagedClaude = join(stagingDir, ".claude");
37
+ const sourceClaude = join(family.rootPath, ".claude");
38
+ const apmYml = join(family.rootPath, "apm.yml");
24
39
 
25
- try {
26
- await access(sourceClaude);
27
- } catch {
28
- throw new Error(
29
- `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
30
- );
31
- }
40
+ const hasApm = await access(apmYml)
41
+ .then(() => true)
42
+ .catch(() => false);
43
+
44
+ if (hasApm) {
45
+ await this.#runApmInstall(family.rootPath);
46
+ try {
47
+ await access(sourceClaude);
48
+ } catch {
49
+ throw new Error(
50
+ `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
51
+ );
52
+ }
53
+ }
32
54
 
33
- await rm(stagingDir, { recursive: true, force: true });
34
- await cp(sourceClaude, stagedClaude, { recursive: true });
55
+ await rm(stagingDir, { recursive: true, force: true });
56
+ const hasClaudeDir = await access(sourceClaude)
57
+ .then(() => true)
58
+ .catch(() => false);
59
+ if (hasClaudeDir) {
60
+ await cp(sourceClaude, stagedClaude, { recursive: true });
61
+ } else {
62
+ await mkdir(stagedClaude, { recursive: true });
63
+ }
35
64
 
36
- // Stage the family-local judge profile outside .claude/ so it is available
37
- // to the judge but never copied into the agent-under-test's CWD.
38
- const judgeSource = join(family.rootPath, "judge.md");
39
- const judgeProfilesDir = join(stagingDir, "judge-profiles");
40
- try {
41
- await access(judgeSource);
42
- await mkdir(judgeProfilesDir, { recursive: true });
43
- await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
44
- } catch {}
65
+ // Stage the family-local judge profile outside .claude/ so it is available
66
+ // to the judge but never copied into the agent-under-test's CWD.
67
+ const judgeSource = join(family.rootPath, "judge.md");
68
+ const judgeProfilesDir = join(stagingDir, "judge-profiles");
69
+ try {
70
+ await access(judgeSource);
71
+ await mkdir(judgeProfilesDir, { recursive: true });
72
+ await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
73
+ } catch {}
45
74
 
46
- const lockPath = join(family.rootPath, "apm.lock.yaml");
47
- const lockBytes = await readFile(lockPath).catch(() => {
48
- throw new Error(`apm install did not produce apm.lock.yaml at ${lockPath}`);
49
- });
50
- const skillSetHash =
51
- "sha256:" +
52
- createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
75
+ const lockPath = join(family.rootPath, "apm.lock.yaml");
76
+ let skillSetHash = "";
77
+ try {
78
+ const lockBytes = await readFile(lockPath);
79
+ skillSetHash =
80
+ "sha256:" +
81
+ createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
82
+ } catch {
83
+ // No lockfile — family doesn't use skill packs.
84
+ }
53
85
 
54
- return { stagingDir, skillSetHash, judgeProfilesDir };
86
+ return { stagingDir, skillSetHash, judgeProfilesDir };
87
+ }
88
+
89
+ #runApmInstall(cwd) {
90
+ return new Promise((res, rej) => {
91
+ const child = this.spawn("apm", ["install", "--target", "claude"], {
92
+ cwd,
93
+ stdio: ["ignore", "pipe", "pipe"],
94
+ });
95
+ let stderr = "";
96
+ child.stdout.on("data", () => {});
97
+ child.stderr.on("data", (d) => {
98
+ stderr += d.toString();
99
+ });
100
+ child.on("error", (e) => {
101
+ rej(new Error(`failed to spawn apm: ${e.message}`));
102
+ });
103
+ child.on("close", (code) => {
104
+ if (code === 0) res();
105
+ else rej(new Error(`apm install exited ${code}: ${stderr}`));
106
+ });
107
+ });
108
+ }
55
109
  }
56
110
 
57
111
  function normalizeLf(buf) {
@@ -63,23 +117,20 @@ function normalizeLf(buf) {
63
117
  return Buffer.from(out);
64
118
  }
65
119
 
66
- function runApmInstall(cwd) {
67
- return new Promise((res, rej) => {
68
- const child = spawn("apm", ["install", "--target", "claude"], {
69
- cwd,
70
- stdio: ["ignore", "pipe", "pipe"],
71
- });
72
- let stderr = "";
73
- child.stdout.on("data", () => {});
74
- child.stderr.on("data", (d) => {
75
- stderr += d.toString();
76
- });
77
- child.on("error", (e) => {
78
- rej(new Error(`failed to spawn apm: ${e.message}`));
79
- });
80
- child.on("close", (code) => {
81
- if (code === 0) res();
82
- else rej(new Error(`apm install exited ${code}: ${stderr}`));
83
- });
84
- });
120
+ /**
121
+ * Factory function wires real dependencies.
122
+ * @param {ConstructorParameters<typeof ApmInstaller>[0]} [deps]
123
+ * @returns {ApmInstaller}
124
+ */
125
+ export function createApmInstaller(deps) {
126
+ return new ApmInstaller(deps);
127
+ }
128
+
129
+ /**
130
+ * Free-function shorthand for callers that don't need to inject a spawn seam.
131
+ * @param {import("./task-family.js").TaskFamily} family
132
+ * @param {string} outputDir
133
+ */
134
+ export function installApm(family, outputDir) {
135
+ return new ApmInstaller().install(family, outputDir);
85
136
  }
@@ -0,0 +1,146 @@
1
+ /**
2
+ * Env-loader — auto-discover `.env` / `.env.local` files in a task family
3
+ * and its tasks, load them into `process.env`, and render the merged result
4
+ * into each agent CWD.
5
+ *
6
+ * Discovery paths (loaded in this order, first value per key wins):
7
+ * 1. process.env (CI secrets, shell env — never overwritten)
8
+ * 2. <family>/.env.local
9
+ * 3. <family>/.env
10
+ * 4. tasks/<id>/.env.local
11
+ * 5. tasks/<id>/.env
12
+ *
13
+ * Every discovered env file — family or task — is loaded into process.env
14
+ * AND rendered (with resolved values) into the agent working directory.
15
+ */
16
+
17
+ import { readFile, writeFile } from "node:fs/promises";
18
+ import { join } from "node:path";
19
+
20
+ const ENV_FILES = [".env.local", ".env"];
21
+
22
+ /**
23
+ * Parse a `.env` file into an array of {key, value} pairs.
24
+ * Handles KEY=VALUE, # comments, blank lines, and single/double-quoted values.
25
+ * @param {string} content
26
+ * @returns {Array<{key: string, value: string}>}
27
+ */
28
+ export function parseEnvFile(content) {
29
+ const entries = [];
30
+ for (const raw of content.split("\n")) {
31
+ const line = raw.trim();
32
+ if (!line || line.startsWith("#")) continue;
33
+ const eq = line.indexOf("=");
34
+ if (eq === -1) continue;
35
+ const key = line.slice(0, eq).trim();
36
+ if (!key) continue;
37
+ let value = line.slice(eq + 1).trim();
38
+ if (
39
+ (value.startsWith('"') && value.endsWith('"')) ||
40
+ (value.startsWith("'") && value.endsWith("'"))
41
+ ) {
42
+ value = value.slice(1, -1);
43
+ }
44
+ entries.push({ key, value });
45
+ }
46
+ return entries;
47
+ }
48
+
49
+ /**
50
+ * Read and parse an env file, returning [] if the file does not exist.
51
+ * @param {string} filePath
52
+ * @returns {Promise<Array<{key: string, value: string}>>}
53
+ */
54
+ async function readEnvFile(filePath) {
55
+ try {
56
+ const content = await readFile(filePath, "utf8");
57
+ return parseEnvFile(content);
58
+ } catch (e) {
59
+ if (e.code === "ENOENT") return [];
60
+ throw e;
61
+ }
62
+ }
63
+
64
+ /**
65
+ * Load entries into process.env. Existing keys are never overwritten.
66
+ * @param {Array<{key: string, value: string}>} entries
67
+ * @returns {string[]} var names that were loaded
68
+ */
69
+ function applyToProcessEnv(entries) {
70
+ const names = [];
71
+ for (const { key, value } of entries) {
72
+ names.push(key);
73
+ if (process.env[key] === undefined) {
74
+ process.env[key] = value;
75
+ }
76
+ }
77
+ return names;
78
+ }
79
+
80
+ /**
81
+ * Load one env file: apply to process.env, record keys in the merged map.
82
+ * @param {string} dir
83
+ * @param {string} file
84
+ * @param {Set<string>} names
85
+ * @param {Map<string, Map<string, true>>} merged
86
+ */
87
+ async function loadOneEnvFile(dir, file, names, merged) {
88
+ const entries = await readEnvFile(join(dir, file));
89
+ if (entries.length === 0) return;
90
+ for (const name of applyToProcessEnv(entries)) names.add(name);
91
+ if (!merged.has(file)) merged.set(file, new Map());
92
+ const fileMap = merged.get(file);
93
+ for (const { key } of entries) {
94
+ if (!fileMap.has(key)) fileMap.set(key, true);
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Scan directories for env files, load into process.env, and collect
100
+ * a merged key manifest per filename.
101
+ * @param {string[]} dirs
102
+ * @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
103
+ */
104
+ async function collectEnvEntries(dirs) {
105
+ const names = new Set();
106
+ const merged = new Map();
107
+ for (const dir of dirs) {
108
+ for (const file of ENV_FILES) {
109
+ await loadOneEnvFile(dir, file, names, merged);
110
+ }
111
+ }
112
+ return { names, merged };
113
+ }
114
+
115
+ /**
116
+ * Write resolved env files into the agent CWD and warn about empty values.
117
+ * @param {Map<string, Map<string, true>>} merged
118
+ * @param {string} agentCwd
119
+ */
120
+ async function renderEnvFiles(merged, agentCwd) {
121
+ for (const [file, keyMap] of merged) {
122
+ const keys = [...keyMap.keys()];
123
+ const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
124
+ await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
125
+ const empty = keys.filter((key) => !process.env[key]);
126
+ if (empty.length > 0) {
127
+ process.stderr.write(
128
+ `libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
129
+ );
130
+ }
131
+ }
132
+ }
133
+
134
+ /**
135
+ * Discover `.env` / `.env.local` in one or more directories, load them
136
+ * into process.env, and render the resolved values into the agent CWD.
137
+ *
138
+ * @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
139
+ * @param {string} agentCwd - Agent working directory to render into.
140
+ * @returns {Promise<string[]>} All var names discovered (for redaction).
141
+ */
142
+ export async function loadEnv(dirs, agentCwd) {
143
+ const { names, merged } = await collectEnvEntries(dirs);
144
+ await renderEnvFiles(merged, agentCwd);
145
+ return [...names];
146
+ }
@@ -29,7 +29,7 @@ const JUDGE_VERDICT_SHAPE = z.object({
29
29
 
30
30
  const PROFILES_SHAPE = z.object({
31
31
  agent: z.union([z.string(), z.null()]),
32
- supervisor: z.null(),
32
+ supervisor: z.union([z.string(), z.null()]),
33
33
  judge: z.union([z.string(), z.null()]),
34
34
  });
35
35
 
@@ -48,8 +48,8 @@ const COMMON_FIELDS = {
48
48
  profiles: PROFILES_SHAPE,
49
49
  model: z.object({
50
50
  agent: z.string(),
51
- supervisor: z.string(),
52
- judge: z.string(),
51
+ supervisor: z.string().optional(),
52
+ judge: z.string().optional(),
53
53
  }),
54
54
  skillSetHash: z.string(),
55
55
  familyRevision: z.string(),
@@ -65,7 +65,7 @@ const HAPPY_RECORD = z.object({
65
65
  ...COMMON_FIELDS,
66
66
  scoring: SCORING_SHAPE,
67
67
  submission: z.string(),
68
- judgeVerdict: JUDGE_VERDICT_SHAPE,
68
+ judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
69
69
  agentTracePath: z.string(),
70
70
  supervisorTracePath: z.string(),
71
71
  judgeTracePath: z.string(),
@@ -15,20 +15,29 @@
15
15
  */
16
16
 
17
17
  import { createReadStream, createWriteStream } from "node:fs";
18
- import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
18
+ import { mkdir, readFile, unlink } from "node:fs/promises";
19
19
  import { createInterface } from "node:readline";
20
20
  import { join, resolve as resolvePath } from "node:path";
21
21
 
22
- import { createRedactor } from "../redaction.js";
22
+ import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
23
23
  import { createSupervisor } from "../supervisor.js";
24
- import { installApm } from "./apm-installer.js";
24
+ import { installApm as defaultInstallApm } from "./apm-installer.js";
25
25
  import { runJudge } from "./judge.js";
26
26
  import { validateResultRecord } from "./result.js";
27
27
  import { runScoring } from "./scorer.js";
28
28
  import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
29
29
  import { createWorkdirManager } from "./workdir.js";
30
30
 
31
- const BASE_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
31
+ const BASE_TOOLS = [
32
+ "Bash",
33
+ "Read",
34
+ "Glob",
35
+ "Grep",
36
+ "Write",
37
+ "Edit",
38
+ "Agent",
39
+ "TodoWrite",
40
+ ];
32
41
 
33
42
  /** Sole orchestrator for a task-family benchmark run. */
34
43
  export class BenchmarkRunner {
@@ -42,6 +51,7 @@ export class BenchmarkRunner {
42
51
  * @param {string} opts.judgeModel
43
52
  * @param {{agent?: string, judge?: string}} [opts.profiles]
44
53
  * @param {Function} opts.query - SDK query (injected for testability).
54
+ * @param {string[]} [opts.allowedTools] - Agent tool allowlist (default: BASE_TOOLS).
45
55
  * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
46
56
  * @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
47
57
  * @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
@@ -54,6 +64,10 @@ export class BenchmarkRunner {
54
64
  * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
55
65
  * contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
56
66
  * only.
67
+ * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
68
+ * Same contract as `installApm(family, outputDir)`. Lets tests inject a
69
+ * fake `apm` spawn (or skip the install entirely) so the suite never
70
+ * shells out to a real `apm` binary. Internal testing only.
57
71
  */
58
72
  constructor({
59
73
  family,
@@ -64,20 +78,20 @@ export class BenchmarkRunner {
64
78
  judgeModel,
65
79
  profiles,
66
80
  query,
81
+ allowedTools,
67
82
  maxTurns,
68
83
  termGraceMs,
69
84
  // Test seams — default to the real implementations.
70
85
  runAgent,
71
86
  runScoring: runScoringHook,
72
87
  runJudge: runJudgeHook,
88
+ installApm: installApmHook,
73
89
  }) {
74
90
  if (!family) throw new Error("family is required");
75
91
  if (!Number.isInteger(runs) || runs < 1)
76
92
  throw new Error("runs must be an integer ≥ 1");
77
93
  if (!output) throw new Error("output is required");
78
94
  if (!agentModel) throw new Error("agentModel is required");
79
- if (!supervisorModel) throw new Error("supervisorModel is required");
80
- if (!judgeModel) throw new Error("judgeModel is required");
81
95
  if (!query) throw new Error("query is required");
82
96
  this.familyInput = family;
83
97
  this.runs = runs;
@@ -85,6 +99,7 @@ export class BenchmarkRunner {
85
99
  this.agentModel = agentModel;
86
100
  this.supervisorModel = supervisorModel;
87
101
  this.judgeModel = judgeModel;
102
+ this.allowedTools = allowedTools ?? BASE_TOOLS;
88
103
  this.profiles = {
89
104
  agent: profiles?.agent ?? null,
90
105
  judge: profiles?.judge ?? null,
@@ -95,6 +110,7 @@ export class BenchmarkRunner {
95
110
  this._runAgentHook = runAgent ?? null;
96
111
  this._runScoringHook = runScoringHook ?? runScoring;
97
112
  this._runJudgeHook = runJudgeHook ?? runJudge;
113
+ this._installApmHook = installApmHook ?? defaultInstallApm;
98
114
  }
99
115
 
100
116
  /**
@@ -108,15 +124,10 @@ export class BenchmarkRunner {
108
124
  : this.familyInput;
109
125
 
110
126
  await mkdir(this.output, { recursive: true });
111
- const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
112
- family,
113
- this.output,
114
- );
127
+ const { stagingDir, skillSetHash, judgeProfilesDir } =
128
+ await this._installApmHook(family, this.output);
115
129
 
116
130
  const tasks = family.tasks();
117
- for (const task of tasks) {
118
- await assertPreflightExecutable(task);
119
- }
120
131
  if (this.profiles.judge) {
121
132
  await assertJudgeProfileStaged(
122
133
  family,
@@ -129,6 +140,7 @@ export class BenchmarkRunner {
129
140
  stagingDir,
130
141
  runOutputDir: this.output,
131
142
  termGraceMs: this.termGraceMs,
143
+ familyRootPath: family.rootPath,
132
144
  });
133
145
 
134
146
  const resultsPath = join(this.output, "results.jsonl");
@@ -178,33 +190,38 @@ export class BenchmarkRunner {
178
190
  port: workdir.port,
179
191
  runDir: workdir.runDir,
180
192
  });
181
- const judgeContext = await this.#buildJudgeContext(
182
- task,
183
- workdir,
184
- skillSetHash,
185
- );
186
- const judgeVerdict = await this._runJudgeHook(
187
- task,
188
- workdir,
189
- scoring,
190
- {
191
- query: this.query,
192
- model: this.judgeModel,
193
- judgeProfile: this.profiles.judge ?? undefined,
194
- profilesDir: judgeProfilesDir,
195
- },
196
- judgeContext,
197
- );
193
+ let judgeVerdict = null;
194
+ if (task.paths.judge) {
195
+ const judgeContext = await this.#buildJudgeContext(
196
+ task,
197
+ workdir,
198
+ skillSetHash,
199
+ );
200
+ judgeVerdict = await this._runJudgeHook(
201
+ task,
202
+ workdir,
203
+ scoring,
204
+ {
205
+ query: this.query,
206
+ model: this.judgeModel,
207
+ judgeProfile: this.profiles.judge ?? undefined,
208
+ profilesDir: judgeProfilesDir,
209
+ },
210
+ judgeContext,
211
+ );
212
+ }
213
+ const verdict =
214
+ scoring.verdict === "pass" &&
215
+ (judgeVerdict === null || judgeVerdict.verdict === "pass")
216
+ ? "pass"
217
+ : "fail";
198
218
  const record = {
199
219
  taskId: task.id,
200
220
  runIndex,
201
- verdict:
202
- scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
203
- ? "pass"
204
- : "fail",
221
+ verdict,
205
222
  scoring,
206
223
  submission,
207
- judgeVerdict,
224
+ ...(judgeVerdict && { judgeVerdict }),
208
225
  costUsd,
209
226
  turns,
210
227
  agentTracePath: workdir.agentTracePath,
@@ -262,6 +279,9 @@ export class BenchmarkRunner {
262
279
  async #runAgent(task, workdir) {
263
280
  const combinedPath = join(workdir.runDir, ".combined.ndjson");
264
281
  const combinedStream = createWriteStream(combinedPath);
282
+ const supervisorInstructions = task.paths.supervisor
283
+ ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
284
+ : null;
265
285
  const supervisor = createSupervisor({
266
286
  supervisorCwd: workdir.cwd,
267
287
  agentCwd: workdir.cwd,
@@ -270,9 +290,12 @@ export class BenchmarkRunner {
270
290
  agentModel: this.agentModel,
271
291
  supervisorModel: this.supervisorModel,
272
292
  maxTurns: this.maxTurns ?? 50,
273
- allowedTools: BASE_TOOLS,
293
+ allowedTools: this.allowedTools,
274
294
  ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
275
- redactor: createRedactor(),
295
+ ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
296
+ redactor: createRedactor({
297
+ allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
298
+ }),
276
299
  });
277
300
  const instructions = await readFile(task.paths.instructions, "utf8");
278
301
  let agentError = null;
@@ -372,23 +395,6 @@ async function writeRecord(stream, record) {
372
395
  });
373
396
  }
374
397
 
375
- /**
376
- * Pre-flight install gate. Throws synchronously if any task's preflight
377
- * script is missing or not executable — design § Pre-flight contract:
378
- * "The harness fails the family at install if any task's preflight script
379
- * is missing or non-executable, before any agent session starts."
380
- */
381
- async function assertPreflightExecutable(task) {
382
- const path = join(task.paths.hooks, "preflight.sh");
383
- try {
384
- await access(path, constants.X_OK);
385
- } catch (e) {
386
- throw new Error(
387
- `task ${task.id}: preflight script not executable at ${path} (${e.code ?? e.message})`,
388
- );
389
- }
390
- }
391
-
392
398
  /**
393
399
  * Split the combined supervisor trace into agent and supervisor files, and
394
400
  * extract cost, turn count, and submission in a single pass. Agent-source
@@ -28,8 +28,11 @@ import { join } from "node:path";
28
28
  * @returns {Promise<ScoringResult>}
29
29
  */
30
30
  export function runScoring(task, ctx) {
31
+ if (!task.paths.score) {
32
+ return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
33
+ }
31
34
  return new Promise((res, rej) => {
32
- const script = join(task.paths.hooks, "score.sh");
35
+ const script = task.paths.score;
33
36
  const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
34
37
 
35
38
  // Bun's child_process pipe setup for fd >= 3 is racy under load (it
@@ -5,7 +5,7 @@
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
6
  * tasks/<task_name>/
7
7
  * agent.task.md
8
- * supervisor.task.md # preserved for v2; not read in v1
8
+ * supervisor.task.md # optional; appended to the task as supervisor context
9
9
  * judge.task.md
10
10
  * hooks/ # harness-only; never copied to agent CWD
11
11
  * preflight.sh
@@ -23,6 +23,7 @@ import { spawn } from "node:child_process";
23
23
  import { createHash } from "node:crypto";
24
24
  import {
25
25
  access,
26
+ constants,
26
27
  lstat,
27
28
  mkdtemp,
28
29
  readdir,
@@ -100,13 +101,20 @@ async function discoverTasks(rootPath) {
100
101
  for (const entry of entries) {
101
102
  if (!entry.isDirectory()) continue;
102
103
  const taskDir = join(tasksRoot, entry.name);
104
+ const supervisorPath = join(taskDir, "supervisor.task.md");
105
+ const judgePath = join(taskDir, "judge.task.md");
106
+ const preflightPath = join(taskDir, "hooks", "preflight.sh");
107
+ const scorePath = join(taskDir, "hooks", "score.sh");
103
108
  tasks.push({
104
109
  id: entry.name,
105
110
  paths: {
111
+ taskDir,
106
112
  instructions: join(taskDir, "agent.task.md"),
107
- supervisor: join(taskDir, "supervisor.task.md"),
108
- judge: join(taskDir, "judge.task.md"),
113
+ supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
114
+ judge: (await fileExists(judgePath)) ? judgePath : null,
109
115
  hooks: join(taskDir, "hooks"),
116
+ preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
117
+ score: (await fileExecutable(scorePath)) ? scorePath : null,
110
118
  specs: join(taskDir, "specs"),
111
119
  workdir: join(taskDir, "workdir"),
112
120
  },
@@ -116,6 +124,24 @@ async function discoverTasks(rootPath) {
116
124
  return tasks;
117
125
  }
118
126
 
127
+ async function fileExists(path) {
128
+ try {
129
+ await access(path);
130
+ return true;
131
+ } catch {
132
+ return false;
133
+ }
134
+ }
135
+
136
+ async function fileExecutable(path) {
137
+ try {
138
+ await access(path, constants.X_OK);
139
+ return true;
140
+ } catch {
141
+ return false;
142
+ }
143
+ }
144
+
119
145
  /**
120
146
  * Canonical-tree hash per design § Family revision algorithm:
121
147
  * list regular files (excluding .git/, node_modules/)
@@ -210,7 +236,7 @@ function run(cmd, args) {
210
236
  /**
211
237
  * @typedef {object} Task
212
238
  * @property {string} id - Task name (directory name under tasks/)
213
- * @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
239
+ * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
214
240
  */
215
241
 
216
242
  /**
@@ -13,6 +13,8 @@ import { createServer } from "node:net";
13
13
  import { connect } from "node:net";
14
14
  import { join } from "node:path";
15
15
 
16
+ import { loadEnv } from "./env-loader.js";
17
+
16
18
  const DEFAULT_TERM_GRACE_MS = 5_000;
17
19
 
18
20
  /**
@@ -25,6 +27,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
25
27
  * @property {string} agentTracePath
26
28
  * @property {string} supervisorTracePath
27
29
  * @property {string} judgeTracePath
30
+ * @property {string[]} [envNames] - Env var names loaded from .env files.
28
31
  * @property {{phase: string, message: string, exitCode: number}} [preflightError]
29
32
  */
30
33
 
@@ -35,12 +38,13 @@ export class WorkdirManager {
35
38
  * @param {string} deps.stagingDir - Output of `installApm(...)`.
36
39
  * @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
37
40
  */
38
- constructor({ stagingDir, runOutputDir, termGraceMs }) {
41
+ constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
39
42
  if (!stagingDir) throw new Error("stagingDir is required");
40
43
  if (!runOutputDir) throw new Error("runOutputDir is required");
41
44
  this.stagingDir = stagingDir;
42
45
  this.runOutputDir = runOutputDir;
43
46
  this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
47
+ this.familyRootPath = familyRootPath ?? null;
44
48
  }
45
49
 
46
50
  /**
@@ -67,13 +71,20 @@ export class WorkdirManager {
67
71
  recursive: true,
68
72
  });
69
73
 
74
+ const envDirs = [
75
+ ...(this.familyRootPath ? [this.familyRootPath] : []),
76
+ ...(task.paths.taskDir ? [task.paths.taskDir] : []),
77
+ ];
78
+ const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
79
+
70
80
  const port = await allocatePort();
71
81
  const agentTracePath = join(runDir, "agent.ndjson");
72
82
  const supervisorTracePath = join(runDir, "supervisor.ndjson");
73
83
  const judgeTracePath = join(runDir, "judge.ndjson");
74
84
 
75
- const preflightScript = join(task.paths.hooks, "preflight.sh");
76
- const preflight = await runPreflight(preflightScript, cwd, port);
85
+ const preflight = task.paths.preflight
86
+ ? await runPreflight(task.paths.preflight, cwd, port)
87
+ : { pgid: 0 };
77
88
 
78
89
  return {
79
90
  cwd,
@@ -84,6 +95,7 @@ export class WorkdirManager {
84
95
  agentTracePath,
85
96
  supervisorTracePath,
86
97
  judgeTracePath,
98
+ envNames,
87
99
  ...(preflight.error && { preflightError: preflight.error }),
88
100
  };
89
101
  }
@@ -47,6 +47,12 @@ function parseRunOptions(values) {
47
47
  judge: values["judge-profile"] ?? null,
48
48
  },
49
49
  maxTurns: parseMaxTurns(values["max-turns"]),
50
+ allowedTools: values["allowed-tools"]
51
+ ? values["allowed-tools"]
52
+ .split(",")
53
+ .map((s) => s.trim())
54
+ .filter(Boolean)
55
+ : undefined,
50
56
  };
51
57
  }
52
58
 
package/src/supervisor.js CHANGED
@@ -104,7 +104,6 @@ export class Supervisor {
104
104
  */
105
105
  async run(task) {
106
106
  const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
107
- this.taskContext = initialTask;
108
107
  this.currentSource = "supervisor";
109
108
  this.currentTurn = 0;
110
109
  let supervisorResult = await this.supervisorRunner.run(initialTask);
@@ -252,22 +251,6 @@ export class Supervisor {
252
251
  return { type: "continue" };
253
252
  }
254
253
 
255
- /**
256
- * Resume the supervisor runner, falling back to a fresh session when the
257
- * SDK reports that the conversation no longer exists (e.g. session GC'd
258
- * while the agent was running). The fresh session includes the original
259
- * task context so the supervisor can still evaluate the agent's work.
260
- * @param {string} prompt
261
- * @returns {Promise<object>}
262
- */
263
- async #resumeSupervisor(prompt) {
264
- const result = await this.supervisorRunner.resume(prompt);
265
- if (result.error && isSessionNotFound(result.error)) {
266
- return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
267
- }
268
- return result;
269
- }
270
-
271
254
  /**
272
255
  * If the agent has an unanswered ask, drain reminders and return a
273
256
  * formatted relay string. Returns null when no relay is needed.
@@ -295,7 +278,7 @@ export class Supervisor {
295
278
  this.currentSource = "supervisor";
296
279
  this.ctx.redirect = null;
297
280
 
298
- await this.#resumeSupervisor(
281
+ await this.supervisorRunner.resume(
299
282
  `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
300
283
  `Review and use your tools if action is needed.`,
301
284
  );
@@ -333,7 +316,7 @@ export class Supervisor {
333
316
  `Review and decide how to proceed.`
334
317
  : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
335
318
 
336
- let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
319
+ let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
337
320
 
338
321
  if (supervisorResult.error) {
339
322
  this.emitSummary({ success: false, turns: turn });
@@ -354,7 +337,7 @@ export class Supervisor {
354
337
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
355
338
  const reminders = this.messageBus.drain("supervisor");
356
339
  if (reminders.length > 0) {
357
- supervisorResult = await this.#resumeSupervisor(
340
+ supervisorResult = await this.supervisorRunner.resume(
358
341
  formatMessages(reminders),
359
342
  );
360
343
  if (this.ctx.concluded) {
@@ -617,8 +600,3 @@ export function createSupervisor({
617
600
  });
618
601
  return supervisor;
619
602
  }
620
-
621
- function isSessionNotFound(error) {
622
- const msg = error?.message ?? String(error);
623
- return msg.includes("No conversation found with session ID");
624
- }