npm - @forwardimpact/libeval - Versions diffs - 0.1.38 → 0.1.41 - Mend

@forwardimpact/libeval 0.1.38 → 0.1.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/bin/fit-benchmark.js +5 -0
package/package.json +1 -1
package/src/agent-runner.js +33 -21
package/src/benchmark/apm-installer.js +107 -56
package/src/benchmark/env-loader.js +146 -0
package/src/benchmark/result.js +4 -4
package/src/benchmark/runner.js +60 -54
package/src/benchmark/scorer.js +4 -1
package/src/benchmark/task-family.js +30 -4
package/src/benchmark/workdir.js +15 -3
package/src/commands/benchmark-run.js +6 -0
package/src/supervisor.js +3 -25

package/bin/fit-benchmark.js CHANGED Viewed

@@ -68,6 +68,11 @@ export const definition = {
           description:
             "Agent-under-test turn budget (default: 50, 0 = unlimited)",
         },
+        "allowed-tools": {
+          type: "string",
+          description:
+            "Comma-separated tool allowlist for the agent-under-test (default: Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite)",
+        },
       },
     },
     {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.38",
+  "version": "0.1.41",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",

package/src/agent-runner.js CHANGED Viewed

@@ -77,22 +77,7 @@ export class AgentRunner {
     try {
       const iterator = this.query({
         prompt: effectiveTask,
-        options: {
-          cwd: this.cwd,
-          allowedTools: this.allowedTools,
-          maxTurns:
-            this.maxTurns === 0 ? Number.MAX_SAFE_INTEGER : this.maxTurns,
-          model: this.model,
-          permissionMode: PERMISSION_MODE,
-          allowDangerouslySkipPermissions: true,
-          settingSources: this.settingSources,
-          abortController,
-          ...(this.disallowedTools.length > 0 && {
-            disallowedTools: this.disallowedTools,
-          }),
-          ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
-          ...(this.mcpServers && { mcpServers: this.mcpServers }),
-        },
+        options: this.#callOptions(abortController),
       });
       return await this.#consumeQuery(iterator);
     } finally {
@@ -112,12 +97,8 @@ export class AgentRunner {
       const iterator = this.query({
         prompt,
         options: {
+          ...this.#callOptions(abortController),
           resume: this.sessionId,
-          model: this.model,
-          permissionMode: PERMISSION_MODE,
-          allowDangerouslySkipPermissions: true,
-          abortController,
-          ...(this.mcpServers && { mcpServers: this.mcpServers }),
         },
       });
       return await this.#consumeQuery(iterator);
@@ -126,6 +107,37 @@ export class AgentRunner {
     }
   }
+  /**
+   * Build the options passed to every SDK query() call. Shared by run() and
+   * resume() so the agent's configuration — cwd, tools, prompt, setting
+   * sources, turn budget — is identical across the session's lifetime. Only
+   * resume() layers `resume: this.sessionId` on top.
+   *
+   * SDK options are call-attached, not session-attached: the resumed call
+   * loads the prior conversation but otherwise uses whatever options this
+   * call passes. Omitting tool/prompt/setting options on resume causes the
+   * agent to silently lose its restrictions and persona between turns.
+   * @param {AbortController} abortController
+   * @returns {object}
+   */
+  #callOptions(abortController) {
+    return {
+      cwd: this.cwd,
+      allowedTools: this.allowedTools,
+      maxTurns: this.maxTurns === 0 ? Number.MAX_SAFE_INTEGER : this.maxTurns,
+      model: this.model,
+      permissionMode: PERMISSION_MODE,
+      allowDangerouslySkipPermissions: true,
+      settingSources: this.settingSources,
+      abortController,
+      ...(this.disallowedTools.length > 0 && {
+        disallowedTools: this.disallowedTools,
+      }),
+      ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
+      ...(this.mcpServers && { mcpServers: this.mcpServers }),
+    };
+  }
   /**
    * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
    * iterator, mirroring every line to the output stream / buffer / onLine

package/src/benchmark/apm-installer.js CHANGED Viewed

@@ -3,55 +3,109 @@
  * materialise skills and agents, copies the resulting `.claude/` into a
  * staging directory, and computes the manifest fingerprint from the lockfile.
  * Per-task copy happens later in WorkdirManager.
+ *
+ * The class takes a `spawn` seam so tests can substitute a fake child process
+ * without ever shelling out to a real `apm` binary. See `createApmInstaller`
+ * for the real-dependency wiring; `installApm` is a thin free-function wrapper
+ * for callers that don't need to inject anything.
  */
-import { spawn } from "node:child_process";
+import { spawn as nodeSpawn } from "node:child_process";
 import { createHash } from "node:crypto";
 import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
 import { join } from "node:path";
-/**
- * @param {import("./task-family.js").TaskFamily} family
- * @param {string} outputDir - The benchmark run's output directory.
- * @returns {Promise<{stagingDir: string, skillSetHash: string}>}
- */
-export async function installApm(family, outputDir) {
-  const stagingDir = join(outputDir, ".apm-staging");
-  const stagedClaude = join(stagingDir, ".claude");
-  const sourceClaude = join(family.rootPath, ".claude");
+/** Installs apm and stages `.claude/` for a task family. */
+export class ApmInstaller {
+  /**
+   * @param {object} [deps]
+   * @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
+   *   `node:child_process` spawn). Tests inject a fake to avoid shelling out.
+   */
+  constructor({ spawn } = {}) {
+    this.spawn = spawn ?? nodeSpawn;
+  }
-  await runApmInstall(family.rootPath);
+  /**
+   * @param {import("./task-family.js").TaskFamily} family
+   * @param {string} outputDir - The benchmark run's output directory.
+   * @returns {Promise<{stagingDir: string, skillSetHash: string, judgeProfilesDir: string}>}
+   */
+  async install(family, outputDir) {
+    const stagingDir = join(outputDir, ".apm-staging");
+    const stagedClaude = join(stagingDir, ".claude");
+    const sourceClaude = join(family.rootPath, ".claude");
+    const apmYml = join(family.rootPath, "apm.yml");
-  try {
-    await access(sourceClaude);
-  } catch {
-    throw new Error(
-      `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
-    );
-  }
+    const hasApm = await access(apmYml)
+      .then(() => true)
+      .catch(() => false);
+    if (hasApm) {
+      await this.#runApmInstall(family.rootPath);
+      try {
+        await access(sourceClaude);
+      } catch {
+        throw new Error(
+          `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
+        );
+      }
+    }
-  await rm(stagingDir, { recursive: true, force: true });
-  await cp(sourceClaude, stagedClaude, { recursive: true });
+    await rm(stagingDir, { recursive: true, force: true });
+    const hasClaudeDir = await access(sourceClaude)
+      .then(() => true)
+      .catch(() => false);
+    if (hasClaudeDir) {
+      await cp(sourceClaude, stagedClaude, { recursive: true });
+    } else {
+      await mkdir(stagedClaude, { recursive: true });
+    }
-  // Stage the family-local judge profile outside .claude/ so it is available
-  // to the judge but never copied into the agent-under-test's CWD.
-  const judgeSource = join(family.rootPath, "judge.md");
-  const judgeProfilesDir = join(stagingDir, "judge-profiles");
-  try {
-    await access(judgeSource);
-    await mkdir(judgeProfilesDir, { recursive: true });
-    await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
-  } catch {}
+    // Stage the family-local judge profile outside .claude/ so it is available
+    // to the judge but never copied into the agent-under-test's CWD.
+    const judgeSource = join(family.rootPath, "judge.md");
+    const judgeProfilesDir = join(stagingDir, "judge-profiles");
+    try {
+      await access(judgeSource);
+      await mkdir(judgeProfilesDir, { recursive: true });
+      await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
+    } catch {}
-  const lockPath = join(family.rootPath, "apm.lock.yaml");
-  const lockBytes = await readFile(lockPath).catch(() => {
-    throw new Error(`apm install did not produce apm.lock.yaml at ${lockPath}`);
-  });
-  const skillSetHash =
-    "sha256:" +
-    createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
+    const lockPath = join(family.rootPath, "apm.lock.yaml");
+    let skillSetHash = "";
+    try {
+      const lockBytes = await readFile(lockPath);
+      skillSetHash =
+        "sha256:" +
+        createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
+    } catch {
+      // No lockfile — family doesn't use skill packs.
+    }
-  return { stagingDir, skillSetHash, judgeProfilesDir };
+    return { stagingDir, skillSetHash, judgeProfilesDir };
+  }
+  #runApmInstall(cwd) {
+    return new Promise((res, rej) => {
+      const child = this.spawn("apm", ["install", "--target", "claude"], {
+        cwd,
+        stdio: ["ignore", "pipe", "pipe"],
+      });
+      let stderr = "";
+      child.stdout.on("data", () => {});
+      child.stderr.on("data", (d) => {
+        stderr += d.toString();
+      });
+      child.on("error", (e) => {
+        rej(new Error(`failed to spawn apm: ${e.message}`));
+      });
+      child.on("close", (code) => {
+        if (code === 0) res();
+        else rej(new Error(`apm install exited ${code}: ${stderr}`));
+      });
+    });
+  }
 }
 function normalizeLf(buf) {
@@ -63,23 +117,20 @@ function normalizeLf(buf) {
   return Buffer.from(out);
 }
-function runApmInstall(cwd) {
-  return new Promise((res, rej) => {
-    const child = spawn("apm", ["install", "--target", "claude"], {
-      cwd,
-      stdio: ["ignore", "pipe", "pipe"],
-    });
-    let stderr = "";
-    child.stdout.on("data", () => {});
-    child.stderr.on("data", (d) => {
-      stderr += d.toString();
-    });
-    child.on("error", (e) => {
-      rej(new Error(`failed to spawn apm: ${e.message}`));
-    });
-    child.on("close", (code) => {
-      if (code === 0) res();
-      else rej(new Error(`apm install exited ${code}: ${stderr}`));
-    });
-  });
+/**
+ * Factory function — wires real dependencies.
+ * @param {ConstructorParameters<typeof ApmInstaller>[0]} [deps]
+ * @returns {ApmInstaller}
+ */
+export function createApmInstaller(deps) {
+  return new ApmInstaller(deps);
+}
+/**
+ * Free-function shorthand for callers that don't need to inject a spawn seam.
+ * @param {import("./task-family.js").TaskFamily} family
+ * @param {string} outputDir
+ */
+export function installApm(family, outputDir) {
+  return new ApmInstaller().install(family, outputDir);
 }

package/src/benchmark/env-loader.js ADDED Viewed

@@ -0,0 +1,146 @@
+/**
+ * Env-loader — auto-discover `.env` / `.env.local` files in a task family
+ * and its tasks, load them into `process.env`, and render the merged result
+ * into each agent CWD.
+ *
+ * Discovery paths (loaded in this order, first value per key wins):
+ *   1. process.env  (CI secrets, shell env — never overwritten)
+ *   2. <family>/.env.local
+ *   3. <family>/.env
+ *   4. tasks/<id>/.env.local
+ *   5. tasks/<id>/.env
+ *
+ * Every discovered env file — family or task — is loaded into process.env
+ * AND rendered (with resolved values) into the agent working directory.
+ */
+import { readFile, writeFile } from "node:fs/promises";
+import { join } from "node:path";
+const ENV_FILES = [".env.local", ".env"];
+/**
+ * Parse a `.env` file into an array of {key, value} pairs.
+ * Handles KEY=VALUE, # comments, blank lines, and single/double-quoted values.
+ * @param {string} content
+ * @returns {Array<{key: string, value: string}>}
+ */
+export function parseEnvFile(content) {
+  const entries = [];
+  for (const raw of content.split("\n")) {
+    const line = raw.trim();
+    if (!line || line.startsWith("#")) continue;
+    const eq = line.indexOf("=");
+    if (eq === -1) continue;
+    const key = line.slice(0, eq).trim();
+    if (!key) continue;
+    let value = line.slice(eq + 1).trim();
+    if (
+      (value.startsWith('"') && value.endsWith('"')) ||
+      (value.startsWith("'") && value.endsWith("'"))
+    ) {
+      value = value.slice(1, -1);
+    }
+    entries.push({ key, value });
+  }
+  return entries;
+}
+/**
+ * Read and parse an env file, returning [] if the file does not exist.
+ * @param {string} filePath
+ * @returns {Promise<Array<{key: string, value: string}>>}
+ */
+async function readEnvFile(filePath) {
+  try {
+    const content = await readFile(filePath, "utf8");
+    return parseEnvFile(content);
+  } catch (e) {
+    if (e.code === "ENOENT") return [];
+    throw e;
+  }
+}
+/**
+ * Load entries into process.env. Existing keys are never overwritten.
+ * @param {Array<{key: string, value: string}>} entries
+ * @returns {string[]} var names that were loaded
+ */
+function applyToProcessEnv(entries) {
+  const names = [];
+  for (const { key, value } of entries) {
+    names.push(key);
+    if (process.env[key] === undefined) {
+      process.env[key] = value;
+    }
+  }
+  return names;
+}
+/**
+ * Load one env file: apply to process.env, record keys in the merged map.
+ * @param {string} dir
+ * @param {string} file
+ * @param {Set<string>} names
+ * @param {Map<string, Map<string, true>>} merged
+ */
+async function loadOneEnvFile(dir, file, names, merged) {
+  const entries = await readEnvFile(join(dir, file));
+  if (entries.length === 0) return;
+  for (const name of applyToProcessEnv(entries)) names.add(name);
+  if (!merged.has(file)) merged.set(file, new Map());
+  const fileMap = merged.get(file);
+  for (const { key } of entries) {
+    if (!fileMap.has(key)) fileMap.set(key, true);
+  }
+}
+/**
+ * Scan directories for env files, load into process.env, and collect
+ * a merged key manifest per filename.
+ * @param {string[]} dirs
+ * @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
+ */
+async function collectEnvEntries(dirs) {
+  const names = new Set();
+  const merged = new Map();
+  for (const dir of dirs) {
+    for (const file of ENV_FILES) {
+      await loadOneEnvFile(dir, file, names, merged);
+    }
+  }
+  return { names, merged };
+}
+/**
+ * Write resolved env files into the agent CWD and warn about empty values.
+ * @param {Map<string, Map<string, true>>} merged
+ * @param {string} agentCwd
+ */
+async function renderEnvFiles(merged, agentCwd) {
+  for (const [file, keyMap] of merged) {
+    const keys = [...keyMap.keys()];
+    const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
+    await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
+    const empty = keys.filter((key) => !process.env[key]);
+    if (empty.length > 0) {
+      process.stderr.write(
+        `libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
+      );
+    }
+  }
+}
+/**
+ * Discover `.env` / `.env.local` in one or more directories, load them
+ * into process.env, and render the resolved values into the agent CWD.
+ *
+ * @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
+ * @param {string} agentCwd - Agent working directory to render into.
+ * @returns {Promise<string[]>} All var names discovered (for redaction).
+ */
+export async function loadEnv(dirs, agentCwd) {
+  const { names, merged } = await collectEnvEntries(dirs);
+  await renderEnvFiles(merged, agentCwd);
+  return [...names];
+}

package/src/benchmark/result.js CHANGED Viewed

@@ -29,7 +29,7 @@ const JUDGE_VERDICT_SHAPE = z.object({
 const PROFILES_SHAPE = z.object({
   agent: z.union([z.string(), z.null()]),
-  supervisor: z.null(),
+  supervisor: z.union([z.string(), z.null()]),
   judge: z.union([z.string(), z.null()]),
 });
@@ -48,8 +48,8 @@ const COMMON_FIELDS = {
   profiles: PROFILES_SHAPE,
   model: z.object({
     agent: z.string(),
-    supervisor: z.string(),
-    judge: z.string(),
+    supervisor: z.string().optional(),
+    judge: z.string().optional(),
   }),
   skillSetHash: z.string(),
   familyRevision: z.string(),
@@ -65,7 +65,7 @@ const HAPPY_RECORD = z.object({
   ...COMMON_FIELDS,
   scoring: SCORING_SHAPE,
   submission: z.string(),
-  judgeVerdict: JUDGE_VERDICT_SHAPE,
+  judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
   agentTracePath: z.string(),
   supervisorTracePath: z.string(),
   judgeTracePath: z.string(),

package/src/benchmark/runner.js CHANGED Viewed

@@ -15,20 +15,29 @@
  */
 import { createReadStream, createWriteStream } from "node:fs";
-import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
+import { mkdir, readFile, unlink } from "node:fs/promises";
 import { createInterface } from "node:readline";
 import { join, resolve as resolvePath } from "node:path";
-import { createRedactor } from "../redaction.js";
+import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
 import { createSupervisor } from "../supervisor.js";
-import { installApm } from "./apm-installer.js";
+import { installApm as defaultInstallApm } from "./apm-installer.js";
 import { runJudge } from "./judge.js";
 import { validateResultRecord } from "./result.js";
 import { runScoring } from "./scorer.js";
 import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
 import { createWorkdirManager } from "./workdir.js";
-const BASE_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
+const BASE_TOOLS = [
+  "Bash",
+  "Read",
+  "Glob",
+  "Grep",
+  "Write",
+  "Edit",
+  "Agent",
+  "TodoWrite",
+];
 /** Sole orchestrator for a task-family benchmark run. */
 export class BenchmarkRunner {
@@ -42,6 +51,7 @@ export class BenchmarkRunner {
    * @param {string} opts.judgeModel
    * @param {{agent?: string, judge?: string}} [opts.profiles]
    * @param {Function} opts.query - SDK query (injected for testability).
+   * @param {string[]} [opts.allowedTools] - Agent tool allowlist (default: BASE_TOOLS).
    * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
    * @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
    * @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
@@ -54,6 +64,10 @@ export class BenchmarkRunner {
    * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
    *   contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
    *   only.
+   * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
+   *   Same contract as `installApm(family, outputDir)`. Lets tests inject a
+   *   fake `apm` spawn (or skip the install entirely) so the suite never
+   *   shells out to a real `apm` binary. Internal testing only.
    */
   constructor({
     family,
@@ -64,20 +78,20 @@ export class BenchmarkRunner {
     judgeModel,
     profiles,
     query,
+    allowedTools,
     maxTurns,
     termGraceMs,
     // Test seams — default to the real implementations.
     runAgent,
     runScoring: runScoringHook,
     runJudge: runJudgeHook,
+    installApm: installApmHook,
   }) {
     if (!family) throw new Error("family is required");
     if (!Number.isInteger(runs) || runs < 1)
       throw new Error("runs must be an integer ≥ 1");
     if (!output) throw new Error("output is required");
     if (!agentModel) throw new Error("agentModel is required");
-    if (!supervisorModel) throw new Error("supervisorModel is required");
-    if (!judgeModel) throw new Error("judgeModel is required");
     if (!query) throw new Error("query is required");
     this.familyInput = family;
     this.runs = runs;
@@ -85,6 +99,7 @@ export class BenchmarkRunner {
     this.agentModel = agentModel;
     this.supervisorModel = supervisorModel;
     this.judgeModel = judgeModel;
+    this.allowedTools = allowedTools ?? BASE_TOOLS;
     this.profiles = {
       agent: profiles?.agent ?? null,
       judge: profiles?.judge ?? null,
@@ -95,6 +110,7 @@ export class BenchmarkRunner {
     this._runAgentHook = runAgent ?? null;
     this._runScoringHook = runScoringHook ?? runScoring;
     this._runJudgeHook = runJudgeHook ?? runJudge;
+    this._installApmHook = installApmHook ?? defaultInstallApm;
   }
   /**
@@ -108,15 +124,10 @@ export class BenchmarkRunner {
         : this.familyInput;
     await mkdir(this.output, { recursive: true });
-    const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
-      family,
-      this.output,
-    );
+    const { stagingDir, skillSetHash, judgeProfilesDir } =
+      await this._installApmHook(family, this.output);
     const tasks = family.tasks();
-    for (const task of tasks) {
-      await assertPreflightExecutable(task);
-    }
     if (this.profiles.judge) {
       await assertJudgeProfileStaged(
         family,
@@ -129,6 +140,7 @@ export class BenchmarkRunner {
       stagingDir,
       runOutputDir: this.output,
       termGraceMs: this.termGraceMs,
+      familyRootPath: family.rootPath,
     });
     const resultsPath = join(this.output, "results.jsonl");
@@ -178,33 +190,38 @@ export class BenchmarkRunner {
         port: workdir.port,
         runDir: workdir.runDir,
       });
-      const judgeContext = await this.#buildJudgeContext(
-        task,
-        workdir,
-        skillSetHash,
-      );
-      const judgeVerdict = await this._runJudgeHook(
-        task,
-        workdir,
-        scoring,
-        {
-          query: this.query,
-          model: this.judgeModel,
-          judgeProfile: this.profiles.judge ?? undefined,
-          profilesDir: judgeProfilesDir,
-        },
-        judgeContext,
-      );
+      let judgeVerdict = null;
+      if (task.paths.judge) {
+        const judgeContext = await this.#buildJudgeContext(
+          task,
+          workdir,
+          skillSetHash,
+        );
+        judgeVerdict = await this._runJudgeHook(
+          task,
+          workdir,
+          scoring,
+          {
+            query: this.query,
+            model: this.judgeModel,
+            judgeProfile: this.profiles.judge ?? undefined,
+            profilesDir: judgeProfilesDir,
+          },
+          judgeContext,
+        );
+      }
+      const verdict =
+        scoring.verdict === "pass" &&
+        (judgeVerdict === null || judgeVerdict.verdict === "pass")
+          ? "pass"
+          : "fail";
       const record = {
         taskId: task.id,
         runIndex,
-        verdict:
-          scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
-            ? "pass"
-            : "fail",
+        verdict,
         scoring,
         submission,
-        judgeVerdict,
+        ...(judgeVerdict && { judgeVerdict }),
         costUsd,
         turns,
         agentTracePath: workdir.agentTracePath,
@@ -262,6 +279,9 @@ export class BenchmarkRunner {
   async #runAgent(task, workdir) {
     const combinedPath = join(workdir.runDir, ".combined.ndjson");
     const combinedStream = createWriteStream(combinedPath);
+    const supervisorInstructions = task.paths.supervisor
+      ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
+      : null;
     const supervisor = createSupervisor({
       supervisorCwd: workdir.cwd,
       agentCwd: workdir.cwd,
@@ -270,9 +290,12 @@ export class BenchmarkRunner {
       agentModel: this.agentModel,
       supervisorModel: this.supervisorModel,
       maxTurns: this.maxTurns ?? 50,
-      allowedTools: BASE_TOOLS,
+      allowedTools: this.allowedTools,
       ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
-      redactor: createRedactor(),
+      ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
+      redactor: createRedactor({
+        allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
+      }),
     });
     const instructions = await readFile(task.paths.instructions, "utf8");
     let agentError = null;
@@ -372,23 +395,6 @@ async function writeRecord(stream, record) {
   });
 }
-/**
- * Pre-flight install gate. Throws synchronously if any task's preflight
- * script is missing or not executable — design § Pre-flight contract:
- * "The harness fails the family at install if any task's preflight script
- * is missing or non-executable, before any agent session starts."
- */
-async function assertPreflightExecutable(task) {
-  const path = join(task.paths.hooks, "preflight.sh");
-  try {
-    await access(path, constants.X_OK);
-  } catch (e) {
-    throw new Error(
-      `task ${task.id}: preflight script not executable at ${path} (${e.code ?? e.message})`,
-    );
-  }
-}
 /**
  * Split the combined supervisor trace into agent and supervisor files, and
  * extract cost, turn count, and submission in a single pass. Agent-source

package/src/benchmark/scorer.js CHANGED Viewed

@@ -28,8 +28,11 @@ import { join } from "node:path";
  * @returns {Promise<ScoringResult>}
  */
 export function runScoring(task, ctx) {
+  if (!task.paths.score) {
+    return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
+  }
   return new Promise((res, rej) => {
-    const script = join(task.paths.hooks, "score.sh");
+    const script = task.paths.score;
     const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
     // Bun's child_process pipe setup for fd >= 3 is racy under load (it

package/src/benchmark/task-family.js CHANGED Viewed

@@ -5,7 +5,7 @@
  *     .claude/                # pre-staged skills + agents (P1)
  *     tasks/<task_name>/
  *       agent.task.md
- *       supervisor.task.md    # preserved for v2; not read in v1
+ *       supervisor.task.md    # optional; appended to the task as supervisor context
  *       judge.task.md
  *       hooks/                # harness-only; never copied to agent CWD
  *         preflight.sh
@@ -23,6 +23,7 @@ import { spawn } from "node:child_process";
 import { createHash } from "node:crypto";
 import {
   access,
+  constants,
   lstat,
   mkdtemp,
   readdir,
@@ -100,13 +101,20 @@ async function discoverTasks(rootPath) {
   for (const entry of entries) {
     if (!entry.isDirectory()) continue;
     const taskDir = join(tasksRoot, entry.name);
+    const supervisorPath = join(taskDir, "supervisor.task.md");
+    const judgePath = join(taskDir, "judge.task.md");
+    const preflightPath = join(taskDir, "hooks", "preflight.sh");
+    const scorePath = join(taskDir, "hooks", "score.sh");
     tasks.push({
       id: entry.name,
       paths: {
+        taskDir,
         instructions: join(taskDir, "agent.task.md"),
-        supervisor: join(taskDir, "supervisor.task.md"),
-        judge: join(taskDir, "judge.task.md"),
+        supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
+        judge: (await fileExists(judgePath)) ? judgePath : null,
         hooks: join(taskDir, "hooks"),
+        preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
+        score: (await fileExecutable(scorePath)) ? scorePath : null,
         specs: join(taskDir, "specs"),
         workdir: join(taskDir, "workdir"),
       },
@@ -116,6 +124,24 @@ async function discoverTasks(rootPath) {
   return tasks;
 }
+async function fileExists(path) {
+  try {
+    await access(path);
+    return true;
+  } catch {
+    return false;
+  }
+}
+async function fileExecutable(path) {
+  try {
+    await access(path, constants.X_OK);
+    return true;
+  } catch {
+    return false;
+  }
+}
 /**
  * Canonical-tree hash per design § Family revision algorithm:
  *   list regular files (excluding .git/, node_modules/)
@@ -210,7 +236,7 @@ function run(cmd, args) {
 /**
  * @typedef {object} Task
  * @property {string} id - Task name (directory name under tasks/)
- * @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
+ * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
  */
 /**

package/src/benchmark/workdir.js CHANGED Viewed

@@ -13,6 +13,8 @@ import { createServer } from "node:net";
 import { connect } from "node:net";
 import { join } from "node:path";
+import { loadEnv } from "./env-loader.js";
 const DEFAULT_TERM_GRACE_MS = 5_000;
 /**
@@ -25,6 +27,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
  * @property {string} agentTracePath
  * @property {string} supervisorTracePath
  * @property {string} judgeTracePath
+ * @property {string[]} [envNames] - Env var names loaded from .env files.
  * @property {{phase: string, message: string, exitCode: number}} [preflightError]
  */
@@ -35,12 +38,13 @@ export class WorkdirManager {
    * @param {string} deps.stagingDir - Output of `installApm(...)`.
    * @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
    */
-  constructor({ stagingDir, runOutputDir, termGraceMs }) {
+  constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
     if (!stagingDir) throw new Error("stagingDir is required");
     if (!runOutputDir) throw new Error("runOutputDir is required");
     this.stagingDir = stagingDir;
     this.runOutputDir = runOutputDir;
     this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
+    this.familyRootPath = familyRootPath ?? null;
   }
   /**
@@ -67,13 +71,20 @@ export class WorkdirManager {
       recursive: true,
     });
+    const envDirs = [
+      ...(this.familyRootPath ? [this.familyRootPath] : []),
+      ...(task.paths.taskDir ? [task.paths.taskDir] : []),
+    ];
+    const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
     const port = await allocatePort();
     const agentTracePath = join(runDir, "agent.ndjson");
     const supervisorTracePath = join(runDir, "supervisor.ndjson");
     const judgeTracePath = join(runDir, "judge.ndjson");
-    const preflightScript = join(task.paths.hooks, "preflight.sh");
-    const preflight = await runPreflight(preflightScript, cwd, port);
+    const preflight = task.paths.preflight
+      ? await runPreflight(task.paths.preflight, cwd, port)
+      : { pgid: 0 };
     return {
       cwd,
@@ -84,6 +95,7 @@ export class WorkdirManager {
       agentTracePath,
       supervisorTracePath,
       judgeTracePath,
+      envNames,
       ...(preflight.error && { preflightError: preflight.error }),
     };
   }

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -47,6 +47,12 @@ function parseRunOptions(values) {
       judge: values["judge-profile"] ?? null,
     },
     maxTurns: parseMaxTurns(values["max-turns"]),
+    allowedTools: values["allowed-tools"]
+      ? values["allowed-tools"]
+          .split(",")
+          .map((s) => s.trim())
+          .filter(Boolean)
+      : undefined,
   };
 }

package/src/supervisor.js CHANGED Viewed

@@ -104,7 +104,6 @@ export class Supervisor {
    */
   async run(task) {
     const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
-    this.taskContext = initialTask;
     this.currentSource = "supervisor";
     this.currentTurn = 0;
     let supervisorResult = await this.supervisorRunner.run(initialTask);
@@ -252,22 +251,6 @@ export class Supervisor {
     return { type: "continue" };
   }
-  /**
-   * Resume the supervisor runner, falling back to a fresh session when the
-   * SDK reports that the conversation no longer exists (e.g. session GC'd
-   * while the agent was running). The fresh session includes the original
-   * task context so the supervisor can still evaluate the agent's work.
-   * @param {string} prompt
-   * @returns {Promise<object>}
-   */
-  async #resumeSupervisor(prompt) {
-    const result = await this.supervisorRunner.resume(prompt);
-    if (result.error && isSessionNotFound(result.error)) {
-      return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
-    }
-    return result;
-  }
   /**
    * If the agent has an unanswered ask, drain reminders and return a
    * formatted relay string. Returns null when no relay is needed.
@@ -295,7 +278,7 @@ export class Supervisor {
     this.currentSource = "supervisor";
     this.ctx.redirect = null;
-    await this.#resumeSupervisor(
+    await this.supervisorRunner.resume(
       `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
         `Review and use your tools if action is needed.`,
     );
@@ -333,7 +316,7 @@ export class Supervisor {
           `Review and decide how to proceed.`
         : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
-    let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
+    let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
     if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: turn });
@@ -354,7 +337,7 @@ export class Supervisor {
     if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
       const reminders = this.messageBus.drain("supervisor");
       if (reminders.length > 0) {
-        supervisorResult = await this.#resumeSupervisor(
+        supervisorResult = await this.supervisorRunner.resume(
           formatMessages(reminders),
         );
         if (this.ctx.concluded) {
@@ -617,8 +600,3 @@ export function createSupervisor({
   });
   return supervisor;
 }
-function isSessionNotFound(error) {
-  const msg = error?.message ?? String(error);
-  return msg.includes("No conversation found with session ID");
-}