npm - @forwardimpact/libeval - Versions diffs - 0.1.36 → 0.1.39 - Mend

@forwardimpact/libeval 0.1.36 → 0.1.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/bin/fit-benchmark.js +32 -7
package/bin/fit-eval.js +24 -3
package/bin/fit-trace.js +42 -0
package/package.json +2 -1
package/src/benchmark/apm-installer.js +78 -16
package/src/benchmark/env-loader.js +146 -0
package/src/benchmark/judge.js +4 -3
package/src/benchmark/report.js +43 -17
package/src/benchmark/result.js +9 -3
package/src/benchmark/runner.js +164 -117
package/src/benchmark/scorer.js +5 -5
package/src/benchmark/task-family.js +43 -50
package/src/benchmark/workdir.js +21 -8
package/src/commands/assert.js +145 -0
package/src/commands/benchmark-report.js +1 -2
package/src/commands/benchmark-run.js +11 -4
package/src/commands/facilitate.js +4 -2
package/src/commands/run.js +3 -3
package/src/commands/supervise.js +5 -2
package/src/facilitator.js +7 -3
package/src/supervisor.js +42 -12

package/src/benchmark/runner.js CHANGED Viewed

@@ -3,7 +3,7 @@
  *
  * Phases per (task, runIndex):
  *   1. WorkdirManager.start → seed CWD + run pre-flight probe
- *   2. AgentRunner (bare; design Decision 14) → produce trace + submission
+ *   2. Supervisor relay (agent + supervisor) → produce traces + submission
  *   3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
  *   4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
  *   5. WorkdirManager.teardown → process-group cleanup
@@ -15,15 +15,12 @@
  */
 import { createReadStream, createWriteStream } from "node:fs";
-import { access, constants, mkdir, readFile } from "node:fs/promises";
+import { mkdir, readFile, unlink } from "node:fs/promises";
 import { createInterface } from "node:readline";
 import { join, resolve as resolvePath } from "node:path";
-import { createAgentRunner } from "../agent-runner.js";
-import { composeProfilePrompt } from "../profile-prompt.js";
-import { createRedactor } from "../redaction.js";
-import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
-import { createTraceCollector } from "../trace-collector.js";
+import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
+import { createSupervisor } from "../supervisor.js";
 import { installApm } from "./apm-installer.js";
 import { runJudge } from "./judge.js";
 import { validateResultRecord } from "./result.js";
@@ -31,7 +28,16 @@ import { runScoring } from "./scorer.js";
 import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
 import { createWorkdirManager } from "./workdir.js";
-const BASE_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
+const BASE_TOOLS = [
+  "Bash",
+  "Read",
+  "Glob",
+  "Grep",
+  "Write",
+  "Edit",
+  "Agent",
+  "TodoWrite",
+];
 /** Sole orchestrator for a task-family benchmark run. */
 export class BenchmarkRunner {
@@ -40,9 +46,12 @@ export class BenchmarkRunner {
    * @param {import("./task-family.js").TaskFamily | string} opts.family
    * @param {number} opts.runs - Runs per task (≥ 1).
    * @param {string} opts.output - Run-output directory.
-   * @param {string} opts.model
+   * @param {string} opts.agentModel
+   * @param {string} opts.supervisorModel
+   * @param {string} opts.judgeModel
    * @param {{agent?: string, judge?: string}} [opts.profiles]
    * @param {Function} opts.query - SDK query (injected for testability).
+   * @param {string[]} [opts.allowedTools] - Agent tool allowlist (default: BASE_TOOLS).
    * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
    * @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
    * @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
@@ -60,9 +69,12 @@ export class BenchmarkRunner {
     family,
     runs,
     output,
-    model,
+    agentModel,
+    supervisorModel,
+    judgeModel,
     profiles,
     query,
+    allowedTools,
     maxTurns,
     termGraceMs,
     // Test seams — default to the real implementations.
@@ -74,12 +86,15 @@ export class BenchmarkRunner {
     if (!Number.isInteger(runs) || runs < 1)
       throw new Error("runs must be an integer ≥ 1");
     if (!output) throw new Error("output is required");
-    if (!model) throw new Error("model is required");
+    if (!agentModel) throw new Error("agentModel is required");
     if (!query) throw new Error("query is required");
     this.familyInput = family;
     this.runs = runs;
     this.output = output;
-    this.model = model;
+    this.agentModel = agentModel;
+    this.supervisorModel = supervisorModel;
+    this.judgeModel = judgeModel;
+    this.allowedTools = allowedTools ?? BASE_TOOLS;
     this.profiles = {
       agent: profiles?.agent ?? null,
       judge: profiles?.judge ?? null,
@@ -103,20 +118,25 @@ export class BenchmarkRunner {
         : this.familyInput;
     await mkdir(this.output, { recursive: true });
-    const { stagingDir, skillSetHash } = await installApm(family, this.output);
+    const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
+      family,
+      this.output,
+    );
     const tasks = family.tasks();
-    for (const task of tasks) {
-      await assertPreflightExecutable(task);
-    }
     if (this.profiles.judge) {
-      await assertJudgeProfileStaged(family, stagingDir, this.profiles.judge);
+      await assertJudgeProfileStaged(
+        family,
+        judgeProfilesDir,
+        this.profiles.judge,
+      );
     }
     const wm = createWorkdirManager({
       stagingDir,
       runOutputDir: this.output,
       termGraceMs: this.termGraceMs,
+      familyRootPath: family.rootPath,
     });
     const resultsPath = join(this.output, "results.jsonl");
@@ -130,6 +150,7 @@ export class BenchmarkRunner {
             task,
             runIndex,
             skillSetHash,
+            judgeProfilesDir,
           );
           await writeRecord(resultsStream, record);
           yield record;
@@ -140,7 +161,7 @@ export class BenchmarkRunner {
     }
   }
-  async #runOne(family, wm, task, runIndex, skillSetHash) {
+  async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
     const t0 = Date.now();
     const workdir = await wm.start(task, runIndex);
     try {
@@ -165,42 +186,53 @@ export class BenchmarkRunner {
         port: workdir.port,
         runDir: workdir.runDir,
       });
-      const judgeContext = await this.#buildJudgeContext(
-        task,
-        workdir,
-        skillSetHash,
-      );
-      const judgeVerdict = await this._runJudgeHook(
-        task,
-        workdir,
-        scoring,
-        {
-          query: this.query,
-          model: this.model,
-          judgeProfile: this.profiles.judge ?? undefined,
-        },
-        judgeContext,
-      );
+      let judgeVerdict = null;
+      if (task.paths.judge) {
+        const judgeContext = await this.#buildJudgeContext(
+          task,
+          workdir,
+          skillSetHash,
+        );
+        judgeVerdict = await this._runJudgeHook(
+          task,
+          workdir,
+          scoring,
+          {
+            query: this.query,
+            model: this.judgeModel,
+            judgeProfile: this.profiles.judge ?? undefined,
+            profilesDir: judgeProfilesDir,
+          },
+          judgeContext,
+        );
+      }
+      const verdict =
+        scoring.verdict === "pass" &&
+        (judgeVerdict === null || judgeVerdict.verdict === "pass")
+          ? "pass"
+          : "fail";
       const record = {
         taskId: task.id,
         runIndex,
-        verdict:
-          scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
-            ? "pass"
-            : "fail",
+        verdict,
         scoring,
         submission,
-        judgeVerdict,
+        ...(judgeVerdict && { judgeVerdict }),
         costUsd,
         turns,
         agentTracePath: workdir.agentTracePath,
+        supervisorTracePath: workdir.supervisorTracePath,
         judgeTracePath: workdir.judgeTracePath,
         profiles: {
           agent: this.profiles.agent,
           supervisor: null,
           judge: this.profiles.judge,
         },
-        model: this.model,
+        model: {
+          agent: this.agentModel,
+          supervisor: this.supervisorModel,
+          judge: this.judgeModel,
+        },
         skillSetHash,
         familyRevision: family.familyRevision,
         durationMs: Date.now() - t0,
@@ -236,54 +268,49 @@ export class BenchmarkRunner {
   }
   /**
-   * Run the agent-under-test as a bare AgentRunner (design Decision 14).
-   * Recover cost/turns/submission from the trace by replaying it into a
-   * fresh TraceCollector — the bare runner writes a single NDJSON stream
-   * with one terminal `result` event.
-   *
-   * Inspects both thrown errors AND the resolved `{success, aborted, error}`
-   * shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
-   * the SDK iterator catches its own errors and resolves with `success:
-   * false`, so a try/catch alone would silently treat a failed session as
-   * a successful one (plan Step 8.5.c).
+   * Run the agent-under-test via a Supervisor relay. The supervisor writes
+   * a combined tagged NDJSON trace; after the session we split it into
+   * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
    */
   async #runAgent(task, workdir) {
-    const agentTraceStream = createWriteStream(workdir.agentTracePath);
-    const systemPrompt = this.profiles.agent
-      ? composeProfilePrompt(this.profiles.agent, {
-          profilesDir: resolvePath(workdir.cwd, ".claude/agents"),
-          trailer: AGENT_SYSTEM_PROMPT,
-        })
-      : undefined;
-    const runner = createAgentRunner({
-      cwd: workdir.cwd,
+    const combinedPath = join(workdir.runDir, ".combined.ndjson");
+    const combinedStream = createWriteStream(combinedPath);
+    const supervisorInstructions = task.paths.supervisor
+      ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
+      : null;
+    const supervisor = createSupervisor({
+      supervisorCwd: workdir.cwd,
+      agentCwd: workdir.cwd,
       query: this.query,
-      output: agentTraceStream,
-      model: this.model,
+      output: combinedStream,
+      agentModel: this.agentModel,
+      supervisorModel: this.supervisorModel,
       maxTurns: this.maxTurns ?? 50,
-      allowedTools: BASE_TOOLS,
-      settingSources: ["project"],
-      systemPrompt,
-      redactor: createRedactor(),
+      allowedTools: this.allowedTools,
+      ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
+      ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
+      redactor: createRedactor({
+        allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
+      }),
     });
     const instructions = await readFile(task.paths.instructions, "utf8");
     let agentError = null;
     try {
-      const result = await runner.run(instructions);
-      if (!result.success) {
-        agentError = {
-          message:
-            result.error?.message ??
-            (result.aborted ? "aborted" : "agent did not succeed"),
-          aborted: result.aborted ?? false,
-        };
+      const result = await supervisor.run(instructions);
+      if (!result.success && !result.concluded) {
+        agentError = { message: "supervisor did not succeed", aborted: false };
       }
     } catch (e) {
       agentError = { message: e.message ?? String(e), aborted: false };
     } finally {
-      await new Promise((r) => agentTraceStream.end(r));
+      await new Promise((r) => combinedStream.end(r));
     }
-    const summary = await readAgentSummary(workdir.agentTracePath);
+    const summary = await splitAndSummarize(
+      combinedPath,
+      workdir.agentTracePath,
+      workdir.supervisorTracePath,
+    );
+    await unlink(combinedPath).catch(() => {});
     return { ...summary, agentError };
   }
@@ -321,11 +348,16 @@ export class BenchmarkRunner {
         supervisor: null,
         judge: this.profiles.judge,
       },
-      model: this.model,
+      model: {
+        agent: this.agentModel,
+        supervisor: this.supervisorModel,
+        judge: this.judgeModel,
+      },
       skillSetHash,
       familyRevision,
       durationMs,
       agentTracePath: workdir.agentTracePath,
+      supervisorTracePath: workdir.supervisorTracePath,
       judgeTracePath: workdir.judgeTracePath,
     };
   }
@@ -360,52 +392,67 @@ async function writeRecord(stream, record) {
 }
 /**
- * Pre-flight install gate. Throws synchronously if any task's preflight
- * script is missing or not executable — design § Pre-flight contract:
- * "The harness fails the family at install if any task's preflight script
- * is missing or non-executable, before any agent session starts."
+ * Split the combined supervisor trace into agent and supervisor files, and
+ * extract cost, turn count, and submission in a single pass. Agent-source
+ * events go to `agentPath`; supervisor and orchestrator events go to
+ * `supervisorPath`.
  */
-async function assertPreflightExecutable(task) {
-  const path = join(task.paths.workdir, "scripts", "preflight.sh");
-  try {
-    await access(path, constants.X_OK);
-  } catch (e) {
-    throw new Error(
-      `task ${task.id}: preflight script not executable at ${path} (${e.code ?? e.message})`,
-    );
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
+async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
+  const agentStream = createWriteStream(agentPath);
+  const supStream = createWriteStream(supervisorPath);
+  const rl = createInterface({
+    input: createReadStream(combinedPath),
+    crlfDelay: Infinity,
+  });
+  let agentCost = 0;
+  let supervisorCost = 0;
+  let turns = 0;
+  let submission = "";
+  for await (const line of rl) {
+    if (!line.trim()) continue;
+    let event;
+    try {
+      event = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    const target = event.source === "agent" ? agentStream : supStream;
+    target.write(line + "\n");
+    const inner = event.event;
+    if (!inner) continue;
+    if (event.source === "agent") {
+      if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
+        agentCost = inner.total_cost_usd;
+      }
+      if (inner.type === "assistant") {
+        const text = extractText(inner);
+        if (text) submission = text;
+      }
+    }
+    if (event.source === "supervisor") {
+      if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
+        supervisorCost = inner.total_cost_usd;
+      }
+    }
+    if (event.source === "orchestrator" && inner.type === "summary") {
+      turns = inner.turns ?? 0;
+    }
   }
+  await Promise.all([
+    new Promise((r) => agentStream.end(r)),
+    new Promise((r) => supStream.end(r)),
+  ]);
+  return { costUsd: agentCost + supervisorCost, turns, submission };
 }
-/**
- * Replay the bare AgentRunner trace into a fresh TraceCollector to recover
- * cost, turn count, and the final assistant text block (the submission).
- */
-async function readAgentSummary(tracePath) {
-  const collector = createTraceCollector();
-  const stream = createReadStream(tracePath);
-  const rl = createInterface({ input: stream, crlfDelay: Infinity });
-  for await (const line of rl) collector.addLine(line);
-  const json = collector.toJSON();
-  const summary = json.summary ?? {};
-  return {
-    costUsd:
-      typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
-    turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
-    submission: lastAssistantText(json),
-  };
-}
-function lastAssistantText(json) {
-  const turns = json.turns ?? [];
-  for (let i = turns.length - 1; i >= 0; i--) {
-    const turn = turns[i];
-    if (turn.role !== "assistant") continue;
-    const content = turn.content ?? [];
-    for (let j = content.length - 1; j >= 0; j--) {
-      if (content[j].type === "text" && content[j].text) return content[j].text;
-    }
+function extractText(inner) {
+  const content = inner.message?.content ?? inner.content;
+  if (!Array.isArray(content)) return null;
+  for (let i = content.length - 1; i >= 0; i--) {
+    if (content[i].type === "text" && content[i].text) return content[i].text;
   }
-  return "";
+  return null;
 }
 /**

package/src/benchmark/scorer.js CHANGED Viewed

@@ -1,10 +1,7 @@
 /**
- * Scorer — runs `<task.paths.scoring>/run.sh` from the template path against
+ * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
  * the post-run agent CWD. The exit code is authoritative for the verdict;
  * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
- *
- * Scoring scripts are never copied into the agent CWD — they live only in the
- * task template (design Decision 3).
  */
 import { spawn } from "node:child_process";
@@ -31,8 +28,11 @@ import { join } from "node:path";
  * @returns {Promise<ScoringResult>}
  */
 export function runScoring(task, ctx) {
+  if (!task.paths.score) {
+    return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
+  }
   return new Promise((res, rej) => {
-    const script = join(task.paths.scoring, "run.sh");
+    const script = task.paths.score;
     const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
     // Bun's child_process pipe setup for fd >= 3 is racy under load (it

package/src/benchmark/task-family.js CHANGED Viewed

@@ -4,13 +4,14 @@
  *     apm.lock.yaml
  *     .claude/                # pre-staged skills + agents (P1)
  *     tasks/<task_name>/
- *       instructions.md
- *       supervisor.task.md    # preserved for v2; not read in v1
+ *       agent.task.md
+ *       supervisor.task.md    # optional; appended to the task as supervisor context
  *       judge.task.md
+ *       hooks/                # harness-only; never copied to agent CWD
+ *         preflight.sh
+ *         score.sh
  *       specs/                # copied into agent CWD
- *       workdir/              # copied into agent CWD (excludes scripts/)
- *         scripts/preflight.sh
- *       scoring/              # template-only; never copied
+ *       workdir/              # copied into agent CWD
  *
  * Local paths or git URLs are both accepted; git URLs are shallow-cloned into
  * a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
@@ -22,6 +23,7 @@ import { spawn } from "node:child_process";
 import { createHash } from "node:crypto";
 import {
   access,
+  constants,
   lstat,
   mkdtemp,
   readdir,
@@ -53,13 +55,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
     familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
   }
-  const apmLockBytes = await readApmLockBytes(rootPath);
   const tasks = await discoverTasks(rootPath);
   return {
     rootPath,
     familyRevision,
-    apmLockBytes,
     tasks() {
       return tasks;
     },
@@ -67,58 +67,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
 }
 /**
- * Assert that `<stagingDir>/.claude/agents/<judgeProfile>.md` exists. Called
- * from `BenchmarkRunner.run()` so a missing judge profile fails the family
+ * Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
+ * `BenchmarkRunner.run()` so a missing judge profile fails the family
  * install before any agent session starts.
  * @param {TaskFamily} _family
- * @param {string} stagingDir
+ * @param {string} judgeProfilesDir
  * @param {string} judgeProfile
  * @returns {Promise<void>}
  */
 export async function assertJudgeProfileStaged(
   _family,
-  stagingDir,
+  judgeProfilesDir,
   judgeProfile,
 ) {
-  const candidate = join(stagingDir, ".claude", "agents", `${judgeProfile}.md`);
+  const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
   try {
     await access(candidate);
   } catch {
-    throw new Error(
-      `judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
-    );
+    throw new Error(`judge profile not staged: ${candidate}`);
   }
 }
-async function readApmLockBytes(rootPath) {
-  const lockPath = join(rootPath, "apm.lock.yaml");
-  try {
-    const raw = await readFile(lockPath);
-    return normalizeLf(raw);
-  } catch (e) {
-    if (e.code === "ENOENT") {
-      throw new Error(
-        `task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
-      );
-    }
-    throw e;
-  }
-}
-/**
- * Replace CRLF with LF so cross-OS authored lockfiles hash identically.
- * @param {Buffer} buf
- * @returns {Buffer}
- */
-function normalizeLf(buf) {
-  const out = [];
-  for (let i = 0; i < buf.length; i++) {
-    if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
-    out.push(buf[i]);
-  }
-  return Buffer.from(out);
-}
 async function discoverTasks(rootPath) {
   const tasksRoot = join(rootPath, "tasks");
   const tasks = [];
@@ -132,15 +101,22 @@ async function discoverTasks(rootPath) {
   for (const entry of entries) {
     if (!entry.isDirectory()) continue;
     const taskDir = join(tasksRoot, entry.name);
+    const supervisorPath = join(taskDir, "supervisor.task.md");
+    const judgePath = join(taskDir, "judge.task.md");
+    const preflightPath = join(taskDir, "hooks", "preflight.sh");
+    const scorePath = join(taskDir, "hooks", "score.sh");
     tasks.push({
       id: entry.name,
       paths: {
-        instructions: join(taskDir, "instructions.md"),
-        supervisor: join(taskDir, "supervisor.task.md"),
-        judge: join(taskDir, "judge.task.md"),
+        taskDir,
+        instructions: join(taskDir, "agent.task.md"),
+        supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
+        judge: (await fileExists(judgePath)) ? judgePath : null,
+        hooks: join(taskDir, "hooks"),
+        preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
+        score: (await fileExecutable(scorePath)) ? scorePath : null,
         specs: join(taskDir, "specs"),
         workdir: join(taskDir, "workdir"),
-        scoring: join(taskDir, "scoring"),
       },
     });
   }
@@ -148,6 +124,24 @@ async function discoverTasks(rootPath) {
   return tasks;
 }
+async function fileExists(path) {
+  try {
+    await access(path);
+    return true;
+  } catch {
+    return false;
+  }
+}
+async function fileExecutable(path) {
+  try {
+    await access(path, constants.X_OK);
+    return true;
+  } catch {
+    return false;
+  }
+}
 /**
  * Canonical-tree hash per design § Family revision algorithm:
  *   list regular files (excluding .git/, node_modules/)
@@ -242,13 +236,12 @@ function run(cmd, args) {
 /**
  * @typedef {object} Task
  * @property {string} id - Task name (directory name under tasks/)
- * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
+ * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
  */
 /**
  * @typedef {object} TaskFamily
  * @property {string} rootPath
  * @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
- * @property {Buffer} apmLockBytes - LF-normalised
  * @property {() => Task[]} tasks
  */