npm - @forwardimpact/libeval - Versions diffs - 0.1.35 → 0.1.38 - Mend

@forwardimpact/libeval 0.1.35 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/bin/fit-benchmark.js +27 -7
package/bin/fit-eval.js +24 -3
package/bin/fit-trace.js +42 -0
package/package.json +2 -1
package/src/benchmark/apm-installer.js +56 -10
package/src/benchmark/judge.js +35 -8
package/src/benchmark/report.js +364 -17
package/src/benchmark/result.js +7 -1
package/src/benchmark/runner.js +149 -79
package/src/benchmark/scorer.js +2 -5
package/src/benchmark/task-family.js +14 -47
package/src/benchmark/workdir.js +7 -6
package/src/commands/assert.js +145 -0
package/src/commands/benchmark-report.js +6 -3
package/src/commands/benchmark-run.js +5 -4
package/src/commands/facilitate.js +4 -2
package/src/commands/run.js +3 -3
package/src/commands/supervise.js +5 -2
package/src/facilitator.js +7 -3
package/src/supervisor.js +47 -14

package/src/benchmark/runner.js CHANGED Viewed

@@ -3,7 +3,7 @@
  *
  * Phases per (task, runIndex):
  *   1. WorkdirManager.start → seed CWD + run pre-flight probe
- *   2. AgentRunner (bare; design Decision 14) → produce trace + submission
+ *   2. Supervisor relay (agent + supervisor) → produce traces + submission
  *   3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
  *   4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
  *   5. WorkdirManager.teardown → process-group cleanup
@@ -15,15 +15,12 @@
  */
 import { createReadStream, createWriteStream } from "node:fs";
-import { access, constants, mkdir, readFile } from "node:fs/promises";
+import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
 import { createInterface } from "node:readline";
 import { join, resolve as resolvePath } from "node:path";
-import { createAgentRunner } from "../agent-runner.js";
-import { composeProfilePrompt } from "../profile-prompt.js";
 import { createRedactor } from "../redaction.js";
-import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
-import { createTraceCollector } from "../trace-collector.js";
+import { createSupervisor } from "../supervisor.js";
 import { installApm } from "./apm-installer.js";
 import { runJudge } from "./judge.js";
 import { validateResultRecord } from "./result.js";
@@ -40,7 +37,9 @@ export class BenchmarkRunner {
    * @param {import("./task-family.js").TaskFamily | string} opts.family
    * @param {number} opts.runs - Runs per task (≥ 1).
    * @param {string} opts.output - Run-output directory.
-   * @param {string} opts.model
+   * @param {string} opts.agentModel
+   * @param {string} opts.supervisorModel
+   * @param {string} opts.judgeModel
    * @param {{agent?: string, judge?: string}} [opts.profiles]
    * @param {Function} opts.query - SDK query (injected for testability).
    * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
@@ -60,7 +59,9 @@ export class BenchmarkRunner {
     family,
     runs,
     output,
-    model,
+    agentModel,
+    supervisorModel,
+    judgeModel,
     profiles,
     query,
     maxTurns,
@@ -74,12 +75,16 @@ export class BenchmarkRunner {
     if (!Number.isInteger(runs) || runs < 1)
       throw new Error("runs must be an integer ≥ 1");
     if (!output) throw new Error("output is required");
-    if (!model) throw new Error("model is required");
+    if (!agentModel) throw new Error("agentModel is required");
+    if (!supervisorModel) throw new Error("supervisorModel is required");
+    if (!judgeModel) throw new Error("judgeModel is required");
     if (!query) throw new Error("query is required");
     this.familyInput = family;
     this.runs = runs;
     this.output = output;
-    this.model = model;
+    this.agentModel = agentModel;
+    this.supervisorModel = supervisorModel;
+    this.judgeModel = judgeModel;
     this.profiles = {
       agent: profiles?.agent ?? null,
       judge: profiles?.judge ?? null,
@@ -103,14 +108,21 @@ export class BenchmarkRunner {
         : this.familyInput;
     await mkdir(this.output, { recursive: true });
-    const { stagingDir, skillSetHash } = await installApm(family, this.output);
+    const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
+      family,
+      this.output,
+    );
     const tasks = family.tasks();
     for (const task of tasks) {
       await assertPreflightExecutable(task);
     }
     if (this.profiles.judge) {
-      await assertJudgeProfileStaged(family, stagingDir, this.profiles.judge);
+      await assertJudgeProfileStaged(
+        family,
+        judgeProfilesDir,
+        this.profiles.judge,
+      );
     }
     const wm = createWorkdirManager({
@@ -130,6 +142,7 @@ export class BenchmarkRunner {
             task,
             runIndex,
             skillSetHash,
+            judgeProfilesDir,
           );
           await writeRecord(resultsStream, record);
           yield record;
@@ -140,7 +153,7 @@ export class BenchmarkRunner {
     }
   }
-  async #runOne(family, wm, task, runIndex, skillSetHash) {
+  async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
     const t0 = Date.now();
     const workdir = await wm.start(task, runIndex);
     try {
@@ -165,11 +178,23 @@ export class BenchmarkRunner {
         port: workdir.port,
         runDir: workdir.runDir,
       });
-      const judgeVerdict = await this._runJudgeHook(task, workdir, scoring, {
-        query: this.query,
-        model: this.model,
-        judgeProfile: this.profiles.judge ?? undefined,
-      });
+      const judgeContext = await this.#buildJudgeContext(
+        task,
+        workdir,
+        skillSetHash,
+      );
+      const judgeVerdict = await this._runJudgeHook(
+        task,
+        workdir,
+        scoring,
+        {
+          query: this.query,
+          model: this.judgeModel,
+          judgeProfile: this.profiles.judge ?? undefined,
+          profilesDir: judgeProfilesDir,
+        },
+        judgeContext,
+      );
       const record = {
         taskId: task.id,
         runIndex,
@@ -183,13 +208,18 @@ export class BenchmarkRunner {
         costUsd,
         turns,
         agentTracePath: workdir.agentTracePath,
+        supervisorTracePath: workdir.supervisorTracePath,
         judgeTracePath: workdir.judgeTracePath,
         profiles: {
           agent: this.profiles.agent,
           supervisor: null,
           judge: this.profiles.judge,
         },
-        model: this.model,
+        model: {
+          agent: this.agentModel,
+          supervisor: this.supervisorModel,
+          judge: this.judgeModel,
+        },
         skillSetHash,
         familyRevision: family.familyRevision,
         durationMs: Date.now() - t0,
@@ -225,57 +255,60 @@ export class BenchmarkRunner {
   }
   /**
-   * Run the agent-under-test as a bare AgentRunner (design Decision 14).
-   * Recover cost/turns/submission from the trace by replaying it into a
-   * fresh TraceCollector — the bare runner writes a single NDJSON stream
-   * with one terminal `result` event.
-   *
-   * Inspects both thrown errors AND the resolved `{success, aborted, error}`
-   * shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
-   * the SDK iterator catches its own errors and resolves with `success:
-   * false`, so a try/catch alone would silently treat a failed session as
-   * a successful one (plan Step 8.5.c).
+   * Run the agent-under-test via a Supervisor relay. The supervisor writes
+   * a combined tagged NDJSON trace; after the session we split it into
+   * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
    */
   async #runAgent(task, workdir) {
-    const agentTraceStream = createWriteStream(workdir.agentTracePath);
-    const systemPrompt = this.profiles.agent
-      ? composeProfilePrompt(this.profiles.agent, {
-          profilesDir: resolvePath(workdir.cwd, ".claude/agents"),
-          trailer: AGENT_SYSTEM_PROMPT,
-        })
-      : undefined;
-    const runner = createAgentRunner({
-      cwd: workdir.cwd,
+    const combinedPath = join(workdir.runDir, ".combined.ndjson");
+    const combinedStream = createWriteStream(combinedPath);
+    const supervisor = createSupervisor({
+      supervisorCwd: workdir.cwd,
+      agentCwd: workdir.cwd,
       query: this.query,
-      output: agentTraceStream,
-      model: this.model,
+      output: combinedStream,
+      agentModel: this.agentModel,
+      supervisorModel: this.supervisorModel,
       maxTurns: this.maxTurns ?? 50,
       allowedTools: BASE_TOOLS,
-      settingSources: ["project"],
-      systemPrompt,
+      ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
       redactor: createRedactor(),
     });
     const instructions = await readFile(task.paths.instructions, "utf8");
     let agentError = null;
     try {
-      const result = await runner.run(instructions);
-      if (!result.success) {
-        agentError = {
-          message:
-            result.error?.message ??
-            (result.aborted ? "aborted" : "agent did not succeed"),
-          aborted: result.aborted ?? false,
-        };
+      const result = await supervisor.run(instructions);
+      if (!result.success && !result.concluded) {
+        agentError = { message: "supervisor did not succeed", aborted: false };
       }
     } catch (e) {
       agentError = { message: e.message ?? String(e), aborted: false };
     } finally {
-      await new Promise((r) => agentTraceStream.end(r));
+      await new Promise((r) => combinedStream.end(r));
     }
-    const summary = await readAgentSummary(workdir.agentTracePath);
+    const summary = await splitAndSummarize(
+      combinedPath,
+      workdir.agentTracePath,
+      workdir.supervisorTracePath,
+    );
+    await unlink(combinedPath).catch(() => {});
     return { ...summary, agentError };
   }
+  async #buildJudgeContext(task, workdir, skillSetHash) {
+    const agentInstructions = await readFile(task.paths.instructions, "utf8");
+    let agentProfile = "";
+    if (this.profiles.agent) {
+      const profilePath = resolvePath(
+        workdir.cwd,
+        ".claude/agents",
+        `${this.profiles.agent}.md`,
+      );
+      agentProfile = await readFile(profilePath, "utf8").catch(() => "");
+    }
+    return { agentInstructions, agentProfile, skillSetHash };
+  }
   #buildPreflightFailureRecord({
     task,
     runIndex,
@@ -296,11 +329,16 @@ export class BenchmarkRunner {
         supervisor: null,
         judge: this.profiles.judge,
       },
-      model: this.model,
+      model: {
+        agent: this.agentModel,
+        supervisor: this.supervisorModel,
+        judge: this.judgeModel,
+      },
       skillSetHash,
       familyRevision,
       durationMs,
       agentTracePath: workdir.agentTracePath,
+      supervisorTracePath: workdir.supervisorTracePath,
       judgeTracePath: workdir.judgeTracePath,
     };
   }
@@ -341,7 +379,7 @@ async function writeRecord(stream, record) {
  * is missing or non-executable, before any agent session starts."
  */
 async function assertPreflightExecutable(task) {
-  const path = join(task.paths.workdir, "scripts", "preflight.sh");
+  const path = join(task.paths.hooks, "preflight.sh");
   try {
     await access(path, constants.X_OK);
   } catch (e) {
@@ -352,35 +390,67 @@ async function assertPreflightExecutable(task) {
 }
 /**
- * Replay the bare AgentRunner trace into a fresh TraceCollector to recover
- * cost, turn count, and the final assistant text block (the submission).
+ * Split the combined supervisor trace into agent and supervisor files, and
+ * extract cost, turn count, and submission in a single pass. Agent-source
+ * events go to `agentPath`; supervisor and orchestrator events go to
+ * `supervisorPath`.
  */
-async function readAgentSummary(tracePath) {
-  const collector = createTraceCollector();
-  const stream = createReadStream(tracePath);
-  const rl = createInterface({ input: stream, crlfDelay: Infinity });
-  for await (const line of rl) collector.addLine(line);
-  const json = collector.toJSON();
-  const summary = json.summary ?? {};
-  return {
-    costUsd:
-      typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
-    turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
-    submission: lastAssistantText(json),
-  };
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
+async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
+  const agentStream = createWriteStream(agentPath);
+  const supStream = createWriteStream(supervisorPath);
+  const rl = createInterface({
+    input: createReadStream(combinedPath),
+    crlfDelay: Infinity,
+  });
+  let agentCost = 0;
+  let supervisorCost = 0;
+  let turns = 0;
+  let submission = "";
+  for await (const line of rl) {
+    if (!line.trim()) continue;
+    let event;
+    try {
+      event = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    const target = event.source === "agent" ? agentStream : supStream;
+    target.write(line + "\n");
+    const inner = event.event;
+    if (!inner) continue;
+    if (event.source === "agent") {
+      if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
+        agentCost = inner.total_cost_usd;
+      }
+      if (inner.type === "assistant") {
+        const text = extractText(inner);
+        if (text) submission = text;
+      }
+    }
+    if (event.source === "supervisor") {
+      if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
+        supervisorCost = inner.total_cost_usd;
+      }
+    }
+    if (event.source === "orchestrator" && inner.type === "summary") {
+      turns = inner.turns ?? 0;
+    }
+  }
+  await Promise.all([
+    new Promise((r) => agentStream.end(r)),
+    new Promise((r) => supStream.end(r)),
+  ]);
+  return { costUsd: agentCost + supervisorCost, turns, submission };
 }
-function lastAssistantText(json) {
-  const turns = json.turns ?? [];
-  for (let i = turns.length - 1; i >= 0; i--) {
-    const turn = turns[i];
-    if (turn.role !== "assistant") continue;
-    const content = turn.content ?? [];
-    for (let j = content.length - 1; j >= 0; j--) {
-      if (content[j].type === "text" && content[j].text) return content[j].text;
-    }
+function extractText(inner) {
+  const content = inner.message?.content ?? inner.content;
+  if (!Array.isArray(content)) return null;
+  for (let i = content.length - 1; i >= 0; i--) {
+    if (content[i].type === "text" && content[i].text) return content[i].text;
   }
-  return "";
+  return null;
 }
 /**

package/src/benchmark/scorer.js CHANGED Viewed

@@ -1,10 +1,7 @@
 /**
- * Scorer — runs `<task.paths.scoring>/run.sh` from the template path against
+ * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
  * the post-run agent CWD. The exit code is authoritative for the verdict;
  * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
- *
- * Scoring scripts are never copied into the agent CWD — they live only in the
- * task template (design Decision 3).
  */
 import { spawn } from "node:child_process";
@@ -32,7 +29,7 @@ import { join } from "node:path";
  */
 export function runScoring(task, ctx) {
   return new Promise((res, rej) => {
-    const script = join(task.paths.scoring, "run.sh");
+    const script = join(task.paths.hooks, "score.sh");
     const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
     // Bun's child_process pipe setup for fd >= 3 is racy under load (it

package/src/benchmark/task-family.js CHANGED Viewed

@@ -4,13 +4,14 @@
  *     apm.lock.yaml
  *     .claude/                # pre-staged skills + agents (P1)
  *     tasks/<task_name>/
- *       instructions.md
+ *       agent.task.md
  *       supervisor.task.md    # preserved for v2; not read in v1
  *       judge.task.md
+ *       hooks/                # harness-only; never copied to agent CWD
+ *         preflight.sh
+ *         score.sh
  *       specs/                # copied into agent CWD
- *       workdir/              # copied into agent CWD (excludes scripts/)
- *         scripts/preflight.sh
- *       scoring/              # template-only; never copied
+ *       workdir/              # copied into agent CWD
  *
  * Local paths or git URLs are both accepted; git URLs are shallow-cloned into
  * a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
@@ -53,13 +54,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
     familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
   }
-  const apmLockBytes = await readApmLockBytes(rootPath);
   const tasks = await discoverTasks(rootPath);
   return {
     rootPath,
     familyRevision,
-    apmLockBytes,
     tasks() {
       return tasks;
     },
@@ -67,58 +66,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
 }
 /**
- * Assert that `<stagingDir>/.claude/agents/<judgeProfile>.md` exists. Called
- * from `BenchmarkRunner.run()` so a missing judge profile fails the family
+ * Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
+ * `BenchmarkRunner.run()` so a missing judge profile fails the family
  * install before any agent session starts.
  * @param {TaskFamily} _family
- * @param {string} stagingDir
+ * @param {string} judgeProfilesDir
  * @param {string} judgeProfile
  * @returns {Promise<void>}
  */
 export async function assertJudgeProfileStaged(
   _family,
-  stagingDir,
+  judgeProfilesDir,
   judgeProfile,
 ) {
-  const candidate = join(stagingDir, ".claude", "agents", `${judgeProfile}.md`);
+  const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
   try {
     await access(candidate);
   } catch {
-    throw new Error(
-      `judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
-    );
+    throw new Error(`judge profile not staged: ${candidate}`);
   }
 }
-async function readApmLockBytes(rootPath) {
-  const lockPath = join(rootPath, "apm.lock.yaml");
-  try {
-    const raw = await readFile(lockPath);
-    return normalizeLf(raw);
-  } catch (e) {
-    if (e.code === "ENOENT") {
-      throw new Error(
-        `task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
-      );
-    }
-    throw e;
-  }
-}
-/**
- * Replace CRLF with LF so cross-OS authored lockfiles hash identically.
- * @param {Buffer} buf
- * @returns {Buffer}
- */
-function normalizeLf(buf) {
-  const out = [];
-  for (let i = 0; i < buf.length; i++) {
-    if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
-    out.push(buf[i]);
-  }
-  return Buffer.from(out);
-}
 async function discoverTasks(rootPath) {
   const tasksRoot = join(rootPath, "tasks");
   const tasks = [];
@@ -135,12 +103,12 @@ async function discoverTasks(rootPath) {
     tasks.push({
       id: entry.name,
       paths: {
-        instructions: join(taskDir, "instructions.md"),
+        instructions: join(taskDir, "agent.task.md"),
         supervisor: join(taskDir, "supervisor.task.md"),
         judge: join(taskDir, "judge.task.md"),
+        hooks: join(taskDir, "hooks"),
         specs: join(taskDir, "specs"),
         workdir: join(taskDir, "workdir"),
-        scoring: join(taskDir, "scoring"),
       },
     });
   }
@@ -242,13 +210,12 @@ function run(cmd, args) {
 /**
  * @typedef {object} Task
  * @property {string} id - Task name (directory name under tasks/)
- * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
+ * @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
  */
 /**
  * @typedef {object} TaskFamily
  * @property {string} rootPath
  * @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
- * @property {Buffer} apmLockBytes - LF-normalised
  * @property {() => Task[]} tasks
  */

package/src/benchmark/workdir.js CHANGED Viewed

@@ -11,9 +11,8 @@ import { spawn } from "node:child_process";
 import { cp, mkdir } from "node:fs/promises";
 import { createServer } from "node:net";
 import { connect } from "node:net";
-import { join, sep } from "node:path";
+import { join } from "node:path";
-const PREFLIGHT_REL = join("workdir", "scripts");
 const DEFAULT_TERM_GRACE_MS = 5_000;
 /**
@@ -24,6 +23,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
  * @property {number} pgid - Process-group id captured from the preflight child.
  * @property {*} scaffold - Reserved per design § Components; v1 sets null.
  * @property {string} agentTracePath
+ * @property {string} supervisorTracePath
  * @property {string} judgeTracePath
  * @property {{phase: string, message: string, exitCode: number}} [preflightError]
  */
@@ -55,9 +55,8 @@ export class WorkdirManager {
     const cwd = join(runDir, "cwd");
     await mkdir(cwd, { recursive: true });
-    await cp(task.paths.workdir, cwd, {
-      recursive: true,
-      filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
+    await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
+      if (e.code !== "ENOENT") throw e;
     });
     await cp(task.paths.specs, join(cwd, "specs"), {
       recursive: true,
@@ -70,9 +69,10 @@ export class WorkdirManager {
     const port = await allocatePort();
     const agentTracePath = join(runDir, "agent.ndjson");
+    const supervisorTracePath = join(runDir, "supervisor.ndjson");
     const judgeTracePath = join(runDir, "judge.ndjson");
-    const preflightScript = join(task.paths.workdir, "scripts", "preflight.sh");
+    const preflightScript = join(task.paths.hooks, "preflight.sh");
     const preflight = await runPreflight(preflightScript, cwd, port);
     return {
@@ -82,6 +82,7 @@ export class WorkdirManager {
       pgid: preflight.pgid,
       scaffold: null,
       agentTracePath,
+      supervisorTracePath,
       judgeTracePath,
       ...(preflight.error && { preflightError: preflight.error }),
     };