npm - @forwardimpact/libeval - Versions diffs - 0.1.36 → 0.1.39 - Mend

@forwardimpact/libeval 0.1.36 → 0.1.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/bin/fit-benchmark.js +32 -7
package/bin/fit-eval.js +24 -3
package/bin/fit-trace.js +42 -0
package/package.json +2 -1
package/src/benchmark/apm-installer.js +78 -16
package/src/benchmark/env-loader.js +146 -0
package/src/benchmark/judge.js +4 -3
package/src/benchmark/report.js +43 -17
package/src/benchmark/result.js +9 -3
package/src/benchmark/runner.js +164 -117
package/src/benchmark/scorer.js +5 -5
package/src/benchmark/task-family.js +43 -50
package/src/benchmark/workdir.js +21 -8
package/src/commands/assert.js +145 -0
package/src/commands/benchmark-report.js +1 -2
package/src/commands/benchmark-run.js +11 -4
package/src/commands/facilitate.js +4 -2
package/src/commands/run.js +3 -3
package/src/commands/supervise.js +5 -2
package/src/facilitator.js +7 -3
package/src/supervisor.js +42 -12

package/src/benchmark/workdir.js CHANGED Viewed

@@ -11,9 +11,10 @@ import { spawn } from "node:child_process";
 import { cp, mkdir } from "node:fs/promises";
 import { createServer } from "node:net";
 import { connect } from "node:net";
-import { join, sep } from "node:path";
+import { join } from "node:path";
+import { loadEnv } from "./env-loader.js";
-const PREFLIGHT_REL = join("workdir", "scripts");
 const DEFAULT_TERM_GRACE_MS = 5_000;
 /**
@@ -24,7 +25,9 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
  * @property {number} pgid - Process-group id captured from the preflight child.
  * @property {*} scaffold - Reserved per design § Components; v1 sets null.
  * @property {string} agentTracePath
+ * @property {string} supervisorTracePath
  * @property {string} judgeTracePath
+ * @property {string[]} [envNames] - Env var names loaded from .env files.
  * @property {{phase: string, message: string, exitCode: number}} [preflightError]
  */
@@ -35,12 +38,13 @@ export class WorkdirManager {
    * @param {string} deps.stagingDir - Output of `installApm(...)`.
    * @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
    */
-  constructor({ stagingDir, runOutputDir, termGraceMs }) {
+  constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
     if (!stagingDir) throw new Error("stagingDir is required");
     if (!runOutputDir) throw new Error("runOutputDir is required");
     this.stagingDir = stagingDir;
     this.runOutputDir = runOutputDir;
     this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
+    this.familyRootPath = familyRootPath ?? null;
   }
   /**
@@ -55,9 +59,8 @@ export class WorkdirManager {
     const cwd = join(runDir, "cwd");
     await mkdir(cwd, { recursive: true });
-    await cp(task.paths.workdir, cwd, {
-      recursive: true,
-      filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
+    await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
+      if (e.code !== "ENOENT") throw e;
     });
     await cp(task.paths.specs, join(cwd, "specs"), {
       recursive: true,
@@ -68,12 +71,20 @@ export class WorkdirManager {
       recursive: true,
     });
+    const envDirs = [
+      ...(this.familyRootPath ? [this.familyRootPath] : []),
+      ...(task.paths.taskDir ? [task.paths.taskDir] : []),
+    ];
+    const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
     const port = await allocatePort();
     const agentTracePath = join(runDir, "agent.ndjson");
+    const supervisorTracePath = join(runDir, "supervisor.ndjson");
     const judgeTracePath = join(runDir, "judge.ndjson");
-    const preflightScript = join(task.paths.workdir, "scripts", "preflight.sh");
-    const preflight = await runPreflight(preflightScript, cwd, port);
+    const preflight = task.paths.preflight
+      ? await runPreflight(task.paths.preflight, cwd, port)
+      : { pgid: 0 };
     return {
       cwd,
@@ -82,7 +93,9 @@ export class WorkdirManager {
       pgid: preflight.pgid,
       scaffold: null,
       agentTracePath,
+      supervisorTracePath,
       judgeTracePath,
+      envNames,
       ...(preflight.error && { preflightError: preflight.error }),
     };
   }

package/src/commands/assert.js ADDED Viewed

@@ -0,0 +1,145 @@
+import { existsSync, readFileSync } from "node:fs";
+import { basename } from "node:path";
+import jmespath from "jmespath";
+/**
+ * Evaluate an assertion and return the structured result.
+ * @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
+ * @param {string[]} args - [testName, file]
+ * @returns {{ test: string, pass: boolean, message?: string }}
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
+export function evaluateAssertion(values, args) {
+  const testName = args[0];
+  if (!testName) throw new Error("assert: missing test name");
+  const file = args[1];
+  const modes = [
+    values.grep,
+    values.query,
+    values.exists,
+    values["cites-job"],
+  ].filter((v) => v !== undefined && v !== false);
+  if (modes.length === 0) {
+    throw new Error(
+      "assert: specify one of --grep, --query, --exists, or --cites-job",
+    );
+  }
+  if (modes.length > 1) {
+    throw new Error(
+      "assert: specify only one of --grep, --query, --exists, or --cites-job",
+    );
+  }
+  let result;
+  if (values.exists) {
+    if (!file) throw new Error("assert: missing file argument");
+    result = assertExists(file);
+  } else if (values.grep) {
+    if (!file) throw new Error("assert: missing file argument for --grep");
+    result = assertGrep(values.grep, file);
+  } else if (values["cites-job"]) {
+    if (!file) throw new Error("assert: missing file argument for --cites-job");
+    result = assertCitesJob(values["cites-job"], file);
+  } else {
+    if (!file) throw new Error("assert: missing file argument for --query");
+    result = assertQuery(values.query, file);
+  }
+  if (values.not) {
+    result.pass = !result.pass;
+    if (result.pass) {
+      delete result.message;
+    } else {
+      result.message =
+        result.message ?? `inverted assertion failed for ${basename(file)}`;
+    }
+  }
+  if (!result.pass && values.message) {
+    result.message = values.message;
+  }
+  const output = { test: testName, pass: result.pass };
+  if (result.message) output.message = result.message;
+  return output;
+}
+/**
+ * Run an assertion, write JSON to stdout, and set process.exitCode on failure.
+ * @param {object} values
+ * @param {string[]} args
+ */
+export async function runAssertCommand(values, args) {
+  const result = evaluateAssertion(values, args);
+  process.stdout.write(JSON.stringify(result) + "\n");
+  if (!result.pass) process.exitCode = 1;
+}
+function assertExists(file) {
+  if (existsSync(file)) return { pass: true };
+  return { pass: false, message: `${file} not found` };
+}
+function assertGrep(pattern, file) {
+  const content = readFileSync(file, "utf8");
+  const re = new RegExp(pattern, "im");
+  if (re.test(content)) return { pass: true };
+  return {
+    pass: false,
+    message: `pattern "${pattern}" not found in ${basename(file)}`,
+  };
+}
+function assertQuery(expression, file) {
+  const content = readFileSync(file, "utf8");
+  const data = parseJsonOrNdjson(content);
+  const result = jmespath.search(data, expression);
+  const truthy =
+    result !== null &&
+    result !== undefined &&
+    result !== false &&
+    (Array.isArray(result) ? result.length > 0 : true);
+  if (truthy) return { pass: true };
+  return {
+    pass: false,
+    message: `query returned ${JSON.stringify(result)}`,
+  };
+}
+const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
+function assertCitesJob(jobFile, file) {
+  const jobContent = readFileSync(jobFile, "utf8");
+  const match = JOB_TAG_RE.exec(jobContent);
+  if (!match) {
+    return {
+      pass: false,
+      message: `no <job> tag found in ${basename(jobFile)}`,
+    };
+  }
+  const citation = `${match[1]}: ${match[2]}`;
+  const content = readFileSync(file, "utf8");
+  if (content.includes(citation)) return { pass: true };
+  return { pass: false, message: `missing "${citation}"` };
+}
+function parseJsonOrNdjson(content) {
+  try {
+    return JSON.parse(content);
+  } catch {
+    // Fall through to NDJSON
+  }
+  const lines = [];
+  for (const raw of content.split("\n")) {
+    const trimmed = raw.trim();
+    if (!trimmed) continue;
+    try {
+      lines.push(JSON.parse(trimmed));
+    } catch {
+      // skip unparseable lines
+    }
+  }
+  if (lines.length === 0) throw new Error("assert: no valid JSON in file");
+  return lines;
+}

package/src/commands/benchmark-report.js CHANGED Viewed

@@ -13,8 +13,7 @@ import { aggregate, renderTextReport } from "../benchmark/report.js";
  * @param {string[]} _args
  */
 export async function runBenchmarkReportCommand(values, _args) {
-  const inputDir = values.input;
-  if (!inputDir) throw new Error("--input is required");
+  const inputDir = values.input ?? "benchmark-runs";
   const kRaw = values.k ?? "1,3,5";
   const kValues = kRaw.split(",").map((t) => {
     const n = Number.parseInt(t.trim(), 10);

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -31,21 +31,28 @@ export async function runBenchmarkRunCommand(values, _args) {
 function parseRunOptions(values) {
   const family = values.family;
   if (!family) throw new Error("--family is required");
-  const output = values.output;
-  if (!output) throw new Error("--output is required");
-  const runs = Number.parseInt(values.runs ?? "1", 10);
+  const output = values.output ?? "benchmark-runs";
+  const runs = Number.parseInt(values.runs ?? "5", 10);
   if (!Number.isFinite(runs) || runs < 1)
     throw new Error("--runs must be a positive integer");
   return {
     family,
     runs,
     output: resolve(output),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
+    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
+    judgeModel: values["judge-model"] ?? "claude-opus-4-7",
     profiles: {
       agent: values["agent-profile"] ?? null,
       judge: values["judge-profile"] ?? null,
     },
     maxTurns: parseMaxTurns(values["max-turns"]),
+    allowedTools: values["allowed-tools"]
+      ? values["allowed-tools"]
+          .split(",")
+          .map((s) => s.trim())
+          .filter(Boolean)
+      : undefined,
   };
 }

package/src/commands/facilitate.js CHANGED Viewed

@@ -45,7 +45,8 @@ function parseFacilitateOptions(values) {
     taskAmend,
     agentConfigs,
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
     outputPath: values.output,
     facilitatorProfile: values["facilitator-profile"] ?? undefined,
@@ -89,7 +90,8 @@ export async function runFacilitateCommand(values, _args) {
     agentConfigs: opts.agentConfigs,
     query,
     output,
-    model: opts.model,
+    agentModel: opts.agentModel,
+    facilitatorModel: opts.facilitatorModel,
     maxTurns: opts.maxTurns,
     facilitatorProfile: opts.facilitatorProfile,
     taskAmend: opts.taskAmend,

package/src/commands/run.js CHANGED Viewed

@@ -29,7 +29,7 @@ function parseRunOptions(values) {
     taskContent,
     taskAmend,
     cwd: resolve(values.cwd ?? "."),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
     outputPath: values.output,
     agentProfile: values["agent-profile"] ?? undefined,
@@ -54,7 +54,7 @@ export async function runRunCommand(values, _args) {
     taskContent,
     taskAmend,
     cwd,
-    model,
+    agentModel,
     maxTurns,
     outputPath,
     agentProfile,
@@ -114,7 +114,7 @@ export async function runRunCommand(values, _args) {
     cwd,
     query,
     output: devNull,
-    model,
+    model: agentModel,
     maxTurns,
     allowedTools,
     onLine,

package/src/commands/supervise.js CHANGED Viewed

@@ -11,6 +11,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
  * @param {object} values - Parsed option values from cli.parse()
  * @returns {object}
  */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
 function parseSuperviseOptions(values) {
   const taskFile = values["task-file"];
   const taskText = values["task-text"];
@@ -31,7 +32,8 @@ function parseSuperviseOptions(values) {
     agentCwd: resolve(
       values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
     ),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
       const raw = values["max-turns"] ?? "20";
       return raw === "0" ? 0 : parseInt(raw, 10);
@@ -102,7 +104,8 @@ export async function runSuperviseCommand(values, _args) {
     agentCwd: opts.agentCwd,
     query,
     output,
-    model: opts.model,
+    agentModel: opts.agentModel,
+    supervisorModel: opts.supervisorModel,
     maxTurns: opts.maxTurns,
     allowedTools: opts.allowedTools,
     supervisorAllowedTools: opts.supervisorAllowedTools,

package/src/facilitator.js CHANGED Viewed

@@ -390,7 +390,9 @@ const devNull = new Writable({
  * @param {Array<{name: string, role: string, cwd?: string, maxTurns?: number, allowedTools?: string[], agentProfile?: string, systemPromptAmend?: string}>} deps.agentConfigs
  * @param {function} deps.query
  * @param {import("stream").Writable} deps.output
- * @param {string} [deps.model]
+ * @param {string} [deps.model] - Default model for all participants.
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
+ * @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
  * @param {number} [deps.maxTurns]
  * @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
@@ -403,6 +405,8 @@ export function createFacilitator({
   query,
   output,
   model,
+  agentModel,
+  facilitatorModel,
   maxTurns,
   facilitatorProfile,
   profilesDir,
@@ -450,7 +454,7 @@ export function createFacilitator({
       cwd: config.cwd ?? facilitatorCwd,
       query,
       output: devNull,
-      model,
+      model: agentModel ?? model,
       maxTurns: config.maxTurns ?? 50,
       allowedTools: config.allowedTools,
       onLine: (line) => facilitator.emitLine(config.name, line),
@@ -467,7 +471,7 @@ export function createFacilitator({
     cwd: facilitatorCwd,
     query,
     output: devNull,
-    model,
+    model: facilitatorModel ?? model,
     maxTurns: maxTurns ?? 20,
     onLine: (line) => facilitator.emitLine("facilitator", line),
     mcpServers: { orchestration: facilitatorServer },

package/src/supervisor.js CHANGED Viewed

@@ -100,17 +100,18 @@ export class Supervisor {
   /**
    * Run the supervisor ↔ agent relay loop.
    * @param {string} task - The initial task for the supervisor
-   * @returns {Promise<{success: boolean, turns: number}>}
+   * @returns {Promise<{success: boolean, turns: number, concluded: boolean}>}
    */
   async run(task) {
     const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
+    this.taskContext = initialTask;
     this.currentSource = "supervisor";
     this.currentTurn = 0;
     let supervisorResult = await this.supervisorRunner.run(initialTask);
     if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: 0 });
-      return { success: false, turns: 0 };
+      return { success: false, turns: 0, concluded: false };
     }
     if (this.ctx.concluded) {
@@ -121,7 +122,7 @@ export class Supervisor {
         turns: 0,
         summary: this.ctx.summary,
       });
-      return { success, turns: 0 };
+      return { success, turns: 0, concluded: true };
     }
     let pendingRelay = null;
@@ -131,16 +132,20 @@ export class Supervisor {
         pendingRelay ?? this.#buildInitialRelay(supervisorResult.text);
       const turnOutcome = await this.#runAgentTurn(turn, relay);
-      if (turnOutcome.exit) return turnOutcome.exit;
+      if (turnOutcome.exit) {
+        return { ...turnOutcome.exit, concluded: this.ctx.concluded };
+      }
       const reviewOutcome = await this.#endOfTurnReview(turn);
-      if (reviewOutcome.exit) return reviewOutcome.exit;
+      if (reviewOutcome.exit) {
+        return { ...reviewOutcome.exit, concluded: this.ctx.concluded };
+      }
       supervisorResult = reviewOutcome.supervisorResult;
       pendingRelay = reviewOutcome.relay ?? null;
     }
     this.emitSummary({ success: false, turns: this.maxTurns });
-    return { success: false, turns: this.maxTurns };
+    return { success: false, turns: this.maxTurns, concluded: false };
   }
   #buildInitialRelay(fallbackText) {
@@ -247,6 +252,22 @@ export class Supervisor {
     return { type: "continue" };
   }
+  /**
+   * Resume the supervisor runner, falling back to a fresh session when the
+   * SDK reports that the conversation no longer exists (e.g. session GC'd
+   * while the agent was running). The fresh session includes the original
+   * task context so the supervisor can still evaluate the agent's work.
+   * @param {string} prompt
+   * @returns {Promise<object>}
+   */
+  async #resumeSupervisor(prompt) {
+    const result = await this.supervisorRunner.resume(prompt);
+    if (result.error && isSessionNotFound(result.error)) {
+      return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
+    }
+    return result;
+  }
   /**
    * If the agent has an unanswered ask, drain reminders and return a
    * formatted relay string. Returns null when no relay is needed.
@@ -274,7 +295,7 @@ export class Supervisor {
     this.currentSource = "supervisor";
     this.ctx.redirect = null;
-    await this.supervisorRunner.resume(
+    await this.#resumeSupervisor(
       `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
         `Review and use your tools if action is needed.`,
     );
@@ -312,7 +333,7 @@ export class Supervisor {
           `Review and decide how to proceed.`
         : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
-    let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
+    let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
     if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: turn });
@@ -333,7 +354,7 @@ export class Supervisor {
     if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
       const reminders = this.messageBus.drain("supervisor");
       if (reminders.length > 0) {
-        supervisorResult = await this.supervisorRunner.resume(
+        supervisorResult = await this.#resumeSupervisor(
           formatMessages(reminders),
         );
         if (this.ctx.concluded) {
@@ -478,7 +499,9 @@ const devNull = new Writable({
  * @param {string} deps.agentCwd
  * @param {function} deps.query
  * @param {import("stream").Writable} deps.output
- * @param {string} [deps.model]
+ * @param {string} [deps.model] - Default model for both runners.
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
+ * @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
  * @param {number} [deps.maxTurns]
  * @param {string[]} [deps.allowedTools]
  * @param {string[]} [deps.supervisorAllowedTools]
@@ -496,6 +519,8 @@ export function createSupervisor({
   query,
   output,
   model,
+  agentModel,
+  supervisorModel,
   maxTurns,
   allowedTools,
   supervisorDisallowedTools,
@@ -543,7 +568,7 @@ export function createSupervisor({
     cwd: agentCwd,
     query,
     output: devNull,
-    model,
+    model: agentModel ?? model,
     maxTurns: perInvocationTurns,
     allowedTools,
     onLine,
@@ -562,7 +587,7 @@ export function createSupervisor({
     cwd: supervisorCwd,
     query,
     output: devNull,
-    model,
+    model: supervisorModel ?? model,
     maxTurns: perInvocationTurns,
     allowedTools: supervisorAllowedTools ?? [
       "Bash",
@@ -592,3 +617,8 @@ export function createSupervisor({
   });
   return supervisor;
 }
+function isSessionNotFound(error) {
+  const msg = error?.message ?? String(error);
+  return msg.includes("No conversation found with session ID");
+}