npm - @forwardimpact/libeval - Versions diffs - 0.1.36 → 0.1.38 - Mend

@forwardimpact/libeval 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/bin/fit-benchmark.js +27 -7
package/bin/fit-eval.js +24 -3
package/bin/fit-trace.js +42 -0
package/package.json +2 -1
package/src/benchmark/apm-installer.js +56 -10
package/src/benchmark/judge.js +4 -3
package/src/benchmark/report.js +43 -17
package/src/benchmark/result.js +7 -1
package/src/benchmark/runner.js +120 -75
package/src/benchmark/scorer.js +2 -5
package/src/benchmark/task-family.js +14 -47
package/src/benchmark/workdir.js +7 -6
package/src/commands/assert.js +145 -0
package/src/commands/benchmark-report.js +1 -2
package/src/commands/benchmark-run.js +5 -4
package/src/commands/facilitate.js +4 -2
package/src/commands/run.js +3 -3
package/src/commands/supervise.js +5 -2
package/src/facilitator.js +7 -3
package/src/supervisor.js +42 -12

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -31,16 +31,17 @@ export async function runBenchmarkRunCommand(values, _args) {
 function parseRunOptions(values) {
   const family = values.family;
   if (!family) throw new Error("--family is required");
-  const output = values.output;
-  if (!output) throw new Error("--output is required");
-  const runs = Number.parseInt(values.runs ?? "1", 10);
+  const output = values.output ?? "benchmark-runs";
+  const runs = Number.parseInt(values.runs ?? "5", 10);
   if (!Number.isFinite(runs) || runs < 1)
     throw new Error("--runs must be a positive integer");
   return {
     family,
     runs,
     output: resolve(output),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
+    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
+    judgeModel: values["judge-model"] ?? "claude-opus-4-7",
     profiles: {
       agent: values["agent-profile"] ?? null,
       judge: values["judge-profile"] ?? null,

package/src/commands/facilitate.js CHANGED Viewed

@@ -45,7 +45,8 @@ function parseFacilitateOptions(values) {
     taskAmend,
     agentConfigs,
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
     outputPath: values.output,
     facilitatorProfile: values["facilitator-profile"] ?? undefined,
@@ -89,7 +90,8 @@ export async function runFacilitateCommand(values, _args) {
     agentConfigs: opts.agentConfigs,
     query,
     output,
-    model: opts.model,
+    agentModel: opts.agentModel,
+    facilitatorModel: opts.facilitatorModel,
     maxTurns: opts.maxTurns,
     facilitatorProfile: opts.facilitatorProfile,
     taskAmend: opts.taskAmend,

package/src/commands/run.js CHANGED Viewed

@@ -29,7 +29,7 @@ function parseRunOptions(values) {
     taskContent,
     taskAmend,
     cwd: resolve(values.cwd ?? "."),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
     outputPath: values.output,
     agentProfile: values["agent-profile"] ?? undefined,
@@ -54,7 +54,7 @@ export async function runRunCommand(values, _args) {
     taskContent,
     taskAmend,
     cwd,
-    model,
+    agentModel,
     maxTurns,
     outputPath,
     agentProfile,
@@ -114,7 +114,7 @@ export async function runRunCommand(values, _args) {
     cwd,
     query,
     output: devNull,
-    model,
+    model: agentModel,
     maxTurns,
     allowedTools,
     onLine,

package/src/commands/supervise.js CHANGED Viewed

@@ -11,6 +11,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
  * @param {object} values - Parsed option values from cli.parse()
  * @returns {object}
  */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
 function parseSuperviseOptions(values) {
   const taskFile = values["task-file"];
   const taskText = values["task-text"];
@@ -31,7 +32,8 @@ function parseSuperviseOptions(values) {
     agentCwd: resolve(
       values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
     ),
-    model: values.model ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
       const raw = values["max-turns"] ?? "20";
       return raw === "0" ? 0 : parseInt(raw, 10);
@@ -102,7 +104,8 @@ export async function runSuperviseCommand(values, _args) {
     agentCwd: opts.agentCwd,
     query,
     output,
-    model: opts.model,
+    agentModel: opts.agentModel,
+    supervisorModel: opts.supervisorModel,
     maxTurns: opts.maxTurns,
     allowedTools: opts.allowedTools,
     supervisorAllowedTools: opts.supervisorAllowedTools,

package/src/facilitator.js CHANGED Viewed

@@ -390,7 +390,9 @@ const devNull = new Writable({
  * @param {Array<{name: string, role: string, cwd?: string, maxTurns?: number, allowedTools?: string[], agentProfile?: string, systemPromptAmend?: string}>} deps.agentConfigs
  * @param {function} deps.query
  * @param {import("stream").Writable} deps.output
- * @param {string} [deps.model]
+ * @param {string} [deps.model] - Default model for all participants.
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
+ * @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
  * @param {number} [deps.maxTurns]
  * @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
@@ -403,6 +405,8 @@ export function createFacilitator({
   query,
   output,
   model,
+  agentModel,
+  facilitatorModel,
   maxTurns,
   facilitatorProfile,
   profilesDir,
@@ -450,7 +454,7 @@ export function createFacilitator({
       cwd: config.cwd ?? facilitatorCwd,
       query,
       output: devNull,
-      model,
+      model: agentModel ?? model,
       maxTurns: config.maxTurns ?? 50,
       allowedTools: config.allowedTools,
       onLine: (line) => facilitator.emitLine(config.name, line),
@@ -467,7 +471,7 @@ export function createFacilitator({
     cwd: facilitatorCwd,
     query,
     output: devNull,
-    model,
+    model: facilitatorModel ?? model,
     maxTurns: maxTurns ?? 20,
     onLine: (line) => facilitator.emitLine("facilitator", line),
     mcpServers: { orchestration: facilitatorServer },

package/src/supervisor.js CHANGED Viewed

@@ -100,17 +100,18 @@ export class Supervisor {
   /**
    * Run the supervisor ↔ agent relay loop.
    * @param {string} task - The initial task for the supervisor
-   * @returns {Promise<{success: boolean, turns: number}>}
+   * @returns {Promise<{success: boolean, turns: number, concluded: boolean}>}
    */
   async run(task) {
     const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
+    this.taskContext = initialTask;
     this.currentSource = "supervisor";
     this.currentTurn = 0;
     let supervisorResult = await this.supervisorRunner.run(initialTask);
     if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: 0 });
-      return { success: false, turns: 0 };
+      return { success: false, turns: 0, concluded: false };
     }
     if (this.ctx.concluded) {
@@ -121,7 +122,7 @@ export class Supervisor {
         turns: 0,
         summary: this.ctx.summary,
       });
-      return { success, turns: 0 };
+      return { success, turns: 0, concluded: true };
     }
     let pendingRelay = null;
@@ -131,16 +132,20 @@ export class Supervisor {
         pendingRelay ?? this.#buildInitialRelay(supervisorResult.text);
       const turnOutcome = await this.#runAgentTurn(turn, relay);
-      if (turnOutcome.exit) return turnOutcome.exit;
+      if (turnOutcome.exit) {
+        return { ...turnOutcome.exit, concluded: this.ctx.concluded };
+      }
       const reviewOutcome = await this.#endOfTurnReview(turn);
-      if (reviewOutcome.exit) return reviewOutcome.exit;
+      if (reviewOutcome.exit) {
+        return { ...reviewOutcome.exit, concluded: this.ctx.concluded };
+      }
       supervisorResult = reviewOutcome.supervisorResult;
       pendingRelay = reviewOutcome.relay ?? null;
     }
     this.emitSummary({ success: false, turns: this.maxTurns });
-    return { success: false, turns: this.maxTurns };
+    return { success: false, turns: this.maxTurns, concluded: false };
   }
   #buildInitialRelay(fallbackText) {
@@ -247,6 +252,22 @@ export class Supervisor {
     return { type: "continue" };
   }
+  /**
+   * Resume the supervisor runner, falling back to a fresh session when the
+   * SDK reports that the conversation no longer exists (e.g. session GC'd
+   * while the agent was running). The fresh session includes the original
+   * task context so the supervisor can still evaluate the agent's work.
+   * @param {string} prompt
+   * @returns {Promise<object>}
+   */
+  async #resumeSupervisor(prompt) {
+    const result = await this.supervisorRunner.resume(prompt);
+    if (result.error && isSessionNotFound(result.error)) {
+      return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
+    }
+    return result;
+  }
   /**
    * If the agent has an unanswered ask, drain reminders and return a
    * formatted relay string. Returns null when no relay is needed.
@@ -274,7 +295,7 @@ export class Supervisor {
     this.currentSource = "supervisor";
     this.ctx.redirect = null;
-    await this.supervisorRunner.resume(
+    await this.#resumeSupervisor(
       `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
         `Review and use your tools if action is needed.`,
     );
@@ -312,7 +333,7 @@ export class Supervisor {
           `Review and decide how to proceed.`
         : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
-    let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
+    let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
     if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: turn });
@@ -333,7 +354,7 @@ export class Supervisor {
     if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
       const reminders = this.messageBus.drain("supervisor");
       if (reminders.length > 0) {
-        supervisorResult = await this.supervisorRunner.resume(
+        supervisorResult = await this.#resumeSupervisor(
           formatMessages(reminders),
         );
         if (this.ctx.concluded) {
@@ -478,7 +499,9 @@ const devNull = new Writable({
  * @param {string} deps.agentCwd
  * @param {function} deps.query
  * @param {import("stream").Writable} deps.output
- * @param {string} [deps.model]
+ * @param {string} [deps.model] - Default model for both runners.
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
+ * @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
  * @param {number} [deps.maxTurns]
  * @param {string[]} [deps.allowedTools]
  * @param {string[]} [deps.supervisorAllowedTools]
@@ -496,6 +519,8 @@ export function createSupervisor({
   query,
   output,
   model,
+  agentModel,
+  supervisorModel,
   maxTurns,
   allowedTools,
   supervisorDisallowedTools,
@@ -543,7 +568,7 @@ export function createSupervisor({
     cwd: agentCwd,
     query,
     output: devNull,
-    model,
+    model: agentModel ?? model,
     maxTurns: perInvocationTurns,
     allowedTools,
     onLine,
@@ -562,7 +587,7 @@ export function createSupervisor({
     cwd: supervisorCwd,
     query,
     output: devNull,
-    model,
+    model: supervisorModel ?? model,
     maxTurns: perInvocationTurns,
     allowedTools: supervisorAllowedTools ?? [
       "Bash",
@@ -592,3 +617,8 @@ export function createSupervisor({
   });
   return supervisor;
 }
+function isSessionNotFound(error) {
+  const msg = error?.message ?? String(error);
+  return msg.includes("No conversation found with session ID");
+}