@forwardimpact/libeval 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,16 +31,17 @@ export async function runBenchmarkRunCommand(values, _args) {
31
31
  function parseRunOptions(values) {
32
32
  const family = values.family;
33
33
  if (!family) throw new Error("--family is required");
34
- const output = values.output;
35
- if (!output) throw new Error("--output is required");
36
- const runs = Number.parseInt(values.runs ?? "1", 10);
34
+ const output = values.output ?? "benchmark-runs";
35
+ const runs = Number.parseInt(values.runs ?? "5", 10);
37
36
  if (!Number.isFinite(runs) || runs < 1)
38
37
  throw new Error("--runs must be a positive integer");
39
38
  return {
40
39
  family,
41
40
  runs,
42
41
  output: resolve(output),
43
- model: values.model ?? "claude-opus-4-7[1m]",
42
+ agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
43
+ supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
44
+ judgeModel: values["judge-model"] ?? "claude-opus-4-7",
44
45
  profiles: {
45
46
  agent: values["agent-profile"] ?? null,
46
47
  judge: values["judge-profile"] ?? null,
@@ -45,7 +45,8 @@ function parseFacilitateOptions(values) {
45
45
  taskAmend,
46
46
  agentConfigs,
47
47
  facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
48
- model: values.model ?? "claude-opus-4-7[1m]",
48
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
49
+ facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
49
50
  maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
50
51
  outputPath: values.output,
51
52
  facilitatorProfile: values["facilitator-profile"] ?? undefined,
@@ -89,7 +90,8 @@ export async function runFacilitateCommand(values, _args) {
89
90
  agentConfigs: opts.agentConfigs,
90
91
  query,
91
92
  output,
92
- model: opts.model,
93
+ agentModel: opts.agentModel,
94
+ facilitatorModel: opts.facilitatorModel,
93
95
  maxTurns: opts.maxTurns,
94
96
  facilitatorProfile: opts.facilitatorProfile,
95
97
  taskAmend: opts.taskAmend,
@@ -29,7 +29,7 @@ function parseRunOptions(values) {
29
29
  taskContent,
30
30
  taskAmend,
31
31
  cwd: resolve(values.cwd ?? "."),
32
- model: values.model ?? "claude-opus-4-7[1m]",
32
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
33
33
  maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
34
34
  outputPath: values.output,
35
35
  agentProfile: values["agent-profile"] ?? undefined,
@@ -54,7 +54,7 @@ export async function runRunCommand(values, _args) {
54
54
  taskContent,
55
55
  taskAmend,
56
56
  cwd,
57
- model,
57
+ agentModel,
58
58
  maxTurns,
59
59
  outputPath,
60
60
  agentProfile,
@@ -114,7 +114,7 @@ export async function runRunCommand(values, _args) {
114
114
  cwd,
115
115
  query,
116
116
  output: devNull,
117
- model,
117
+ model: agentModel,
118
118
  maxTurns,
119
119
  allowedTools,
120
120
  onLine,
@@ -11,6 +11,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
11
11
  * @param {object} values - Parsed option values from cli.parse()
12
12
  * @returns {object}
13
13
  */
14
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
14
15
  function parseSuperviseOptions(values) {
15
16
  const taskFile = values["task-file"];
16
17
  const taskText = values["task-text"];
@@ -31,7 +32,8 @@ function parseSuperviseOptions(values) {
31
32
  agentCwd: resolve(
32
33
  values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
33
34
  ),
34
- model: values.model ?? "claude-opus-4-7[1m]",
35
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
36
+ supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
35
37
  maxTurns: (() => {
36
38
  const raw = values["max-turns"] ?? "20";
37
39
  return raw === "0" ? 0 : parseInt(raw, 10);
@@ -102,7 +104,8 @@ export async function runSuperviseCommand(values, _args) {
102
104
  agentCwd: opts.agentCwd,
103
105
  query,
104
106
  output,
105
- model: opts.model,
107
+ agentModel: opts.agentModel,
108
+ supervisorModel: opts.supervisorModel,
106
109
  maxTurns: opts.maxTurns,
107
110
  allowedTools: opts.allowedTools,
108
111
  supervisorAllowedTools: opts.supervisorAllowedTools,
@@ -390,7 +390,9 @@ const devNull = new Writable({
390
390
  * @param {Array<{name: string, role: string, cwd?: string, maxTurns?: number, allowedTools?: string[], agentProfile?: string, systemPromptAmend?: string}>} deps.agentConfigs
391
391
  * @param {function} deps.query
392
392
  * @param {import("stream").Writable} deps.output
393
- * @param {string} [deps.model]
393
+ * @param {string} [deps.model] - Default model for all participants.
394
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
395
+ * @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
394
396
  * @param {number} [deps.maxTurns]
395
397
  * @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
396
398
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
@@ -403,6 +405,8 @@ export function createFacilitator({
403
405
  query,
404
406
  output,
405
407
  model,
408
+ agentModel,
409
+ facilitatorModel,
406
410
  maxTurns,
407
411
  facilitatorProfile,
408
412
  profilesDir,
@@ -450,7 +454,7 @@ export function createFacilitator({
450
454
  cwd: config.cwd ?? facilitatorCwd,
451
455
  query,
452
456
  output: devNull,
453
- model,
457
+ model: agentModel ?? model,
454
458
  maxTurns: config.maxTurns ?? 50,
455
459
  allowedTools: config.allowedTools,
456
460
  onLine: (line) => facilitator.emitLine(config.name, line),
@@ -467,7 +471,7 @@ export function createFacilitator({
467
471
  cwd: facilitatorCwd,
468
472
  query,
469
473
  output: devNull,
470
- model,
474
+ model: facilitatorModel ?? model,
471
475
  maxTurns: maxTurns ?? 20,
472
476
  onLine: (line) => facilitator.emitLine("facilitator", line),
473
477
  mcpServers: { orchestration: facilitatorServer },
package/src/supervisor.js CHANGED
@@ -100,17 +100,18 @@ export class Supervisor {
100
100
  /**
101
101
  * Run the supervisor ↔ agent relay loop.
102
102
  * @param {string} task - The initial task for the supervisor
103
- * @returns {Promise<{success: boolean, turns: number}>}
103
+ * @returns {Promise<{success: boolean, turns: number, concluded: boolean}>}
104
104
  */
105
105
  async run(task) {
106
106
  const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
107
+ this.taskContext = initialTask;
107
108
  this.currentSource = "supervisor";
108
109
  this.currentTurn = 0;
109
110
  let supervisorResult = await this.supervisorRunner.run(initialTask);
110
111
 
111
112
  if (supervisorResult.error) {
112
113
  this.emitSummary({ success: false, turns: 0 });
113
- return { success: false, turns: 0 };
114
+ return { success: false, turns: 0, concluded: false };
114
115
  }
115
116
 
116
117
  if (this.ctx.concluded) {
@@ -121,7 +122,7 @@ export class Supervisor {
121
122
  turns: 0,
122
123
  summary: this.ctx.summary,
123
124
  });
124
- return { success, turns: 0 };
125
+ return { success, turns: 0, concluded: true };
125
126
  }
126
127
 
127
128
  let pendingRelay = null;
@@ -131,16 +132,20 @@ export class Supervisor {
131
132
  pendingRelay ?? this.#buildInitialRelay(supervisorResult.text);
132
133
 
133
134
  const turnOutcome = await this.#runAgentTurn(turn, relay);
134
- if (turnOutcome.exit) return turnOutcome.exit;
135
+ if (turnOutcome.exit) {
136
+ return { ...turnOutcome.exit, concluded: this.ctx.concluded };
137
+ }
135
138
 
136
139
  const reviewOutcome = await this.#endOfTurnReview(turn);
137
- if (reviewOutcome.exit) return reviewOutcome.exit;
140
+ if (reviewOutcome.exit) {
141
+ return { ...reviewOutcome.exit, concluded: this.ctx.concluded };
142
+ }
138
143
  supervisorResult = reviewOutcome.supervisorResult;
139
144
  pendingRelay = reviewOutcome.relay ?? null;
140
145
  }
141
146
 
142
147
  this.emitSummary({ success: false, turns: this.maxTurns });
143
- return { success: false, turns: this.maxTurns };
148
+ return { success: false, turns: this.maxTurns, concluded: false };
144
149
  }
145
150
 
146
151
  #buildInitialRelay(fallbackText) {
@@ -247,6 +252,22 @@ export class Supervisor {
247
252
  return { type: "continue" };
248
253
  }
249
254
 
255
+ /**
256
+ * Resume the supervisor runner, falling back to a fresh session when the
257
+ * SDK reports that the conversation no longer exists (e.g. session GC'd
258
+ * while the agent was running). The fresh session includes the original
259
+ * task context so the supervisor can still evaluate the agent's work.
260
+ * @param {string} prompt
261
+ * @returns {Promise<object>}
262
+ */
263
+ async #resumeSupervisor(prompt) {
264
+ const result = await this.supervisorRunner.resume(prompt);
265
+ if (result.error && isSessionNotFound(result.error)) {
266
+ return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
267
+ }
268
+ return result;
269
+ }
270
+
250
271
  /**
251
272
  * If the agent has an unanswered ask, drain reminders and return a
252
273
  * formatted relay string. Returns null when no relay is needed.
@@ -274,7 +295,7 @@ export class Supervisor {
274
295
  this.currentSource = "supervisor";
275
296
  this.ctx.redirect = null;
276
297
 
277
- await this.supervisorRunner.resume(
298
+ await this.#resumeSupervisor(
278
299
  `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
279
300
  `Review and use your tools if action is needed.`,
280
301
  );
@@ -312,7 +333,7 @@ export class Supervisor {
312
333
  `Review and decide how to proceed.`
313
334
  : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
314
335
 
315
- let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
336
+ let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
316
337
 
317
338
  if (supervisorResult.error) {
318
339
  this.emitSummary({ success: false, turns: turn });
@@ -333,7 +354,7 @@ export class Supervisor {
333
354
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
334
355
  const reminders = this.messageBus.drain("supervisor");
335
356
  if (reminders.length > 0) {
336
- supervisorResult = await this.supervisorRunner.resume(
357
+ supervisorResult = await this.#resumeSupervisor(
337
358
  formatMessages(reminders),
338
359
  );
339
360
  if (this.ctx.concluded) {
@@ -478,7 +499,9 @@ const devNull = new Writable({
478
499
  * @param {string} deps.agentCwd
479
500
  * @param {function} deps.query
480
501
  * @param {import("stream").Writable} deps.output
481
- * @param {string} [deps.model]
502
+ * @param {string} [deps.model] - Default model for both runners.
503
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
504
+ * @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
482
505
  * @param {number} [deps.maxTurns]
483
506
  * @param {string[]} [deps.allowedTools]
484
507
  * @param {string[]} [deps.supervisorAllowedTools]
@@ -496,6 +519,8 @@ export function createSupervisor({
496
519
  query,
497
520
  output,
498
521
  model,
522
+ agentModel,
523
+ supervisorModel,
499
524
  maxTurns,
500
525
  allowedTools,
501
526
  supervisorDisallowedTools,
@@ -543,7 +568,7 @@ export function createSupervisor({
543
568
  cwd: agentCwd,
544
569
  query,
545
570
  output: devNull,
546
- model,
571
+ model: agentModel ?? model,
547
572
  maxTurns: perInvocationTurns,
548
573
  allowedTools,
549
574
  onLine,
@@ -562,7 +587,7 @@ export function createSupervisor({
562
587
  cwd: supervisorCwd,
563
588
  query,
564
589
  output: devNull,
565
- model,
590
+ model: supervisorModel ?? model,
566
591
  maxTurns: perInvocationTurns,
567
592
  allowedTools: supervisorAllowedTools ?? [
568
593
  "Bash",
@@ -592,3 +617,8 @@ export function createSupervisor({
592
617
  });
593
618
  return supervisor;
594
619
  }
620
+
621
+ function isSessionNotFound(error) {
622
+ const msg = error?.message ?? String(error);
623
+ return msg.includes("No conversation found with session ID");
624
+ }