@forwardimpact/libeval 0.1.35 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { basename } from "node:path";
3
+ import jmespath from "jmespath";
4
+
5
+ /**
6
+ * Evaluate an assertion and return the structured result.
7
+ * @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
8
+ * @param {string[]} args - [testName, file]
9
+ * @returns {{ test: string, pass: boolean, message?: string }}
10
+ */
11
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
12
+ export function evaluateAssertion(values, args) {
13
+ const testName = args[0];
14
+ if (!testName) throw new Error("assert: missing test name");
15
+
16
+ const file = args[1];
17
+ const modes = [
18
+ values.grep,
19
+ values.query,
20
+ values.exists,
21
+ values["cites-job"],
22
+ ].filter((v) => v !== undefined && v !== false);
23
+ if (modes.length === 0) {
24
+ throw new Error(
25
+ "assert: specify one of --grep, --query, --exists, or --cites-job",
26
+ );
27
+ }
28
+ if (modes.length > 1) {
29
+ throw new Error(
30
+ "assert: specify only one of --grep, --query, --exists, or --cites-job",
31
+ );
32
+ }
33
+
34
+ let result;
35
+ if (values.exists) {
36
+ if (!file) throw new Error("assert: missing file argument");
37
+ result = assertExists(file);
38
+ } else if (values.grep) {
39
+ if (!file) throw new Error("assert: missing file argument for --grep");
40
+ result = assertGrep(values.grep, file);
41
+ } else if (values["cites-job"]) {
42
+ if (!file) throw new Error("assert: missing file argument for --cites-job");
43
+ result = assertCitesJob(values["cites-job"], file);
44
+ } else {
45
+ if (!file) throw new Error("assert: missing file argument for --query");
46
+ result = assertQuery(values.query, file);
47
+ }
48
+
49
+ if (values.not) {
50
+ result.pass = !result.pass;
51
+ if (result.pass) {
52
+ delete result.message;
53
+ } else {
54
+ result.message =
55
+ result.message ?? `inverted assertion failed for ${basename(file)}`;
56
+ }
57
+ }
58
+
59
+ if (!result.pass && values.message) {
60
+ result.message = values.message;
61
+ }
62
+
63
+ const output = { test: testName, pass: result.pass };
64
+ if (result.message) output.message = result.message;
65
+ return output;
66
+ }
67
+
68
+ /**
69
+ * Run an assertion, write JSON to stdout, and set process.exitCode on failure.
70
+ * @param {object} values
71
+ * @param {string[]} args
72
+ */
73
+ export async function runAssertCommand(values, args) {
74
+ const result = evaluateAssertion(values, args);
75
+ process.stdout.write(JSON.stringify(result) + "\n");
76
+ if (!result.pass) process.exitCode = 1;
77
+ }
78
+
79
+ function assertExists(file) {
80
+ if (existsSync(file)) return { pass: true };
81
+ return { pass: false, message: `${file} not found` };
82
+ }
83
+
84
+ function assertGrep(pattern, file) {
85
+ const content = readFileSync(file, "utf8");
86
+ const re = new RegExp(pattern, "im");
87
+ if (re.test(content)) return { pass: true };
88
+ return {
89
+ pass: false,
90
+ message: `pattern "${pattern}" not found in ${basename(file)}`,
91
+ };
92
+ }
93
+
94
+ function assertQuery(expression, file) {
95
+ const content = readFileSync(file, "utf8");
96
+ const data = parseJsonOrNdjson(content);
97
+ const result = jmespath.search(data, expression);
98
+ const truthy =
99
+ result !== null &&
100
+ result !== undefined &&
101
+ result !== false &&
102
+ (Array.isArray(result) ? result.length > 0 : true);
103
+ if (truthy) return { pass: true };
104
+ return {
105
+ pass: false,
106
+ message: `query returned ${JSON.stringify(result)}`,
107
+ };
108
+ }
109
+
110
+ const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
111
+
112
+ function assertCitesJob(jobFile, file) {
113
+ const jobContent = readFileSync(jobFile, "utf8");
114
+ const match = JOB_TAG_RE.exec(jobContent);
115
+ if (!match) {
116
+ return {
117
+ pass: false,
118
+ message: `no <job> tag found in ${basename(jobFile)}`,
119
+ };
120
+ }
121
+ const citation = `${match[1]}: ${match[2]}`;
122
+ const content = readFileSync(file, "utf8");
123
+ if (content.includes(citation)) return { pass: true };
124
+ return { pass: false, message: `missing "${citation}"` };
125
+ }
126
+
127
+ function parseJsonOrNdjson(content) {
128
+ try {
129
+ return JSON.parse(content);
130
+ } catch {
131
+ // Fall through to NDJSON
132
+ }
133
+ const lines = [];
134
+ for (const raw of content.split("\n")) {
135
+ const trimmed = raw.trim();
136
+ if (!trimmed) continue;
137
+ try {
138
+ lines.push(JSON.parse(trimmed));
139
+ } catch {
140
+ // skip unparseable lines
141
+ }
142
+ }
143
+ if (lines.length === 0) throw new Error("assert: no valid JSON in file");
144
+ return lines;
145
+ }
@@ -13,8 +13,7 @@ import { aggregate, renderTextReport } from "../benchmark/report.js";
13
13
  * @param {string[]} _args
14
14
  */
15
15
  export async function runBenchmarkReportCommand(values, _args) {
16
- const inputDir = values.input;
17
- if (!inputDir) throw new Error("--input is required");
16
+ const inputDir = values.input ?? "benchmark-runs";
18
17
  const kRaw = values.k ?? "1,3,5";
19
18
  const kValues = kRaw.split(",").map((t) => {
20
19
  const n = Number.parseInt(t.trim(), 10);
@@ -30,7 +29,11 @@ export async function runBenchmarkReportCommand(values, _args) {
30
29
  throw new Error("--format must be 'json' or 'text'");
31
30
  }
32
31
 
33
- const report = await aggregate({ inputDir: resolve(inputDir), kValues });
32
+ const report = await aggregate({
33
+ inputDir: resolve(inputDir),
34
+ kValues,
35
+ includeRuns: format === "text",
36
+ });
34
37
  if (format === "text") {
35
38
  process.stdout.write(renderTextReport(report, kValues) + "\n");
36
39
  } else {
@@ -31,16 +31,17 @@ export async function runBenchmarkRunCommand(values, _args) {
31
31
  function parseRunOptions(values) {
32
32
  const family = values.family;
33
33
  if (!family) throw new Error("--family is required");
34
- const output = values.output;
35
- if (!output) throw new Error("--output is required");
36
- const runs = Number.parseInt(values.runs ?? "1", 10);
34
+ const output = values.output ?? "benchmark-runs";
35
+ const runs = Number.parseInt(values.runs ?? "5", 10);
37
36
  if (!Number.isFinite(runs) || runs < 1)
38
37
  throw new Error("--runs must be a positive integer");
39
38
  return {
40
39
  family,
41
40
  runs,
42
41
  output: resolve(output),
43
- model: values.model ?? "claude-opus-4-7[1m]",
42
+ agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
43
+ supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
44
+ judgeModel: values["judge-model"] ?? "claude-opus-4-7",
44
45
  profiles: {
45
46
  agent: values["agent-profile"] ?? null,
46
47
  judge: values["judge-profile"] ?? null,
@@ -45,7 +45,8 @@ function parseFacilitateOptions(values) {
45
45
  taskAmend,
46
46
  agentConfigs,
47
47
  facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
48
- model: values.model ?? "claude-opus-4-7[1m]",
48
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
49
+ facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
49
50
  maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
50
51
  outputPath: values.output,
51
52
  facilitatorProfile: values["facilitator-profile"] ?? undefined,
@@ -89,7 +90,8 @@ export async function runFacilitateCommand(values, _args) {
89
90
  agentConfigs: opts.agentConfigs,
90
91
  query,
91
92
  output,
92
- model: opts.model,
93
+ agentModel: opts.agentModel,
94
+ facilitatorModel: opts.facilitatorModel,
93
95
  maxTurns: opts.maxTurns,
94
96
  facilitatorProfile: opts.facilitatorProfile,
95
97
  taskAmend: opts.taskAmend,
@@ -29,7 +29,7 @@ function parseRunOptions(values) {
29
29
  taskContent,
30
30
  taskAmend,
31
31
  cwd: resolve(values.cwd ?? "."),
32
- model: values.model ?? "claude-opus-4-7[1m]",
32
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
33
33
  maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
34
34
  outputPath: values.output,
35
35
  agentProfile: values["agent-profile"] ?? undefined,
@@ -54,7 +54,7 @@ export async function runRunCommand(values, _args) {
54
54
  taskContent,
55
55
  taskAmend,
56
56
  cwd,
57
- model,
57
+ agentModel,
58
58
  maxTurns,
59
59
  outputPath,
60
60
  agentProfile,
@@ -114,7 +114,7 @@ export async function runRunCommand(values, _args) {
114
114
  cwd,
115
115
  query,
116
116
  output: devNull,
117
- model,
117
+ model: agentModel,
118
118
  maxTurns,
119
119
  allowedTools,
120
120
  onLine,
@@ -11,6 +11,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
11
11
  * @param {object} values - Parsed option values from cli.parse()
12
12
  * @returns {object}
13
13
  */
14
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
14
15
  function parseSuperviseOptions(values) {
15
16
  const taskFile = values["task-file"];
16
17
  const taskText = values["task-text"];
@@ -31,7 +32,8 @@ function parseSuperviseOptions(values) {
31
32
  agentCwd: resolve(
32
33
  values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
33
34
  ),
34
- model: values.model ?? "claude-opus-4-7[1m]",
35
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
36
+ supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
35
37
  maxTurns: (() => {
36
38
  const raw = values["max-turns"] ?? "20";
37
39
  return raw === "0" ? 0 : parseInt(raw, 10);
@@ -102,7 +104,8 @@ export async function runSuperviseCommand(values, _args) {
102
104
  agentCwd: opts.agentCwd,
103
105
  query,
104
106
  output,
105
- model: opts.model,
107
+ agentModel: opts.agentModel,
108
+ supervisorModel: opts.supervisorModel,
106
109
  maxTurns: opts.maxTurns,
107
110
  allowedTools: opts.allowedTools,
108
111
  supervisorAllowedTools: opts.supervisorAllowedTools,
@@ -390,7 +390,9 @@ const devNull = new Writable({
390
390
  * @param {Array<{name: string, role: string, cwd?: string, maxTurns?: number, allowedTools?: string[], agentProfile?: string, systemPromptAmend?: string}>} deps.agentConfigs
391
391
  * @param {function} deps.query
392
392
  * @param {import("stream").Writable} deps.output
393
- * @param {string} [deps.model]
393
+ * @param {string} [deps.model] - Default model for all participants.
394
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
395
+ * @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
394
396
  * @param {number} [deps.maxTurns]
395
397
  * @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
396
398
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
@@ -403,6 +405,8 @@ export function createFacilitator({
403
405
  query,
404
406
  output,
405
407
  model,
408
+ agentModel,
409
+ facilitatorModel,
406
410
  maxTurns,
407
411
  facilitatorProfile,
408
412
  profilesDir,
@@ -450,7 +454,7 @@ export function createFacilitator({
450
454
  cwd: config.cwd ?? facilitatorCwd,
451
455
  query,
452
456
  output: devNull,
453
- model,
457
+ model: agentModel ?? model,
454
458
  maxTurns: config.maxTurns ?? 50,
455
459
  allowedTools: config.allowedTools,
456
460
  onLine: (line) => facilitator.emitLine(config.name, line),
@@ -467,7 +471,7 @@ export function createFacilitator({
467
471
  cwd: facilitatorCwd,
468
472
  query,
469
473
  output: devNull,
470
- model,
474
+ model: facilitatorModel ?? model,
471
475
  maxTurns: maxTurns ?? 20,
472
476
  onLine: (line) => facilitator.emitLine("facilitator", line),
473
477
  mcpServers: { orchestration: facilitatorServer },
package/src/supervisor.js CHANGED
@@ -100,17 +100,18 @@ export class Supervisor {
100
100
  /**
101
101
  * Run the supervisor ↔ agent relay loop.
102
102
  * @param {string} task - The initial task for the supervisor
103
- * @returns {Promise<{success: boolean, turns: number}>}
103
+ * @returns {Promise<{success: boolean, turns: number, concluded: boolean}>}
104
104
  */
105
105
  async run(task) {
106
106
  const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
107
+ this.taskContext = initialTask;
107
108
  this.currentSource = "supervisor";
108
109
  this.currentTurn = 0;
109
110
  let supervisorResult = await this.supervisorRunner.run(initialTask);
110
111
 
111
112
  if (supervisorResult.error) {
112
113
  this.emitSummary({ success: false, turns: 0 });
113
- return { success: false, turns: 0 };
114
+ return { success: false, turns: 0, concluded: false };
114
115
  }
115
116
 
116
117
  if (this.ctx.concluded) {
@@ -121,7 +122,7 @@ export class Supervisor {
121
122
  turns: 0,
122
123
  summary: this.ctx.summary,
123
124
  });
124
- return { success, turns: 0 };
125
+ return { success, turns: 0, concluded: true };
125
126
  }
126
127
 
127
128
  let pendingRelay = null;
@@ -131,16 +132,20 @@ export class Supervisor {
131
132
  pendingRelay ?? this.#buildInitialRelay(supervisorResult.text);
132
133
 
133
134
  const turnOutcome = await this.#runAgentTurn(turn, relay);
134
- if (turnOutcome.exit) return turnOutcome.exit;
135
+ if (turnOutcome.exit) {
136
+ return { ...turnOutcome.exit, concluded: this.ctx.concluded };
137
+ }
135
138
 
136
139
  const reviewOutcome = await this.#endOfTurnReview(turn);
137
- if (reviewOutcome.exit) return reviewOutcome.exit;
140
+ if (reviewOutcome.exit) {
141
+ return { ...reviewOutcome.exit, concluded: this.ctx.concluded };
142
+ }
138
143
  supervisorResult = reviewOutcome.supervisorResult;
139
144
  pendingRelay = reviewOutcome.relay ?? null;
140
145
  }
141
146
 
142
147
  this.emitSummary({ success: false, turns: this.maxTurns });
143
- return { success: false, turns: this.maxTurns };
148
+ return { success: false, turns: this.maxTurns, concluded: false };
144
149
  }
145
150
 
146
151
  #buildInitialRelay(fallbackText) {
@@ -247,6 +252,22 @@ export class Supervisor {
247
252
  return { type: "continue" };
248
253
  }
249
254
 
255
+ /**
256
+ * Resume the supervisor runner, falling back to a fresh session when the
257
+ * SDK reports that the conversation no longer exists (e.g. session GC'd
258
+ * while the agent was running). The fresh session includes the original
259
+ * task context so the supervisor can still evaluate the agent's work.
260
+ * @param {string} prompt
261
+ * @returns {Promise<object>}
262
+ */
263
+ async #resumeSupervisor(prompt) {
264
+ const result = await this.supervisorRunner.resume(prompt);
265
+ if (result.error && isSessionNotFound(result.error)) {
266
+ return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
267
+ }
268
+ return result;
269
+ }
270
+
250
271
  /**
251
272
  * If the agent has an unanswered ask, drain reminders and return a
252
273
  * formatted relay string. Returns null when no relay is needed.
@@ -274,7 +295,7 @@ export class Supervisor {
274
295
  this.currentSource = "supervisor";
275
296
  this.ctx.redirect = null;
276
297
 
277
- await this.supervisorRunner.resume(
298
+ await this.#resumeSupervisor(
278
299
  `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
279
300
  `Review and use your tools if action is needed.`,
280
301
  );
@@ -312,7 +333,7 @@ export class Supervisor {
312
333
  `Review and decide how to proceed.`
313
334
  : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
314
335
 
315
- let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
336
+ let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
316
337
 
317
338
  if (supervisorResult.error) {
318
339
  this.emitSummary({ success: false, turns: turn });
@@ -333,7 +354,7 @@ export class Supervisor {
333
354
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
334
355
  const reminders = this.messageBus.drain("supervisor");
335
356
  if (reminders.length > 0) {
336
- supervisorResult = await this.supervisorRunner.resume(
357
+ supervisorResult = await this.#resumeSupervisor(
337
358
  formatMessages(reminders),
338
359
  );
339
360
  if (this.ctx.concluded) {
@@ -478,7 +499,9 @@ const devNull = new Writable({
478
499
  * @param {string} deps.agentCwd
479
500
  * @param {function} deps.query
480
501
  * @param {import("stream").Writable} deps.output
481
- * @param {string} [deps.model]
502
+ * @param {string} [deps.model] - Default model for both runners.
503
+ * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
504
+ * @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
482
505
  * @param {number} [deps.maxTurns]
483
506
  * @param {string[]} [deps.allowedTools]
484
507
  * @param {string[]} [deps.supervisorAllowedTools]
@@ -496,6 +519,8 @@ export function createSupervisor({
496
519
  query,
497
520
  output,
498
521
  model,
522
+ agentModel,
523
+ supervisorModel,
499
524
  maxTurns,
500
525
  allowedTools,
501
526
  supervisorDisallowedTools,
@@ -536,12 +561,15 @@ export function createSupervisor({
536
561
 
537
562
  const onLine = (line) => supervisor.emitLine(line);
538
563
 
564
+ const perInvocationTurns =
565
+ maxTurns === 0 ? 0 : Math.max(maxTurns ?? 100, 200);
566
+
539
567
  const agentRunner = createAgentRunner({
540
568
  cwd: agentCwd,
541
569
  query,
542
570
  output: devNull,
543
- model,
544
- maxTurns: 50,
571
+ model: agentModel ?? model,
572
+ maxTurns: perInvocationTurns,
545
573
  allowedTools,
546
574
  onLine,
547
575
  settingSources: ["project"],
@@ -559,8 +587,8 @@ export function createSupervisor({
559
587
  cwd: supervisorCwd,
560
588
  query,
561
589
  output: devNull,
562
- model,
563
- maxTurns: 20,
590
+ model: supervisorModel ?? model,
591
+ maxTurns: perInvocationTurns,
564
592
  allowedTools: supervisorAllowedTools ?? [
565
593
  "Bash",
566
594
  "Read",
@@ -589,3 +617,8 @@ export function createSupervisor({
589
617
  });
590
618
  return supervisor;
591
619
  }
620
+
621
+ function isSessionNotFound(error) {
622
+ const msg = error?.message ?? String(error);
623
+ return msg.includes("No conversation found with session ID");
624
+ }