@forwardimpact/libeval 0.1.42 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,10 +46,10 @@ export const definition = {
46
46
  description:
47
47
  "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
48
48
  },
49
- "supervisor-model": {
49
+ "lead-model": {
50
50
  type: "string",
51
51
  description:
52
- "Claude model for the supervisor (default: claude-opus-4-7)",
52
+ "Claude model for the lead role (default: claude-opus-4-7)",
53
53
  },
54
54
  "judge-model": {
55
55
  type: "string",
package/bin/fit-eval.js CHANGED
@@ -9,6 +9,8 @@ import { runTeeCommand } from "../src/commands/tee.js";
9
9
  import { runRunCommand } from "../src/commands/run.js";
10
10
  import { runSuperviseCommand } from "../src/commands/supervise.js";
11
11
  import { runFacilitateCommand } from "../src/commands/facilitate.js";
12
+ import { runDiscussCommand } from "../src/commands/discuss.js";
13
+ import { runCallbackCommand } from "../src/commands/callback.js";
12
14
 
13
15
  // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
14
16
  // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -18,6 +20,18 @@ const VERSION =
18
20
  JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
19
21
  .version;
20
22
 
23
+ const LEAD_OPTIONS = {
24
+ "lead-profile": {
25
+ type: "string",
26
+ description: "Lead role profile name (supervisor / facilitator / chair)",
27
+ },
28
+ "lead-model": {
29
+ type: "string",
30
+ description:
31
+ "Claude model for the lead role (default: claude-opus-4-7[1m])",
32
+ },
33
+ };
34
+
21
35
  const definition = {
22
36
  name: "fit-eval",
23
37
  version: VERSION,
@@ -93,14 +107,11 @@ const definition = {
93
107
  description:
94
108
  "Claude model for the agent (default: claude-opus-4-7[1m])",
95
109
  },
96
- "supervisor-model": {
97
- type: "string",
98
- description:
99
- "Claude model for the supervisor (default: claude-opus-4-7[1m])",
100
- },
110
+ ...LEAD_OPTIONS,
101
111
  "max-turns": {
102
112
  type: "string",
103
- description: "Max agentic turns (default: 20, 0 = unlimited)",
113
+ description:
114
+ "Max agentic turns per runner invocation (default: 200, 0 = unlimited)",
104
115
  },
105
116
  output: {
106
117
  type: "string",
@@ -116,10 +127,6 @@ const definition = {
116
127
  description: "Supervisor working directory",
117
128
  },
118
129
  "agent-cwd": { type: "string", description: "Agent working directory" },
119
- "supervisor-profile": {
120
- type: "string",
121
- description: "Supervisor (judge) profile name",
122
- },
123
130
  "supervisor-allowed-tools": {
124
131
  type: "string",
125
132
  description: "Supervisor tool allowlist",
@@ -153,11 +160,7 @@ const definition = {
153
160
  type: "string",
154
161
  description: "Claude model for agents (default: claude-opus-4-7[1m])",
155
162
  },
156
- "facilitator-model": {
157
- type: "string",
158
- description:
159
- "Claude model for the facilitator (default: claude-opus-4-7[1m])",
160
- },
163
+ ...LEAD_OPTIONS,
161
164
  "max-turns": {
162
165
  type: "string",
163
166
  description: "Max agentic turns (default: 20, 0 = unlimited)",
@@ -170,10 +173,6 @@ const definition = {
170
173
  type: "string",
171
174
  description: "Facilitator working directory",
172
175
  },
173
- "facilitator-profile": {
174
- type: "string",
175
- description: "Facilitator profile name",
176
- },
177
176
  "agent-profiles": {
178
177
  type: "string",
179
178
  description:
@@ -185,6 +184,56 @@ const definition = {
185
184
  },
186
185
  },
187
186
  },
187
+ {
188
+ name: "discuss",
189
+ args: "",
190
+ description:
191
+ "Run an async, suspendable discussion — Chair + N participants + bridge callback",
192
+ options: {
193
+ "task-file": {
194
+ type: "string",
195
+ description: "Path to a markdown task file",
196
+ },
197
+ "task-text": {
198
+ type: "string",
199
+ description: "Inline task text (alternative to --task-file)",
200
+ },
201
+ "task-amend": {
202
+ type: "string",
203
+ description: "Additional text appended to the task",
204
+ },
205
+ "agent-model": {
206
+ type: "string",
207
+ description: "Claude model for agents (default: claude-opus-4-7[1m])",
208
+ },
209
+ ...LEAD_OPTIONS,
210
+ "max-turns": {
211
+ type: "string",
212
+ description: "Max agentic turns (default: 40, 0 = unlimited)",
213
+ },
214
+ output: {
215
+ type: "string",
216
+ description: "Write the NDJSON trace to a file",
217
+ },
218
+ "agent-profiles": {
219
+ type: "string",
220
+ description: "Comma-separated participant profile names (optional)",
221
+ },
222
+ "agent-cwd": {
223
+ type: "string",
224
+ description: "Working directory shared by participants (default: .)",
225
+ },
226
+ "discussion-id": {
227
+ type: "string",
228
+ description:
229
+ "Stable id for the threaded conversation; carried through traces for linking",
230
+ },
231
+ "resume-context": {
232
+ type: "string",
233
+ description: "JSON-serialized prior state for a resumed run",
234
+ },
235
+ },
236
+ },
188
237
  {
189
238
  name: "output",
190
239
  args: "",
@@ -197,6 +246,35 @@ const definition = {
197
246
  description:
198
247
  "Stream readable text to stdout while saving raw NDJSON to a file",
199
248
  },
249
+ {
250
+ name: "callback",
251
+ args: "",
252
+ description:
253
+ "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
254
+ options: {
255
+ "trace-file": {
256
+ type: "string",
257
+ description: "Path to the NDJSON trace file",
258
+ },
259
+ "callback-url": {
260
+ type: "string",
261
+ description: "URL to POST the summary to",
262
+ },
263
+ "correlation-id": {
264
+ type: "string",
265
+ description: "Correlation ID to include in the payload",
266
+ },
267
+ "run-url": {
268
+ type: "string",
269
+ description: "GitHub Actions run URL (optional)",
270
+ },
271
+ "discussion-id": {
272
+ type: "string",
273
+ description:
274
+ "Discussion id (fallback when the trace lacks a meta event)",
275
+ },
276
+ },
277
+ },
200
278
  ],
201
279
  globalOptions: {
202
280
  format: { type: "string", description: "Output format (json|text)" },
@@ -206,8 +284,9 @@ const definition = {
206
284
  },
207
285
  examples: [
208
286
  "fit-eval run --task-file=task.md --output=trace.ndjson",
209
- "fit-eval supervise --task-file=task.md --supervisor-profile=judge --agent-profile=coder --output=trace.ndjson",
210
- 'fit-eval facilitate --task-file=task.md --facilitator-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
287
+ "fit-eval supervise --task-file=task.md --lead-profile=judge --agent-profile=coder --output=trace.ndjson",
288
+ 'fit-eval facilitate --task-file=task.md --lead-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
289
+ 'fit-eval discuss --task-file=task.md --lead-profile=release-engineer --agent-profiles="staff-engineer,security-engineer" --discussion-id=GD_kw...',
211
290
  "fit-eval output --format=text < trace.ndjson",
212
291
  ],
213
292
  documentation: [
@@ -233,7 +312,7 @@ const definition = {
233
312
  title: "Agent Teams",
234
313
  url: "https://www.forwardimpact.team/docs/products/agent-teams/index.md",
235
314
  description:
236
- "How to author the agent, supervisor, and facilitator profiles consumed by --agent-profile, --supervisor-profile, --facilitator-profile, and --agent-profiles.",
315
+ "How to author the profiles consumed by --agent-profile, --lead-profile, and --agent-profiles.",
237
316
  },
238
317
  ],
239
318
  };
@@ -247,6 +326,8 @@ const COMMANDS = {
247
326
  run: runRunCommand,
248
327
  supervise: runSuperviseCommand,
249
328
  facilitate: runFacilitateCommand,
329
+ discuss: runDiscussCommand,
330
+ callback: runCallbackCommand,
250
331
  };
251
332
 
252
333
  async function main() {
package/bin/fit-trace.js CHANGED
@@ -26,6 +26,7 @@ import {
26
26
  runSplitCommand,
27
27
  } from "../src/commands/trace.js";
28
28
  import { runAssertCommand } from "../src/commands/assert.js";
29
+ import { runByDiscussionCommand } from "../src/commands/by-discussion.js";
29
30
 
30
31
  // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
31
32
  // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -160,6 +161,18 @@ const definition = {
160
161
  args: "<file> <index>",
161
162
  description: "Single turn by index",
162
163
  },
164
+ {
165
+ name: "by-discussion",
166
+ args: "<discussion-id> [trace-dir]",
167
+ description:
168
+ "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
169
+ options: {
170
+ "trace-dir": {
171
+ type: "string",
172
+ description: "Directory to scan (default: traces)",
173
+ },
174
+ },
175
+ },
163
176
  {
164
177
  name: "filter",
165
178
  args: "<file>",
@@ -307,6 +320,7 @@ const COMMANDS = {
307
320
  filter: runFilterCommand,
308
321
  split: runSplitCommand,
309
322
  assert: runAssertCommand,
323
+ "by-discussion": runByDiscussionCommand,
310
324
  };
311
325
 
312
326
  async function main() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.42",
3
+ "version": "0.1.44",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -40,7 +40,7 @@ function parseRunOptions(values) {
40
40
  runs,
41
41
  output: resolve(output),
42
42
  agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
43
- supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
43
+ supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
44
44
  judgeModel: values["judge-model"] ?? "claude-opus-4-7",
45
45
  profiles: {
46
46
  agent: values["agent-profile"] ?? null,
@@ -0,0 +1,84 @@
1
+ import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
2
+ import { join } from "node:path";
3
+
4
+ /**
5
+ * Read the first newline-terminated line of a file. Bounded to 64 KiB
6
+ * which is well above any orchestrator envelope.
7
+ *
8
+ * @param {string} path
9
+ * @returns {string}
10
+ */
11
+ function readFirstLine(path) {
12
+ const fd = openSync(path, "r");
13
+ try {
14
+ const buf = Buffer.alloc(65536);
15
+ const bytes = readSync(fd, buf, 0, buf.length, 0);
16
+ const slice = buf.slice(0, bytes).toString("utf8");
17
+ const nl = slice.indexOf("\n");
18
+ return nl === -1 ? slice : slice.slice(0, nl);
19
+ } finally {
20
+ closeSync(fd);
21
+ }
22
+ }
23
+
24
+ /**
25
+ * Scan a directory for `.ndjson` files whose meta header carries the
26
+ * given discussion_id. The Step 2.6 first-line guarantee makes the
27
+ * lookup cheap: we read only the first line per file. Files without a
28
+ * meta header (e.g. legacy supervise/facilitate traces) are skipped
29
+ * silently — not erroneous.
30
+ *
31
+ * @param {string} dir
32
+ * @param {string} discussionId
33
+ * @returns {Array<{path: string, mtimeMs: number}>}
34
+ */
35
+ export function findTracesByDiscussion(dir, discussionId) {
36
+ const matches = [];
37
+ let entries;
38
+ try {
39
+ entries = readdirSync(dir);
40
+ } catch {
41
+ return [];
42
+ }
43
+ for (const entry of entries) {
44
+ if (!entry.endsWith(".ndjson")) continue;
45
+ const path = join(dir, entry);
46
+ let firstLine;
47
+ try {
48
+ firstLine = readFirstLine(path);
49
+ } catch {
50
+ continue;
51
+ }
52
+ let parsed;
53
+ try {
54
+ parsed = JSON.parse(firstLine);
55
+ } catch {
56
+ continue;
57
+ }
58
+ const event = parsed.event ?? parsed;
59
+ if (event?.type !== "meta") continue;
60
+ if (event.discussion_id !== discussionId) continue;
61
+ matches.push({ path, mtimeMs: statSync(path).mtimeMs });
62
+ }
63
+ matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
64
+ return matches;
65
+ }
66
+
67
+ /**
68
+ * `fit-trace by-discussion <discussion-id> [trace-dir]` — list trace
69
+ * files whose meta header carries the given discussion_id, one per
70
+ * line, ordered by first-event timestamp (file mtime ascending). The
71
+ * result is usable with `xargs cat` for a chronological merge.
72
+ *
73
+ * @param {object} values
74
+ * @param {string[]} args
75
+ */
76
+ export async function runByDiscussionCommand(values, args) {
77
+ const [discussionId, traceDirArg] = args;
78
+ if (!discussionId) throw new Error("<discussion-id> is required");
79
+ const dir = traceDirArg ?? values["trace-dir"] ?? "traces";
80
+ const matches = findTracesByDiscussion(dir, discussionId);
81
+ for (const { path } of matches) {
82
+ process.stdout.write(`${path}\n`);
83
+ }
84
+ }
@@ -0,0 +1,104 @@
1
+ import { readFileSync } from "node:fs";
2
+
3
+ /**
4
+ * Scan an NDJSON trace and return the last orchestrator summary event,
5
+ * the first `meta` event's `discussion_id`, and any structured replies
6
+ * collected by the discusser. Skips malformed lines.
7
+ *
8
+ * The runner is verdict-agnostic — verbatim passthrough of whatever the
9
+ * trace carries ("success"/"failure" from supervise/facilitate; canonical
10
+ * "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
11
+ * its channel semantics.
12
+ *
13
+ * @param {string} traceFile
14
+ * @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
15
+ */
16
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
17
+ function readTraceSummary(traceFile) {
18
+ let summary = null;
19
+ let metaDiscussionId = null;
20
+ for (const line of readFileSync(traceFile, "utf8").split("\n")) {
21
+ if (!line.trim()) continue;
22
+ let record;
23
+ try {
24
+ record = JSON.parse(line);
25
+ } catch {
26
+ continue;
27
+ }
28
+ if (record.source !== "orchestrator") continue;
29
+ if (record.event?.type === "meta" && !metaDiscussionId) {
30
+ metaDiscussionId = record.event.discussion_id ?? null;
31
+ }
32
+ if (record.event?.type === "summary") {
33
+ summary = {
34
+ verdict: record.event.verdict ?? "failed",
35
+ summary: record.event.summary ?? "",
36
+ replies: Array.isArray(record.event.replies)
37
+ ? record.event.replies
38
+ : [],
39
+ ...(record.event.trigger && { trigger: record.event.trigger }),
40
+ ...(record.event.discussion_id && {
41
+ discussionId: record.event.discussion_id,
42
+ }),
43
+ };
44
+ }
45
+ }
46
+ if (summary && !summary.discussionId && metaDiscussionId) {
47
+ summary.discussionId = metaDiscussionId;
48
+ }
49
+ return summary;
50
+ }
51
+
52
+ /**
53
+ * Callback command — read an NDJSON trace, extract the terminal
54
+ * orchestrator summary, and POST a canonical callback body to the
55
+ * configured URL. Used by `kata-dispatch.yml` to deliver the lead's
56
+ * conclusion to the bridge that dispatched the run.
57
+ *
58
+ * Wire shape (single shape across modes):
59
+ *
60
+ * ```
61
+ * {
62
+ * correlation_id, verdict, summary, run_url,
63
+ * discussion_id?, replies: [], trigger?
64
+ * }
65
+ * ```
66
+ *
67
+ * @param {object} values - Parsed option values from cli.parse()
68
+ * @param {string[]} _args - Positional arguments
69
+ */
70
+ export async function runCallbackCommand(values, _args) {
71
+ const traceFile = values["trace-file"];
72
+ const callbackUrl = values["callback-url"];
73
+ const correlationId = values["correlation-id"];
74
+ const runUrl = values["run-url"] ?? "";
75
+ const discussionIdOverride = values["discussion-id"] ?? null;
76
+
77
+ if (!traceFile) throw new Error("--trace-file is required");
78
+ if (!callbackUrl) throw new Error("--callback-url is required");
79
+
80
+ const found = readTraceSummary(traceFile) ?? {
81
+ verdict: "failed",
82
+ summary: "Run ended without producing a summary.",
83
+ replies: [],
84
+ };
85
+
86
+ const discussionId = found.discussionId ?? discussionIdOverride ?? null;
87
+ const payload = {
88
+ correlation_id: correlationId,
89
+ verdict: found.verdict,
90
+ summary: found.summary,
91
+ run_url: runUrl,
92
+ replies: found.replies,
93
+ ...(discussionId && { discussion_id: discussionId }),
94
+ ...(found.trigger && { trigger: found.trigger }),
95
+ };
96
+ const res = await fetch(callbackUrl, {
97
+ method: "POST",
98
+ headers: { "Content-Type": "application/json" },
99
+ body: JSON.stringify(payload),
100
+ });
101
+ if (!res.ok) {
102
+ throw new Error(`Callback POST failed: ${res.status}`);
103
+ }
104
+ }
@@ -0,0 +1,116 @@
1
+ import { readFileSync, createWriteStream } from "node:fs";
2
+ import { resolve } from "node:path";
3
+ import { createDiscusser } from "../discusser.js";
4
+ import { createRedactor } from "../redaction.js";
5
+ import { createTeeWriter } from "../tee-writer.js";
6
+
7
+ function parseAgentProfiles(raw, cwd, maxTurns) {
8
+ if (!raw) return [];
9
+ return raw.split(",").map((entry) => {
10
+ const name = entry.trim();
11
+ return { name, role: name, cwd, agentProfile: name, maxTurns };
12
+ });
13
+ }
14
+
15
+ /**
16
+ * Parse and validate discuss command options. Exported so tests can verify
17
+ * defaults and the legacy-flag clean break.
18
+ * @param {object} values - Parsed option values
19
+ * @returns {object}
20
+ */
21
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
22
+ export function parseDiscussOptions(values) {
23
+ const taskFile = values["task-file"];
24
+ const taskText = values["task-text"];
25
+ if (taskFile && taskText)
26
+ throw new Error("--task-file and --task-text are mutually exclusive");
27
+ if (!taskFile && !taskText)
28
+ throw new Error("--task-file or --task-text is required");
29
+
30
+ const taskAmend = values["task-amend"] ?? undefined;
31
+ const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
32
+
33
+ const profilesRaw = values["agent-profiles"];
34
+ const agentCwd = resolve(values["agent-cwd"] ?? ".");
35
+
36
+ const maxTurnsRaw = values["max-turns"] ?? "40";
37
+ const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
38
+
39
+ const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
40
+
41
+ const resumeContextRaw = values["resume-context"];
42
+ let resumeContext = null;
43
+ if (resumeContextRaw) {
44
+ try {
45
+ resumeContext = JSON.parse(resumeContextRaw);
46
+ } catch (err) {
47
+ throw new Error(`--resume-context is not valid JSON: ${err.message}`);
48
+ }
49
+ }
50
+
51
+ return {
52
+ taskContent,
53
+ taskAmend,
54
+ agentConfigs,
55
+ leadProfile: values["lead-profile"] ?? "release-engineer",
56
+ leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
57
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
58
+ maxTurns,
59
+ outputPath: values.output,
60
+ discussionId: values["discussion-id"] ?? null,
61
+ resumeContext,
62
+ };
63
+ }
64
+
65
+ /**
66
+ * Discuss command — run a discusser-led session with suspend/resume
67
+ * semantics, threading `discussion_id` through the trace so multi-run
68
+ * conversations are queryable as one.
69
+ *
70
+ * @param {object} values - Parsed option values
71
+ * @param {string[]} _args - Positional arguments
72
+ */
73
+ export async function runDiscussCommand(values, _args) {
74
+ const opts = parseDiscussOptions(values);
75
+
76
+ const redactor = createRedactor();
77
+
78
+ const fileStream = opts.outputPath
79
+ ? createWriteStream(opts.outputPath)
80
+ : null;
81
+ const output = fileStream
82
+ ? createTeeWriter({
83
+ fileStream,
84
+ textStream: process.stdout,
85
+ mode: "supervised",
86
+ })
87
+ : process.stdout;
88
+
89
+ if (opts.leadProfile) {
90
+ process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
91
+ }
92
+
93
+ const { query } = await import("@anthropic-ai/claude-agent-sdk");
94
+ const discusser = createDiscusser({
95
+ leadProfile: opts.leadProfile,
96
+ leadModel: opts.leadModel,
97
+ agentModel: opts.agentModel,
98
+ agentConfigs: opts.agentConfigs,
99
+ discussionId: opts.discussionId,
100
+ resumeContext: opts.resumeContext,
101
+ query,
102
+ output,
103
+ maxTurns: opts.maxTurns,
104
+ taskAmend: opts.taskAmend,
105
+ redactor,
106
+ });
107
+
108
+ const result = await discusser.run(opts.taskContent);
109
+
110
+ if (fileStream) {
111
+ await new Promise((r) => output.end(r));
112
+ await new Promise((r) => fileStream.end(r));
113
+ }
114
+
115
+ process.exit(result.success ? 0 : 1);
116
+ }
@@ -10,19 +10,21 @@ import { createTeeWriter } from "../tee-writer.js";
10
10
  * @param {string} cwd - Shared working directory for all agents
11
11
  * @returns {Array<{name: string, role: string, cwd: string, agentProfile: string}>}
12
12
  */
13
- function parseAgentProfiles(raw, cwd) {
13
+ function parseAgentProfiles(raw, cwd, maxTurns) {
14
14
  return raw.split(",").map((entry) => {
15
15
  const name = entry.trim();
16
- return { name, role: name, cwd, agentProfile: name };
16
+ return { name, role: name, cwd, agentProfile: name, maxTurns };
17
17
  });
18
18
  }
19
19
 
20
20
  /**
21
- * Parse and validate facilitate command options.
21
+ * Parse and validate facilitate command options. Exported for test
22
+ * coverage of the `--max-turns` → per-agent threading contract; not part
23
+ * of the package's public API.
22
24
  * @param {object} values - Parsed option values
23
25
  * @returns {object} Parsed options
24
26
  */
25
- function parseFacilitateOptions(values) {
27
+ export function parseFacilitateOptions(values) {
26
28
  const taskFile = values["task-file"];
27
29
  const taskText = values["task-text"];
28
30
  if (taskFile && taskText)
@@ -36,9 +38,15 @@ function parseFacilitateOptions(values) {
36
38
  const profilesRaw = values["agent-profiles"];
37
39
  if (!profilesRaw) throw new Error("--agent-profiles is required");
38
40
  const agentCwd = resolve(values["agent-cwd"] ?? ".");
39
- const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd);
40
41
 
41
42
  const maxTurnsRaw = values["max-turns"] ?? "20";
43
+ const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
44
+
45
+ // Thread --max-turns into each participant: without this, every facilitated
46
+ // agent silently falls back to the 50-turn default in facilitator.js even
47
+ // when the caller raises the budget. Observed in run 26078312414 where
48
+ // staff-engineer terminated at 51 turns despite --max-turns=200.
49
+ const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
42
50
 
43
51
  return {
44
52
  taskContent,
@@ -46,10 +54,10 @@ function parseFacilitateOptions(values) {
46
54
  agentConfigs,
47
55
  facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
48
56
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
49
- facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
50
- maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
57
+ facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
58
+ maxTurns,
51
59
  outputPath: values.output,
52
- facilitatorProfile: values["facilitator-profile"] ?? undefined,
60
+ facilitatorProfile: values["lead-profile"] ?? undefined,
53
61
  };
54
62
  }
55
63
 
@@ -12,7 +12,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
12
12
  * @returns {object}
13
13
  */
14
14
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
15
- function parseSuperviseOptions(values) {
15
+ export function parseSuperviseOptions(values) {
16
16
  const taskFile = values["task-file"];
17
17
  const taskText = values["task-text"];
18
18
  if (taskFile && taskText)
@@ -33,13 +33,13 @@ function parseSuperviseOptions(values) {
33
33
  values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
34
34
  ),
35
35
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
36
- supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
36
+ supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
37
37
  maxTurns: (() => {
38
- const raw = values["max-turns"] ?? "20";
38
+ const raw = values["max-turns"] ?? "200";
39
39
  return raw === "0" ? 0 : parseInt(raw, 10);
40
40
  })(),
41
41
  outputPath: values.output,
42
- supervisorProfile: values["supervisor-profile"] ?? undefined,
42
+ supervisorProfile: values["lead-profile"] ?? undefined,
43
43
  agentProfile: values["agent-profile"] ?? undefined,
44
44
  allowedTools: (
45
45
  values["allowed-tools"] ??