@forwardimpact/libeval 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  /**
2
- * AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
3
- * events to an output stream. Building block for both `fit-eval run` and
4
- * `fit-eval supervise`.
2
+ * AgentRunner — runs a single Claude Agent SDK session and emits raw
3
+ * NDJSON events to an output stream. Building block for `fit-eval run`,
4
+ * `fit-eval supervise`, `fit-eval facilitate`, and `fit-eval discuss`.
5
5
  *
6
6
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
7
7
  */
@@ -13,25 +13,6 @@ const DEFAULT_ALLOWED_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
13
13
  // overridable — so a future caller can't accidentally reduce permissions.
14
14
  const PERMISSION_MODE = "bypassPermissions";
15
15
 
16
- function applyDefaults(deps) {
17
- return {
18
- cwd: deps.cwd,
19
- query: deps.query,
20
- output: deps.output,
21
- model: deps.model ?? "claude-opus-4-7[1m]",
22
- maxTurns: deps.maxTurns ?? 50,
23
- allowedTools: deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS,
24
- onLine: deps.onLine ?? null,
25
- onBatch: deps.onBatch ?? null,
26
- batchSize: deps.batchSize ?? 3,
27
- settingSources: deps.settingSources ?? [],
28
- systemPrompt: deps.systemPrompt ?? null,
29
- disallowedTools: deps.disallowedTools ?? [],
30
- mcpServers: deps.mcpServers ?? null,
31
- taskAmend: deps.taskAmend ?? null,
32
- };
33
- }
34
-
35
16
  /** Run a single Claude Agent SDK session and emit raw NDJSON events to an output stream. */
36
17
  export class AgentRunner {
37
18
  /**
@@ -43,29 +24,38 @@ export class AgentRunner {
43
24
  * @param {number} [deps.maxTurns] - Maximum agentic turns; 0 means unlimited
44
25
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
45
26
  * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
46
- * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries: every `batchSize` assistant text blocks, the terminal `result` message, and — on iterator crash/abort — once more in a final flush carrying any lines that never reached a boundary. Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
47
- * @param {number} [deps.batchSize] - Assistant text-block messages to accumulate before firing onBatch. Tool-only assistant messages ride along without counting. Default 3: the supervisor reviews the agent every three text turns instead of every turn. The terminal `result` always flushes regardless of count.
48
27
  * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
49
28
  * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
50
29
  * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
51
30
  * @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
31
+ * @param {object} deps.redactor
52
32
  */
53
33
  constructor(deps) {
54
34
  if (!deps.cwd) throw new Error("cwd is required");
55
35
  if (!deps.query) throw new Error("query is required");
56
36
  if (!deps.output) throw new Error("output is required");
57
37
  if (!deps.redactor) throw new Error("redactor is required");
58
- Object.assign(this, applyDefaults(deps));
38
+ this.cwd = deps.cwd;
39
+ this.query = deps.query;
40
+ this.output = deps.output;
59
41
  this.redactor = deps.redactor;
42
+ this.model = deps.model ?? "claude-opus-4-7[1m]";
43
+ this.maxTurns = deps.maxTurns ?? 50;
44
+ this.allowedTools = deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS;
45
+ this.onLine = deps.onLine ?? null;
46
+ this.settingSources = deps.settingSources ?? [];
47
+ this.systemPrompt = deps.systemPrompt ?? null;
48
+ this.disallowedTools = deps.disallowedTools ?? [];
49
+ this.mcpServers = deps.mcpServers ?? null;
50
+ this.taskAmend = deps.taskAmend ?? null;
60
51
  this.sessionId = null;
61
- this.buffer = [];
62
52
  /** @type {AbortController|null} */
63
53
  this.currentAbortController = null;
64
54
  }
65
55
 
66
56
  /**
67
57
  * Run a new agent session with the given task.
68
- * @param {string} task - The task prompt
58
+ * @param {string} task
69
59
  * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
70
60
  */
71
61
  async run(task) {
@@ -87,7 +77,7 @@ export class AgentRunner {
87
77
 
88
78
  /**
89
79
  * Resume an existing session with a follow-up prompt.
90
- * @param {string} prompt - The follow-up prompt
80
+ * @param {string} prompt
91
81
  * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
92
82
  */
93
83
  async resume(prompt) {
@@ -108,17 +98,16 @@ export class AgentRunner {
108
98
  }
109
99
 
110
100
  /**
111
- * Build the options passed to every SDK query() call. Shared by run() and
112
- * resume() so the agent's configuration — cwd, tools, prompt, setting
113
- * sources, turn budget — is identical across the session's lifetime. Only
114
- * resume() layers `resume: this.sessionId` on top.
101
+ * Build the options passed to every SDK query() call. Shared by run()
102
+ * and resume() so the agent's configuration — cwd, tools, prompt,
103
+ * setting sources, turn budget — is identical across the session's
104
+ * lifetime. Only resume() layers `resume: this.sessionId` on top.
115
105
  *
116
- * SDK options are call-attached, not session-attached: the resumed call
117
- * loads the prior conversation but otherwise uses whatever options this
118
- * call passes. Omitting tool/prompt/setting options on resume causes the
119
- * agent to silently lose its restrictions and persona between turns.
120
- * @param {AbortController} abortController
121
- * @returns {object}
106
+ * SDK options are call-attached, not session-attached: the resumed
107
+ * call loads the prior conversation but otherwise uses whatever
108
+ * options this call passes. Omitting tool/prompt/setting options on
109
+ * resume causes the agent to silently lose its restrictions and
110
+ * persona between turns.
122
111
  */
123
112
  #callOptions(abortController) {
124
113
  return {
@@ -139,59 +128,28 @@ export class AgentRunner {
139
128
  }
140
129
 
141
130
  /**
142
- * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
143
- * iterator, mirroring every line to the output stream / buffer / onLine
144
- * callback, and — when `onBatch` is set flushes accumulated lines to it
145
- * at coarse boundaries: every `batchSize` assistant text-block messages,
146
- * and the terminal `result` message. Tool-only assistant messages still
147
- * accumulate in the pending batch and ride along in the next flush, so
148
- * the supervisor always sees the tool calls that led up to each text
149
- * block. Raising `batchSize` above 1 is the knob that makes the mid-turn
150
- * supervisor review less chatty — with the default of 3, the supervisor
151
- * sees the agent in chunks of three text turns instead of every turn.
152
- *
153
- * Corollary: a turn that is *entirely* tool_use with no text blocks and
154
- * then hits `result` produces exactly one flush at `result` regardless
155
- * of how many tools ran. That is deliberate — the supervisor only needs
156
- * to weigh in when the agent surfaces something text-like to react to.
157
- *
158
- * INVARIANT: the `await this.onBatch(...)` call below is the ONLY
159
- * suspension point in this loop. While it is pending, no further lines
160
- * are pulled from the SDK generator. The Supervisor relies on this — its
161
- * onBatch callback flips `currentSource` to "supervisor" for the duration
162
- * of its mid-turn LLM call, and the invariant guarantees no agent line
163
- * can arrive concurrently and be mis-tagged.
164
- *
165
- * If the supervisor calls `abort()` from inside the callback, the next
166
- * iteration of the for-await loop will throw. We catch the throw, check
167
- * `currentAbortController.signal.aborted` (avoiding fragility around
168
- * AbortError vs DOMException shapes), and report `aborted: true` so the
169
- * caller can distinguish "supervisor asked us to stop" from a real error.
131
+ * Iterate the SDK query iterator, mirroring every message to the
132
+ * output stream and the `onLine` callback. Captures `sessionId` from
133
+ * the SDK's `system/init` message and tracks Skill invocations into
134
+ * `LIBEVAL_SKILL` for downstream metrics.
170
135
  *
171
- * If the iterator throws before a flush boundary, any lines still in the
172
- * pending batch would otherwise vanish without the supervisor seeing
173
- * them. The `finally` block emits a terminal batch so the supervisor can
174
- * observe the partial state (e.g. note a crash or react to an external
175
- * abort). A throw from that final flush becomes the returned `error`
176
- * only if no earlier error was captured — the original failure wins.
177
- * @param {AsyncIterable<object>} iterator
178
- * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
136
+ * If the iterator throws and we triggered the abort ourselves
137
+ * (`currentAbortController.signal.aborted`), we report `aborted:
138
+ * true`; otherwise the error propagates as `error`.
179
139
  */
180
140
  async #consumeQuery(iterator) {
181
141
  let text = "";
182
142
  let stopReason = null;
183
143
  let error = null;
184
144
  let aborted = false;
185
- const state = { pendingBatch: [], assistantTextCount: 0 };
186
145
 
187
146
  try {
188
147
  for await (const message of iterator) {
189
- this.#recordLine(message, state);
148
+ this.#recordLine(message);
190
149
  if (message.type === "result") {
191
150
  text = message.result ?? "";
192
151
  stopReason = message.subtype;
193
152
  }
194
- await this.#maybeFlushBatch(message, state);
195
153
  }
196
154
  } catch (err) {
197
155
  if (this.currentAbortController?.signal.aborted) {
@@ -201,118 +159,28 @@ export class AgentRunner {
201
159
  }
202
160
  }
203
161
 
204
- const flushErr = await this.#terminalFlush(state, { error, aborted });
205
- if (flushErr && !error) error = flushErr;
206
-
207
- const success = stopReason === "success";
208
- return { success, text, sessionId: this.sessionId, error, aborted };
162
+ return {
163
+ success: stopReason === "success",
164
+ text,
165
+ sessionId: this.sessionId,
166
+ error,
167
+ aborted,
168
+ };
209
169
  }
210
170
 
211
- /**
212
- * Mirror a single SDK message to the output stream, buffer, onLine
213
- * callback, and (when set) the pending-batch state. Also handles
214
- * session id capture and text-block counting so `#consumeQuery` can
215
- * stay within the complexity budget.
216
- * @param {object} message
217
- * @param {{pendingBatch: string[], assistantTextCount: number}} state
218
- */
219
- #recordLine(message, state) {
171
+ #recordLine(message) {
220
172
  const redacted = this.redactor.redactValue(message);
221
173
  const line = JSON.stringify(redacted);
222
174
  this.output.write(line + "\n");
223
- this.buffer.push(line);
224
175
  if (this.onLine) this.onLine(line);
225
- if (this.onBatch) state.pendingBatch.push(line);
226
176
 
227
- // Session-id / text-block tracking reads the ORIGINAL message —
228
- // these fields are not secret carriers, and the trackers rely on
229
- // shape, not string contents.
230
177
  if (message.type === "system" && message.subtype === "init") {
231
178
  this.sessionId = message.session_id;
232
179
  }
233
- if (message.type === "assistant") {
234
- if (hasTextBlock(message)) state.assistantTextCount++;
235
- trackSkillInvocation(message);
236
- }
237
- }
238
-
239
- /**
240
- * Terminal flush — only fires on the abnormal-end paths (iterator
241
- * threw or was aborted mid-stream). Delivers any pending lines so the
242
- * supervisor sees the partial state instead of losing the tail of
243
- * the run. A natural-end iterator that simply ran out of messages
244
- * without a `result` marker is treated as an incomplete stub (the
245
- * real SDK always terminates with `result`) and its pending batch is
246
- * not re-flushed. Returns an error thrown by the flush callback, or
247
- * `null` if the flush succeeded or did not fire.
248
- * @param {{pendingBatch: string[], assistantTextCount: number}} state
249
- * @param {{error: Error|null, aborted: boolean}} outcome
250
- * @returns {Promise<Error|null>}
251
- */
252
- async #terminalFlush(state, { error, aborted }) {
253
- const loopEndedAbnormally = Boolean(error || aborted);
254
- if (!loopEndedAbnormally) return null;
255
- if (!this.onBatch || state.pendingBatch.length === 0) return null;
256
- try {
257
- const batchLines = state.pendingBatch.splice(0);
258
- await this.onBatch(batchLines, {
259
- abort: () => this.currentAbortController?.abort(),
260
- });
261
- return null;
262
- } catch (flushErr) {
263
- return flushErr;
264
- }
265
- }
266
-
267
- /**
268
- * Flush the pending batch to `onBatch` if either the batchSize threshold
269
- * has been reached or the current message is the terminal `result`.
270
- * Extracted so that `#consumeQuery` stays within the project's complexity
271
- * budget — the flush is one cohesive unit of logic in its own right.
272
- * @param {object} message
273
- * @param {{pendingBatch: string[], assistantTextCount: number}} state
274
- */
275
- async #maybeFlushBatch(message, state) {
276
- if (!this.onBatch) return;
277
- const shouldFlush =
278
- message.type === "result" || state.assistantTextCount >= this.batchSize;
279
- if (!shouldFlush) return;
280
- state.assistantTextCount = 0;
281
- const batchLines = state.pendingBatch.splice(0);
282
- await this.onBatch(batchLines, {
283
- abort: () => this.currentAbortController?.abort(),
284
- });
285
- }
286
-
287
- /**
288
- * Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
289
- * @returns {string[]}
290
- */
291
- drainOutput() {
292
- const lines = [...this.buffer];
293
- this.buffer = [];
294
- return lines;
180
+ if (message.type === "assistant") trackSkillInvocation(message);
295
181
  }
296
182
  }
297
183
 
298
- /**
299
- * Whether an SDK assistant message contains at least one text block.
300
- * Only text-block messages count toward the `batchSize` threshold — tool-only
301
- * assistant messages accumulate silently into the pending batch and ride along
302
- * in the next flush, keeping supervisor LLM cost bounded. Exported so the mock
303
- * runner can mirror the real flush predicate without duplicating the logic.
304
- * @param {object} message
305
- * @returns {boolean}
306
- */
307
- export function hasTextBlock(message) {
308
- const content = message.message?.content ?? message.content;
309
- if (!Array.isArray(content)) return false;
310
- for (const block of content) {
311
- if (block.type === "text" && block.text) return true;
312
- }
313
- return false;
314
- }
315
-
316
184
  function trackSkillInvocation(message) {
317
185
  const content = message.message?.content ?? message.content;
318
186
  if (!Array.isArray(content)) return;
@@ -327,11 +195,7 @@ function trackSkillInvocation(message) {
327
195
  }
328
196
  }
329
197
 
330
- /**
331
- * Factory function — wires real dependencies.
332
- * @param {object} deps - Same as AgentRunner constructor
333
- * @returns {AgentRunner}
334
- */
198
+ /** Factory function — wires real dependencies. */
335
199
  export function createAgentRunner(deps) {
336
200
  return new AgentRunner(deps);
337
201
  }
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * Phases per (task, runIndex):
5
5
  * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
- * 2. Supervisor relay (agent + supervisor) → produce traces + submission
6
+ * 2. Supervisor session (agent + supervisor) → produce traces + submission
7
7
  * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
8
8
  * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
9
  * 5. WorkdirManager.teardown → process-group cleanup
@@ -272,7 +272,7 @@ export class BenchmarkRunner {
272
272
  }
273
273
 
274
274
  /**
275
- * Run the agent-under-test via a Supervisor relay. The supervisor writes
275
+ * Run the agent-under-test under a Supervisor. The supervisor writes
276
276
  * a combined tagged NDJSON trace; after the session we split it into
277
277
  * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
278
278
  */
@@ -40,7 +40,7 @@ function parseRunOptions(values) {
40
40
  runs,
41
41
  output: resolve(output),
42
42
  agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
43
- supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
43
+ supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
44
44
  judgeModel: values["judge-model"] ?? "claude-opus-4-7",
45
45
  profiles: {
46
46
  agent: values["agent-profile"] ?? null,
@@ -0,0 +1,84 @@
1
+ import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
2
+ import { join } from "node:path";
3
+
4
+ /**
5
+ * Read the first newline-terminated line of a file. Bounded to 64 KiB
6
+ * which is well above any orchestrator envelope.
7
+ *
8
+ * @param {string} path
9
+ * @returns {string}
10
+ */
11
+ function readFirstLine(path) {
12
+ const fd = openSync(path, "r");
13
+ try {
14
+ const buf = Buffer.alloc(65536);
15
+ const bytes = readSync(fd, buf, 0, buf.length, 0);
16
+ const slice = buf.slice(0, bytes).toString("utf8");
17
+ const nl = slice.indexOf("\n");
18
+ return nl === -1 ? slice : slice.slice(0, nl);
19
+ } finally {
20
+ closeSync(fd);
21
+ }
22
+ }
23
+
24
+ /**
25
+ * Scan a directory for `.ndjson` files whose meta header carries the
26
+ * given discussion_id. The Step 2.6 first-line guarantee makes the
27
+ * lookup cheap: we read only the first line per file. Files without a
28
+ * meta header (e.g. legacy supervise/facilitate traces) are skipped
29
+ * silently — not erroneous.
30
+ *
31
+ * @param {string} dir
32
+ * @param {string} discussionId
33
+ * @returns {Array<{path: string, mtimeMs: number}>}
34
+ */
35
+ export function findTracesByDiscussion(dir, discussionId) {
36
+ const matches = [];
37
+ let entries;
38
+ try {
39
+ entries = readdirSync(dir);
40
+ } catch {
41
+ return [];
42
+ }
43
+ for (const entry of entries) {
44
+ if (!entry.endsWith(".ndjson")) continue;
45
+ const path = join(dir, entry);
46
+ let firstLine;
47
+ try {
48
+ firstLine = readFirstLine(path);
49
+ } catch {
50
+ continue;
51
+ }
52
+ let parsed;
53
+ try {
54
+ parsed = JSON.parse(firstLine);
55
+ } catch {
56
+ continue;
57
+ }
58
+ const event = parsed.event ?? parsed;
59
+ if (event?.type !== "meta") continue;
60
+ if (event.discussion_id !== discussionId) continue;
61
+ matches.push({ path, mtimeMs: statSync(path).mtimeMs });
62
+ }
63
+ matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
64
+ return matches;
65
+ }
66
+
67
+ /**
68
+ * `fit-trace by-discussion <discussion-id> [trace-dir]` — list trace
69
+ * files whose meta header carries the given discussion_id, one per
70
+ * line, ordered by first-event timestamp (file mtime ascending). The
71
+ * result is usable with `xargs cat` for a chronological merge.
72
+ *
73
+ * @param {object} values
74
+ * @param {string[]} args
75
+ */
76
+ export async function runByDiscussionCommand(values, args) {
77
+ const [discussionId, traceDirArg] = args;
78
+ if (!discussionId) throw new Error("<discussion-id> is required");
79
+ const dir = traceDirArg ?? values["trace-dir"] ?? "traces";
80
+ const matches = findTracesByDiscussion(dir, discussionId);
81
+ for (const { path } of matches) {
82
+ process.stdout.write(`${path}\n`);
83
+ }
84
+ }
@@ -0,0 +1,104 @@
1
+ import { readFileSync } from "node:fs";
2
+
3
+ /**
4
+ * Scan an NDJSON trace and return the last orchestrator summary event,
5
+ * the first `meta` event's `discussion_id`, and any structured replies
6
+ * collected by the discusser. Skips malformed lines.
7
+ *
8
+ * The runner is verdict-agnostic — verbatim passthrough of whatever the
9
+ * trace carries ("success"/"failure" from supervise/facilitate; canonical
10
+ * "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
11
+ * its channel semantics.
12
+ *
13
+ * @param {string} traceFile
14
+ * @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
15
+ */
16
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
17
+ function readTraceSummary(traceFile) {
18
+ let summary = null;
19
+ let metaDiscussionId = null;
20
+ for (const line of readFileSync(traceFile, "utf8").split("\n")) {
21
+ if (!line.trim()) continue;
22
+ let record;
23
+ try {
24
+ record = JSON.parse(line);
25
+ } catch {
26
+ continue;
27
+ }
28
+ if (record.source !== "orchestrator") continue;
29
+ if (record.event?.type === "meta" && !metaDiscussionId) {
30
+ metaDiscussionId = record.event.discussion_id ?? null;
31
+ }
32
+ if (record.event?.type === "summary") {
33
+ summary = {
34
+ verdict: record.event.verdict ?? "failed",
35
+ summary: record.event.summary ?? "",
36
+ replies: Array.isArray(record.event.replies)
37
+ ? record.event.replies
38
+ : [],
39
+ ...(record.event.trigger && { trigger: record.event.trigger }),
40
+ ...(record.event.discussion_id && {
41
+ discussionId: record.event.discussion_id,
42
+ }),
43
+ };
44
+ }
45
+ }
46
+ if (summary && !summary.discussionId && metaDiscussionId) {
47
+ summary.discussionId = metaDiscussionId;
48
+ }
49
+ return summary;
50
+ }
51
+
52
+ /**
53
+ * Callback command — read an NDJSON trace, extract the terminal
54
+ * orchestrator summary, and POST a canonical callback body to the
55
+ * configured URL. Used by `kata-dispatch.yml` to deliver the lead's
56
+ * conclusion to the bridge that dispatched the run.
57
+ *
58
+ * Wire shape (single shape across modes):
59
+ *
60
+ * ```
61
+ * {
62
+ * correlation_id, verdict, summary, run_url,
63
+ * discussion_id?, replies: [], trigger?
64
+ * }
65
+ * ```
66
+ *
67
+ * @param {object} values - Parsed option values from cli.parse()
68
+ * @param {string[]} _args - Positional arguments
69
+ */
70
+ export async function runCallbackCommand(values, _args) {
71
+ const traceFile = values["trace-file"];
72
+ const callbackUrl = values["callback-url"];
73
+ const correlationId = values["correlation-id"];
74
+ const runUrl = values["run-url"] ?? "";
75
+ const discussionIdOverride = values["discussion-id"] ?? null;
76
+
77
+ if (!traceFile) throw new Error("--trace-file is required");
78
+ if (!callbackUrl) throw new Error("--callback-url is required");
79
+
80
+ const found = readTraceSummary(traceFile) ?? {
81
+ verdict: "failed",
82
+ summary: "Run ended without producing a summary.",
83
+ replies: [],
84
+ };
85
+
86
+ const discussionId = found.discussionId ?? discussionIdOverride ?? null;
87
+ const payload = {
88
+ correlation_id: correlationId,
89
+ verdict: found.verdict,
90
+ summary: found.summary,
91
+ run_url: runUrl,
92
+ replies: found.replies,
93
+ ...(discussionId && { discussion_id: discussionId }),
94
+ ...(found.trigger && { trigger: found.trigger }),
95
+ };
96
+ const res = await fetch(callbackUrl, {
97
+ method: "POST",
98
+ headers: { "Content-Type": "application/json" },
99
+ body: JSON.stringify(payload),
100
+ });
101
+ if (!res.ok) {
102
+ throw new Error(`Callback POST failed: ${res.status}`);
103
+ }
104
+ }
@@ -0,0 +1,116 @@
1
+ import { readFileSync, createWriteStream } from "node:fs";
2
+ import { resolve } from "node:path";
3
+ import { createDiscusser } from "../discusser.js";
4
+ import { createRedactor } from "../redaction.js";
5
+ import { createTeeWriter } from "../tee-writer.js";
6
+
7
+ function parseAgentProfiles(raw, cwd, maxTurns) {
8
+ if (!raw) return [];
9
+ return raw.split(",").map((entry) => {
10
+ const name = entry.trim();
11
+ return { name, role: name, cwd, agentProfile: name, maxTurns };
12
+ });
13
+ }
14
+
15
+ /**
16
+ * Parse and validate discuss command options. Exported so tests can verify
17
+ * defaults and the legacy-flag clean break.
18
+ * @param {object} values - Parsed option values
19
+ * @returns {object}
20
+ */
21
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
22
+ export function parseDiscussOptions(values) {
23
+ const taskFile = values["task-file"];
24
+ const taskText = values["task-text"];
25
+ if (taskFile && taskText)
26
+ throw new Error("--task-file and --task-text are mutually exclusive");
27
+ if (!taskFile && !taskText)
28
+ throw new Error("--task-file or --task-text is required");
29
+
30
+ const taskAmend = values["task-amend"] ?? undefined;
31
+ const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
32
+
33
+ const profilesRaw = values["agent-profiles"];
34
+ const agentCwd = resolve(values["agent-cwd"] ?? ".");
35
+
36
+ const maxTurnsRaw = values["max-turns"] ?? "40";
37
+ const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
38
+
39
+ const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
40
+
41
+ const resumeContextRaw = values["resume-context"];
42
+ let resumeContext = null;
43
+ if (resumeContextRaw) {
44
+ try {
45
+ resumeContext = JSON.parse(resumeContextRaw);
46
+ } catch (err) {
47
+ throw new Error(`--resume-context is not valid JSON: ${err.message}`);
48
+ }
49
+ }
50
+
51
+ return {
52
+ taskContent,
53
+ taskAmend,
54
+ agentConfigs,
55
+ leadProfile: values["lead-profile"] ?? "release-engineer",
56
+ leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
57
+ agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
58
+ maxTurns,
59
+ outputPath: values.output,
60
+ discussionId: values["discussion-id"] ?? null,
61
+ resumeContext,
62
+ };
63
+ }
64
+
65
+ /**
66
+ * Discuss command — run a discusser-led session with suspend/resume
67
+ * semantics, threading `discussion_id` through the trace so multi-run
68
+ * conversations are queryable as one.
69
+ *
70
+ * @param {object} values - Parsed option values
71
+ * @param {string[]} _args - Positional arguments
72
+ */
73
+ export async function runDiscussCommand(values, _args) {
74
+ const opts = parseDiscussOptions(values);
75
+
76
+ const redactor = createRedactor();
77
+
78
+ const fileStream = opts.outputPath
79
+ ? createWriteStream(opts.outputPath)
80
+ : null;
81
+ const output = fileStream
82
+ ? createTeeWriter({
83
+ fileStream,
84
+ textStream: process.stdout,
85
+ mode: "supervised",
86
+ })
87
+ : process.stdout;
88
+
89
+ if (opts.leadProfile) {
90
+ process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
91
+ }
92
+
93
+ const { query } = await import("@anthropic-ai/claude-agent-sdk");
94
+ const discusser = createDiscusser({
95
+ leadProfile: opts.leadProfile,
96
+ leadModel: opts.leadModel,
97
+ agentModel: opts.agentModel,
98
+ agentConfigs: opts.agentConfigs,
99
+ discussionId: opts.discussionId,
100
+ resumeContext: opts.resumeContext,
101
+ query,
102
+ output,
103
+ maxTurns: opts.maxTurns,
104
+ taskAmend: opts.taskAmend,
105
+ redactor,
106
+ });
107
+
108
+ const result = await discusser.run(opts.taskContent);
109
+
110
+ if (fileStream) {
111
+ await new Promise((r) => output.end(r));
112
+ await new Promise((r) => fileStream.end(r));
113
+ }
114
+
115
+ process.exit(result.success ? 0 : 1);
116
+ }
@@ -54,10 +54,10 @@ export function parseFacilitateOptions(values) {
54
54
  agentConfigs,
55
55
  facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
56
56
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
57
- facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
57
+ facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
58
58
  maxTurns,
59
59
  outputPath: values.output,
60
- facilitatorProfile: values["facilitator-profile"] ?? undefined,
60
+ facilitatorProfile: values["lead-profile"] ?? undefined,
61
61
  };
62
62
  }
63
63