npm - @forwardimpact/libeval - Versions diffs - 0.1.43 → 0.1.45 - Mend

@forwardimpact/libeval 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +212 -13
package/bin/fit-benchmark.js +2 -2
package/bin/fit-eval.js +101 -21
package/bin/fit-trace.js +14 -0
package/package.json +1 -1
package/src/agent-runner.js +45 -181
package/src/benchmark/runner.js +2 -2
package/src/commands/benchmark-run.js +1 -1
package/src/commands/by-discussion.js +84 -0
package/src/commands/callback.js +104 -0
package/src/commands/discuss.js +116 -0
package/src/commands/facilitate.js +2 -2
package/src/commands/supervise.js +6 -4
package/src/discuss-tools.js +135 -0
package/src/discusser.js +315 -0
package/src/facilitator.js +46 -357
package/src/index.js +12 -0
package/src/judge.js +1 -1
package/src/message-bus.js +27 -81
package/src/orchestration-loop.js +316 -0
package/src/orchestration-toolkit.js +272 -303
package/src/orchestrator-helpers.js +9 -45
package/src/redaction.js +12 -0
package/src/render/orchestrator-filter.js +1 -8
package/src/supervisor.js +79 -465
package/src/trace-collector.js +4 -0

package/src/agent-runner.js CHANGED Viewed

@@ -1,7 +1,7 @@
 /**
- * AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
- * events to an output stream. Building block for both `fit-eval run` and
- * `fit-eval supervise`.
+ * AgentRunner — runs a single Claude Agent SDK session and emits raw
+ * NDJSON events to an output stream. Building block for `fit-eval run`,
+ * `fit-eval supervise`, `fit-eval facilitate`, and `fit-eval discuss`.
  *
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
  */
@@ -13,25 +13,6 @@ const DEFAULT_ALLOWED_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
 // overridable — so a future caller can't accidentally reduce permissions.
 const PERMISSION_MODE = "bypassPermissions";
-function applyDefaults(deps) {
-  return {
-    cwd: deps.cwd,
-    query: deps.query,
-    output: deps.output,
-    model: deps.model ?? "claude-opus-4-7[1m]",
-    maxTurns: deps.maxTurns ?? 50,
-    allowedTools: deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS,
-    onLine: deps.onLine ?? null,
-    onBatch: deps.onBatch ?? null,
-    batchSize: deps.batchSize ?? 3,
-    settingSources: deps.settingSources ?? [],
-    systemPrompt: deps.systemPrompt ?? null,
-    disallowedTools: deps.disallowedTools ?? [],
-    mcpServers: deps.mcpServers ?? null,
-    taskAmend: deps.taskAmend ?? null,
-  };
-}
 /** Run a single Claude Agent SDK session and emit raw NDJSON events to an output stream. */
 export class AgentRunner {
   /**
@@ -43,29 +24,38 @@ export class AgentRunner {
    * @param {number} [deps.maxTurns] - Maximum agentic turns; 0 means unlimited
    * @param {string[]} [deps.allowedTools] - Tools the agent may use
    * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
-   * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries: every `batchSize` assistant text blocks, the terminal `result` message, and — on iterator crash/abort — once more in a final flush carrying any lines that never reached a boundary. Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
-   * @param {number} [deps.batchSize] - Assistant text-block messages to accumulate before firing onBatch. Tool-only assistant messages ride along without counting. Default 3: the supervisor reviews the agent every three text turns instead of every turn. The terminal `result` always flushes regardless of count.
    * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
    * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
    * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
    * @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
+   * @param {object} deps.redactor
    */
   constructor(deps) {
     if (!deps.cwd) throw new Error("cwd is required");
     if (!deps.query) throw new Error("query is required");
     if (!deps.output) throw new Error("output is required");
     if (!deps.redactor) throw new Error("redactor is required");
-    Object.assign(this, applyDefaults(deps));
+    this.cwd = deps.cwd;
+    this.query = deps.query;
+    this.output = deps.output;
     this.redactor = deps.redactor;
+    this.model = deps.model ?? "claude-opus-4-7[1m]";
+    this.maxTurns = deps.maxTurns ?? 50;
+    this.allowedTools = deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS;
+    this.onLine = deps.onLine ?? null;
+    this.settingSources = deps.settingSources ?? [];
+    this.systemPrompt = deps.systemPrompt ?? null;
+    this.disallowedTools = deps.disallowedTools ?? [];
+    this.mcpServers = deps.mcpServers ?? null;
+    this.taskAmend = deps.taskAmend ?? null;
     this.sessionId = null;
-    this.buffer = [];
     /** @type {AbortController|null} */
     this.currentAbortController = null;
   }
   /**
    * Run a new agent session with the given task.
-   * @param {string} task - The task prompt
+   * @param {string} task
    * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
    */
   async run(task) {
@@ -87,7 +77,7 @@ export class AgentRunner {
   /**
    * Resume an existing session with a follow-up prompt.
-   * @param {string} prompt - The follow-up prompt
+   * @param {string} prompt
    * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
    */
   async resume(prompt) {
@@ -108,17 +98,16 @@ export class AgentRunner {
   }
   /**
-   * Build the options passed to every SDK query() call. Shared by run() and
-   * resume() so the agent's configuration — cwd, tools, prompt, setting
-   * sources, turn budget — is identical across the session's lifetime. Only
-   * resume() layers `resume: this.sessionId` on top.
+   * Build the options passed to every SDK query() call. Shared by run()
+   * and resume() so the agent's configuration — cwd, tools, prompt,
+   * setting sources, turn budget — is identical across the session's
+   * lifetime. Only resume() layers `resume: this.sessionId` on top.
    *
-   * SDK options are call-attached, not session-attached: the resumed call
-   * loads the prior conversation but otherwise uses whatever options this
-   * call passes. Omitting tool/prompt/setting options on resume causes the
-   * agent to silently lose its restrictions and persona between turns.
-   * @param {AbortController} abortController
-   * @returns {object}
+   * SDK options are call-attached, not session-attached: the resumed
+   * call loads the prior conversation but otherwise uses whatever
+   * options this call passes. Omitting tool/prompt/setting options on
+   * resume causes the agent to silently lose its restrictions and
+   * persona between turns.
    */
   #callOptions(abortController) {
     return {
@@ -139,59 +128,28 @@ export class AgentRunner {
   }
   /**
-   * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
-   * iterator, mirroring every line to the output stream / buffer / onLine
-   * callback, and — when `onBatch` is set — flushes accumulated lines to it
-   * at coarse boundaries: every `batchSize` assistant text-block messages,
-   * and the terminal `result` message. Tool-only assistant messages still
-   * accumulate in the pending batch and ride along in the next flush, so
-   * the supervisor always sees the tool calls that led up to each text
-   * block. Raising `batchSize` above 1 is the knob that makes the mid-turn
-   * supervisor review less chatty — with the default of 3, the supervisor
-   * sees the agent in chunks of three text turns instead of every turn.
-   *
-   * Corollary: a turn that is *entirely* tool_use with no text blocks and
-   * then hits `result` produces exactly one flush at `result` regardless
-   * of how many tools ran. That is deliberate — the supervisor only needs
-   * to weigh in when the agent surfaces something text-like to react to.
-   *
-   * INVARIANT: the `await this.onBatch(...)` call below is the ONLY
-   * suspension point in this loop. While it is pending, no further lines
-   * are pulled from the SDK generator. The Supervisor relies on this — its
-   * onBatch callback flips `currentSource` to "supervisor" for the duration
-   * of its mid-turn LLM call, and the invariant guarantees no agent line
-   * can arrive concurrently and be mis-tagged.
-   *
-   * If the supervisor calls `abort()` from inside the callback, the next
-   * iteration of the for-await loop will throw. We catch the throw, check
-   * `currentAbortController.signal.aborted` (avoiding fragility around
-   * AbortError vs DOMException shapes), and report `aborted: true` so the
-   * caller can distinguish "supervisor asked us to stop" from a real error.
+   * Iterate the SDK query iterator, mirroring every message to the
+   * output stream and the `onLine` callback. Captures `sessionId` from
+   * the SDK's `system/init` message and tracks Skill invocations into
+   * `LIBEVAL_SKILL` for downstream metrics.
    *
-   * If the iterator throws before a flush boundary, any lines still in the
-   * pending batch would otherwise vanish without the supervisor seeing
-   * them. The `finally` block emits a terminal batch so the supervisor can
-   * observe the partial state (e.g. note a crash or react to an external
-   * abort). A throw from that final flush becomes the returned `error`
-   * only if no earlier error was captured — the original failure wins.
-   * @param {AsyncIterable<object>} iterator
-   * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
+   * If the iterator throws and we triggered the abort ourselves
+   * (`currentAbortController.signal.aborted`), we report `aborted:
+   * true`; otherwise the error propagates as `error`.
    */
   async #consumeQuery(iterator) {
     let text = "";
     let stopReason = null;
     let error = null;
     let aborted = false;
-    const state = { pendingBatch: [], assistantTextCount: 0 };
     try {
       for await (const message of iterator) {
-        this.#recordLine(message, state);
+        this.#recordLine(message);
         if (message.type === "result") {
           text = message.result ?? "";
           stopReason = message.subtype;
         }
-        await this.#maybeFlushBatch(message, state);
       }
     } catch (err) {
       if (this.currentAbortController?.signal.aborted) {
@@ -201,118 +159,28 @@ export class AgentRunner {
       }
     }
-    const flushErr = await this.#terminalFlush(state, { error, aborted });
-    if (flushErr && !error) error = flushErr;
-    const success = stopReason === "success";
-    return { success, text, sessionId: this.sessionId, error, aborted };
+    return {
+      success: stopReason === "success",
+      text,
+      sessionId: this.sessionId,
+      error,
+      aborted,
+    };
   }
-  /**
-   * Mirror a single SDK message to the output stream, buffer, onLine
-   * callback, and (when set) the pending-batch state. Also handles
-   * session id capture and text-block counting so `#consumeQuery` can
-   * stay within the complexity budget.
-   * @param {object} message
-   * @param {{pendingBatch: string[], assistantTextCount: number}} state
-   */
-  #recordLine(message, state) {
+  #recordLine(message) {
     const redacted = this.redactor.redactValue(message);
     const line = JSON.stringify(redacted);
     this.output.write(line + "\n");
-    this.buffer.push(line);
     if (this.onLine) this.onLine(line);
-    if (this.onBatch) state.pendingBatch.push(line);
-    // Session-id / text-block tracking reads the ORIGINAL message —
-    // these fields are not secret carriers, and the trackers rely on
-    // shape, not string contents.
     if (message.type === "system" && message.subtype === "init") {
       this.sessionId = message.session_id;
     }
-    if (message.type === "assistant") {
-      if (hasTextBlock(message)) state.assistantTextCount++;
-      trackSkillInvocation(message);
-    }
-  }
-  /**
-   * Terminal flush — only fires on the abnormal-end paths (iterator
-   * threw or was aborted mid-stream). Delivers any pending lines so the
-   * supervisor sees the partial state instead of losing the tail of
-   * the run. A natural-end iterator that simply ran out of messages
-   * without a `result` marker is treated as an incomplete stub (the
-   * real SDK always terminates with `result`) and its pending batch is
-   * not re-flushed. Returns an error thrown by the flush callback, or
-   * `null` if the flush succeeded or did not fire.
-   * @param {{pendingBatch: string[], assistantTextCount: number}} state
-   * @param {{error: Error|null, aborted: boolean}} outcome
-   * @returns {Promise<Error|null>}
-   */
-  async #terminalFlush(state, { error, aborted }) {
-    const loopEndedAbnormally = Boolean(error || aborted);
-    if (!loopEndedAbnormally) return null;
-    if (!this.onBatch || state.pendingBatch.length === 0) return null;
-    try {
-      const batchLines = state.pendingBatch.splice(0);
-      await this.onBatch(batchLines, {
-        abort: () => this.currentAbortController?.abort(),
-      });
-      return null;
-    } catch (flushErr) {
-      return flushErr;
-    }
-  }
-  /**
-   * Flush the pending batch to `onBatch` if either the batchSize threshold
-   * has been reached or the current message is the terminal `result`.
-   * Extracted so that `#consumeQuery` stays within the project's complexity
-   * budget — the flush is one cohesive unit of logic in its own right.
-   * @param {object} message
-   * @param {{pendingBatch: string[], assistantTextCount: number}} state
-   */
-  async #maybeFlushBatch(message, state) {
-    if (!this.onBatch) return;
-    const shouldFlush =
-      message.type === "result" || state.assistantTextCount >= this.batchSize;
-    if (!shouldFlush) return;
-    state.assistantTextCount = 0;
-    const batchLines = state.pendingBatch.splice(0);
-    await this.onBatch(batchLines, {
-      abort: () => this.currentAbortController?.abort(),
-    });
-  }
-  /**
-   * Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
-   * @returns {string[]}
-   */
-  drainOutput() {
-    const lines = [...this.buffer];
-    this.buffer = [];
-    return lines;
+    if (message.type === "assistant") trackSkillInvocation(message);
   }
 }
-/**
- * Whether an SDK assistant message contains at least one text block.
- * Only text-block messages count toward the `batchSize` threshold — tool-only
- * assistant messages accumulate silently into the pending batch and ride along
- * in the next flush, keeping supervisor LLM cost bounded. Exported so the mock
- * runner can mirror the real flush predicate without duplicating the logic.
- * @param {object} message
- * @returns {boolean}
- */
-export function hasTextBlock(message) {
-  const content = message.message?.content ?? message.content;
-  if (!Array.isArray(content)) return false;
-  for (const block of content) {
-    if (block.type === "text" && block.text) return true;
-  }
-  return false;
-}
 function trackSkillInvocation(message) {
   const content = message.message?.content ?? message.content;
   if (!Array.isArray(content)) return;
@@ -327,11 +195,7 @@ function trackSkillInvocation(message) {
   }
 }
-/**
- * Factory function — wires real dependencies.
- * @param {object} deps - Same as AgentRunner constructor
- * @returns {AgentRunner}
- */
+/** Factory function — wires real dependencies. */
 export function createAgentRunner(deps) {
   return new AgentRunner(deps);
 }

package/src/benchmark/runner.js CHANGED Viewed

@@ -3,7 +3,7 @@
  *
  * Phases per (task, runIndex):
  *   1. WorkdirManager.start → seed CWD + run pre-flight probe
- *   2. Supervisor relay (agent + supervisor) → produce traces + submission
+ *   2. Supervisor session (agent + supervisor) → produce traces + submission
  *   3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
  *   4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
  *   5. WorkdirManager.teardown → process-group cleanup
@@ -272,7 +272,7 @@ export class BenchmarkRunner {
   }
   /**
-   * Run the agent-under-test via a Supervisor relay. The supervisor writes
+   * Run the agent-under-test under a Supervisor. The supervisor writes
    * a combined tagged NDJSON trace; after the session we split it into
    * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
    */

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -40,7 +40,7 @@ function parseRunOptions(values) {
     runs,
     output: resolve(output),
     agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
-    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
+    supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
     judgeModel: values["judge-model"] ?? "claude-opus-4-7",
     profiles: {
       agent: values["agent-profile"] ?? null,

package/src/commands/by-discussion.js ADDED Viewed

@@ -0,0 +1,84 @@
+import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
+import { join } from "node:path";
+/**
+ * Read the first newline-terminated line of a file. Bounded to 64 KiB
+ * which is well above any orchestrator envelope.
+ *
+ * @param {string} path
+ * @returns {string}
+ */
+function readFirstLine(path) {
+  const fd = openSync(path, "r");
+  try {
+    const buf = Buffer.alloc(65536);
+    const bytes = readSync(fd, buf, 0, buf.length, 0);
+    const slice = buf.slice(0, bytes).toString("utf8");
+    const nl = slice.indexOf("\n");
+    return nl === -1 ? slice : slice.slice(0, nl);
+  } finally {
+    closeSync(fd);
+  }
+}
+/**
+ * Scan a directory for `.ndjson` files whose meta header carries the
+ * given discussion_id. The Step 2.6 first-line guarantee makes the
+ * lookup cheap: we read only the first line per file. Files without a
+ * meta header (e.g. legacy supervise/facilitate traces) are skipped
+ * silently — not erroneous.
+ *
+ * @param {string} dir
+ * @param {string} discussionId
+ * @returns {Array<{path: string, mtimeMs: number}>}
+ */
+export function findTracesByDiscussion(dir, discussionId) {
+  const matches = [];
+  let entries;
+  try {
+    entries = readdirSync(dir);
+  } catch {
+    return [];
+  }
+  for (const entry of entries) {
+    if (!entry.endsWith(".ndjson")) continue;
+    const path = join(dir, entry);
+    let firstLine;
+    try {
+      firstLine = readFirstLine(path);
+    } catch {
+      continue;
+    }
+    let parsed;
+    try {
+      parsed = JSON.parse(firstLine);
+    } catch {
+      continue;
+    }
+    const event = parsed.event ?? parsed;
+    if (event?.type !== "meta") continue;
+    if (event.discussion_id !== discussionId) continue;
+    matches.push({ path, mtimeMs: statSync(path).mtimeMs });
+  }
+  matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
+  return matches;
+}
+/**
+ * `fit-trace by-discussion <discussion-id> [trace-dir]` — list trace
+ * files whose meta header carries the given discussion_id, one per
+ * line, ordered by first-event timestamp (file mtime ascending). The
+ * result is usable with `xargs cat` for a chronological merge.
+ *
+ * @param {object} values
+ * @param {string[]} args
+ */
+export async function runByDiscussionCommand(values, args) {
+  const [discussionId, traceDirArg] = args;
+  if (!discussionId) throw new Error("<discussion-id> is required");
+  const dir = traceDirArg ?? values["trace-dir"] ?? "traces";
+  const matches = findTracesByDiscussion(dir, discussionId);
+  for (const { path } of matches) {
+    process.stdout.write(`${path}\n`);
+  }
+}

package/src/commands/callback.js ADDED Viewed

@@ -0,0 +1,104 @@
+import { readFileSync } from "node:fs";
+/**
+ * Scan an NDJSON trace and return the last orchestrator summary event,
+ * the first `meta` event's `discussion_id`, and any structured replies
+ * collected by the discusser. Skips malformed lines.
+ *
+ * The runner is verdict-agnostic — verbatim passthrough of whatever the
+ * trace carries ("success"/"failure" from supervise/facilitate; canonical
+ * "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
+ * its channel semantics.
+ *
+ * @param {string} traceFile
+ * @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
+function readTraceSummary(traceFile) {
+  let summary = null;
+  let metaDiscussionId = null;
+  for (const line of readFileSync(traceFile, "utf8").split("\n")) {
+    if (!line.trim()) continue;
+    let record;
+    try {
+      record = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    if (record.source !== "orchestrator") continue;
+    if (record.event?.type === "meta" && !metaDiscussionId) {
+      metaDiscussionId = record.event.discussion_id ?? null;
+    }
+    if (record.event?.type === "summary") {
+      summary = {
+        verdict: record.event.verdict ?? "failed",
+        summary: record.event.summary ?? "",
+        replies: Array.isArray(record.event.replies)
+          ? record.event.replies
+          : [],
+        ...(record.event.trigger && { trigger: record.event.trigger }),
+        ...(record.event.discussion_id && {
+          discussionId: record.event.discussion_id,
+        }),
+      };
+    }
+  }
+  if (summary && !summary.discussionId && metaDiscussionId) {
+    summary.discussionId = metaDiscussionId;
+  }
+  return summary;
+}
+/**
+ * Callback command — read an NDJSON trace, extract the terminal
+ * orchestrator summary, and POST a canonical callback body to the
+ * configured URL. Used by `kata-dispatch.yml` to deliver the lead's
+ * conclusion to the bridge that dispatched the run.
+ *
+ * Wire shape (single shape across modes):
+ *
+ * ```
+ * {
+ *   correlation_id, verdict, summary, run_url,
+ *   discussion_id?, replies: [], trigger?
+ * }
+ * ```
+ *
+ * @param {object} values - Parsed option values from cli.parse()
+ * @param {string[]} _args - Positional arguments
+ */
+export async function runCallbackCommand(values, _args) {
+  const traceFile = values["trace-file"];
+  const callbackUrl = values["callback-url"];
+  const correlationId = values["correlation-id"];
+  const runUrl = values["run-url"] ?? "";
+  const discussionIdOverride = values["discussion-id"] ?? null;
+  if (!traceFile) throw new Error("--trace-file is required");
+  if (!callbackUrl) throw new Error("--callback-url is required");
+  const found = readTraceSummary(traceFile) ?? {
+    verdict: "failed",
+    summary: "Run ended without producing a summary.",
+    replies: [],
+  };
+  const discussionId = found.discussionId ?? discussionIdOverride ?? null;
+  const payload = {
+    correlation_id: correlationId,
+    verdict: found.verdict,
+    summary: found.summary,
+    run_url: runUrl,
+    replies: found.replies,
+    ...(discussionId && { discussion_id: discussionId }),
+    ...(found.trigger && { trigger: found.trigger }),
+  };
+  const res = await fetch(callbackUrl, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(payload),
+  });
+  if (!res.ok) {
+    throw new Error(`Callback POST failed: ${res.status}`);
+  }
+}

package/src/commands/discuss.js ADDED Viewed

@@ -0,0 +1,116 @@
+import { readFileSync, createWriteStream } from "node:fs";
+import { resolve } from "node:path";
+import { createDiscusser } from "../discusser.js";
+import { createRedactor } from "../redaction.js";
+import { createTeeWriter } from "../tee-writer.js";
+function parseAgentProfiles(raw, cwd, maxTurns) {
+  if (!raw) return [];
+  return raw.split(",").map((entry) => {
+    const name = entry.trim();
+    return { name, role: name, cwd, agentProfile: name, maxTurns };
+  });
+}
+/**
+ * Parse and validate discuss command options. Exported so tests can verify
+ * defaults and the legacy-flag clean break.
+ * @param {object} values - Parsed option values
+ * @returns {object}
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
+export function parseDiscussOptions(values) {
+  const taskFile = values["task-file"];
+  const taskText = values["task-text"];
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
+  const taskAmend = values["task-amend"] ?? undefined;
+  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  const profilesRaw = values["agent-profiles"];
+  const agentCwd = resolve(values["agent-cwd"] ?? ".");
+  const maxTurnsRaw = values["max-turns"] ?? "40";
+  const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
+  const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
+  const resumeContextRaw = values["resume-context"];
+  let resumeContext = null;
+  if (resumeContextRaw) {
+    try {
+      resumeContext = JSON.parse(resumeContextRaw);
+    } catch (err) {
+      throw new Error(`--resume-context is not valid JSON: ${err.message}`);
+    }
+  }
+  return {
+    taskContent,
+    taskAmend,
+    agentConfigs,
+    leadProfile: values["lead-profile"] ?? "release-engineer",
+    leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    maxTurns,
+    outputPath: values.output,
+    discussionId: values["discussion-id"] ?? null,
+    resumeContext,
+  };
+}
+/**
+ * Discuss command — run a discusser-led session with suspend/resume
+ * semantics, threading `discussion_id` through the trace so multi-run
+ * conversations are queryable as one.
+ *
+ * @param {object} values - Parsed option values
+ * @param {string[]} _args - Positional arguments
+ */
+export async function runDiscussCommand(values, _args) {
+  const opts = parseDiscussOptions(values);
+  const redactor = createRedactor();
+  const fileStream = opts.outputPath
+    ? createWriteStream(opts.outputPath)
+    : null;
+  const output = fileStream
+    ? createTeeWriter({
+        fileStream,
+        textStream: process.stdout,
+        mode: "supervised",
+      })
+    : process.stdout;
+  if (opts.leadProfile) {
+    process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
+  }
+  const { query } = await import("@anthropic-ai/claude-agent-sdk");
+  const discusser = createDiscusser({
+    leadProfile: opts.leadProfile,
+    leadModel: opts.leadModel,
+    agentModel: opts.agentModel,
+    agentConfigs: opts.agentConfigs,
+    discussionId: opts.discussionId,
+    resumeContext: opts.resumeContext,
+    query,
+    output,
+    maxTurns: opts.maxTurns,
+    taskAmend: opts.taskAmend,
+    redactor,
+  });
+  const result = await discusser.run(opts.taskContent);
+  if (fileStream) {
+    await new Promise((r) => output.end(r));
+    await new Promise((r) => fileStream.end(r));
+  }
+  process.exit(result.success ? 0 : 1);
+}

package/src/commands/facilitate.js CHANGED Viewed

@@ -54,10 +54,10 @@ export function parseFacilitateOptions(values) {
     agentConfigs,
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
+    facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     maxTurns,
     outputPath: values.output,
-    facilitatorProfile: values["facilitator-profile"] ?? undefined,
+    facilitatorProfile: values["lead-profile"] ?? undefined,
   };
 }