npm - @forwardimpact/libeval - Versions diffs - 0.1.43 → 0.1.45 - Mend

@forwardimpact/libeval 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +212 -13
package/bin/fit-benchmark.js +2 -2
package/bin/fit-eval.js +101 -21
package/bin/fit-trace.js +14 -0
package/package.json +1 -1
package/src/agent-runner.js +45 -181
package/src/benchmark/runner.js +2 -2
package/src/commands/benchmark-run.js +1 -1
package/src/commands/by-discussion.js +84 -0
package/src/commands/callback.js +104 -0
package/src/commands/discuss.js +116 -0
package/src/commands/facilitate.js +2 -2
package/src/commands/supervise.js +6 -4
package/src/discuss-tools.js +135 -0
package/src/discusser.js +315 -0
package/src/facilitator.js +46 -357
package/src/index.js +12 -0
package/src/judge.js +1 -1
package/src/message-bus.js +27 -81
package/src/orchestration-loop.js +316 -0
package/src/orchestration-toolkit.js +272 -303
package/src/orchestrator-helpers.js +9 -45
package/src/redaction.js +12 -0
package/src/render/orchestrator-filter.js +1 -8
package/src/supervisor.js +79 -465
package/src/trace-collector.js +4 -0

package/src/commands/supervise.js CHANGED Viewed

@@ -12,7 +12,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
  * @returns {object}
  */
 // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
-function parseSuperviseOptions(values) {
+export function parseSuperviseOptions(values) {
   const taskFile = values["task-file"];
   const taskText = values["task-text"];
   if (taskFile && taskText)
@@ -33,13 +33,13 @@ function parseSuperviseOptions(values) {
       values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
     ),
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
+    supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
       const raw = values["max-turns"] ?? "200";
       return raw === "0" ? 0 : parseInt(raw, 10);
     })(),
     outputPath: values.output,
-    supervisorProfile: values["supervisor-profile"] ?? undefined,
+    supervisorProfile: values["lead-profile"] ?? undefined,
     agentProfile: values["agent-profile"] ?? undefined,
     allowedTools: (
       values["allowed-tools"] ??
@@ -53,7 +53,9 @@ function parseSuperviseOptions(values) {
 }
 /**
- * Supervise command — run two agents in a relay loop via the Claude Agent SDK.
+ * Supervise command — run one agent under a supervisor via the
+ * orchestration loop. The supervisor delegates work through Ask, sees
+ * each reply on its next turn, and ends with Conclude.
  *
  * Usage: fit-eval supervise [options]
  *

package/src/discuss-tools.js ADDED Viewed

@@ -0,0 +1,135 @@
+/**
+ * DiscussTools — discuss-mode tool servers. The lead's surface extends the
+ * base set with three discuss-only terminal tools:
+ *
+ * - `RequestForComment` posts a fire-and-forget message to a human channel
+ *   via the bridge; the reply arrives on a later workflow run.
+ * - `Recess` suspends the session with a resumption trigger.
+ * - `Adjourn` ends the discussion with a verdict.
+ *
+ * `Conclude` is absent — discuss mode ends via Adjourn or Recess. The
+ * agent surface is identical to the facilitated agent's: Ask / Answer /
+ * Announce / RollCall, with Ask defaulting to the lead.
+ */
+import { tool } from "@anthropic-ai/claude-agent-sdk";
+import { z } from "zod";
+import {
+  baseTools,
+  concludeSession,
+  orchestrationServer,
+} from "./orchestration-toolkit.js";
+/** System prompt appended for discuss-mode agent runners. */
+export const DISCUSS_AGENT_SYSTEM_PROMPT =
+  "You participate in an asynchronous discussion. " +
+  "Each question you receive carries an [ask#N] header — quote that N back as the askId field on Answer so the reply pairs with the right question. " +
+  "Answer replies to an ask addressed to you. askId is optional: omit it and the handler auto-picks if exactly one ask is owed to you, otherwise it routes your message as an Announce. " +
+  "Ask sends a question to the lead or another participant and returns immediately with {askIds:[N]}; the reply arrives on a later turn as `[answer#N] <participant>: <text>` in your inbox. " +
+  "Announce broadcasts a message to every other participant — use this for unsolicited remarks or to reply to an Announce. " +
+  "RollCall lists participants.";
+const RESUME_TRIGGER_SCHEMA = z
+  .object({
+    kind: z.enum(["responses", "elapsed", "either"]),
+    responses: z.number().optional(),
+    elapsed: z.string().optional(),
+  })
+  .strict();
+/** Discuss-mode lead tool server. */
+export function createDiscussLeadToolServer(ctx) {
+  return orchestrationServer([
+    ...baseTools(ctx, { from: "lead", defaultTo: undefined, broadcast: true }),
+    tool(
+      "RequestForComment",
+      "Post a fire-and-forget message to a channel via the bridge. Returns a correlation id; the reply arrives on a later workflow run.",
+      {
+        channel: z.string(),
+        body: z.string(),
+        addressees: z.array(z.string()).optional(),
+      },
+      createRequestForCommentHandler(ctx),
+    ),
+    tool(
+      "Recess",
+      "Suspend the run. The bridge re-dispatches the workflow when the trigger fires.",
+      { reason: z.string(), trigger: RESUME_TRIGGER_SCHEMA },
+      createRecessHandler(ctx),
+    ),
+    tool(
+      "Adjourn",
+      "End the discussion with a verdict ('adjourned' / 'failed') and a summary.",
+      {
+        verdict: z.enum(["adjourned", "failed"]),
+        summary: z.string(),
+        outcome: z.string().optional(),
+      },
+      createAdjournHandler(ctx),
+    ),
+  ]);
+}
+/** Discuss-mode agent tool server. */
+export function createDiscussAgentToolServer(ctx, { from }) {
+  return orchestrationServer(
+    baseTools(ctx, { from, defaultTo: "lead", broadcast: true }),
+  );
+}
+/** RequestForComment handler — queues structured replies on `ctx.replies[]`. */
+export function createRequestForCommentHandler(ctx) {
+  return async ({ channel, body, addressees }) => {
+    const correlationId = `rfc_${++ctx.rfcCounter}`;
+    const addresseeList = addressees?.length ? addressees : [null];
+    for (const addressee of addresseeList) {
+      ctx.replies.push({
+        ...(addressee && { addressee }),
+        body,
+        ...(ctx.discussionId && { thread_id: ctx.discussionId }),
+        correlation_id: correlationId,
+      });
+    }
+    return {
+      content: [
+        {
+          type: "text",
+          text: JSON.stringify({ correlation_id: correlationId, channel }),
+        },
+      ],
+    };
+  };
+}
+/**
+ * Recess handler — ends the run with a structured pause + resumption
+ * trigger; cancels any open Asks so askers see a synthetic null answer.
+ * `concluded` flips true (same as Adjourn); the `recessed` verdict
+ * distinguishes them, and `recessTrigger` carries the resume shape for
+ * the bridge.
+ */
+export function createRecessHandler(ctx) {
+  return async ({ reason, trigger }) => {
+    ctx.recessTrigger = trigger;
+    concludeSession(ctx, {
+      verdict: "recessed",
+      summary: reason,
+      reason: "session recessed",
+    });
+    return { content: [{ type: "text", text: "Recess queued." }] };
+  };
+}
+/** Adjourn handler — ends the discussion with a verdict. */
+export function createAdjournHandler(ctx) {
+  return async ({ verdict, summary, outcome }) => {
+    if (outcome !== undefined) ctx.outcome = outcome;
+    concludeSession(ctx, {
+      verdict,
+      summary,
+      reason: "session adjourned",
+    });
+    return { content: [{ type: "text", text: "Session adjourned." }] };
+  };
+}

package/src/discusser.js ADDED Viewed

@@ -0,0 +1,315 @@
+/**
+ * Discusser — async, suspendable orchestration on top of a within-run
+ * `OrchestrationLoop`. The lead role uses `DiscussTools` (Adjourn / Recess
+ * / RequestForComment) instead of the facilitator's Conclude.
+ *
+ * Discuss mode is a sibling of facilitate mode, not a subset of it. The
+ * within-run turn loop is shared via `OrchestrationLoop`, but the lead
+ * role, tool set, system prompts, and participant naming all stay
+ * mode-local.
+ */
+import { Writable } from "node:stream";
+import { resolve } from "node:path";
+import { createAgentRunner } from "./agent-runner.js";
+import { composeProfilePrompt } from "./profile-prompt.js";
+import { SequenceCounter } from "./sequence-counter.js";
+import { createMessageBus } from "./message-bus.js";
+import { createOrchestrationContext } from "./orchestration-toolkit.js";
+import {
+  createDiscussLeadToolServer,
+  createDiscussAgentToolServer,
+  DISCUSS_AGENT_SYSTEM_PROMPT,
+} from "./discuss-tools.js";
+import { OrchestrationLoop } from "./orchestration-loop.js";
+/** System prompt appended for the lead (Chair) runner in discuss mode. */
+export const DISCUSS_SYSTEM_PROMPT =
+  "You lead an asynchronous discussion across multiple participants and a human channel. " +
+  "Ask sends a question and returns immediately with {askIds:[N,…]}. The reply arrives on a later turn as `[answer#N] <participant>: <text>` in your inbox — between turns you can plan, reflect, or send more Asks while participants work in parallel. End your turn with text after you've asked everything you intend to; the orchestrator wakes you when the next message lands. " +
+  "Answer replies to an ask a participant addressed to you (you'll see it tagged `[ask#N] <participant>: …` in your inbox). Quote askId from the [ask#N] tag; omit it and the handler auto-picks the only pending ask or routes your message as an Announce. " +
+  "Announce delivers a message with no reply obligation. " +
+  "RollCall returns the participant roster. " +
+  "RequestForComment posts a message to the human thread via the bridge. Every reply you want the human to see MUST go through RequestForComment — the bridge delivers only queued replies, not your text output. " +
+  "Recess suspends the run with a resumption trigger (responses / elapsed / either); any open Asks get a synthetic '[no answer: session concluded]' on the asker's queue so nothing dangles. " +
+  "Adjourn ends the discussion with a verdict ('adjourned' / 'failed') and a summary. " +
+  "Multiple Ask / Announce calls in one assistant turn dispatch in parallel — issue them as parallel tool_use blocks rather than sending the same question both broadcast and individually. " +
+  "You MUST call RequestForComment with your response before calling Adjourn. You MUST end every run by calling Adjourn or Recess — never end a turn with only text *after* every Ask round has resolved.";
+/**
+ * Augment a base orchestration context with discuss-mode fields.
+ * @param {object} ctx
+ * @param {string|null} discussionId
+ * @returns {object}
+ */
+export function augmentContextForDiscuss(ctx, discussionId) {
+  ctx.discussionId = discussionId;
+  ctx.recessTrigger = null;
+  ctx.replies = [];
+  ctx.rfcCounter = 0;
+  ctx.outcome = null;
+  return ctx;
+}
+const devNull = new Writable({
+  write(_chunk, _enc, cb) {
+    cb();
+  },
+});
+/**
+ * Async orchestrator for the `discuss` mode. Composes an
+ * `OrchestrationLoop` for the within-run turns but owns the discussion id,
+ * the resumption trigger, and the discuss-augmented terminal summary.
+ */
+export class Discusser {
+  /**
+   * @param {object} deps
+   * @param {OrchestrationLoop} deps.loop
+   * @param {object} deps.ctx
+   * @param {import("stream").Writable} deps.output
+   * @param {object} deps.redactor
+   * @param {string|null} [deps.discussionId]
+   * @param {SequenceCounter} [deps.counter]
+   */
+  constructor({ loop, ctx, output, discussionId, counter, redactor }) {
+    if (!loop) throw new Error("loop is required");
+    if (!ctx) throw new Error("ctx is required");
+    if (!output) throw new Error("output is required");
+    if (!redactor) throw new Error("redactor is required");
+    this.loop = loop;
+    this.ctx = ctx;
+    this.output = output;
+    this.discussionId = discussionId ?? null;
+    this.counter = counter ?? new SequenceCounter();
+    this.redactor = redactor;
+  }
+  /**
+   * Run the discussion. Emits the meta header first (when a discussion_id
+   * is set), delegates the within-run loop to `OrchestrationLoop`, then
+   * emits the discuss-augmented summary (overrides the loop's earlier
+   * summary; trace consumers keep the last summary they see).
+   *
+   * @param {string} task
+   * @returns {Promise<{success: boolean, verdict: string, turns: number, replies: object[], trigger: object|null}>}
+   */
+  async run(task) {
+    this.#emitMeta();
+    // The loop owns within-run turns. Its emitSummary fires once before
+    // run() returns; ours replaces it as the last summary line.
+    await this.loop.run(task);
+    const verdict = this.ctx.verdict ?? "failed";
+    const success = verdict === "adjourned";
+    this.#emitDiscussSummary({
+      success,
+      verdict,
+      turns: this.loop.leadTurns,
+    });
+    return {
+      success,
+      verdict,
+      turns: this.loop.leadTurns,
+      replies: this.ctx.replies.slice(),
+      trigger: this.ctx.recessTrigger ?? null,
+    };
+  }
+  #emitMeta() {
+    if (!this.discussionId) return;
+    this.output.write(
+      JSON.stringify(
+        this.redactor.redactValue({
+          source: "orchestrator",
+          seq: this.counter.next(),
+          event: { type: "meta", discussion_id: this.discussionId },
+        }),
+      ) + "\n",
+    );
+  }
+  #emitDiscussSummary({ success, verdict, turns }) {
+    const event = {
+      type: "summary",
+      success,
+      verdict,
+      turns,
+      ...(this.ctx.summary && { summary: this.ctx.summary }),
+      ...(this.ctx.outcome && { outcome: this.ctx.outcome }),
+      replies: this.ctx.replies,
+      ...(this.ctx.recessTrigger && { trigger: this.ctx.recessTrigger }),
+      ...(this.discussionId && { discussion_id: this.discussionId }),
+    };
+    this.output.write(
+      JSON.stringify(
+        this.redactor.redactValue({
+          source: "orchestrator",
+          seq: this.counter.next(),
+          event,
+        }),
+      ) + "\n",
+    );
+  }
+}
+/**
+ * Factory — wires the lead and agent runners with `DiscussTools`, builds
+ * the `OrchestrationLoop` (with `leadName: "lead"` and discuss-mode
+ * protocol tagging) and the wrapping `Discusser`.
+ *
+ * Resume semantics: Recess ends the run, cancels any open Asks via
+ * `cancelPendingAsks`, and emits a synthetic null answer per cancelled
+ * ask so nothing dangles in the trace. The bridge later re-dispatches
+ * the workflow against a fresh context; the human reads the trail of
+ * events to decide what to re-ask.
+ *
+ * @param {object} deps
+ * @param {string} [deps.leadProfile]
+ * @param {string} [deps.leadModel]
+ * @param {string} [deps.agentModel]
+ * @param {Array<object>} [deps.agentConfigs]
+ * @param {string|null} [deps.discussionId]
+ * @param {object|null} [deps.resumeContext]
+ * @param {function} deps.query
+ * @param {import("stream").Writable} deps.output
+ * @param {number} [deps.maxTurns]
+ * @param {string} [deps.leadCwd]
+ * @param {string} [deps.profilesDir]
+ * @param {string} [deps.taskAmend]
+ * @param {object} deps.redactor
+ * @returns {Discusser}
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: factory wires N runners + resume hydration paths
+export function createDiscusser({
+  leadProfile,
+  leadModel,
+  agentModel,
+  agentConfigs,
+  discussionId,
+  resumeContext,
+  query,
+  output,
+  maxTurns,
+  leadCwd,
+  profilesDir,
+  taskAmend,
+  redactor,
+}) {
+  if (!redactor) throw new Error("redactor is required");
+  const resolvedLeadCwd = resolve(leadCwd ?? ".");
+  const resolvedProfilesDir =
+    profilesDir ?? resolve(resolvedLeadCwd, ".claude/agents");
+  const resolvedConfigs = agentConfigs ?? [];
+  const ctx = augmentContextForDiscuss(
+    createOrchestrationContext(),
+    discussionId ?? null,
+  );
+  // Hydrate resume context — participants, replies, counters. `pendingAsks`
+  // is intentionally not restored: Recess cancelled every in-flight Ask
+  // with a synthetic null answer, so there's nothing meaningful to carry
+  // forward.
+  if (resumeContext) {
+    if (Array.isArray(resumeContext.participants))
+      ctx.participants = resumeContext.participants;
+    if (Array.isArray(resumeContext.replies))
+      ctx.replies = resumeContext.replies;
+    if (typeof resumeContext.askIdCounter === "number")
+      ctx.askIdCounter = resumeContext.askIdCounter;
+    if (typeof resumeContext.rfcCounter === "number")
+      ctx.rfcCounter = resumeContext.rfcCounter;
+  }
+  const messageBus = createMessageBus({
+    participants: ["lead", ...resolvedConfigs.map((a) => a.name)],
+  });
+  ctx.messageBus = messageBus;
+  if (ctx.participants.length === 0) {
+    ctx.participants = [
+      { name: "lead", role: "lead" },
+      ...resolvedConfigs.map((a) => ({ name: a.name, role: a.role })),
+    ];
+  }
+  const systemPromptFor = (profile, trailer) => {
+    if (!trailer) throw new Error("trailer is required");
+    return profile
+      ? composeProfilePrompt(profile, {
+          profilesDir: resolvedProfilesDir,
+          trailer,
+        })
+      : { type: "preset", preset: "claude_code", append: trailer };
+  };
+  let discusser;
+  const leadServer = createDiscussLeadToolServer(ctx);
+  const agents = resolvedConfigs.map((config) => {
+    const agentServer = createDiscussAgentToolServer(ctx, {
+      from: config.name,
+    });
+    const agentTrailer = config.systemPromptAmend
+      ? `${DISCUSS_AGENT_SYSTEM_PROMPT}\n\n${config.systemPromptAmend}`
+      : DISCUSS_AGENT_SYSTEM_PROMPT;
+    const runner = createAgentRunner({
+      cwd: config.cwd ?? resolvedLeadCwd,
+      query,
+      output: devNull,
+      model: agentModel ?? "claude-opus-4-7[1m]",
+      maxTurns: config.maxTurns ?? 50,
+      allowedTools: config.allowedTools,
+      onLine: (line) => discusser.loop.emitLine(config.name, line),
+      mcpServers: { orchestration: agentServer },
+      settingSources: ["project"],
+      systemPrompt: systemPromptFor(config.agentProfile, agentTrailer),
+      redactor,
+    });
+    return { name: config.name, role: config.role, runner };
+  });
+  const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
+  const leadRunner = createAgentRunner({
+    cwd: resolvedLeadCwd,
+    query,
+    output: devNull,
+    model: leadModel ?? "claude-opus-4-7[1m]",
+    maxTurns: maxTurns ?? 80,
+    allowedTools: ["Bash", "Read", "Glob", "Grep", "Write", "Edit"],
+    disallowedTools: defaultDisallowed,
+    onLine: (line) => discusser.loop.emitLine("lead", line),
+    mcpServers: { orchestration: leadServer },
+    settingSources: ["project"],
+    systemPrompt: systemPromptFor(leadProfile, DISCUSS_SYSTEM_PROMPT),
+    redactor,
+  });
+  const loop = new OrchestrationLoop({
+    leadRunner,
+    agents,
+    messageBus,
+    output,
+    leadName: "lead",
+    mode: "discussion",
+    ctx,
+    taskAmend,
+    redactor,
+  });
+  discusser = new Discusser({
+    loop,
+    ctx,
+    output,
+    discussionId: discussionId ?? null,
+    redactor,
+    counter: loop.counter,
+  });
+  return discusser;
+}