@forwardimpact/libeval 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
12
12
  * @returns {object}
13
13
  */
14
14
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
15
- function parseSuperviseOptions(values) {
15
+ export function parseSuperviseOptions(values) {
16
16
  const taskFile = values["task-file"];
17
17
  const taskText = values["task-text"];
18
18
  if (taskFile && taskText)
@@ -33,13 +33,13 @@ function parseSuperviseOptions(values) {
33
33
  values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
34
34
  ),
35
35
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
36
- supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
36
+ supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
37
37
  maxTurns: (() => {
38
38
  const raw = values["max-turns"] ?? "200";
39
39
  return raw === "0" ? 0 : parseInt(raw, 10);
40
40
  })(),
41
41
  outputPath: values.output,
42
- supervisorProfile: values["supervisor-profile"] ?? undefined,
42
+ supervisorProfile: values["lead-profile"] ?? undefined,
43
43
  agentProfile: values["agent-profile"] ?? undefined,
44
44
  allowedTools: (
45
45
  values["allowed-tools"] ??
@@ -53,7 +53,9 @@ function parseSuperviseOptions(values) {
53
53
  }
54
54
 
55
55
  /**
56
- * Supervise command — run two agents in a relay loop via the Claude Agent SDK.
56
+ * Supervise command — run one agent under a supervisor via the
57
+ * orchestration loop. The supervisor delegates work through Ask, sees
58
+ * each reply on its next turn, and ends with Conclude.
57
59
  *
58
60
  * Usage: fit-eval supervise [options]
59
61
  *
@@ -0,0 +1,135 @@
1
+ /**
2
+ * DiscussTools — discuss-mode tool servers. The lead's surface extends the
3
+ * base set with three discuss-only terminal tools:
4
+ *
5
+ * - `RequestForComment` posts a fire-and-forget message to a human channel
6
+ * via the bridge; the reply arrives on a later workflow run.
7
+ * - `Recess` suspends the session with a resumption trigger.
8
+ * - `Adjourn` ends the discussion with a verdict.
9
+ *
10
+ * `Conclude` is absent — discuss mode ends via Adjourn or Recess. The
11
+ * agent surface is identical to the facilitated agent's: Ask / Answer /
12
+ * Announce / RollCall, with Ask defaulting to the lead.
13
+ */
14
+
15
+ import { tool } from "@anthropic-ai/claude-agent-sdk";
16
+ import { z } from "zod";
17
+
18
+ import {
19
+ baseTools,
20
+ concludeSession,
21
+ orchestrationServer,
22
+ } from "./orchestration-toolkit.js";
23
+
24
+ /** System prompt appended for discuss-mode agent runners. */
25
+ export const DISCUSS_AGENT_SYSTEM_PROMPT =
26
+ "You participate in an asynchronous discussion. " +
27
+ "Each question you receive carries an [ask#N] header — quote that N back as the askId field on Answer so the reply pairs with the right question. " +
28
+ "Answer replies to an ask addressed to you. askId is optional: omit it and the handler auto-picks if exactly one ask is owed to you, otherwise it routes your message as an Announce. " +
29
+ "Ask sends a question to the lead or another participant and returns immediately with {askIds:[N]}; the reply arrives on a later turn as `[answer#N] <participant>: <text>` in your inbox. " +
30
+ "Announce broadcasts a message to every other participant — use this for unsolicited remarks or to reply to an Announce. " +
31
+ "RollCall lists participants.";
32
+
33
+ const RESUME_TRIGGER_SCHEMA = z
34
+ .object({
35
+ kind: z.enum(["responses", "elapsed", "either"]),
36
+ responses: z.number().optional(),
37
+ elapsed: z.string().optional(),
38
+ })
39
+ .strict();
40
+
41
+ /** Discuss-mode lead tool server. */
42
+ export function createDiscussLeadToolServer(ctx) {
43
+ return orchestrationServer([
44
+ ...baseTools(ctx, { from: "lead", defaultTo: undefined, broadcast: true }),
45
+ tool(
46
+ "RequestForComment",
47
+ "Post a fire-and-forget message to a channel via the bridge. Returns a correlation id; the reply arrives on a later workflow run.",
48
+ {
49
+ channel: z.string(),
50
+ body: z.string(),
51
+ addressees: z.array(z.string()).optional(),
52
+ },
53
+ createRequestForCommentHandler(ctx),
54
+ ),
55
+ tool(
56
+ "Recess",
57
+ "Suspend the run. The bridge re-dispatches the workflow when the trigger fires.",
58
+ { reason: z.string(), trigger: RESUME_TRIGGER_SCHEMA },
59
+ createRecessHandler(ctx),
60
+ ),
61
+ tool(
62
+ "Adjourn",
63
+ "End the discussion with a verdict ('adjourned' / 'failed') and a summary.",
64
+ {
65
+ verdict: z.enum(["adjourned", "failed"]),
66
+ summary: z.string(),
67
+ outcome: z.string().optional(),
68
+ },
69
+ createAdjournHandler(ctx),
70
+ ),
71
+ ]);
72
+ }
73
+
74
+ /** Discuss-mode agent tool server. */
75
+ export function createDiscussAgentToolServer(ctx, { from }) {
76
+ return orchestrationServer(
77
+ baseTools(ctx, { from, defaultTo: "lead", broadcast: true }),
78
+ );
79
+ }
80
+
81
+ /** RequestForComment handler — queues structured replies on `ctx.replies[]`. */
82
+ export function createRequestForCommentHandler(ctx) {
83
+ return async ({ channel, body, addressees }) => {
84
+ const correlationId = `rfc_${++ctx.rfcCounter}`;
85
+ const addresseeList = addressees?.length ? addressees : [null];
86
+ for (const addressee of addresseeList) {
87
+ ctx.replies.push({
88
+ ...(addressee && { addressee }),
89
+ body,
90
+ ...(ctx.discussionId && { thread_id: ctx.discussionId }),
91
+ correlation_id: correlationId,
92
+ });
93
+ }
94
+ return {
95
+ content: [
96
+ {
97
+ type: "text",
98
+ text: JSON.stringify({ correlation_id: correlationId, channel }),
99
+ },
100
+ ],
101
+ };
102
+ };
103
+ }
104
+
105
+ /**
106
+ * Recess handler — ends the run with a structured pause + resumption
107
+ * trigger; cancels any open Asks so askers see a synthetic null answer.
108
+ * `concluded` flips true (same as Adjourn); the `recessed` verdict
109
+ * distinguishes them, and `recessTrigger` carries the resume shape for
110
+ * the bridge.
111
+ */
112
+ export function createRecessHandler(ctx) {
113
+ return async ({ reason, trigger }) => {
114
+ ctx.recessTrigger = trigger;
115
+ concludeSession(ctx, {
116
+ verdict: "recessed",
117
+ summary: reason,
118
+ reason: "session recessed",
119
+ });
120
+ return { content: [{ type: "text", text: "Recess queued." }] };
121
+ };
122
+ }
123
+
124
+ /** Adjourn handler — ends the discussion with a verdict. */
125
+ export function createAdjournHandler(ctx) {
126
+ return async ({ verdict, summary, outcome }) => {
127
+ if (outcome !== undefined) ctx.outcome = outcome;
128
+ concludeSession(ctx, {
129
+ verdict,
130
+ summary,
131
+ reason: "session adjourned",
132
+ });
133
+ return { content: [{ type: "text", text: "Session adjourned." }] };
134
+ };
135
+ }
@@ -0,0 +1,315 @@
1
+ /**
2
+ * Discusser — async, suspendable orchestration on top of a within-run
3
+ * `OrchestrationLoop`. The lead role uses `DiscussTools` (Adjourn / Recess
4
+ * / RequestForComment) instead of the facilitator's Conclude.
5
+ *
6
+ * Discuss mode is a sibling of facilitate mode, not a subset of it. The
7
+ * within-run turn loop is shared via `OrchestrationLoop`, but the lead
8
+ * role, tool set, system prompts, and participant naming all stay
9
+ * mode-local.
10
+ */
11
+
12
+ import { Writable } from "node:stream";
13
+ import { resolve } from "node:path";
14
+
15
+ import { createAgentRunner } from "./agent-runner.js";
16
+ import { composeProfilePrompt } from "./profile-prompt.js";
17
+ import { SequenceCounter } from "./sequence-counter.js";
18
+ import { createMessageBus } from "./message-bus.js";
19
+ import { createOrchestrationContext } from "./orchestration-toolkit.js";
20
+ import {
21
+ createDiscussLeadToolServer,
22
+ createDiscussAgentToolServer,
23
+ DISCUSS_AGENT_SYSTEM_PROMPT,
24
+ } from "./discuss-tools.js";
25
+ import { OrchestrationLoop } from "./orchestration-loop.js";
26
+
27
+ /** System prompt appended for the lead (Chair) runner in discuss mode. */
28
+ export const DISCUSS_SYSTEM_PROMPT =
29
+ "You lead an asynchronous discussion across multiple participants and a human channel. " +
30
+ "Ask sends a question and returns immediately with {askIds:[N,…]}. The reply arrives on a later turn as `[answer#N] <participant>: <text>` in your inbox — between turns you can plan, reflect, or send more Asks while participants work in parallel. End your turn with text after you've asked everything you intend to; the orchestrator wakes you when the next message lands. " +
31
+ "Answer replies to an ask a participant addressed to you (you'll see it tagged `[ask#N] <participant>: …` in your inbox). Quote askId from the [ask#N] tag; omit it and the handler auto-picks the only pending ask or routes your message as an Announce. " +
32
+ "Announce delivers a message with no reply obligation. " +
33
+ "RollCall returns the participant roster. " +
34
+ "RequestForComment posts a message to the human thread via the bridge. Every reply you want the human to see MUST go through RequestForComment — the bridge delivers only queued replies, not your text output. " +
35
+ "Recess suspends the run with a resumption trigger (responses / elapsed / either); any open Asks get a synthetic '[no answer: session concluded]' on the asker's queue so nothing dangles. " +
36
+ "Adjourn ends the discussion with a verdict ('adjourned' / 'failed') and a summary. " +
37
+ "Multiple Ask / Announce calls in one assistant turn dispatch in parallel — issue them as parallel tool_use blocks rather than sending the same question both broadcast and individually. " +
38
+ "You MUST call RequestForComment with your response before calling Adjourn. You MUST end every run by calling Adjourn or Recess — never end a turn with only text *after* every Ask round has resolved.";
39
+
40
+ /**
41
+ * Augment a base orchestration context with discuss-mode fields.
42
+ * @param {object} ctx
43
+ * @param {string|null} discussionId
44
+ * @returns {object}
45
+ */
46
+ export function augmentContextForDiscuss(ctx, discussionId) {
47
+ ctx.discussionId = discussionId;
48
+ ctx.recessTrigger = null;
49
+ ctx.replies = [];
50
+ ctx.rfcCounter = 0;
51
+ ctx.outcome = null;
52
+ return ctx;
53
+ }
54
+
55
+ const devNull = new Writable({
56
+ write(_chunk, _enc, cb) {
57
+ cb();
58
+ },
59
+ });
60
+
61
+ /**
62
+ * Async orchestrator for the `discuss` mode. Composes an
63
+ * `OrchestrationLoop` for the within-run turns but owns the discussion id,
64
+ * the resumption trigger, and the discuss-augmented terminal summary.
65
+ */
66
+ export class Discusser {
67
+ /**
68
+ * @param {object} deps
69
+ * @param {OrchestrationLoop} deps.loop
70
+ * @param {object} deps.ctx
71
+ * @param {import("stream").Writable} deps.output
72
+ * @param {object} deps.redactor
73
+ * @param {string|null} [deps.discussionId]
74
+ * @param {SequenceCounter} [deps.counter]
75
+ */
76
+ constructor({ loop, ctx, output, discussionId, counter, redactor }) {
77
+ if (!loop) throw new Error("loop is required");
78
+ if (!ctx) throw new Error("ctx is required");
79
+ if (!output) throw new Error("output is required");
80
+ if (!redactor) throw new Error("redactor is required");
81
+ this.loop = loop;
82
+ this.ctx = ctx;
83
+ this.output = output;
84
+ this.discussionId = discussionId ?? null;
85
+ this.counter = counter ?? new SequenceCounter();
86
+ this.redactor = redactor;
87
+ }
88
+
89
+ /**
90
+ * Run the discussion. Emits the meta header first (when a discussion_id
91
+ * is set), delegates the within-run loop to `OrchestrationLoop`, then
92
+ * emits the discuss-augmented summary (overrides the loop's earlier
93
+ * summary; trace consumers keep the last summary they see).
94
+ *
95
+ * @param {string} task
96
+ * @returns {Promise<{success: boolean, verdict: string, turns: number, replies: object[], trigger: object|null}>}
97
+ */
98
+ async run(task) {
99
+ this.#emitMeta();
100
+
101
+ // The loop owns within-run turns. Its emitSummary fires once before
102
+ // run() returns; ours replaces it as the last summary line.
103
+ await this.loop.run(task);
104
+
105
+ const verdict = this.ctx.verdict ?? "failed";
106
+ const success = verdict === "adjourned";
107
+ this.#emitDiscussSummary({
108
+ success,
109
+ verdict,
110
+ turns: this.loop.leadTurns,
111
+ });
112
+
113
+ return {
114
+ success,
115
+ verdict,
116
+ turns: this.loop.leadTurns,
117
+ replies: this.ctx.replies.slice(),
118
+ trigger: this.ctx.recessTrigger ?? null,
119
+ };
120
+ }
121
+
122
+ #emitMeta() {
123
+ if (!this.discussionId) return;
124
+ this.output.write(
125
+ JSON.stringify(
126
+ this.redactor.redactValue({
127
+ source: "orchestrator",
128
+ seq: this.counter.next(),
129
+ event: { type: "meta", discussion_id: this.discussionId },
130
+ }),
131
+ ) + "\n",
132
+ );
133
+ }
134
+
135
+ #emitDiscussSummary({ success, verdict, turns }) {
136
+ const event = {
137
+ type: "summary",
138
+ success,
139
+ verdict,
140
+ turns,
141
+ ...(this.ctx.summary && { summary: this.ctx.summary }),
142
+ ...(this.ctx.outcome && { outcome: this.ctx.outcome }),
143
+ replies: this.ctx.replies,
144
+ ...(this.ctx.recessTrigger && { trigger: this.ctx.recessTrigger }),
145
+ ...(this.discussionId && { discussion_id: this.discussionId }),
146
+ };
147
+ this.output.write(
148
+ JSON.stringify(
149
+ this.redactor.redactValue({
150
+ source: "orchestrator",
151
+ seq: this.counter.next(),
152
+ event,
153
+ }),
154
+ ) + "\n",
155
+ );
156
+ }
157
+ }
158
+
159
+ /**
160
+ * Factory — wires the lead and agent runners with `DiscussTools`, builds
161
+ * the `OrchestrationLoop` (with `leadName: "lead"` and discuss-mode
162
+ * protocol tagging) and the wrapping `Discusser`.
163
+ *
164
+ * Resume semantics: Recess ends the run, cancels any open Asks via
165
+ * `cancelPendingAsks`, and emits a synthetic null answer per cancelled
166
+ * ask so nothing dangles in the trace. The bridge later re-dispatches
167
+ * the workflow against a fresh context; the human reads the trail of
168
+ * events to decide what to re-ask.
169
+ *
170
+ * @param {object} deps
171
+ * @param {string} [deps.leadProfile]
172
+ * @param {string} [deps.leadModel]
173
+ * @param {string} [deps.agentModel]
174
+ * @param {Array<object>} [deps.agentConfigs]
175
+ * @param {string|null} [deps.discussionId]
176
+ * @param {object|null} [deps.resumeContext]
177
+ * @param {function} deps.query
178
+ * @param {import("stream").Writable} deps.output
179
+ * @param {number} [deps.maxTurns]
180
+ * @param {string} [deps.leadCwd]
181
+ * @param {string} [deps.profilesDir]
182
+ * @param {string} [deps.taskAmend]
183
+ * @param {object} deps.redactor
184
+ * @returns {Discusser}
185
+ */
186
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: factory wires N runners + resume hydration paths
187
+ export function createDiscusser({
188
+ leadProfile,
189
+ leadModel,
190
+ agentModel,
191
+ agentConfigs,
192
+ discussionId,
193
+ resumeContext,
194
+ query,
195
+ output,
196
+ maxTurns,
197
+ leadCwd,
198
+ profilesDir,
199
+ taskAmend,
200
+ redactor,
201
+ }) {
202
+ if (!redactor) throw new Error("redactor is required");
203
+ const resolvedLeadCwd = resolve(leadCwd ?? ".");
204
+ const resolvedProfilesDir =
205
+ profilesDir ?? resolve(resolvedLeadCwd, ".claude/agents");
206
+ const resolvedConfigs = agentConfigs ?? [];
207
+
208
+ const ctx = augmentContextForDiscuss(
209
+ createOrchestrationContext(),
210
+ discussionId ?? null,
211
+ );
212
+
213
+ // Hydrate resume context — participants, replies, counters. `pendingAsks`
214
+ // is intentionally not restored: Recess cancelled every in-flight Ask
215
+ // with a synthetic null answer, so there's nothing meaningful to carry
216
+ // forward.
217
+ if (resumeContext) {
218
+ if (Array.isArray(resumeContext.participants))
219
+ ctx.participants = resumeContext.participants;
220
+ if (Array.isArray(resumeContext.replies))
221
+ ctx.replies = resumeContext.replies;
222
+ if (typeof resumeContext.askIdCounter === "number")
223
+ ctx.askIdCounter = resumeContext.askIdCounter;
224
+ if (typeof resumeContext.rfcCounter === "number")
225
+ ctx.rfcCounter = resumeContext.rfcCounter;
226
+ }
227
+
228
+ const messageBus = createMessageBus({
229
+ participants: ["lead", ...resolvedConfigs.map((a) => a.name)],
230
+ });
231
+ ctx.messageBus = messageBus;
232
+ if (ctx.participants.length === 0) {
233
+ ctx.participants = [
234
+ { name: "lead", role: "lead" },
235
+ ...resolvedConfigs.map((a) => ({ name: a.name, role: a.role })),
236
+ ];
237
+ }
238
+
239
+ const systemPromptFor = (profile, trailer) => {
240
+ if (!trailer) throw new Error("trailer is required");
241
+ return profile
242
+ ? composeProfilePrompt(profile, {
243
+ profilesDir: resolvedProfilesDir,
244
+ trailer,
245
+ })
246
+ : { type: "preset", preset: "claude_code", append: trailer };
247
+ };
248
+
249
+ let discusser;
250
+ const leadServer = createDiscussLeadToolServer(ctx);
251
+
252
+ const agents = resolvedConfigs.map((config) => {
253
+ const agentServer = createDiscussAgentToolServer(ctx, {
254
+ from: config.name,
255
+ });
256
+
257
+ const agentTrailer = config.systemPromptAmend
258
+ ? `${DISCUSS_AGENT_SYSTEM_PROMPT}\n\n${config.systemPromptAmend}`
259
+ : DISCUSS_AGENT_SYSTEM_PROMPT;
260
+
261
+ const runner = createAgentRunner({
262
+ cwd: config.cwd ?? resolvedLeadCwd,
263
+ query,
264
+ output: devNull,
265
+ model: agentModel ?? "claude-opus-4-7[1m]",
266
+ maxTurns: config.maxTurns ?? 50,
267
+ allowedTools: config.allowedTools,
268
+ onLine: (line) => discusser.loop.emitLine(config.name, line),
269
+ mcpServers: { orchestration: agentServer },
270
+ settingSources: ["project"],
271
+ systemPrompt: systemPromptFor(config.agentProfile, agentTrailer),
272
+ redactor,
273
+ });
274
+
275
+ return { name: config.name, role: config.role, runner };
276
+ });
277
+
278
+ const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
279
+ const leadRunner = createAgentRunner({
280
+ cwd: resolvedLeadCwd,
281
+ query,
282
+ output: devNull,
283
+ model: leadModel ?? "claude-opus-4-7[1m]",
284
+ maxTurns: maxTurns ?? 80,
285
+ allowedTools: ["Bash", "Read", "Glob", "Grep", "Write", "Edit"],
286
+ disallowedTools: defaultDisallowed,
287
+ onLine: (line) => discusser.loop.emitLine("lead", line),
288
+ mcpServers: { orchestration: leadServer },
289
+ settingSources: ["project"],
290
+ systemPrompt: systemPromptFor(leadProfile, DISCUSS_SYSTEM_PROMPT),
291
+ redactor,
292
+ });
293
+
294
+ const loop = new OrchestrationLoop({
295
+ leadRunner,
296
+ agents,
297
+ messageBus,
298
+ output,
299
+ leadName: "lead",
300
+ mode: "discussion",
301
+ ctx,
302
+ taskAmend,
303
+ redactor,
304
+ });
305
+
306
+ discusser = new Discusser({
307
+ loop,
308
+ ctx,
309
+ output,
310
+ discussionId: discussionId ?? null,
311
+ redactor,
312
+ counter: loop.counter,
313
+ });
314
+ return discusser;
315
+ }