@forwardimpact/libeval 0.1.13 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +92 -44
- package/package.json +3 -2
- package/src/agent-runner.js +29 -40
- package/src/commands/facilitate.js +109 -0
- package/src/commands/run.js +17 -1
- package/src/facilitator.js +492 -0
- package/src/index.js +15 -2
- package/src/message-bus.js +100 -0
- package/src/orchestration-toolkit.js +209 -0
- package/src/sequence-counter.js +17 -0
- package/src/supervisor.js +128 -210
- package/src/tee-writer.js +20 -26
package/src/supervisor.js
CHANGED
|
@@ -4,50 +4,37 @@
|
|
|
4
4
|
* introduces itself, and delegates work to the agent. The loop then alternates:
|
|
5
5
|
* agent → supervisor → agent.
|
|
6
6
|
*
|
|
7
|
+
* Signaling uses orchestration tools (Conclude, Redirect, Ask) via in-process
|
|
8
|
+
* MCP servers. No text-token detection.
|
|
9
|
+
*
|
|
7
10
|
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
8
11
|
*/
|
|
9
12
|
|
|
10
|
-
import {
|
|
13
|
+
import { Writable } from "node:stream";
|
|
11
14
|
import { createAgentRunner } from "./agent-runner.js";
|
|
12
15
|
import { TraceCollector } from "./trace-collector.js";
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
* @param {string} text
|
|
20
|
-
* @returns {boolean}
|
|
21
|
-
*/
|
|
22
|
-
export function isComplete(text) {
|
|
23
|
-
return /(?:^|[\s*_~`])EVALUATION_COMPLETE(?:[\s*_~`.,!?]|$)/m.test(text);
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
/**
|
|
27
|
-
* Check if the supervisor's response signals a mid-turn intervention.
|
|
28
|
-
* Same tolerance rules as isComplete (markdown formatting, word boundaries),
|
|
29
|
-
* but matches the EVALUATION_INTERVENTION keyword instead.
|
|
30
|
-
* @param {string} text
|
|
31
|
-
* @returns {boolean}
|
|
32
|
-
*/
|
|
33
|
-
export function isIntervention(text) {
|
|
34
|
-
return /(?:^|[\s*_~`])EVALUATION_INTERVENTION(?:[\s*_~`.,!?]|$)/m.test(text);
|
|
35
|
-
}
|
|
16
|
+
import { SequenceCounter } from "./sequence-counter.js";
|
|
17
|
+
import {
|
|
18
|
+
createOrchestrationContext,
|
|
19
|
+
createSupervisorToolServer,
|
|
20
|
+
createSupervisedAgentToolServer,
|
|
21
|
+
} from "./orchestration-toolkit.js";
|
|
36
22
|
|
|
37
23
|
/** System prompt appended for the supervisor runner in supervise mode. */
|
|
38
24
|
export const SUPERVISOR_SYSTEM_PROMPT =
|
|
39
25
|
"You relay messages to one persistent agent session — your only output " +
|
|
40
26
|
"channel. Spawning sub-agents or restarting the agent is blocked. Do not " +
|
|
41
|
-
"do the work yourself. Reply briefly to let the agent continue
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"relayed.";
|
|
27
|
+
"do the work yourself. Reply briefly to let the agent continue. Use your " +
|
|
28
|
+
"Redirect tool to interrupt and correct the agent. Use your Conclude tool " +
|
|
29
|
+
"with a summary when the task is fully done. Only your final message each " +
|
|
30
|
+
"turn is relayed.";
|
|
45
31
|
|
|
46
32
|
/** System prompt appended for the agent runner in supervise mode. */
|
|
47
33
|
export const AGENT_SYSTEM_PROMPT =
|
|
48
34
|
"A supervisor watches your work and may interrupt with new instructions " +
|
|
49
35
|
"mid-task. Treat any new prompt as authoritative and adjust course. " +
|
|
50
|
-
"When uncertain,
|
|
36
|
+
"When uncertain, use your Ask tool to ask the supervisor a clarifying " +
|
|
37
|
+
"question — you will receive a direct answer.";
|
|
51
38
|
|
|
52
39
|
/**
|
|
53
40
|
* Maximum number of mid-turn interventions allowed within a single agent turn.
|
|
@@ -64,8 +51,9 @@ export class Supervisor {
|
|
|
64
51
|
* @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions
|
|
65
52
|
* @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to
|
|
66
53
|
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
|
|
54
|
+
* @param {object} [deps.ctx] - Orchestration context (injected by factory)
|
|
67
55
|
*/
|
|
68
|
-
constructor({ agentRunner, supervisorRunner, output, maxTurns }) {
|
|
56
|
+
constructor({ agentRunner, supervisorRunner, output, maxTurns, ctx }) {
|
|
69
57
|
if (!agentRunner) throw new Error("agentRunner is required");
|
|
70
58
|
if (!supervisorRunner) throw new Error("supervisorRunner is required");
|
|
71
59
|
if (!output) throw new Error("output is required");
|
|
@@ -73,51 +61,22 @@ export class Supervisor {
|
|
|
73
61
|
this.supervisorRunner = supervisorRunner;
|
|
74
62
|
this.output = output;
|
|
75
63
|
this.maxTurns = maxTurns ?? 100;
|
|
64
|
+
this.ctx = ctx ?? createOrchestrationContext();
|
|
65
|
+
this.counter = new SequenceCounter();
|
|
76
66
|
/** @type {"agent"|"supervisor"} */
|
|
77
67
|
this.currentSource = "agent";
|
|
78
68
|
/** @type {number} */
|
|
79
69
|
this.currentTurn = 0;
|
|
80
|
-
/**
|
|
81
|
-
* Set to true when any supervisor message contains the success signal.
|
|
82
|
-
* The SDK result text only reflects the last assistant message, so when
|
|
83
|
-
* the supervisor writes EVALUATION_COMPLETE in an early message and
|
|
84
|
-
* then continues with follow-up work, the result text won't contain it.
|
|
85
|
-
* This flag captures the signal from the full message stream.
|
|
86
|
-
* @type {boolean}
|
|
87
|
-
*/
|
|
88
|
-
this.completeSignalSeen = false;
|
|
89
|
-
/**
|
|
90
|
-
* Set to true when any supervisor message contains EVALUATION_INTERVENTION.
|
|
91
|
-
* Mirrors completeSignalSeen — populated by emitLine when a supervisor
|
|
92
|
-
* assistant text block matches isIntervention(...). The mid-turn loop
|
|
93
|
-
* reads this flag after each supervisor invocation to decide whether to
|
|
94
|
-
* abort the agent's in-flight SDK session.
|
|
95
|
-
* @type {boolean}
|
|
96
|
-
*/
|
|
97
|
-
this.interventionSignalSeen = false;
|
|
98
|
-
/**
|
|
99
|
-
* The most recent supervisor SDK result captured inside the mid-turn
|
|
100
|
-
* onBatch callback. The outer loop reads this after the agent aborts to
|
|
101
|
-
* build the next relay prompt without re-running the supervisor.
|
|
102
|
-
* @type {{success: boolean, text: string}|null}
|
|
103
|
-
*/
|
|
104
|
-
this.lastSupervisorResult = null;
|
|
105
70
|
}
|
|
106
71
|
|
|
107
72
|
/**
|
|
108
73
|
* Run the supervisor ↔ agent relay loop.
|
|
109
|
-
* The supervisor receives the task first, introduces itself, and delegates
|
|
110
|
-
* work to the agent. The loop then alternates: agent → supervisor → agent.
|
|
111
74
|
* @param {string} task - The initial task for the supervisor
|
|
112
75
|
* @returns {Promise<{success: boolean, turns: number}>}
|
|
113
76
|
*/
|
|
114
77
|
async run(task) {
|
|
115
|
-
// Turn 0: Supervisor receives the task and introduces it to the agent
|
|
116
78
|
this.currentSource = "supervisor";
|
|
117
79
|
this.currentTurn = 0;
|
|
118
|
-
this.completeSignalSeen = false;
|
|
119
|
-
this.interventionSignalSeen = false;
|
|
120
|
-
this.lastSupervisorResult = null;
|
|
121
80
|
let supervisorResult = await this.supervisorRunner.run(task);
|
|
122
81
|
|
|
123
82
|
if (supervisorResult.error) {
|
|
@@ -125,38 +84,25 @@ export class Supervisor {
|
|
|
125
84
|
return { success: false, turns: 0 };
|
|
126
85
|
}
|
|
127
86
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
// assistant message, so when the supervisor writes EVALUATION_COMPLETE
|
|
131
|
-
// early and then continues (e.g. filing issues), we must also check the
|
|
132
|
-
// flag set by emitLine during streaming.
|
|
133
|
-
if (this.completeSignalSeen || isComplete(supervisorResult.text)) {
|
|
134
|
-
this.emitSummary({ success: true, turns: 0 });
|
|
87
|
+
if (this.ctx.concluded) {
|
|
88
|
+
this.emitSummary({ success: true, turns: 0, summary: this.ctx.summary });
|
|
135
89
|
return { success: true, turns: 0 };
|
|
136
90
|
}
|
|
137
91
|
|
|
92
|
+
let pendingRelay = null;
|
|
138
93
|
const turnLimit = this.maxTurns === 0 ? Infinity : this.maxTurns;
|
|
139
94
|
for (let turn = 1; turn <= turnLimit; turn++) {
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
const relay = this.extractLastText(
|
|
144
|
-
this.supervisorRunner,
|
|
145
|
-
supervisorResult.text,
|
|
146
|
-
);
|
|
95
|
+
const relay =
|
|
96
|
+
pendingRelay ??
|
|
97
|
+
this.extractLastText(this.supervisorRunner, supervisorResult.text);
|
|
147
98
|
|
|
148
|
-
// Drive the agent through interventions until its SDK session ends
|
|
149
|
-
// naturally, the supervisor signals completion mid-turn, or the
|
|
150
|
-
// per-turn intervention budget is exhausted.
|
|
151
99
|
const turnOutcome = await this.#runAgentTurn(turn, relay);
|
|
152
100
|
if (turnOutcome.exit) return turnOutcome.exit;
|
|
153
101
|
|
|
154
|
-
// End-of-turn review (existing behaviour). Returns either an exit
|
|
155
|
-
// outcome (error or completion) or the supervisor result for the
|
|
156
|
-
// next turn's relay.
|
|
157
102
|
const reviewOutcome = await this.#endOfTurnReview(turn);
|
|
158
103
|
if (reviewOutcome.exit) return reviewOutcome.exit;
|
|
159
104
|
supervisorResult = reviewOutcome.supervisorResult;
|
|
105
|
+
pendingRelay = reviewOutcome.relay ?? null;
|
|
160
106
|
}
|
|
161
107
|
|
|
162
108
|
this.emitSummary({ success: false, turns: this.maxTurns });
|
|
@@ -165,9 +111,8 @@ export class Supervisor {
|
|
|
165
111
|
|
|
166
112
|
/**
|
|
167
113
|
* Drive the agent through one turn, allowing the supervisor to interrupt
|
|
168
|
-
*
|
|
169
|
-
*
|
|
170
|
-
* end-of-turn review).
|
|
114
|
+
* via the Redirect tool. Returns either an `exit` outcome (the loop should
|
|
115
|
+
* return immediately) or `{exit: null}` (proceed to end-of-turn review).
|
|
171
116
|
* @param {number} turn
|
|
172
117
|
* @param {string} initialRelay
|
|
173
118
|
* @returns {Promise<{exit: {success: boolean, turns: number}|null}>}
|
|
@@ -176,11 +121,6 @@ export class Supervisor {
|
|
|
176
121
|
let relay = initialRelay;
|
|
177
122
|
let interventions = 0;
|
|
178
123
|
|
|
179
|
-
// Wire the mid-turn observation hook on the agent runner. The bound
|
|
180
|
-
// callback captures `turn` so the inner loop's multiple resume(...)
|
|
181
|
-
// calls all see the same turn id. The supervisorRunner does NOT get
|
|
182
|
-
// an onBatch callback — it only fires onLine, which is enough for
|
|
183
|
-
// emitLine to detect EVALUATION_COMPLETE / EVALUATION_INTERVENTION.
|
|
184
124
|
this.agentRunner.onBatch = (batchLines, ctx) =>
|
|
185
125
|
this.#midTurnReview(turn, batchLines, ctx);
|
|
186
126
|
|
|
@@ -198,124 +138,109 @@ export class Supervisor {
|
|
|
198
138
|
return { exit: { success: false, turns: turn } };
|
|
199
139
|
}
|
|
200
140
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
141
|
+
if (this.ctx.concluded) {
|
|
142
|
+
this.emitSummary({
|
|
143
|
+
success: true,
|
|
144
|
+
turns: turn,
|
|
145
|
+
summary: this.ctx.summary,
|
|
146
|
+
});
|
|
204
147
|
return { exit: { success: true, turns: turn } };
|
|
205
148
|
}
|
|
206
149
|
|
|
207
|
-
if (agentResult.aborted && this.
|
|
150
|
+
if (agentResult.aborted && this.ctx.redirect) {
|
|
208
151
|
interventions++;
|
|
152
|
+
const redirect = this.ctx.redirect;
|
|
153
|
+
this.ctx.redirect = null;
|
|
209
154
|
if (interventions >= MAX_INTERVENTIONS_PER_TURN) {
|
|
210
155
|
this.emitOrchestratorEvent({ type: "intervention_limit", turn });
|
|
211
156
|
return { exit: null };
|
|
212
157
|
}
|
|
213
|
-
relay =
|
|
214
|
-
this.supervisorRunner,
|
|
215
|
-
this.lastSupervisorResult?.text ?? "",
|
|
216
|
-
);
|
|
158
|
+
relay = redirect.message;
|
|
217
159
|
this.emitOrchestratorEvent({ type: "intervention_relayed", turn });
|
|
218
160
|
continue;
|
|
219
161
|
}
|
|
220
162
|
|
|
221
|
-
// Agent's SDK session finished naturally — proceed to end-of-turn.
|
|
222
163
|
return { exit: null };
|
|
223
164
|
}
|
|
224
165
|
} finally {
|
|
225
|
-
// Detach onBatch before the end-of-turn review so the supervisor's
|
|
226
|
-
// own SDK session does not trigger nested onBatch fires.
|
|
227
166
|
this.agentRunner.onBatch = null;
|
|
228
167
|
}
|
|
229
168
|
}
|
|
230
169
|
|
|
231
170
|
/**
|
|
232
171
|
* Mid-turn supervisor review fired from inside the agent's onBatch hook.
|
|
233
|
-
*
|
|
234
|
-
*
|
|
235
|
-
* EVALUATION_INTERVENTION or EVALUATION_COMPLETE.
|
|
172
|
+
* Runs the supervisor's LLM against the batch and aborts the agent if
|
|
173
|
+
* the supervisor calls Redirect or Conclude.
|
|
236
174
|
* @param {number} turn
|
|
237
175
|
* @param {string[]} batchLines
|
|
238
176
|
* @param {{abort: () => void}} ctx
|
|
239
177
|
*/
|
|
240
178
|
async #midTurnReview(turn, batchLines, { abort }) {
|
|
241
179
|
const batchTranscript = this.renderBatch(batchLines);
|
|
242
|
-
|
|
243
|
-
// Order matters: emit the orchestrator marker BEFORE the supervisor
|
|
244
|
-
// LLM call so the trace reads
|
|
245
|
-
// agent line → orchestrator:mid_turn_review
|
|
246
|
-
// → supervisor lines (tagged turn:N)
|
|
247
|
-
// → orchestrator:intervention_requested|complete_requested
|
|
248
180
|
this.emitOrchestratorEvent({ type: "mid_turn_review", turn });
|
|
249
181
|
|
|
250
|
-
// currentTurn stays = turn so mid-turn supervisor lines share the
|
|
251
|
-
// agent's turn id. They are distinguishable from end-of-turn reviews
|
|
252
|
-
// by the surrounding orchestrator events emitted around this call.
|
|
253
182
|
this.currentSource = "supervisor";
|
|
254
|
-
this.
|
|
255
|
-
this.interventionSignalSeen = false;
|
|
183
|
+
this.ctx.redirect = null;
|
|
256
184
|
|
|
257
|
-
|
|
185
|
+
await this.supervisorRunner.resume(
|
|
258
186
|
`The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
|
|
259
|
-
`
|
|
260
|
-
`EVALUATION_INTERVENTION followed by a corrective message to stop ` +
|
|
261
|
-
`and relay a new instruction. Write EVALUATION_COMPLETE only when ` +
|
|
262
|
-
`the task is fully done.`,
|
|
187
|
+
`Review and use your tools if action is needed.`,
|
|
263
188
|
);
|
|
264
189
|
this.currentSource = "agent";
|
|
265
190
|
|
|
266
|
-
if (this.
|
|
191
|
+
if (this.ctx.redirect) {
|
|
267
192
|
this.emitOrchestratorEvent({ type: "intervention_requested", turn });
|
|
268
193
|
abort();
|
|
269
194
|
return;
|
|
270
195
|
}
|
|
271
|
-
if (this.
|
|
196
|
+
if (this.ctx.concluded) {
|
|
272
197
|
this.emitOrchestratorEvent({ type: "complete_requested", turn });
|
|
273
198
|
abort();
|
|
274
199
|
}
|
|
275
|
-
// Non-intervention: do nothing; the agent loop pulls the next line.
|
|
276
200
|
}
|
|
277
201
|
|
|
278
202
|
/**
|
|
279
|
-
* End-of-turn supervisor review
|
|
280
|
-
*
|
|
281
|
-
*
|
|
203
|
+
* End-of-turn supervisor review. Returns either an exit outcome (error or
|
|
204
|
+
* completion) or the supervisor result so the outer loop can build the
|
|
205
|
+
* next turn's relay.
|
|
282
206
|
* @param {number} turn
|
|
283
|
-
* @returns {Promise<{exit: {success: boolean, turns: number}|null, supervisorResult?: object}>}
|
|
207
|
+
* @returns {Promise<{exit: {success: boolean, turns: number}|null, supervisorResult?: object, relay?: string}>}
|
|
284
208
|
*/
|
|
285
209
|
async #endOfTurnReview(turn) {
|
|
286
|
-
// Build the full agent transcript from buffered NDJSON events so the
|
|
287
|
-
// supervisor sees tool calls and reasoning, not just the SDK result.
|
|
288
210
|
const agentTranscript = this.extractTranscript(this.agentRunner);
|
|
289
|
-
|
|
290
|
-
const supervisorPrompt =
|
|
291
|
-
`The agent reported:\n\n${agentTranscript}\n\n` +
|
|
292
|
-
`Review the agent's work and decide how to proceed.`;
|
|
293
|
-
|
|
294
211
|
this.currentSource = "supervisor";
|
|
295
212
|
this.currentTurn = turn;
|
|
296
|
-
this.
|
|
297
|
-
|
|
298
|
-
const supervisorResult =
|
|
299
|
-
|
|
213
|
+
this.ctx.redirect = null;
|
|
214
|
+
|
|
215
|
+
const supervisorResult = await this.supervisorRunner.resume(
|
|
216
|
+
`The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`,
|
|
217
|
+
);
|
|
300
218
|
|
|
301
219
|
if (supervisorResult.error) {
|
|
302
220
|
this.emitSummary({ success: false, turns: turn });
|
|
303
221
|
return { exit: { success: false, turns: turn } };
|
|
304
222
|
}
|
|
305
223
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
224
|
+
if (this.ctx.concluded) {
|
|
225
|
+
this.emitSummary({
|
|
226
|
+
success: true,
|
|
227
|
+
turns: turn,
|
|
228
|
+
summary: this.ctx.summary,
|
|
229
|
+
});
|
|
310
230
|
return { exit: { success: true, turns: turn } };
|
|
311
231
|
}
|
|
312
232
|
|
|
233
|
+
if (this.ctx.redirect) {
|
|
234
|
+
const redirect = this.ctx.redirect;
|
|
235
|
+
this.ctx.redirect = null;
|
|
236
|
+
return { exit: null, supervisorResult, relay: redirect.message };
|
|
237
|
+
}
|
|
238
|
+
|
|
313
239
|
return { exit: null, supervisorResult };
|
|
314
240
|
}
|
|
315
241
|
|
|
316
242
|
/**
|
|
317
243
|
* Extract a human-readable transcript from an AgentRunner's buffered output.
|
|
318
|
-
* Drains the buffer and replays events through a TraceCollector.
|
|
319
244
|
* @param {import("./agent-runner.js").AgentRunner} runner
|
|
320
245
|
* @returns {string}
|
|
321
246
|
*/
|
|
@@ -330,11 +255,8 @@ export class Supervisor {
|
|
|
330
255
|
|
|
331
256
|
/**
|
|
332
257
|
* Extract only the last assistant text block from an AgentRunner's buffer.
|
|
333
|
-
* Scans buffered NDJSON events in reverse to find the final assistant message
|
|
334
|
-
* with a text content block. This prevents intermediate reasoning (tool calls,
|
|
335
|
-
* research notes) from leaking to the agent.
|
|
336
258
|
* @param {import("./agent-runner.js").AgentRunner} runner
|
|
337
|
-
* @param {string} fallback
|
|
259
|
+
* @param {string} fallback
|
|
338
260
|
* @returns {string}
|
|
339
261
|
*/
|
|
340
262
|
extractLastText(runner, fallback) {
|
|
@@ -354,43 +276,21 @@ export class Supervisor {
|
|
|
354
276
|
}
|
|
355
277
|
|
|
356
278
|
/**
|
|
357
|
-
* Emit a single NDJSON line tagged with the current source and
|
|
358
|
-
* Called in real-time via the AgentRunner onLine callback.
|
|
359
|
-
*
|
|
360
|
-
* When the current source is the supervisor, also scans assistant text
|
|
361
|
-
* content for the EVALUATION_COMPLETE and EVALUATION_INTERVENTION signals,
|
|
362
|
-
* setting completeSignalSeen / interventionSignalSeen respectively.
|
|
279
|
+
* Emit a single NDJSON line tagged with the current source and seq.
|
|
363
280
|
* @param {string} line - Raw NDJSON line from the runner
|
|
364
281
|
*/
|
|
365
282
|
emitLine(line) {
|
|
366
283
|
const event = JSON.parse(line);
|
|
367
284
|
const tagged = {
|
|
368
285
|
source: this.currentSource,
|
|
369
|
-
|
|
286
|
+
seq: this.counter.next(),
|
|
370
287
|
event,
|
|
371
288
|
};
|
|
372
289
|
this.output.write(JSON.stringify(tagged) + "\n");
|
|
373
|
-
|
|
374
|
-
// Scan supervisor assistant messages for the signals in real time.
|
|
375
|
-
// The SDK result text only reflects the final assistant message, but the
|
|
376
|
-
// supervisor may write EVALUATION_COMPLETE / EVALUATION_INTERVENTION in
|
|
377
|
-
// an earlier message and then continue with follow-up tool calls.
|
|
378
|
-
if (this.currentSource === "supervisor" && event.type === "assistant") {
|
|
379
|
-
const content = event.message?.content ?? event.content ?? [];
|
|
380
|
-
if (Array.isArray(content)) {
|
|
381
|
-
for (const block of content) {
|
|
382
|
-
if (block.type !== "text" || !block.text) continue;
|
|
383
|
-
if (isComplete(block.text)) this.completeSignalSeen = true;
|
|
384
|
-
if (isIntervention(block.text)) this.interventionSignalSeen = true;
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
}
|
|
388
290
|
}
|
|
389
291
|
|
|
390
292
|
/**
|
|
391
|
-
* Render a batch of buffered NDJSON lines as human-readable text
|
|
392
|
-
* mid-turn supervisor prompt. Reuses the TraceCollector pipeline so the
|
|
393
|
-
* supervisor sees tool calls and reasoning, not just raw events.
|
|
293
|
+
* Render a batch of buffered NDJSON lines as human-readable text.
|
|
394
294
|
* @param {string[]} batchLines
|
|
395
295
|
* @returns {string}
|
|
396
296
|
*/
|
|
@@ -404,53 +304,59 @@ export class Supervisor {
|
|
|
404
304
|
}
|
|
405
305
|
|
|
406
306
|
/**
|
|
407
|
-
* Emit an orchestrator-source NDJSON line.
|
|
408
|
-
* mark mid_turn_review / intervention_requested / intervention_relayed /
|
|
409
|
-
* intervention_limit / complete_requested boundaries in the trace, so the
|
|
410
|
-
* improvement coach can distinguish mid-turn supervisor activity from
|
|
411
|
-
* end-of-turn reviews. Additive to existing trace shape — the parser
|
|
412
|
-
* already reads `source` and ignores unknown event types.
|
|
307
|
+
* Emit an orchestrator-source NDJSON line.
|
|
413
308
|
* @param {{type: string, turn?: number}} event
|
|
414
309
|
*/
|
|
415
310
|
emitOrchestratorEvent(event) {
|
|
416
311
|
this.output.write(
|
|
417
312
|
JSON.stringify({
|
|
418
313
|
source: "orchestrator",
|
|
419
|
-
|
|
314
|
+
seq: this.counter.next(),
|
|
420
315
|
event,
|
|
421
316
|
}) + "\n",
|
|
422
317
|
);
|
|
423
318
|
}
|
|
424
319
|
|
|
425
320
|
/**
|
|
426
|
-
* Emit a final orchestrator summary line.
|
|
427
|
-
* @param {{success: boolean, turns: number}} result
|
|
321
|
+
* Emit a final orchestrator summary line, wrapped in the universal envelope.
|
|
322
|
+
* @param {{success: boolean, turns: number, summary?: string}} result
|
|
428
323
|
*/
|
|
429
324
|
emitSummary(result) {
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
325
|
+
this.output.write(
|
|
326
|
+
JSON.stringify({
|
|
327
|
+
source: "orchestrator",
|
|
328
|
+
seq: this.counter.next(),
|
|
329
|
+
event: {
|
|
330
|
+
type: "summary",
|
|
331
|
+
success: result.success,
|
|
332
|
+
turns: result.turns,
|
|
333
|
+
...(result.summary && { summary: result.summary }),
|
|
334
|
+
},
|
|
335
|
+
}) + "\n",
|
|
336
|
+
);
|
|
437
337
|
}
|
|
438
338
|
}
|
|
439
339
|
|
|
340
|
+
const devNull = new Writable({
|
|
341
|
+
write(_chunk, _enc, cb) {
|
|
342
|
+
cb();
|
|
343
|
+
},
|
|
344
|
+
});
|
|
345
|
+
|
|
440
346
|
/**
|
|
441
347
|
* Factory function — wires both AgentRunners with their respective configs.
|
|
442
348
|
* @param {object} deps
|
|
443
|
-
* @param {string} deps.supervisorCwd
|
|
444
|
-
* @param {string} deps.agentCwd
|
|
445
|
-
* @param {function} deps.query
|
|
446
|
-
* @param {import("stream").Writable} deps.output
|
|
447
|
-
* @param {string} [deps.model]
|
|
448
|
-
* @param {number} [deps.maxTurns]
|
|
449
|
-
* @param {string[]} [deps.allowedTools]
|
|
450
|
-
* @param {string[]} [deps.supervisorAllowedTools]
|
|
451
|
-
* @param {string[]} [deps.supervisorDisallowedTools]
|
|
452
|
-
* @param {string} [deps.supervisorProfile]
|
|
453
|
-
* @param {string} [deps.agentProfile]
|
|
349
|
+
* @param {string} deps.supervisorCwd
|
|
350
|
+
* @param {string} deps.agentCwd
|
|
351
|
+
* @param {function} deps.query
|
|
352
|
+
* @param {import("stream").Writable} deps.output
|
|
353
|
+
* @param {string} [deps.model]
|
|
354
|
+
* @param {number} [deps.maxTurns]
|
|
355
|
+
* @param {string[]} [deps.allowedTools]
|
|
356
|
+
* @param {string[]} [deps.supervisorAllowedTools]
|
|
357
|
+
* @param {string[]} [deps.supervisorDisallowedTools]
|
|
358
|
+
* @param {string} [deps.supervisorProfile]
|
|
359
|
+
* @param {string} [deps.agentProfile]
|
|
454
360
|
* @returns {Supervisor}
|
|
455
361
|
*/
|
|
456
362
|
export function createSupervisor({
|
|
@@ -466,15 +372,31 @@ export function createSupervisor({
|
|
|
466
372
|
supervisorProfile,
|
|
467
373
|
agentProfile,
|
|
468
374
|
}) {
|
|
469
|
-
// Forward-reference: onLine captures `supervisor` before construction completes.
|
|
470
|
-
// This is safe because onLine is only called during run(), after construction.
|
|
471
375
|
let supervisor;
|
|
376
|
+
let supervisorRunner;
|
|
377
|
+
|
|
378
|
+
const ctx = createOrchestrationContext();
|
|
379
|
+
|
|
380
|
+
const supervisorServer = createSupervisorToolServer(ctx);
|
|
381
|
+
const agentServer = createSupervisedAgentToolServer(ctx, {
|
|
382
|
+
onAsk: async (question) => {
|
|
383
|
+
supervisor.currentSource = "supervisor";
|
|
384
|
+
supervisor.emitOrchestratorEvent({ type: "ask_received" });
|
|
385
|
+
await supervisorRunner.resume(
|
|
386
|
+
`The agent asks: "${question}"\n\nAnswer the question directly.`,
|
|
387
|
+
);
|
|
388
|
+
supervisor.currentSource = "agent";
|
|
389
|
+
supervisor.emitOrchestratorEvent({ type: "ask_answered" });
|
|
390
|
+
return supervisor.extractLastText(supervisorRunner, "No answer.");
|
|
391
|
+
},
|
|
392
|
+
});
|
|
393
|
+
|
|
472
394
|
const onLine = (line) => supervisor.emitLine(line);
|
|
473
395
|
|
|
474
396
|
const agentRunner = createAgentRunner({
|
|
475
397
|
cwd: agentCwd,
|
|
476
398
|
query,
|
|
477
|
-
output:
|
|
399
|
+
output: devNull,
|
|
478
400
|
model,
|
|
479
401
|
maxTurns: 50,
|
|
480
402
|
allowedTools,
|
|
@@ -486,24 +408,18 @@ export function createSupervisor({
|
|
|
486
408
|
preset: "claude_code",
|
|
487
409
|
append: AGENT_SYSTEM_PROMPT,
|
|
488
410
|
},
|
|
411
|
+
mcpServers: { orchestration: agentServer },
|
|
489
412
|
});
|
|
490
413
|
|
|
491
|
-
// Block every sub-agent spawning tool so the supervisor cannot bypass the
|
|
492
|
-
// relay loop. The current Claude Agent SDK exposes the spawn tool to the
|
|
493
|
-
// model as `Agent`; older versions called it `Task`. Both are blocked
|
|
494
|
-
// (along with TaskOutput/TaskStop) so the supervisor sees no spawn tool
|
|
495
|
-
// regardless of which SDK version is installed. Letting the supervisor
|
|
496
|
-
// spawn its own sub-agent would bypass the relay and produce an empty
|
|
497
|
-
// agent trace, which is the failure mode that motivated this default.
|
|
498
414
|
const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
|
|
499
415
|
const disallowedTools = supervisorDisallowedTools
|
|
500
416
|
? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
|
|
501
417
|
: defaultDisallowed;
|
|
502
418
|
|
|
503
|
-
|
|
419
|
+
supervisorRunner = createAgentRunner({
|
|
504
420
|
cwd: supervisorCwd,
|
|
505
421
|
query,
|
|
506
|
-
output:
|
|
422
|
+
output: devNull,
|
|
507
423
|
model,
|
|
508
424
|
maxTurns: 20,
|
|
509
425
|
allowedTools: supervisorAllowedTools ?? [
|
|
@@ -523,6 +439,7 @@ export function createSupervisor({
|
|
|
523
439
|
preset: "claude_code",
|
|
524
440
|
append: SUPERVISOR_SYSTEM_PROMPT,
|
|
525
441
|
},
|
|
442
|
+
mcpServers: { orchestration: supervisorServer },
|
|
526
443
|
});
|
|
527
444
|
|
|
528
445
|
supervisor = new Supervisor({
|
|
@@ -530,6 +447,7 @@ export function createSupervisor({
|
|
|
530
447
|
supervisorRunner,
|
|
531
448
|
output,
|
|
532
449
|
maxTurns,
|
|
450
|
+
ctx,
|
|
533
451
|
});
|
|
534
452
|
return supervisor;
|
|
535
453
|
}
|
package/src/tee-writer.js
CHANGED
|
@@ -3,9 +3,9 @@
|
|
|
3
3
|
* simultaneously streaming human-readable text to a separate stream (e.g.
|
|
4
4
|
* process.stdout).
|
|
5
5
|
*
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
6
|
+
* All modes emit the same { source, seq, event } envelope. The `mode`
|
|
7
|
+
* parameter controls display formatting: multi-participant modes show
|
|
8
|
+
* source labels on content lines.
|
|
9
9
|
*
|
|
10
10
|
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
11
11
|
*/
|
|
@@ -18,7 +18,7 @@ export class TeeWriter extends Writable {
|
|
|
18
18
|
* @param {object} deps
|
|
19
19
|
* @param {import("stream").Writable} deps.fileStream - Stream to write raw NDJSON to
|
|
20
20
|
* @param {import("stream").Writable} deps.textStream - Stream to write human-readable text to
|
|
21
|
-
* @param {"raw"|"supervised"} [deps.mode] -
|
|
21
|
+
* @param {"raw"|"supervised"} [deps.mode] - Display mode: "raw" (no source labels) or "supervised" (source labels) (default: "raw")
|
|
22
22
|
*/
|
|
23
23
|
constructor({ fileStream, textStream, mode }) {
|
|
24
24
|
super();
|
|
@@ -72,23 +72,10 @@ export class TeeWriter extends Writable {
|
|
|
72
72
|
}
|
|
73
73
|
|
|
74
74
|
/**
|
|
75
|
-
* Process a single NDJSON line —
|
|
75
|
+
* Process a single NDJSON line — unified envelope handling for all modes.
|
|
76
76
|
* @param {string} line
|
|
77
77
|
*/
|
|
78
78
|
processLine(line) {
|
|
79
|
-
if (this.mode === "supervised") {
|
|
80
|
-
this.processSupervisedLine(line);
|
|
81
|
-
} else {
|
|
82
|
-
this.collector.addLine(line);
|
|
83
|
-
this.flushTurns();
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
/**
|
|
88
|
-
* Handle a tagged supervisor line: unwrap event, show source labels.
|
|
89
|
-
* @param {string} line
|
|
90
|
-
*/
|
|
91
|
-
processSupervisedLine(line) {
|
|
92
79
|
let parsed;
|
|
93
80
|
try {
|
|
94
81
|
parsed = JSON.parse(line);
|
|
@@ -96,21 +83,28 @@ export class TeeWriter extends Writable {
|
|
|
96
83
|
return;
|
|
97
84
|
}
|
|
98
85
|
|
|
99
|
-
|
|
100
|
-
const status = parsed.success ? "completed" : "incomplete";
|
|
101
|
-
this.textStream.write(
|
|
102
|
-
`\n--- Evaluation ${status} after ${parsed.turns} turns ---\n`,
|
|
103
|
-
);
|
|
104
|
-
return;
|
|
105
|
-
}
|
|
106
|
-
|
|
86
|
+
// Universal envelope: { source, seq, event }
|
|
107
87
|
if (parsed.event) {
|
|
88
|
+
// Orchestrator summary event
|
|
89
|
+
if (parsed.source === "orchestrator" && parsed.event.type === "summary") {
|
|
90
|
+
const status = parsed.event.success ? "completed" : "incomplete";
|
|
91
|
+
this.textStream.write(
|
|
92
|
+
`\n--- Evaluation ${status} after ${parsed.event.turns} turns ---\n`,
|
|
93
|
+
);
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
|
|
108
97
|
if (parsed.source && parsed.source !== this.lastSource) {
|
|
109
98
|
this.lastSource = parsed.source;
|
|
110
99
|
}
|
|
111
100
|
this.collector.addLine(JSON.stringify(parsed.event));
|
|
112
101
|
this.flushTurns();
|
|
102
|
+
return;
|
|
113
103
|
}
|
|
104
|
+
|
|
105
|
+
// Bare event (run mode pre-migration or direct feed)
|
|
106
|
+
this.collector.addLine(line);
|
|
107
|
+
this.flushTurns();
|
|
114
108
|
}
|
|
115
109
|
|
|
116
110
|
/**
|