@forwardimpact/libeval 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +2 -2
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/agent-runner.js +97 -39
- package/src/commands/run.js +43 -18
- package/src/commands/supervise.js +59 -37
- package/src/supervisor.js +298 -59
- package/test/mock-runner.js +101 -0
- package/test/supervisor-intervention.test.js +359 -0
- package/test/{supervisor.test.js → supervisor-output.test.js} +120 -306
- package/test/supervisor-run.test.js +310 -0
package/src/supervisor.js
CHANGED
|
@@ -13,25 +13,49 @@ import { TraceCollector } from "./trace-collector.js";
|
|
|
13
13
|
|
|
14
14
|
/**
|
|
15
15
|
* Check if the supervisor's response signals evaluation success.
|
|
16
|
-
* Matches
|
|
17
|
-
* formatting (e.g. **
|
|
16
|
+
* Matches EVALUATION_COMPLETE anywhere in the text, tolerating markdown
|
|
17
|
+
* formatting (e.g. **EVALUATION_COMPLETE**). Uses word boundaries to
|
|
18
18
|
* avoid matching inside longer identifiers.
|
|
19
19
|
* @param {string} text
|
|
20
20
|
* @returns {boolean}
|
|
21
21
|
*/
|
|
22
|
-
export function
|
|
23
|
-
return /(?:^|[\s*_~`])
|
|
22
|
+
export function isComplete(text) {
|
|
23
|
+
return /(?:^|[\s*_~`])EVALUATION_COMPLETE(?:[\s*_~`.,!?]|$)/m.test(text);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Check if the supervisor's response signals a mid-turn intervention.
|
|
28
|
+
* Same tolerance rules as isComplete (markdown formatting, word boundaries),
|
|
29
|
+
* but matches the EVALUATION_INTERVENTION keyword instead.
|
|
30
|
+
* @param {string} text
|
|
31
|
+
* @returns {boolean}
|
|
32
|
+
*/
|
|
33
|
+
export function isIntervention(text) {
|
|
34
|
+
return /(?:^|[\s*_~`])EVALUATION_INTERVENTION(?:[\s*_~`.,!?]|$)/m.test(text);
|
|
24
35
|
}
|
|
25
36
|
|
|
26
37
|
/** System prompt appended for the supervisor runner in supervise mode. */
|
|
27
38
|
export const SUPERVISOR_SYSTEM_PROMPT =
|
|
28
|
-
"You
|
|
29
|
-
"
|
|
39
|
+
"You relay messages to one persistent agent session — your only output " +
|
|
40
|
+
"channel. Spawning sub-agents or restarting the agent is blocked. Do not " +
|
|
41
|
+
"do the work yourself. Reply briefly to let the agent continue, write " +
|
|
42
|
+
"EVALUATION_INTERVENTION + instructions to interrupt mid-turn, or " +
|
|
43
|
+
"EVALUATION_COMPLETE when done. Only your final message each turn is " +
|
|
44
|
+
"relayed.";
|
|
30
45
|
|
|
31
46
|
/** System prompt appended for the agent runner in supervise mode. */
|
|
32
47
|
export const AGENT_SYSTEM_PROMPT =
|
|
33
|
-
"
|
|
34
|
-
"
|
|
48
|
+
"A supervisor watches your work and may interrupt with new instructions " +
|
|
49
|
+
"mid-task. Treat any new prompt as authoritative and adjust course. " +
|
|
50
|
+
"When uncertain, stop and ask a clarifying question.";
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Maximum number of mid-turn interventions allowed within a single agent turn.
|
|
54
|
+
* Bounded so a looping supervisor exhausts its quota fast (observability) but
|
|
55
|
+
* leaves headroom for legitimate "intervene, observe, intervene again" patterns.
|
|
56
|
+
* The outer maxTurns budget still bounds overall runtime.
|
|
57
|
+
*/
|
|
58
|
+
const MAX_INTERVENTIONS_PER_TURN = 5;
|
|
35
59
|
|
|
36
60
|
export class Supervisor {
|
|
37
61
|
/**
|
|
@@ -56,12 +80,28 @@ export class Supervisor {
|
|
|
56
80
|
/**
|
|
57
81
|
* Set to true when any supervisor message contains the success signal.
|
|
58
82
|
* The SDK result text only reflects the last assistant message, so when
|
|
59
|
-
* the supervisor writes
|
|
83
|
+
* the supervisor writes EVALUATION_COMPLETE in an early message and
|
|
60
84
|
* then continues with follow-up work, the result text won't contain it.
|
|
61
85
|
* This flag captures the signal from the full message stream.
|
|
62
86
|
* @type {boolean}
|
|
63
87
|
*/
|
|
64
|
-
this.
|
|
88
|
+
this.completeSignalSeen = false;
|
|
89
|
+
/**
|
|
90
|
+
* Set to true when any supervisor message contains EVALUATION_INTERVENTION.
|
|
91
|
+
* Mirrors completeSignalSeen — populated by emitLine when a supervisor
|
|
92
|
+
* assistant text block matches isIntervention(...). The mid-turn loop
|
|
93
|
+
* reads this flag after each supervisor invocation to decide whether to
|
|
94
|
+
* abort the agent's in-flight SDK session.
|
|
95
|
+
* @type {boolean}
|
|
96
|
+
*/
|
|
97
|
+
this.interventionSignalSeen = false;
|
|
98
|
+
/**
|
|
99
|
+
* The most recent supervisor SDK result captured inside the mid-turn
|
|
100
|
+
* onBatch callback. The outer loop reads this after the agent aborts to
|
|
101
|
+
* build the next relay prompt without re-running the supervisor.
|
|
102
|
+
* @type {{success: boolean, text: string}|null}
|
|
103
|
+
*/
|
|
104
|
+
this.lastSupervisorResult = null;
|
|
65
105
|
}
|
|
66
106
|
|
|
67
107
|
/**
|
|
@@ -75,7 +115,9 @@ export class Supervisor {
|
|
|
75
115
|
// Turn 0: Supervisor receives the task and introduces it to the agent
|
|
76
116
|
this.currentSource = "supervisor";
|
|
77
117
|
this.currentTurn = 0;
|
|
78
|
-
this.
|
|
118
|
+
this.completeSignalSeen = false;
|
|
119
|
+
this.interventionSignalSeen = false;
|
|
120
|
+
this.lastSupervisorResult = null;
|
|
79
121
|
let supervisorResult = await this.supervisorRunner.run(task);
|
|
80
122
|
|
|
81
123
|
if (supervisorResult.error) {
|
|
@@ -85,58 +127,190 @@ export class Supervisor {
|
|
|
85
127
|
|
|
86
128
|
// Check for the success signal in either the SDK result text or the
|
|
87
129
|
// streamed message content. The SDK result text only reflects the last
|
|
88
|
-
// assistant message, so when the supervisor writes
|
|
130
|
+
// assistant message, so when the supervisor writes EVALUATION_COMPLETE
|
|
89
131
|
// early and then continues (e.g. filing issues), we must also check the
|
|
90
132
|
// flag set by emitLine during streaming.
|
|
91
|
-
if (this.
|
|
133
|
+
if (this.completeSignalSeen || isComplete(supervisorResult.text)) {
|
|
92
134
|
this.emitSummary({ success: true, turns: 0 });
|
|
93
135
|
return { success: true, turns: 0 };
|
|
94
136
|
}
|
|
95
137
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
}
|
|
138
|
+
const turnLimit = this.maxTurns === 0 ? Infinity : this.maxTurns;
|
|
139
|
+
for (let turn = 1; turn <= turnLimit; turn++) {
|
|
140
|
+
// Only the supervisor's final message is relayed to the agent.
|
|
141
|
+
// Extract the last assistant text block from the buffer to avoid
|
|
142
|
+
// leaking intermediate reasoning (research, tool calls, notes).
|
|
143
|
+
const relay = this.extractLastText(
|
|
144
|
+
this.supervisorRunner,
|
|
145
|
+
supervisorResult.text,
|
|
146
|
+
);
|
|
106
147
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
148
|
+
// Drive the agent through interventions until its SDK session ends
|
|
149
|
+
// naturally, the supervisor signals completion mid-turn, or the
|
|
150
|
+
// per-turn intervention budget is exhausted.
|
|
151
|
+
const turnOutcome = await this.#runAgentTurn(turn, relay);
|
|
152
|
+
if (turnOutcome.exit) return turnOutcome.exit;
|
|
153
|
+
|
|
154
|
+
// End-of-turn review (existing behaviour). Returns either an exit
|
|
155
|
+
// outcome (error or completion) or the supervisor result for the
|
|
156
|
+
// next turn's relay.
|
|
157
|
+
const reviewOutcome = await this.#endOfTurnReview(turn);
|
|
158
|
+
if (reviewOutcome.exit) return reviewOutcome.exit;
|
|
159
|
+
supervisorResult = reviewOutcome.supervisorResult;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
this.emitSummary({ success: false, turns: this.maxTurns });
|
|
163
|
+
return { success: false, turns: this.maxTurns };
|
|
164
|
+
}
|
|
111
165
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
166
|
+
/**
|
|
167
|
+
* Drive the agent through one turn, allowing the supervisor to interrupt
|
|
168
|
+
* mid-stream via EVALUATION_INTERVENTION. Returns either an `exit` outcome
|
|
169
|
+
* (the loop should return immediately) or `{exit: null}` (proceed to the
|
|
170
|
+
* end-of-turn review).
|
|
171
|
+
* @param {number} turn
|
|
172
|
+
* @param {string} initialRelay
|
|
173
|
+
* @returns {Promise<{exit: {success: boolean, turns: number}|null}>}
|
|
174
|
+
*/
|
|
175
|
+
async #runAgentTurn(turn, initialRelay) {
|
|
176
|
+
let relay = initialRelay;
|
|
177
|
+
let interventions = 0;
|
|
115
178
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
179
|
+
// Wire the mid-turn observation hook on the agent runner. The bound
|
|
180
|
+
// callback captures `turn` so the inner loop's multiple resume(...)
|
|
181
|
+
// calls all see the same turn id. The supervisorRunner does NOT get
|
|
182
|
+
// an onBatch callback — it only fires onLine, which is enough for
|
|
183
|
+
// emitLine to detect EVALUATION_COMPLETE / EVALUATION_INTERVENTION.
|
|
184
|
+
this.agentRunner.onBatch = (batchLines, ctx) =>
|
|
185
|
+
this.#midTurnReview(turn, batchLines, ctx);
|
|
119
186
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
187
|
+
try {
|
|
188
|
+
while (true) {
|
|
189
|
+
this.currentSource = "agent";
|
|
190
|
+
this.currentTurn = turn;
|
|
191
|
+
const isFirstAgentCall = turn === 1 && interventions === 0;
|
|
192
|
+
const agentResult = isFirstAgentCall
|
|
193
|
+
? await this.agentRunner.run(relay)
|
|
194
|
+
: await this.agentRunner.resume(relay);
|
|
124
195
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
196
|
+
if (agentResult.error && !agentResult.aborted) {
|
|
197
|
+
this.emitSummary({ success: false, turns: turn });
|
|
198
|
+
return { exit: { success: false, turns: turn } };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Mid-turn EVALUATION_COMPLETE: end the session immediately.
|
|
202
|
+
if (this.completeSignalSeen) {
|
|
203
|
+
this.emitSummary({ success: true, turns: turn });
|
|
204
|
+
return { exit: { success: true, turns: turn } };
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
if (agentResult.aborted && this.interventionSignalSeen) {
|
|
208
|
+
interventions++;
|
|
209
|
+
if (interventions >= MAX_INTERVENTIONS_PER_TURN) {
|
|
210
|
+
this.emitOrchestratorEvent({ type: "intervention_limit", turn });
|
|
211
|
+
return { exit: null };
|
|
212
|
+
}
|
|
213
|
+
relay = this.extractLastText(
|
|
214
|
+
this.supervisorRunner,
|
|
215
|
+
this.lastSupervisorResult?.text ?? "",
|
|
216
|
+
);
|
|
217
|
+
this.emitOrchestratorEvent({ type: "intervention_relayed", turn });
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
129
220
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
|
|
133
|
-
this.emitSummary({ success: true, turns: turn });
|
|
134
|
-
return { success: true, turns: turn };
|
|
221
|
+
// Agent's SDK session finished naturally — proceed to end-of-turn.
|
|
222
|
+
return { exit: null };
|
|
135
223
|
}
|
|
224
|
+
} finally {
|
|
225
|
+
// Detach onBatch before the end-of-turn review so the supervisor's
|
|
226
|
+
// own SDK session does not trigger nested onBatch fires.
|
|
227
|
+
this.agentRunner.onBatch = null;
|
|
136
228
|
}
|
|
229
|
+
}
|
|
137
230
|
|
|
138
|
-
|
|
139
|
-
|
|
231
|
+
/**
|
|
232
|
+
* Mid-turn supervisor review fired from inside the agent's onBatch hook.
|
|
233
|
+
* Emits a `mid_turn_review` orchestrator marker, runs the supervisor's
|
|
234
|
+
* LLM against the batch, and aborts the agent if the supervisor signals
|
|
235
|
+
* EVALUATION_INTERVENTION or EVALUATION_COMPLETE.
|
|
236
|
+
* @param {number} turn
|
|
237
|
+
* @param {string[]} batchLines
|
|
238
|
+
* @param {{abort: () => void}} ctx
|
|
239
|
+
*/
|
|
240
|
+
async #midTurnReview(turn, batchLines, { abort }) {
|
|
241
|
+
const batchTranscript = this.renderBatch(batchLines);
|
|
242
|
+
|
|
243
|
+
// Order matters: emit the orchestrator marker BEFORE the supervisor
|
|
244
|
+
// LLM call so the trace reads
|
|
245
|
+
// agent line → orchestrator:mid_turn_review
|
|
246
|
+
// → supervisor lines (tagged turn:N)
|
|
247
|
+
// → orchestrator:intervention_requested|complete_requested
|
|
248
|
+
this.emitOrchestratorEvent({ type: "mid_turn_review", turn });
|
|
249
|
+
|
|
250
|
+
// currentTurn stays = turn so mid-turn supervisor lines share the
|
|
251
|
+
// agent's turn id. They are distinguishable from end-of-turn reviews
|
|
252
|
+
// by the surrounding orchestrator events emitted around this call.
|
|
253
|
+
this.currentSource = "supervisor";
|
|
254
|
+
this.completeSignalSeen = false;
|
|
255
|
+
this.interventionSignalSeen = false;
|
|
256
|
+
|
|
257
|
+
this.lastSupervisorResult = await this.supervisorRunner.resume(
|
|
258
|
+
`The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
|
|
259
|
+
`Respond with a brief acknowledgement to let it continue, or write ` +
|
|
260
|
+
`EVALUATION_INTERVENTION followed by a corrective message to stop ` +
|
|
261
|
+
`and relay a new instruction. Write EVALUATION_COMPLETE only when ` +
|
|
262
|
+
`the task is fully done.`,
|
|
263
|
+
);
|
|
264
|
+
this.currentSource = "agent";
|
|
265
|
+
|
|
266
|
+
if (this.interventionSignalSeen) {
|
|
267
|
+
this.emitOrchestratorEvent({ type: "intervention_requested", turn });
|
|
268
|
+
abort();
|
|
269
|
+
return;
|
|
270
|
+
}
|
|
271
|
+
if (this.completeSignalSeen) {
|
|
272
|
+
this.emitOrchestratorEvent({ type: "complete_requested", turn });
|
|
273
|
+
abort();
|
|
274
|
+
}
|
|
275
|
+
// Non-intervention: do nothing; the agent loop pulls the next line.
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* End-of-turn supervisor review (existing behaviour). Returns either an
|
|
280
|
+
* exit outcome (error or completion) or the supervisor result so the
|
|
281
|
+
* outer loop can build the next turn's relay.
|
|
282
|
+
* @param {number} turn
|
|
283
|
+
* @returns {Promise<{exit: {success: boolean, turns: number}|null, supervisorResult?: object}>}
|
|
284
|
+
*/
|
|
285
|
+
async #endOfTurnReview(turn) {
|
|
286
|
+
// Build the full agent transcript from buffered NDJSON events so the
|
|
287
|
+
// supervisor sees tool calls and reasoning, not just the SDK result.
|
|
288
|
+
const agentTranscript = this.extractTranscript(this.agentRunner);
|
|
289
|
+
|
|
290
|
+
const supervisorPrompt =
|
|
291
|
+
`The agent reported:\n\n${agentTranscript}\n\n` +
|
|
292
|
+
`Review the agent's work and decide how to proceed.`;
|
|
293
|
+
|
|
294
|
+
this.currentSource = "supervisor";
|
|
295
|
+
this.currentTurn = turn;
|
|
296
|
+
this.completeSignalSeen = false;
|
|
297
|
+
this.interventionSignalSeen = false;
|
|
298
|
+
const supervisorResult =
|
|
299
|
+
await this.supervisorRunner.resume(supervisorPrompt);
|
|
300
|
+
|
|
301
|
+
if (supervisorResult.error) {
|
|
302
|
+
this.emitSummary({ success: false, turns: turn });
|
|
303
|
+
return { exit: { success: false, turns: turn } };
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// The supervisor's turn is fully complete — check for success signal
|
|
307
|
+
// in either the SDK result text or streamed messages.
|
|
308
|
+
if (this.completeSignalSeen || isComplete(supervisorResult.text)) {
|
|
309
|
+
this.emitSummary({ success: true, turns: turn });
|
|
310
|
+
return { exit: { success: true, turns: turn } };
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
return { exit: null, supervisorResult };
|
|
140
314
|
}
|
|
141
315
|
|
|
142
316
|
/**
|
|
@@ -154,12 +328,38 @@ export class Supervisor {
|
|
|
154
328
|
return collector.toText() || "[The agent produced no output.]";
|
|
155
329
|
}
|
|
156
330
|
|
|
331
|
+
/**
|
|
332
|
+
* Extract only the last assistant text block from an AgentRunner's buffer.
|
|
333
|
+
* Scans buffered NDJSON events in reverse to find the final assistant message
|
|
334
|
+
* with a text content block. This prevents intermediate reasoning (tool calls,
|
|
335
|
+
* research notes) from leaking to the agent.
|
|
336
|
+
* @param {import("./agent-runner.js").AgentRunner} runner
|
|
337
|
+
* @param {string} fallback - Fallback text if no assistant text block is found
|
|
338
|
+
* @returns {string}
|
|
339
|
+
*/
|
|
340
|
+
extractLastText(runner, fallback) {
|
|
341
|
+
const lines = runner.buffer;
|
|
342
|
+
for (let i = lines.length - 1; i >= 0; i--) {
|
|
343
|
+
const event = JSON.parse(lines[i]);
|
|
344
|
+
if (event.type !== "assistant") continue;
|
|
345
|
+
const content = event.message?.content ?? event.content;
|
|
346
|
+
if (!Array.isArray(content)) continue;
|
|
347
|
+
for (let j = content.length - 1; j >= 0; j--) {
|
|
348
|
+
if (content[j].type === "text" && content[j].text) {
|
|
349
|
+
return content[j].text;
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
return fallback;
|
|
354
|
+
}
|
|
355
|
+
|
|
157
356
|
/**
|
|
158
357
|
* Emit a single NDJSON line tagged with the current source and turn.
|
|
159
358
|
* Called in real-time via the AgentRunner onLine callback.
|
|
160
359
|
*
|
|
161
360
|
* When the current source is the supervisor, also scans assistant text
|
|
162
|
-
* content for the
|
|
361
|
+
* content for the EVALUATION_COMPLETE and EVALUATION_INTERVENTION signals,
|
|
362
|
+
* setting completeSignalSeen / interventionSignalSeen respectively.
|
|
163
363
|
* @param {string} line - Raw NDJSON line from the runner
|
|
164
364
|
*/
|
|
165
365
|
emitLine(line) {
|
|
@@ -171,22 +371,57 @@ export class Supervisor {
|
|
|
171
371
|
};
|
|
172
372
|
this.output.write(JSON.stringify(tagged) + "\n");
|
|
173
373
|
|
|
174
|
-
// Scan supervisor assistant messages for the
|
|
374
|
+
// Scan supervisor assistant messages for the signals in real time.
|
|
175
375
|
// The SDK result text only reflects the final assistant message, but the
|
|
176
|
-
// supervisor may write
|
|
177
|
-
// then continue with follow-up tool calls.
|
|
376
|
+
// supervisor may write EVALUATION_COMPLETE / EVALUATION_INTERVENTION in
|
|
377
|
+
// an earlier message and then continue with follow-up tool calls.
|
|
178
378
|
if (this.currentSource === "supervisor" && event.type === "assistant") {
|
|
179
379
|
const content = event.message?.content ?? event.content ?? [];
|
|
180
380
|
if (Array.isArray(content)) {
|
|
181
381
|
for (const block of content) {
|
|
182
|
-
if (block.type
|
|
183
|
-
|
|
184
|
-
|
|
382
|
+
if (block.type !== "text" || !block.text) continue;
|
|
383
|
+
if (isComplete(block.text)) this.completeSignalSeen = true;
|
|
384
|
+
if (isIntervention(block.text)) this.interventionSignalSeen = true;
|
|
185
385
|
}
|
|
186
386
|
}
|
|
187
387
|
}
|
|
188
388
|
}
|
|
189
389
|
|
|
390
|
+
/**
|
|
391
|
+
* Render a batch of buffered NDJSON lines as human-readable text for the
|
|
392
|
+
* mid-turn supervisor prompt. Reuses the TraceCollector pipeline so the
|
|
393
|
+
* supervisor sees tool calls and reasoning, not just raw events.
|
|
394
|
+
* @param {string[]} batchLines
|
|
395
|
+
* @returns {string}
|
|
396
|
+
*/
|
|
397
|
+
renderBatch(batchLines) {
|
|
398
|
+
if (batchLines.length === 0) return "[empty]";
|
|
399
|
+
const collector = new TraceCollector();
|
|
400
|
+
for (const line of batchLines) {
|
|
401
|
+
collector.addLine(line);
|
|
402
|
+
}
|
|
403
|
+
return collector.toText() || "[empty]";
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* Emit an orchestrator-source NDJSON line. Used by the mid-turn loop to
|
|
408
|
+
* mark mid_turn_review / intervention_requested / intervention_relayed /
|
|
409
|
+
* intervention_limit / complete_requested boundaries in the trace, so the
|
|
410
|
+
* improvement coach can distinguish mid-turn supervisor activity from
|
|
411
|
+
* end-of-turn reviews. Additive to existing trace shape — the parser
|
|
412
|
+
* already reads `source` and ignores unknown event types.
|
|
413
|
+
* @param {{type: string, turn?: number}} event
|
|
414
|
+
*/
|
|
415
|
+
emitOrchestratorEvent(event) {
|
|
416
|
+
this.output.write(
|
|
417
|
+
JSON.stringify({
|
|
418
|
+
source: "orchestrator",
|
|
419
|
+
turn: this.currentTurn,
|
|
420
|
+
event,
|
|
421
|
+
}) + "\n",
|
|
422
|
+
);
|
|
423
|
+
}
|
|
424
|
+
|
|
190
425
|
/**
|
|
191
426
|
* Emit a final orchestrator summary line.
|
|
192
427
|
* @param {{success: boolean, turns: number}} result
|
|
@@ -253,10 +488,14 @@ export function createSupervisor({
|
|
|
253
488
|
},
|
|
254
489
|
});
|
|
255
490
|
|
|
256
|
-
// Block
|
|
257
|
-
//
|
|
258
|
-
//
|
|
259
|
-
|
|
491
|
+
// Block every sub-agent spawning tool so the supervisor cannot bypass the
|
|
492
|
+
// relay loop. The current Claude Agent SDK exposes the spawn tool to the
|
|
493
|
+
// model as `Agent`; older versions called it `Task`. Both are blocked
|
|
494
|
+
// (along with TaskOutput/TaskStop) so the supervisor sees no spawn tool
|
|
495
|
+
// regardless of which SDK version is installed. Letting the supervisor
|
|
496
|
+
// spawn its own sub-agent would bypass the relay and produce an empty
|
|
497
|
+
// agent trace, which is the failure mode that motivated this default.
|
|
498
|
+
const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
|
|
260
499
|
const disallowedTools = supervisorDisallowedTools
|
|
261
500
|
? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
|
|
262
501
|
: defaultDisallowed;
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test-only mock factory for AgentRunner. Yields pre-scripted responses,
|
|
3
|
+
* and (when an `onBatch` callback is set) fires it at the same boundaries
|
|
4
|
+
* the real AgentRunner would: assistant messages with at least one text
|
|
5
|
+
* block, and the terminal `result` message. If the callback calls
|
|
6
|
+
* `abort()`, the mock stops iterating that response's messages and
|
|
7
|
+
* reports `aborted: true`.
|
|
8
|
+
*
|
|
9
|
+
* Intentionally a regular module (not a test file) so describe/test blocks
|
|
10
|
+
* here would not run. Lives under test/ to make its scope explicit.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { PassThrough } from "node:stream";
|
|
14
|
+
import { AgentRunner } from "@forwardimpact/libeval";
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Whether a scripted message should trigger an onBatch flush. Mirrors the
|
|
18
|
+
* real AgentRunner: assistant-with-text-block or terminal `result` message.
|
|
19
|
+
* Tool-only or string-content messages accumulate without flushing.
|
|
20
|
+
* @param {object} message
|
|
21
|
+
* @returns {boolean}
|
|
22
|
+
*/
|
|
23
|
+
export function shouldFlush(message) {
|
|
24
|
+
if (message.type === "result") return true;
|
|
25
|
+
if (message.type !== "assistant") return false;
|
|
26
|
+
const content = message.message?.content ?? message.content;
|
|
27
|
+
if (!Array.isArray(content)) return false;
|
|
28
|
+
for (const block of content) {
|
|
29
|
+
if (block.type === "text" && block.text) return true;
|
|
30
|
+
}
|
|
31
|
+
return false;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Create a mock AgentRunner that yields pre-scripted responses. Each call
|
|
36
|
+
* to `run()` or `resume()` pops the next response from the array.
|
|
37
|
+
* @param {object[]} responses - Array of {text, success} objects
|
|
38
|
+
* @param {object[]} [messages] - Messages to buffer per response
|
|
39
|
+
* @returns {AgentRunner}
|
|
40
|
+
*/
|
|
41
|
+
export function createMockRunner(responses, messages) {
|
|
42
|
+
const output = new PassThrough();
|
|
43
|
+
let callIndex = 0;
|
|
44
|
+
|
|
45
|
+
const runner = new AgentRunner({
|
|
46
|
+
cwd: "/tmp",
|
|
47
|
+
query: async function* () {},
|
|
48
|
+
output,
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const consume = async (msgs) => {
|
|
52
|
+
let aborted = false;
|
|
53
|
+
for (const m of msgs) {
|
|
54
|
+
const line = JSON.stringify(m);
|
|
55
|
+
runner.buffer.push(line);
|
|
56
|
+
if (runner.onLine) runner.onLine(line);
|
|
57
|
+
if (runner.onBatch && shouldFlush(m)) {
|
|
58
|
+
await runner.onBatch([line], {
|
|
59
|
+
abort: () => {
|
|
60
|
+
aborted = true;
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
if (aborted) break;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
return aborted;
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
runner.run = async (_task) => {
|
|
70
|
+
const resp = responses[callIndex++];
|
|
71
|
+
const msgs = messages?.[callIndex - 1] ?? [
|
|
72
|
+
{ type: "assistant", content: resp.text },
|
|
73
|
+
];
|
|
74
|
+
const aborted = await consume(msgs);
|
|
75
|
+
runner.sessionId = "mock-session";
|
|
76
|
+
return {
|
|
77
|
+
success: resp.success ?? true,
|
|
78
|
+
text: resp.text,
|
|
79
|
+
sessionId: "mock-session",
|
|
80
|
+
aborted,
|
|
81
|
+
error: null,
|
|
82
|
+
};
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
runner.resume = async (_prompt) => {
|
|
86
|
+
const resp = responses[callIndex++];
|
|
87
|
+
const msgs = messages?.[callIndex - 1] ?? [
|
|
88
|
+
{ type: "assistant", content: resp.text },
|
|
89
|
+
];
|
|
90
|
+
const aborted = await consume(msgs);
|
|
91
|
+
return {
|
|
92
|
+
success: resp.success ?? true,
|
|
93
|
+
text: resp.text,
|
|
94
|
+
sessionId: runner.sessionId,
|
|
95
|
+
aborted,
|
|
96
|
+
error: null,
|
|
97
|
+
};
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
return runner;
|
|
101
|
+
}
|