@forwardimpact/libeval 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.8",
3
+ "version": "0.1.9",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
@@ -17,7 +17,8 @@ export class AgentRunner {
17
17
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
18
18
  * @param {string} [deps.permissionMode] - SDK permission mode
19
19
  * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
20
- * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
20
+ * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries: every `batchSize` assistant text blocks, the terminal `result` message, and — on iterator crash/abort — once more in a final flush carrying any lines that never reached a boundary. Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
21
+ * @param {number} [deps.batchSize] - Assistant text-block messages to accumulate before firing onBatch. Tool-only assistant messages ride along without counting. Default 3: the supervisor reviews the agent every three text turns instead of every turn. The terminal `result` always flushes regardless of count.
21
22
  * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
22
23
  * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
23
24
  * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
@@ -33,6 +34,7 @@ export class AgentRunner {
33
34
  permissionMode,
34
35
  onLine,
35
36
  onBatch,
37
+ batchSize,
36
38
  settingSources,
37
39
  agentProfile,
38
40
  systemPrompt,
@@ -57,6 +59,7 @@ export class AgentRunner {
57
59
  this.permissionMode = permissionMode ?? "bypassPermissions";
58
60
  this.onLine = onLine ?? null;
59
61
  this.onBatch = onBatch ?? null;
62
+ this.batchSize = batchSize ?? 3;
60
63
  this.settingSources = settingSources ?? [];
61
64
  this.agentProfile = agentProfile ?? null;
62
65
  this.systemPrompt = systemPrompt ?? null;
@@ -128,8 +131,18 @@ export class AgentRunner {
128
131
  * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
129
132
  * iterator, mirroring every line to the output stream / buffer / onLine
130
133
  * callback, and — when `onBatch` is set — flushes accumulated lines to it
131
- * at natural boundaries (assistant messages with text blocks, and the
132
- * terminal `result` message).
134
+ * at coarse boundaries: every `batchSize` assistant text-block messages,
135
+ * and the terminal `result` message. Tool-only assistant messages still
136
+ * accumulate in the pending batch and ride along in the next flush, so
137
+ * the supervisor always sees the tool calls that led up to each text
138
+ * block. Raising `batchSize` above 1 is the knob that makes the mid-turn
139
+ * supervisor review less chatty — with the default of 3, the supervisor
140
+ * sees the agent in chunks of three text turns instead of every turn.
141
+ *
142
+ * Corollary: a turn that is *entirely* tool_use with no text blocks and
143
+ * then hits `result` produces exactly one flush at `result` regardless
144
+ * of how many tools ran. That is deliberate — the supervisor only needs
145
+ * to weigh in when the agent surfaces something text-like to react to.
133
146
  *
134
147
  * INVARIANT: the `await this.onBatch(...)` call below is the ONLY
135
148
  * suspension point in this loop. While it is pending, no further lines
@@ -143,6 +156,13 @@ export class AgentRunner {
143
156
  * `currentAbortController.signal.aborted` (avoiding fragility around
144
157
  * AbortError vs DOMException shapes), and report `aborted: true` so the
145
158
  * caller can distinguish "supervisor asked us to stop" from a real error.
159
+ *
160
+ * If the iterator throws before a flush boundary, any lines still in the
161
+ * pending batch would otherwise vanish without the supervisor seeing
162
+ * them. The `finally` block emits a terminal batch so the supervisor can
163
+ * observe the partial state (e.g. note a crash or react to an external
164
+ * abort). A throw from that final flush becomes the returned `error`
165
+ * only if no earlier error was captured — the original failure wins.
146
166
  * @param {AsyncIterable<object>} iterator
147
167
  * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
148
168
  */
@@ -151,34 +171,16 @@ export class AgentRunner {
151
171
  let stopReason = null;
152
172
  let error = null;
153
173
  let aborted = false;
154
- const pendingBatch = [];
174
+ const state = { pendingBatch: [], assistantTextCount: 0 };
155
175
 
156
176
  try {
157
177
  for await (const message of iterator) {
158
- const line = JSON.stringify(message);
159
- this.output.write(line + "\n");
160
- this.buffer.push(line);
161
- if (this.onLine) this.onLine(line);
162
- if (this.onBatch) pendingBatch.push(line);
163
-
164
- if (message.type === "system" && message.subtype === "init") {
165
- this.sessionId = message.session_id;
166
- }
178
+ this.#recordLine(message, state);
167
179
  if (message.type === "result") {
168
180
  text = message.result ?? "";
169
181
  stopReason = message.subtype;
170
182
  }
171
-
172
- const shouldFlush =
173
- this.onBatch &&
174
- (message.type === "result" ||
175
- (message.type === "assistant" && hasTextBlock(message)));
176
- if (shouldFlush) {
177
- const batchLines = pendingBatch.splice(0, pendingBatch.length);
178
- await this.onBatch(batchLines, {
179
- abort: () => this.currentAbortController?.abort(),
180
- });
181
- }
183
+ await this.#maybeFlushBatch(message, state);
182
184
  }
183
185
  } catch (err) {
184
186
  if (this.currentAbortController?.signal.aborted) {
@@ -188,10 +190,84 @@ export class AgentRunner {
188
190
  }
189
191
  }
190
192
 
193
+ const flushErr = await this.#terminalFlush(state, { error, aborted });
194
+ if (flushErr && !error) error = flushErr;
195
+
191
196
  const success = stopReason === "success";
192
197
  return { success, text, sessionId: this.sessionId, error, aborted };
193
198
  }
194
199
 
200
+ /**
201
+ * Mirror a single SDK message to the output stream, buffer, onLine
202
+ * callback, and (when set) the pending-batch state. Also handles
203
+ * session id capture and text-block counting so `#consumeQuery` can
204
+ * stay within the complexity budget.
205
+ * @param {object} message
206
+ * @param {{pendingBatch: string[], assistantTextCount: number}} state
207
+ */
208
+ #recordLine(message, state) {
209
+ const line = JSON.stringify(message);
210
+ this.output.write(line + "\n");
211
+ this.buffer.push(line);
212
+ if (this.onLine) this.onLine(line);
213
+ if (this.onBatch) state.pendingBatch.push(line);
214
+
215
+ if (message.type === "system" && message.subtype === "init") {
216
+ this.sessionId = message.session_id;
217
+ }
218
+ if (message.type === "assistant" && hasTextBlock(message)) {
219
+ state.assistantTextCount++;
220
+ }
221
+ }
222
+
223
+ /**
224
+ * Terminal flush — only fires on the abnormal-end paths (iterator
225
+ * threw or was aborted mid-stream). Delivers any pending lines so the
226
+ * supervisor sees the partial state instead of losing the tail of
227
+ * the run. A natural-end iterator that simply ran out of messages
228
+ * without a `result` marker is treated as an incomplete stub (the
229
+ * real SDK always terminates with `result`) and its pending batch is
230
+ * not re-flushed. Returns an error thrown by the flush callback, or
231
+ * `null` if the flush succeeded or did not fire.
232
+ * @param {{pendingBatch: string[], assistantTextCount: number}} state
233
+ * @param {{error: Error|null, aborted: boolean}} outcome
234
+ * @returns {Promise<Error|null>}
235
+ */
236
+ async #terminalFlush(state, { error, aborted }) {
237
+ const loopEndedAbnormally = Boolean(error || aborted);
238
+ if (!loopEndedAbnormally) return null;
239
+ if (!this.onBatch || state.pendingBatch.length === 0) return null;
240
+ try {
241
+ const batchLines = state.pendingBatch.splice(0);
242
+ await this.onBatch(batchLines, {
243
+ abort: () => this.currentAbortController?.abort(),
244
+ });
245
+ return null;
246
+ } catch (flushErr) {
247
+ return flushErr;
248
+ }
249
+ }
250
+
251
+ /**
252
+ * Flush the pending batch to `onBatch` if either the batchSize threshold
253
+ * has been reached or the current message is the terminal `result`.
254
+ * Extracted so that `#consumeQuery` stays within the project's complexity
255
+ * budget — the flush is one cohesive unit of logic in its own right.
256
+ * @param {object} message
257
+ * @param {{pendingBatch: string[], assistantTextCount: number}} state
258
+ */
259
+ async #maybeFlushBatch(message, state) {
260
+ if (!this.onBatch) return;
261
+ const shouldFlush =
262
+ message.type === "result" || state.assistantTextCount >= this.batchSize;
263
+ if (!shouldFlush) return;
264
+ state.assistantTextCount = 0;
265
+ const batchLines = state.pendingBatch.splice(0);
266
+ await this.onBatch(batchLines, {
267
+ abort: () => this.currentAbortController?.abort(),
268
+ });
269
+ }
270
+
195
271
  /**
196
272
  * Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
197
273
  * @returns {string[]}
@@ -205,13 +281,14 @@ export class AgentRunner {
205
281
 
206
282
  /**
207
283
  * Whether an SDK assistant message contains at least one text block.
208
- * Tool-only assistant messages return false so they accumulate into the
209
- * pending batch and flush with the next text block (or with the terminal
210
- * `result` message), keeping supervisor LLM cost bounded.
284
+ * Only text-block messages count toward the `batchSize` threshold tool-only
285
+ * assistant messages accumulate silently into the pending batch and ride along
286
+ * in the next flush, keeping supervisor LLM cost bounded. Exported so the mock
287
+ * runner can mirror the real flush predicate without duplicating the logic.
211
288
  * @param {object} message
212
289
  * @returns {boolean}
213
290
  */
214
- function hasTextBlock(message) {
291
+ export function hasTextBlock(message) {
215
292
  const content = message.message?.content ?? message.content;
216
293
  if (!Array.isArray(content)) return false;
217
294
  for (const block of content) {
@@ -0,0 +1,271 @@
1
+ import { describe, test } from "node:test";
2
+ import assert from "node:assert";
3
+ import { PassThrough } from "node:stream";
4
+
5
+ import { AgentRunner } from "@forwardimpact/libeval";
6
+
7
+ /**
8
+ * Create a mock query function that yields canned messages.
9
+ * @param {object[]} messages - Messages to yield
10
+ * @returns {function}
11
+ */
12
+ function mockQuery(messages) {
13
+ return async function* () {
14
+ for (const msg of messages) {
15
+ yield msg;
16
+ }
17
+ };
18
+ }
19
+
20
+ const textBlock = (t) => ({
21
+ type: "assistant",
22
+ message: { content: [{ type: "text", text: t }] },
23
+ });
24
+
25
+ const toolOnly = (name) => ({
26
+ type: "assistant",
27
+ message: {
28
+ content: [{ type: "tool_use", id: "tu_" + name, name, input: {} }],
29
+ },
30
+ });
31
+
32
+ describe("AgentRunner - onBatch batching", () => {
33
+ test("batchSize defaults to 3", () => {
34
+ const runner = new AgentRunner({
35
+ cwd: "/tmp",
36
+ query: async function* () {},
37
+ output: new PassThrough(),
38
+ });
39
+ assert.strictEqual(runner.batchSize, 3);
40
+ });
41
+
42
+ test("onBatch fires every 3 assistant text-block messages by default", async () => {
43
+ // 5 text-block messages + terminal result. With the default batchSize
44
+ // of 3, onBatch should fire on the 3rd text message and again on the
45
+ // terminal result (flushing the remaining 2).
46
+ const messages = [
47
+ { type: "system", subtype: "init", session_id: "sess-batch" },
48
+ textBlock("one"),
49
+ textBlock("two"),
50
+ textBlock("three"),
51
+ textBlock("four"),
52
+ textBlock("five"),
53
+ { type: "result", subtype: "success", result: "Done." },
54
+ ];
55
+
56
+ const batches = [];
57
+ const runner = new AgentRunner({
58
+ cwd: "/tmp",
59
+ query: mockQuery(messages),
60
+ output: new PassThrough(),
61
+ });
62
+ runner.onBatch = async (lines) => {
63
+ batches.push(lines.map((l) => JSON.parse(l)));
64
+ };
65
+
66
+ await runner.run("Task");
67
+
68
+ // First flush carries init + first 3 text messages; second carries
69
+ // remaining 2 text messages + the result.
70
+ assert.strictEqual(batches.length, 2);
71
+ assert.strictEqual(batches[0].length, 4);
72
+ assert.strictEqual(batches[1].length, 3);
73
+ });
74
+
75
+ test("onBatch honours custom batchSize", async () => {
76
+ // batchSize = 2: 4 text messages produce 2 flushes; result adds a 3rd.
77
+ const messages = [
78
+ textBlock("a"),
79
+ textBlock("b"),
80
+ textBlock("c"),
81
+ textBlock("d"),
82
+ { type: "result", subtype: "success", result: "Done." },
83
+ ];
84
+
85
+ const batches = [];
86
+ const runner = new AgentRunner({
87
+ cwd: "/tmp",
88
+ query: mockQuery(messages),
89
+ output: new PassThrough(),
90
+ batchSize: 2,
91
+ });
92
+ runner.onBatch = async (lines) => {
93
+ batches.push(lines.length);
94
+ };
95
+
96
+ await runner.run("Task");
97
+
98
+ assert.deepStrictEqual(batches, [2, 2, 1]);
99
+ });
100
+
101
+ test("tool-only assistant messages ride along in the next flush", async () => {
102
+ // Tool-only assistant messages accumulate without incrementing the
103
+ // counter. The supervisor sees the preceding tool calls when the
104
+ // flush eventually fires.
105
+ const messages = [
106
+ toolOnly("Read"),
107
+ toolOnly("Grep"),
108
+ textBlock("found it"),
109
+ { type: "result", subtype: "success", result: "Done." },
110
+ ];
111
+
112
+ const batches = [];
113
+ const runner = new AgentRunner({
114
+ cwd: "/tmp",
115
+ query: mockQuery(messages),
116
+ output: new PassThrough(),
117
+ batchSize: 1,
118
+ });
119
+ runner.onBatch = async (lines) => {
120
+ batches.push(lines.map((l) => JSON.parse(l)));
121
+ };
122
+
123
+ await runner.run("Task");
124
+
125
+ // First flush triggered by the single text-block message; it carries
126
+ // the two preceding tool-only messages with it.
127
+ assert.strictEqual(batches.length, 2);
128
+ assert.strictEqual(batches[0].length, 3);
129
+ assert.strictEqual(batches[0][0].message.content[0].type, "tool_use");
130
+ assert.strictEqual(batches[0][1].message.content[0].type, "tool_use");
131
+ assert.strictEqual(batches[0][2].message.content[0].type, "text");
132
+ assert.strictEqual(batches[1].length, 1);
133
+ assert.strictEqual(batches[1][0].type, "result");
134
+ });
135
+
136
+ test("terminal result always flushes even if batchSize not yet reached", async () => {
137
+ // 1 text-block + result, batchSize = 5. The counter only reaches 1
138
+ // but the terminal result must still flush.
139
+ const messages = [
140
+ textBlock("only one"),
141
+ { type: "result", subtype: "success", result: "Done." },
142
+ ];
143
+
144
+ const batches = [];
145
+ const runner = new AgentRunner({
146
+ cwd: "/tmp",
147
+ query: mockQuery(messages),
148
+ output: new PassThrough(),
149
+ batchSize: 5,
150
+ });
151
+ runner.onBatch = async (lines) => {
152
+ batches.push(lines.length);
153
+ };
154
+
155
+ await runner.run("Task");
156
+
157
+ assert.deepStrictEqual(batches, [2]);
158
+ });
159
+ });
160
+
161
+ describe("AgentRunner - terminal flush on abnormal end", () => {
162
+ test("iterator crash before a flush boundary still delivers the pending batch", async () => {
163
+ // batchSize = 3: the first two text messages accumulate without
164
+ // flushing. The iterator then throws before the threshold — the
165
+ // pending batch must ship in a terminal flush.
166
+ async function* crashingQuery() {
167
+ yield { type: "system", subtype: "init", session_id: "sess-crash" };
168
+ yield textBlock("step 1");
169
+ yield textBlock("step 2");
170
+ throw new Error("Claude Code process exited with code 1");
171
+ }
172
+
173
+ const batches = [];
174
+ const runner = new AgentRunner({
175
+ cwd: "/tmp",
176
+ query: () => crashingQuery(),
177
+ output: new PassThrough(),
178
+ });
179
+ runner.onBatch = async (lines) => {
180
+ batches.push(lines.map((l) => JSON.parse(l)));
181
+ };
182
+
183
+ const result = await runner.run("Task");
184
+
185
+ assert.ok(result.error);
186
+ assert.match(result.error.message, /exited with code 1/);
187
+ assert.strictEqual(batches.length, 1);
188
+ assert.strictEqual(batches[0].length, 3);
189
+ assert.strictEqual(batches[0][0].type, "system");
190
+ assert.strictEqual(batches[0][1].type, "assistant");
191
+ assert.strictEqual(batches[0][2].type, "assistant");
192
+ });
193
+
194
+ test("iterator crash after a completed batch does not re-flush", async () => {
195
+ // batchSize = 2: two text messages trigger a normal flush, emptying
196
+ // the pending batch. The iterator then throws with nothing pending —
197
+ // the terminal flush must be a no-op, not an empty call.
198
+ async function* crashingQuery() {
199
+ yield textBlock("a");
200
+ yield textBlock("b");
201
+ throw new Error("boom");
202
+ }
203
+
204
+ const batches = [];
205
+ const runner = new AgentRunner({
206
+ cwd: "/tmp",
207
+ query: () => crashingQuery(),
208
+ output: new PassThrough(),
209
+ batchSize: 2,
210
+ });
211
+ runner.onBatch = async (lines) => {
212
+ batches.push(lines.length);
213
+ };
214
+
215
+ const result = await runner.run("Task");
216
+ assert.ok(result.error);
217
+ assert.match(result.error.message, /boom/);
218
+ assert.deepStrictEqual(batches, [2]);
219
+ });
220
+
221
+ test("natural-end iterator without a result does not trigger terminal flush", async () => {
222
+ // The real SDK always terminates with `result`. A mock that ends
223
+ // naturally with pending lines is treated as an incomplete stub —
224
+ // no phantom flush, since nothing about a natural end warrants a
225
+ // new mid-turn review.
226
+ async function* noResultQuery() {
227
+ yield textBlock("one");
228
+ yield textBlock("two");
229
+ // No result, no error — just ends.
230
+ }
231
+
232
+ const batches = [];
233
+ const runner = new AgentRunner({
234
+ cwd: "/tmp",
235
+ query: () => noResultQuery(),
236
+ output: new PassThrough(),
237
+ batchSize: 3,
238
+ });
239
+ runner.onBatch = async (lines) => {
240
+ batches.push(lines.length);
241
+ };
242
+
243
+ const result = await runner.run("Task");
244
+ assert.strictEqual(result.error, null);
245
+ assert.strictEqual(batches.length, 0);
246
+ });
247
+
248
+ test("onBatch throw during terminal flush does not mask an earlier error", async () => {
249
+ // The iterator threw first; the terminal flush also throws. The
250
+ // original iterator error must win — it is the more actionable
251
+ // condition to surface to the caller.
252
+ async function* crashingQuery() {
253
+ yield textBlock("partial");
254
+ throw new Error("original failure");
255
+ }
256
+
257
+ const runner = new AgentRunner({
258
+ cwd: "/tmp",
259
+ query: () => crashingQuery(),
260
+ output: new PassThrough(),
261
+ batchSize: 3,
262
+ });
263
+ runner.onBatch = async () => {
264
+ throw new Error("flush failure");
265
+ };
266
+
267
+ const result = await runner.run("Task");
268
+ assert.ok(result.error);
269
+ assert.match(result.error.message, /original failure/);
270
+ });
271
+ });
@@ -1,10 +1,13 @@
1
1
  /**
2
2
  * Test-only mock factory for AgentRunner. Yields pre-scripted responses,
3
3
  * and (when an `onBatch` callback is set) fires it at the same boundaries
4
- * the real AgentRunner would: assistant messages with at least one text
5
- * block, and the terminal `result` message. If the callback calls
6
- * `abort()`, the mock stops iterating that response's messages and
7
- * reports `aborted: true`.
4
+ * the real AgentRunner would: every `runner.batchSize` assistant messages
5
+ * with a text block, and the terminal `result` message. Tool-only
6
+ * assistant messages accumulate into the pending batch without counting
7
+ * toward the threshold. If the callback calls `abort()`, the mock stops
8
+ * iterating that response's messages and reports `aborted: true` — any
9
+ * lines that never made it through a flush boundary then ship in a
10
+ * terminal batch, mirroring the real runner's finally-flush.
8
11
  *
9
12
  * Intentionally a regular module (not a test file) so describe/test blocks
10
13
  * here would not run. Lives under test/ to make its scope explicit.
@@ -12,24 +15,7 @@
12
15
 
13
16
  import { PassThrough } from "node:stream";
14
17
  import { AgentRunner } from "@forwardimpact/libeval";
15
-
16
- /**
17
- * Whether a scripted message should trigger an onBatch flush. Mirrors the
18
- * real AgentRunner: assistant-with-text-block or terminal `result` message.
19
- * Tool-only or string-content messages accumulate without flushing.
20
- * @param {object} message
21
- * @returns {boolean}
22
- */
23
- export function shouldFlush(message) {
24
- if (message.type === "result") return true;
25
- if (message.type !== "assistant") return false;
26
- const content = message.message?.content ?? message.content;
27
- if (!Array.isArray(content)) return false;
28
- for (const block of content) {
29
- if (block.type === "text" && block.text) return true;
30
- }
31
- return false;
32
- }
18
+ import { hasTextBlock } from "../src/agent-runner.js";
33
19
 
34
20
  /**
35
21
  * Create a mock AgentRunner that yields pre-scripted responses. Each call
@@ -50,12 +36,25 @@ export function createMockRunner(responses, messages) {
50
36
 
51
37
  const consume = async (msgs) => {
52
38
  let aborted = false;
39
+ const pendingBatch = [];
40
+ let assistantTextCount = 0;
53
41
  for (const m of msgs) {
54
42
  const line = JSON.stringify(m);
55
43
  runner.buffer.push(line);
56
44
  if (runner.onLine) runner.onLine(line);
57
- if (runner.onBatch && shouldFlush(m)) {
58
- await runner.onBatch([line], {
45
+ if (runner.onBatch) pendingBatch.push(line);
46
+
47
+ if (hasTextBlock(m)) {
48
+ assistantTextCount++;
49
+ }
50
+
51
+ const shouldFlush =
52
+ runner.onBatch &&
53
+ (m.type === "result" || assistantTextCount >= runner.batchSize);
54
+ if (shouldFlush) {
55
+ assistantTextCount = 0;
56
+ const batchLines = pendingBatch.splice(0);
57
+ await runner.onBatch(batchLines, {
59
58
  abort: () => {
60
59
  aborted = true;
61
60
  },
@@ -63,6 +62,19 @@ export function createMockRunner(responses, messages) {
63
62
  if (aborted) break;
64
63
  }
65
64
  }
65
+ // Terminal flush: mirror the real AgentRunner's abnormal-end path —
66
+ // an aborted scripted run delivers any pending tail so the supervisor
67
+ // sees the partial state. Natural-end without a `result` marker is
68
+ // treated as a simplified stub (no phantom flush), matching the real
69
+ // runner's rule that terminal flush only fires on error/abort.
70
+ if (aborted && runner.onBatch && pendingBatch.length > 0) {
71
+ const batchLines = pendingBatch.splice(0);
72
+ await runner.onBatch(batchLines, {
73
+ abort: () => {
74
+ aborted = true;
75
+ },
76
+ });
77
+ }
66
78
  return aborted;
67
79
  };
68
80
 
@@ -0,0 +1,175 @@
1
+ import { describe, test } from "node:test";
2
+ import assert from "node:assert";
3
+ import { PassThrough } from "node:stream";
4
+
5
+ import { Supervisor } from "@forwardimpact/libeval";
6
+ import { createMockRunner } from "./mock-runner.js";
7
+
8
+ const textBlock = (t) => ({
9
+ type: "assistant",
10
+ message: { content: [{ type: "text", text: t }] },
11
+ });
12
+
13
+ describe("Supervisor - batching at the default batchSize", () => {
14
+ test("mid-turn review fires once per 3 agent text messages", async () => {
15
+ // Agent emits 7 text-block assistant messages in one turn. With the
16
+ // default batchSize of 3 the supervisor's mid-turn review should fire
17
+ // twice (after messages 3 and 6) plus once more from the terminal
18
+ // result flush carrying the remaining message — not seven times, as
19
+ // the old per-message flushing would have done.
20
+ const agentMessages = [
21
+ [
22
+ textBlock("step 1"),
23
+ textBlock("step 2"),
24
+ textBlock("step 3"),
25
+ textBlock("step 4"),
26
+ textBlock("step 5"),
27
+ textBlock("step 6"),
28
+ textBlock("step 7"),
29
+ { type: "result", subtype: "success", result: "Done." },
30
+ ],
31
+ ];
32
+
33
+ const agentRunner = createMockRunner(
34
+ [{ text: "Finished." }],
35
+ agentMessages,
36
+ );
37
+ // Leave batchSize at the default (3) — this is the behaviour we're
38
+ // verifying end-to-end through the supervisor loop.
39
+ assert.strictEqual(agentRunner.batchSize, 3);
40
+
41
+ const supervisorRunner = createMockRunner([
42
+ { text: "Welcome. Begin." },
43
+ { text: "Keep going." }, // mid-turn batch 1 (messages 1-3)
44
+ { text: "Keep going." }, // mid-turn batch 2 (messages 4-6)
45
+ { text: "Keep going." }, // terminal result flush (message 7 + result)
46
+ { text: "Good work.\n\nEVALUATION_COMPLETE" }, // end-of-turn review
47
+ ]);
48
+
49
+ const output = new PassThrough();
50
+ const supervisor = new Supervisor({
51
+ agentRunner,
52
+ supervisorRunner,
53
+ output,
54
+ maxTurns: 10,
55
+ });
56
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
57
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
58
+
59
+ const result = await supervisor.run("Do the task");
60
+ assert.strictEqual(result.success, true);
61
+
62
+ const midTurnReviews = (output.read()?.toString() ?? "")
63
+ .trim()
64
+ .split("\n")
65
+ .filter((l) => l.length > 0)
66
+ .map((l) => JSON.parse(l))
67
+ .filter(
68
+ (l) =>
69
+ l.source === "orchestrator" && l.event?.type === "mid_turn_review",
70
+ );
71
+
72
+ // 3 flushes total: two at the batchSize threshold (messages 3 and 6),
73
+ // one at the terminal result (trailing message + result marker).
74
+ assert.strictEqual(
75
+ midTurnReviews.length,
76
+ 3,
77
+ "Supervisor should review 3 times per turn, not 7",
78
+ );
79
+ });
80
+
81
+ test("EVALUATION_INTERVENTION at the default batchSize still aborts and relays", async () => {
82
+ // Companion to the observation test above: the 3-message batching and
83
+ // the intervention path exercised together.
84
+ //
85
+ // Agent call 1 emits 3 text-block messages (triggering a flush at the
86
+ // 3rd). The supervisor intervenes; the agent SDK session aborts and
87
+ // the supervisor's intervention text is relayed into resume(). Agent
88
+ // call 2 has 1 text block — below the batchSize threshold — so no
89
+ // extra mid-turn flush fires, and the supervisor jumps straight to
90
+ // the end-of-turn review.
91
+ const agentMessages = [
92
+ [
93
+ textBlock("reading docs"),
94
+ textBlock("running Bash"),
95
+ textBlock("found the wrong path"),
96
+ ],
97
+ [textBlock("corrected, using the documented path")],
98
+ ];
99
+
100
+ const agentRunner = createMockRunner(
101
+ [{ text: "wrong path" }, { text: "corrected" }],
102
+ agentMessages,
103
+ );
104
+ assert.strictEqual(agentRunner.batchSize, 3);
105
+
106
+ const supervisorMessages = [
107
+ undefined,
108
+ [
109
+ {
110
+ type: "assistant",
111
+ message: {
112
+ content: [
113
+ {
114
+ type: "text",
115
+ text: "EVALUATION_INTERVENTION Use the documented path.",
116
+ },
117
+ ],
118
+ },
119
+ },
120
+ ],
121
+ undefined,
122
+ ];
123
+
124
+ const supervisorRunner = createMockRunner(
125
+ [
126
+ { text: "Welcome. Begin." },
127
+ { text: "EVALUATION_INTERVENTION Use the documented path." },
128
+ { text: "Good.\n\nEVALUATION_COMPLETE" },
129
+ ],
130
+ supervisorMessages,
131
+ );
132
+
133
+ const output = new PassThrough();
134
+ const supervisor = new Supervisor({
135
+ agentRunner,
136
+ supervisorRunner,
137
+ output,
138
+ maxTurns: 10,
139
+ });
140
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
141
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
142
+
143
+ let resumePrompt = null;
144
+ const origResume = agentRunner.resume;
145
+ agentRunner.resume = async (prompt) => {
146
+ resumePrompt = prompt;
147
+ return origResume.call(agentRunner, prompt);
148
+ };
149
+
150
+ const result = await supervisor.run("Install");
151
+ assert.strictEqual(result.success, true);
152
+ assert.strictEqual(result.turns, 1);
153
+ assert.ok(
154
+ resumePrompt && resumePrompt.includes("documented path"),
155
+ "Resume prompt should carry the supervisor's intervention text",
156
+ );
157
+
158
+ const orchestratorEvents = (output.read()?.toString() ?? "")
159
+ .trim()
160
+ .split("\n")
161
+ .filter((l) => l.length > 0)
162
+ .map((l) => JSON.parse(l))
163
+ .filter((e) => e.source === "orchestrator");
164
+ assert.ok(
165
+ orchestratorEvents.some(
166
+ (e) => e.event?.type === "intervention_requested",
167
+ ),
168
+ "Trace should contain intervention_requested",
169
+ );
170
+ assert.ok(
171
+ orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
172
+ "Trace should contain intervention_relayed",
173
+ );
174
+ });
175
+ });
@@ -65,6 +65,9 @@ describe("Supervisor - mid-turn intervention", () => {
65
65
  // Supervisor responds with "Keep going." — neither signal flag is set,
66
66
  // so the agent's SDK session completes naturally and the end-of-turn
67
67
  // review then emits EVALUATION_COMPLETE.
68
+ //
69
+ // batchSize = 1 keeps this test focused on intervention semantics, not
70
+ // on the coarser default batching (3) exercised by agent-runner.test.js.
68
71
  const agentMessages = [
69
72
  [
70
73
  {
@@ -80,6 +83,7 @@ describe("Supervisor - mid-turn intervention", () => {
80
83
  [{ text: "I'm working on it." }],
81
84
  agentMessages,
82
85
  );
86
+ agentRunner.batchSize = 1;
83
87
 
84
88
  const supervisorRunner = createMockRunner([
85
89
  { text: "Welcome! Please install." },
@@ -167,6 +171,7 @@ describe("Supervisor - mid-turn intervention", () => {
167
171
  ],
168
172
  agentMessages,
169
173
  );
174
+ agentRunner.batchSize = 1;
170
175
 
171
176
  // Supervisor responses (in order):
172
177
  // 0: turn 0 introduction
@@ -277,6 +282,7 @@ describe("Supervisor - mid-turn intervention", () => {
277
282
  [{ text: "Trying X." }, { text: "Trying Y." }],
278
283
  agentMessages,
279
284
  );
285
+ agentRunner.batchSize = 1;
280
286
 
281
287
  const supervisorMessages = [
282
288
  undefined,
@@ -146,6 +146,7 @@ describe("Supervisor - output and events", () => {
146
146
  [{ text: "Trying the wrong thing." }, { text: "Switching." }],
147
147
  agentMessages,
148
148
  );
149
+ agentRunner.batchSize = 1;
149
150
  const supervisorRunner = createMockRunner(
150
151
  [
151
152
  { text: "Welcome." },