@forwardimpact/libeval 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,359 @@
1
+ import { describe, test } from "node:test";
2
+ import assert from "node:assert";
3
+ import { PassThrough } from "node:stream";
4
+
5
+ import { Supervisor } from "@forwardimpact/libeval";
6
+ import { isIntervention } from "../src/supervisor.js";
7
+ import { createMockRunner } from "./mock-runner.js";
8
+
9
+ describe("isIntervention", () => {
10
+ test("detects EVALUATION_INTERVENTION on its own line", () => {
11
+ assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true);
12
+ assert.strictEqual(
13
+ isIntervention("Some text\nEVALUATION_INTERVENTION\nMore text"),
14
+ true,
15
+ );
16
+ assert.strictEqual(
17
+ isIntervention("Stop.\n\nEVALUATION_INTERVENTION"),
18
+ true,
19
+ );
20
+ });
21
+
22
+ test("tolerates markdown formatting around the signal", () => {
23
+ assert.strictEqual(isIntervention("**EVALUATION_INTERVENTION**"), true);
24
+ assert.strictEqual(isIntervention("*EVALUATION_INTERVENTION*"), true);
25
+ assert.strictEqual(isIntervention("__EVALUATION_INTERVENTION__"), true);
26
+ assert.strictEqual(isIntervention("_EVALUATION_INTERVENTION_"), true);
27
+ assert.strictEqual(isIntervention("`EVALUATION_INTERVENTION`"), true);
28
+ assert.strictEqual(
29
+ isIntervention(
30
+ "Wrong path.\n\n**EVALUATION_INTERVENTION**\n\nTry the documented one.",
31
+ ),
32
+ true,
33
+ );
34
+ });
35
+
36
+ test("matches EVALUATION_INTERVENTION inline", () => {
37
+ assert.strictEqual(
38
+ isIntervention("Stopping you with EVALUATION_INTERVENTION now."),
39
+ true,
40
+ );
41
+ assert.strictEqual(
42
+ isIntervention("Note: EVALUATION_INTERVENTION. Switch to Y."),
43
+ true,
44
+ );
45
+ });
46
+
47
+ test("does not match empty or unrelated text", () => {
48
+ assert.strictEqual(isIntervention(""), false);
49
+ assert.strictEqual(isIntervention("Stop and think."), false);
50
+ assert.strictEqual(isIntervention("INTERVENTION"), false);
51
+ });
52
+
53
+ test("does not match EVALUATION_COMPLETE alone", () => {
54
+ assert.strictEqual(isIntervention("EVALUATION_COMPLETE"), false);
55
+ assert.strictEqual(
56
+ isIntervention("Good work.\n\nEVALUATION_COMPLETE"),
57
+ false,
58
+ );
59
+ });
60
+ });
61
+
62
+ describe("Supervisor - mid-turn intervention", () => {
63
+ test("observation without intervention does not interrupt the agent", async () => {
64
+ // Agent emits one structured assistant text block — fires onBatch once.
65
+ // Supervisor responds with "Keep going." — neither signal flag is set,
66
+ // so the agent's SDK session completes naturally and the end-of-turn
67
+ // review then emits EVALUATION_COMPLETE.
68
+ const agentMessages = [
69
+ [
70
+ {
71
+ type: "assistant",
72
+ message: {
73
+ content: [{ type: "text", text: "I'm working on it." }],
74
+ },
75
+ },
76
+ ],
77
+ ];
78
+
79
+ const agentRunner = createMockRunner(
80
+ [{ text: "I'm working on it." }],
81
+ agentMessages,
82
+ );
83
+
84
+ const supervisorRunner = createMockRunner([
85
+ { text: "Welcome! Please install." },
86
+ { text: "Keep going." },
87
+ { text: "Good work.\n\nEVALUATION_COMPLETE" },
88
+ ]);
89
+
90
+ const output = new PassThrough();
91
+ const supervisor = new Supervisor({
92
+ agentRunner,
93
+ supervisorRunner,
94
+ output,
95
+ maxTurns: 10,
96
+ });
97
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
98
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
99
+
100
+ let agentResumeCalls = 0;
101
+ const origAgentResume = agentRunner.resume;
102
+ agentRunner.resume = async (prompt) => {
103
+ agentResumeCalls++;
104
+ return origAgentResume.call(agentRunner, prompt);
105
+ };
106
+
107
+ const result = await supervisor.run("Install");
108
+
109
+ assert.strictEqual(result.success, true);
110
+ assert.strictEqual(result.turns, 1);
111
+ assert.strictEqual(
112
+ agentResumeCalls,
113
+ 0,
114
+ "Agent should not be resumed when supervisor never intervenes",
115
+ );
116
+
117
+ // Trace must contain a mid_turn_review marker but no intervention markers.
118
+ const data = output.read()?.toString() ?? "";
119
+ const orchestratorEvents = data
120
+ .trim()
121
+ .split("\n")
122
+ .filter((l) => l.length > 0)
123
+ .map((l) => JSON.parse(l))
124
+ .filter((e) => e.source === "orchestrator");
125
+ assert.ok(
126
+ orchestratorEvents.some((e) => e.event?.type === "mid_turn_review"),
127
+ "Trace should contain mid_turn_review when onBatch fires",
128
+ );
129
+ assert.ok(
130
+ !orchestratorEvents.some(
131
+ (e) => e.event?.type === "intervention_requested",
132
+ ),
133
+ "Trace should not contain intervention_requested when supervisor only observes",
134
+ );
135
+ });
136
+
137
+ test("EVALUATION_INTERVENTION from mid-turn batch interrupts and relays", async () => {
138
+ // Agent's first call fires onBatch on a structured assistant text block;
139
+ // supervisor responds with EVALUATION_INTERVENTION → abort + relay.
140
+ // Agent's second call (resume) finishes naturally; end-of-turn review
141
+ // then emits EVALUATION_COMPLETE.
142
+ const agentMessages = [
143
+ [
144
+ {
145
+ type: "assistant",
146
+ message: {
147
+ content: [{ type: "text", text: "I'll try the wrong path." }],
148
+ },
149
+ },
150
+ ],
151
+ [
152
+ {
153
+ type: "assistant",
154
+ message: {
155
+ content: [
156
+ { type: "text", text: "OK, switching to the documented path." },
157
+ ],
158
+ },
159
+ },
160
+ ],
161
+ ];
162
+
163
+ const agentRunner = createMockRunner(
164
+ [
165
+ { text: "I'll try the wrong path." },
166
+ { text: "OK, switching to the documented path." },
167
+ ],
168
+ agentMessages,
169
+ );
170
+
171
+ // Supervisor responses (in order):
172
+ // 0: turn 0 introduction
173
+ // 1: mid-turn 1 batch 1 — intervene
174
+ // 2: mid-turn 1 batch 1 (post-resume) — keep going
175
+ // 3: end-of-turn 1 — EVALUATION_COMPLETE
176
+ const supervisorMessages = [
177
+ undefined,
178
+ [
179
+ {
180
+ type: "assistant",
181
+ message: {
182
+ content: [
183
+ {
184
+ type: "text",
185
+ text: "EVALUATION_INTERVENTION Stop and use the documented path.",
186
+ },
187
+ ],
188
+ },
189
+ },
190
+ ],
191
+ undefined,
192
+ undefined,
193
+ ];
194
+
195
+ const supervisorRunner = createMockRunner(
196
+ [
197
+ { text: "Welcome." },
198
+ { text: "EVALUATION_INTERVENTION Stop and use the documented path." },
199
+ { text: "Keep going." },
200
+ { text: "Good.\n\nEVALUATION_COMPLETE" },
201
+ ],
202
+ supervisorMessages,
203
+ );
204
+
205
+ const output = new PassThrough();
206
+ const supervisor = new Supervisor({
207
+ agentRunner,
208
+ supervisorRunner,
209
+ output,
210
+ maxTurns: 10,
211
+ });
212
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
213
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
214
+
215
+ let agentResumeCalls = 0;
216
+ let firstResumePrompt = null;
217
+ const origAgentResume = agentRunner.resume;
218
+ agentRunner.resume = async (prompt) => {
219
+ agentResumeCalls++;
220
+ if (agentResumeCalls === 1) firstResumePrompt = prompt;
221
+ return origAgentResume.call(agentRunner, prompt);
222
+ };
223
+
224
+ const result = await supervisor.run("Install");
225
+
226
+ assert.strictEqual(result.success, true);
227
+ assert.strictEqual(result.turns, 1);
228
+ assert.strictEqual(
229
+ agentResumeCalls,
230
+ 1,
231
+ "Agent should be resumed exactly once after intervention",
232
+ );
233
+ assert.ok(
234
+ firstResumePrompt && firstResumePrompt.includes("documented path"),
235
+ "Resume prompt should carry the supervisor's intervention text",
236
+ );
237
+
238
+ const orchestratorEvents = (output.read()?.toString() ?? "")
239
+ .trim()
240
+ .split("\n")
241
+ .filter((l) => l.length > 0)
242
+ .map((l) => JSON.parse(l))
243
+ .filter((e) => e.source === "orchestrator");
244
+ assert.ok(
245
+ orchestratorEvents.some(
246
+ (e) => e.event?.type === "intervention_requested",
247
+ ),
248
+ "Trace should contain intervention_requested orchestrator event",
249
+ );
250
+ assert.ok(
251
+ orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
252
+ "Trace should contain intervention_relayed orchestrator event",
253
+ );
254
+ });
255
+
256
+ test("EVALUATION_INTERVENTION and EVALUATION_COMPLETE in the same turn", async () => {
257
+ // Batch 1: supervisor intervenes (abort + relay).
258
+ // After resume, batch 1 of resume: supervisor writes EVALUATION_COMPLETE
259
+ // (mid-turn) — the loop must exit success without running an end-of-turn
260
+ // review.
261
+ const agentMessages = [
262
+ [
263
+ {
264
+ type: "assistant",
265
+ message: { content: [{ type: "text", text: "Trying X." }] },
266
+ },
267
+ ],
268
+ [
269
+ {
270
+ type: "assistant",
271
+ message: { content: [{ type: "text", text: "OK trying Y." }] },
272
+ },
273
+ ],
274
+ ];
275
+
276
+ const agentRunner = createMockRunner(
277
+ [{ text: "Trying X." }, { text: "Trying Y." }],
278
+ agentMessages,
279
+ );
280
+
281
+ const supervisorMessages = [
282
+ undefined,
283
+ [
284
+ {
285
+ type: "assistant",
286
+ message: {
287
+ content: [
288
+ {
289
+ type: "text",
290
+ text: "EVALUATION_INTERVENTION Try Y instead.",
291
+ },
292
+ ],
293
+ },
294
+ },
295
+ ],
296
+ [
297
+ {
298
+ type: "assistant",
299
+ message: {
300
+ content: [{ type: "text", text: "Excellent. EVALUATION_COMPLETE" }],
301
+ },
302
+ },
303
+ ],
304
+ ];
305
+
306
+ const supervisorRunner = createMockRunner(
307
+ [
308
+ { text: "Welcome." },
309
+ { text: "EVALUATION_INTERVENTION Try Y instead." },
310
+ { text: "Excellent. EVALUATION_COMPLETE" },
311
+ ],
312
+ supervisorMessages,
313
+ );
314
+
315
+ const output = new PassThrough();
316
+ const supervisor = new Supervisor({
317
+ agentRunner,
318
+ supervisorRunner,
319
+ output,
320
+ maxTurns: 10,
321
+ });
322
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
323
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
324
+
325
+ let agentResumeCalls = 0;
326
+ const origAgentResume = agentRunner.resume;
327
+ agentRunner.resume = async (prompt) => {
328
+ agentResumeCalls++;
329
+ return origAgentResume.call(agentRunner, prompt);
330
+ };
331
+
332
+ const result = await supervisor.run("Install");
333
+
334
+ assert.strictEqual(result.success, true);
335
+ assert.strictEqual(result.turns, 1);
336
+ assert.strictEqual(
337
+ agentResumeCalls,
338
+ 1,
339
+ "Agent.resume runs once (after intervention); EVALUATION_COMPLETE then ends the turn",
340
+ );
341
+
342
+ const orchestratorEvents = (output.read()?.toString() ?? "")
343
+ .trim()
344
+ .split("\n")
345
+ .filter((l) => l.length > 0)
346
+ .map((l) => JSON.parse(l))
347
+ .filter((e) => e.source === "orchestrator");
348
+ assert.ok(
349
+ orchestratorEvents.some(
350
+ (e) => e.event?.type === "intervention_requested",
351
+ ),
352
+ "Trace should contain intervention_requested",
353
+ );
354
+ assert.ok(
355
+ orchestratorEvents.some((e) => e.event?.type === "complete_requested"),
356
+ "Trace should contain complete_requested for mid-turn EVALUATION_COMPLETE",
357
+ );
358
+ });
359
+ });