@forwardimpact/libeval 0.1.9 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,369 +0,0 @@
1
- import { describe, test } from "node:test";
2
- import assert from "node:assert";
3
- import { PassThrough } from "node:stream";
4
-
5
- import {
6
- Supervisor,
7
- createSupervisor,
8
- SUPERVISOR_SYSTEM_PROMPT,
9
- AGENT_SYSTEM_PROMPT,
10
- } from "@forwardimpact/libeval";
11
- import { createMockRunner } from "./mock-runner.js";
12
-
13
- describe("Supervisor - output and events", () => {
14
- test("output contains tagged lines with correct source and turn", async () => {
15
- const supervisorMessages = [
16
- [{ type: "assistant", content: "Go ahead" }],
17
- [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
18
- ];
19
- const agentMessages = [[{ type: "assistant", content: "Working" }]];
20
-
21
- const supervisorRunner = createMockRunner(
22
- [{ text: "Go ahead" }, { text: "EVALUATION_COMPLETE" }],
23
- supervisorMessages,
24
- );
25
- const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
26
-
27
- const output = new PassThrough();
28
- const supervisor = new Supervisor({
29
- agentRunner,
30
- supervisorRunner,
31
- output,
32
- maxTurns: 10,
33
- });
34
- agentRunner.onLine = (line) => supervisor.emitLine(line);
35
- supervisorRunner.onLine = (line) => supervisor.emitLine(line);
36
-
37
- await supervisor.run("Task");
38
-
39
- const data = output.read()?.toString() ?? "";
40
- const lines = data
41
- .trim()
42
- .split("\n")
43
- .filter((l) => l.length > 0);
44
-
45
- // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
46
- assert.ok(lines.length >= 4);
47
-
48
- const supervisorLine = JSON.parse(lines[0]);
49
- assert.strictEqual(supervisorLine.source, "supervisor");
50
- assert.strictEqual(supervisorLine.turn, 0);
51
- assert.ok("event" in supervisorLine);
52
-
53
- const agentLine = JSON.parse(lines[1]);
54
- assert.strictEqual(agentLine.source, "agent");
55
- assert.strictEqual(agentLine.turn, 1);
56
- assert.ok("event" in agentLine);
57
-
58
- const summaryLine = JSON.parse(lines[lines.length - 1]);
59
- assert.strictEqual(summaryLine.source, "orchestrator");
60
- assert.strictEqual(summaryLine.type, "summary");
61
- assert.strictEqual(summaryLine.success, true);
62
- });
63
-
64
- test("events are nested under event key (no field collisions)", async () => {
65
- const sourceEvent = {
66
- type: "assistant",
67
- source: "sdk-internal",
68
- content: "test",
69
- };
70
- const supervisorRunner = createMockRunner(
71
- [{ text: "Go" }, { text: "EVALUATION_COMPLETE" }],
72
- [
73
- [{ type: "assistant", content: "Go" }],
74
- [{ type: "assistant", content: "ok" }],
75
- ],
76
- );
77
- const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
78
-
79
- const output = new PassThrough();
80
- const supervisor = new Supervisor({
81
- agentRunner,
82
- supervisorRunner,
83
- output,
84
- maxTurns: 10,
85
- });
86
- agentRunner.onLine = (line) => supervisor.emitLine(line);
87
- supervisorRunner.onLine = (line) => supervisor.emitLine(line);
88
-
89
- await supervisor.run("Task");
90
-
91
- const data = output.read()?.toString() ?? "";
92
- const lines = data
93
- .trim()
94
- .split("\n")
95
- .filter((l) => l.length > 0);
96
-
97
- // First line is supervisor turn 0, second is agent turn 1
98
- const tagged = JSON.parse(lines[1]);
99
- assert.strictEqual(tagged.source, "agent");
100
- assert.strictEqual(tagged.event.source, "sdk-internal");
101
- });
102
-
103
- test("mid-turn intervention emits orchestrator events and shares the agent's turn id", async () => {
104
- // Agent emits one structured assistant text block on its first call —
105
- // supervisor intervenes mid-turn. Resume then completes naturally and
106
- // the end-of-turn review signals EVALUATION_COMPLETE.
107
- const agentMessages = [
108
- [
109
- {
110
- type: "assistant",
111
- message: {
112
- content: [{ type: "text", text: "Trying the wrong thing." }],
113
- },
114
- },
115
- ],
116
- [
117
- {
118
- type: "assistant",
119
- message: {
120
- content: [{ type: "text", text: "Switching to the right thing." }],
121
- },
122
- },
123
- ],
124
- ];
125
-
126
- const supervisorMessages = [
127
- undefined,
128
- [
129
- {
130
- type: "assistant",
131
- message: {
132
- content: [
133
- {
134
- type: "text",
135
- text: "EVALUATION_INTERVENTION Switch to the right path.",
136
- },
137
- ],
138
- },
139
- },
140
- ],
141
- undefined,
142
- undefined,
143
- ];
144
-
145
- const agentRunner = createMockRunner(
146
- [{ text: "Trying the wrong thing." }, { text: "Switching." }],
147
- agentMessages,
148
- );
149
- agentRunner.batchSize = 1;
150
- const supervisorRunner = createMockRunner(
151
- [
152
- { text: "Welcome." },
153
- { text: "EVALUATION_INTERVENTION Switch to the right path." },
154
- { text: "Keep going." },
155
- { text: "Done. EVALUATION_COMPLETE" },
156
- ],
157
- supervisorMessages,
158
- );
159
-
160
- const output = new PassThrough();
161
- const supervisor = new Supervisor({
162
- agentRunner,
163
- supervisorRunner,
164
- output,
165
- maxTurns: 10,
166
- });
167
- agentRunner.onLine = (line) => supervisor.emitLine(line);
168
- supervisorRunner.onLine = (line) => supervisor.emitLine(line);
169
-
170
- const result = await supervisor.run("Task");
171
- assert.strictEqual(result.success, true);
172
-
173
- const lines = (output.read()?.toString() ?? "")
174
- .trim()
175
- .split("\n")
176
- .filter((l) => l.length > 0)
177
- .map((l) => JSON.parse(l));
178
-
179
- // (1) Orchestrator event with intervention_requested.
180
- const interventionRequested = lines.find(
181
- (l) =>
182
- l.source === "orchestrator" &&
183
- l.event?.type === "intervention_requested",
184
- );
185
- assert.ok(
186
- interventionRequested,
187
- "Trace must contain intervention_requested orchestrator event",
188
- );
189
-
190
- // (2) At least one agent line and one supervisor line share a turn id —
191
- // mid-turn supervisor activity is tagged with the agent's turn.
192
- const agentTurns = new Set(
193
- lines.filter((l) => l.source === "agent").map((l) => l.turn),
194
- );
195
- const supervisorTurns = new Set(
196
- lines.filter((l) => l.source === "supervisor").map((l) => l.turn),
197
- );
198
- const sharedTurns = [...agentTurns].filter((t) => supervisorTurns.has(t));
199
- assert.ok(
200
- sharedTurns.length > 0,
201
- "At least one turn id must appear on both agent and supervisor lines",
202
- );
203
-
204
- // (3) Final summary line still emitted.
205
- const summary = lines[lines.length - 1];
206
- assert.strictEqual(summary.source, "orchestrator");
207
- assert.strictEqual(summary.type, "summary");
208
- assert.strictEqual(summary.success, true);
209
- });
210
-
211
- test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
212
- const supervisorMessages = [
213
- [{ type: "assistant", content: "Starting..." }],
214
- ];
215
- const supervisorRunner = createMockRunner(
216
- [{ text: "Starting...", success: false }],
217
- supervisorMessages,
218
- );
219
-
220
- const origRun = supervisorRunner.run;
221
- supervisorRunner.run = async (task) => {
222
- const result = await origRun.call(supervisorRunner, task);
223
- return { ...result, error: new Error("Process exited with code 1") };
224
- };
225
-
226
- const agentRunner = createMockRunner([]);
227
-
228
- const output = new PassThrough();
229
- const supervisor = new Supervisor({
230
- agentRunner,
231
- supervisorRunner,
232
- output,
233
- maxTurns: 10,
234
- });
235
- agentRunner.onLine = (line) => supervisor.emitLine(line);
236
- supervisorRunner.onLine = (line) => supervisor.emitLine(line);
237
-
238
- const result = await supervisor.run("Task");
239
-
240
- assert.strictEqual(result.success, false);
241
- assert.strictEqual(result.turns, 0);
242
-
243
- const data = output.read()?.toString() ?? "";
244
- const lines = data
245
- .trim()
246
- .split("\n")
247
- .filter((l) => l.length > 0);
248
-
249
- assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
250
-
251
- const supervisorLine = JSON.parse(lines[0]);
252
- assert.strictEqual(supervisorLine.source, "supervisor");
253
- assert.strictEqual(supervisorLine.turn, 0);
254
-
255
- const summaryLine = JSON.parse(lines[lines.length - 1]);
256
- assert.strictEqual(summaryLine.source, "orchestrator");
257
- assert.strictEqual(summaryLine.success, false);
258
- assert.strictEqual(summaryLine.turns, 0);
259
- });
260
- });
261
-
262
- describe("Supervisor - createSupervisor factory", () => {
263
- test("createSupervisor factory returns a Supervisor instance", () => {
264
- const supervisor = createSupervisor({
265
- supervisorCwd: "/tmp/sup",
266
- agentCwd: "/tmp/agent",
267
- query: async function* () {},
268
- output: new PassThrough(),
269
- });
270
- assert.ok(supervisor instanceof Supervisor);
271
- });
272
-
273
- test("createSupervisor uses default supervisor tools when none specified", () => {
274
- const supervisor = createSupervisor({
275
- supervisorCwd: "/tmp/sup",
276
- agentCwd: "/tmp/agent",
277
- query: async function* () {},
278
- output: new PassThrough(),
279
- });
280
- assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
281
- "Bash",
282
- "Read",
283
- "Glob",
284
- "Grep",
285
- "Write",
286
- "Edit",
287
- ]);
288
- });
289
-
290
- test("createSupervisor passes custom supervisor tools", () => {
291
- const supervisor = createSupervisor({
292
- supervisorCwd: "/tmp/sup",
293
- agentCwd: "/tmp/agent",
294
- query: async function* () {},
295
- output: new PassThrough(),
296
- supervisorAllowedTools: ["Read", "Glob", "Grep"],
297
- });
298
- assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
299
- "Read",
300
- "Glob",
301
- "Grep",
302
- ]);
303
- });
304
-
305
- test("createSupervisor wires system prompts to both runners", () => {
306
- const supervisor = createSupervisor({
307
- supervisorCwd: "/tmp/sup",
308
- agentCwd: "/tmp/agent",
309
- query: async function* () {},
310
- output: new PassThrough(),
311
- });
312
-
313
- assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
314
- type: "preset",
315
- preset: "claude_code",
316
- append: AGENT_SYSTEM_PROMPT,
317
- });
318
- assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
319
- type: "preset",
320
- preset: "claude_code",
321
- append: SUPERVISOR_SYSTEM_PROMPT,
322
- });
323
- });
324
-
325
- test("createSupervisor blocks sub-agent spawn tools on supervisor by default", () => {
326
- const supervisor = createSupervisor({
327
- supervisorCwd: "/tmp/sup",
328
- agentCwd: "/tmp/agent",
329
- query: async function* () {},
330
- output: new PassThrough(),
331
- });
332
- assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
333
- "Agent",
334
- "Task",
335
- "TaskOutput",
336
- "TaskStop",
337
- ]);
338
- assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
339
- });
340
-
341
- test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
342
- const supervisor = createSupervisor({
343
- supervisorCwd: "/tmp/sup",
344
- agentCwd: "/tmp/agent",
345
- query: async function* () {},
346
- output: new PassThrough(),
347
- supervisorDisallowedTools: ["WebSearch", "Task"],
348
- });
349
- const disallowed = supervisor.supervisorRunner.disallowedTools;
350
- assert.ok(disallowed.includes("Agent"));
351
- assert.ok(disallowed.includes("Task"));
352
- assert.ok(disallowed.includes("TaskOutput"));
353
- assert.ok(disallowed.includes("TaskStop"));
354
- assert.ok(disallowed.includes("WebSearch"));
355
- assert.strictEqual(disallowed.length, new Set(disallowed).size);
356
- });
357
-
358
- test("system prompt constants are non-empty strings", () => {
359
- assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
360
- assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
361
- assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
362
- assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
363
- });
364
-
365
- test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
366
- assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
367
- assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_COMPLETE"));
368
- });
369
- });
@@ -1,310 +0,0 @@
1
- import { describe, test } from "node:test";
2
- import assert from "node:assert";
3
- import { PassThrough } from "node:stream";
4
-
5
- import { Supervisor } from "@forwardimpact/libeval";
6
- import { isComplete } from "../src/supervisor.js";
7
- import { createMockRunner } from "./mock-runner.js";
8
-
9
- describe("isComplete", () => {
10
- test("detects EVALUATION_COMPLETE on its own line", () => {
11
- assert.strictEqual(isComplete("EVALUATION_COMPLETE"), true);
12
- assert.strictEqual(
13
- isComplete("Some text\nEVALUATION_COMPLETE\nMore text"),
14
- true,
15
- );
16
- assert.strictEqual(isComplete("Done.\n\nEVALUATION_COMPLETE"), true);
17
- });
18
-
19
- test("tolerates markdown formatting around the signal", () => {
20
- assert.strictEqual(isComplete("**EVALUATION_COMPLETE**"), true);
21
- assert.strictEqual(isComplete("*EVALUATION_COMPLETE*"), true);
22
- assert.strictEqual(isComplete("__EVALUATION_COMPLETE__"), true);
23
- assert.strictEqual(isComplete("_EVALUATION_COMPLETE_"), true);
24
- assert.strictEqual(isComplete("`EVALUATION_COMPLETE`"), true);
25
- assert.strictEqual(
26
- isComplete("Good work.\n\n**EVALUATION_COMPLETE**\n\nNow filing issues."),
27
- true,
28
- );
29
- });
30
-
31
- test("matches EVALUATION_COMPLETE anywhere in text", () => {
32
- assert.strictEqual(isComplete("not EVALUATION_COMPLETE yet"), true);
33
- assert.strictEqual(
34
- isComplete("The agent is EVALUATION_COMPLETE done"),
35
- true,
36
- );
37
- assert.strictEqual(
38
- isComplete("Great work! EVALUATION_COMPLETE. Now filing issues."),
39
- true,
40
- );
41
- });
42
-
43
- test("does not match empty or unrelated text", () => {
44
- assert.strictEqual(isComplete(""), false);
45
- assert.strictEqual(isComplete("All done!"), false);
46
- assert.strictEqual(isComplete("DONE"), false);
47
- });
48
-
49
- test("does not match old EVALUATION_SUCCESSFUL signal", () => {
50
- assert.strictEqual(isComplete("EVALUATION_SUCCESSFUL"), false);
51
- });
52
- });
53
-
54
- describe("Supervisor - run and turns", () => {
55
- test("constructor throws on missing agentRunner", () => {
56
- assert.throws(
57
- () =>
58
- new Supervisor({
59
- supervisorRunner: createMockRunner([]),
60
- output: new PassThrough(),
61
- }),
62
- /agentRunner is required/,
63
- );
64
- });
65
-
66
- test("constructor throws on missing supervisorRunner", () => {
67
- assert.throws(
68
- () =>
69
- new Supervisor({
70
- agentRunner: createMockRunner([]),
71
- output: new PassThrough(),
72
- }),
73
- /supervisorRunner is required/,
74
- );
75
- });
76
-
77
- test("constructor throws on missing output", () => {
78
- assert.throws(
79
- () =>
80
- new Supervisor({
81
- agentRunner: createMockRunner([]),
82
- supervisorRunner: createMockRunner([]),
83
- }),
84
- /output is required/,
85
- );
86
- });
87
-
88
- test("completes on EVALUATION_COMPLETE from supervisor at turn 0", async () => {
89
- const agentRunner = createMockRunner([]);
90
-
91
- const supervisorRunner = createMockRunner([
92
- { text: "EVALUATION_COMPLETE" },
93
- ]);
94
-
95
- const output = new PassThrough();
96
- const supervisor = new Supervisor({
97
- agentRunner,
98
- supervisorRunner,
99
- output,
100
- maxTurns: 10,
101
- });
102
-
103
- const result = await supervisor.run("Install stuff");
104
-
105
- assert.strictEqual(result.success, true);
106
- assert.strictEqual(result.turns, 0);
107
- });
108
-
109
- test("completes after one agent turn", async () => {
110
- const agentRunner = createMockRunner([
111
- { text: "I installed the packages." },
112
- ]);
113
-
114
- const supervisorRunner = createMockRunner([
115
- { text: "Welcome! Please install the packages." },
116
- { text: "Good work.\n\nEVALUATION_COMPLETE" },
117
- ]);
118
-
119
- const output = new PassThrough();
120
- const supervisor = new Supervisor({
121
- agentRunner,
122
- supervisorRunner,
123
- output,
124
- maxTurns: 10,
125
- });
126
-
127
- const result = await supervisor.run("Install stuff");
128
-
129
- assert.strictEqual(result.success, true);
130
- assert.strictEqual(result.turns, 1);
131
- });
132
-
133
- test("detects EVALUATION_COMPLETE in streamed messages when result text differs", async () => {
134
- const agentRunner = createMockRunner([
135
- { text: "I installed the packages." },
136
- ]);
137
-
138
- const supervisorMessages = [
139
- undefined,
140
- [
141
- {
142
- type: "assistant",
143
- message: {
144
- content: [
145
- {
146
- type: "text",
147
- text: "Good work.\n\nEVALUATION_COMPLETE\n\nNow filing issues.",
148
- },
149
- ],
150
- },
151
- },
152
- {
153
- type: "assistant",
154
- message: {
155
- content: [
156
- { type: "text", text: "## Summary\n\nAll issues filed." },
157
- ],
158
- },
159
- },
160
- ],
161
- ];
162
-
163
- const supervisorRunner = createMockRunner(
164
- [
165
- { text: "Welcome! Please install the packages." },
166
- { text: "## Summary\n\nAll issues filed." },
167
- ],
168
- supervisorMessages,
169
- );
170
-
171
- const output = new PassThrough();
172
- const supervisor = new Supervisor({
173
- agentRunner,
174
- supervisorRunner,
175
- output,
176
- maxTurns: 10,
177
- });
178
- agentRunner.onLine = (line) => supervisor.emitLine(line);
179
- supervisorRunner.onLine = (line) => supervisor.emitLine(line);
180
-
181
- const result = await supervisor.run("Install stuff");
182
-
183
- assert.strictEqual(result.success, true);
184
- assert.strictEqual(result.turns, 1);
185
- });
186
-
187
- test("relays only the last assistant text block to the agent", async () => {
188
- // Supervisor emits reasoning text ("Let me research...") then a tool call,
189
- // then a final task message. Only the final message should reach the agent.
190
- const supervisorMessages = [
191
- // Turn 0: multiple assistant messages with reasoning + task
192
- [
193
- {
194
- type: "assistant",
195
- message: {
196
- content: [
197
- { type: "text", text: "Let me research the product first." },
198
- ],
199
- },
200
- },
201
- {
202
- type: "assistant",
203
- message: {
204
- content: [
205
- {
206
- type: "text",
207
- text: "Hello! Here is your task: install the packages.",
208
- },
209
- ],
210
- },
211
- },
212
- ],
213
- // Turn 1: evaluation
214
- undefined,
215
- ];
216
-
217
- let capturedAgentPrompt = null;
218
- const agentRunner = createMockRunner([
219
- { text: "I installed the packages." },
220
- ]);
221
- const origRun = agentRunner.run;
222
- agentRunner.run = async (task) => {
223
- capturedAgentPrompt = task;
224
- return origRun.call(agentRunner, task);
225
- };
226
-
227
- const supervisorRunner = createMockRunner(
228
- [
229
- // SDK result text = last message text (but relay should use buffer)
230
- { text: "Hello! Here is your task: install the packages." },
231
- { text: "EVALUATION_COMPLETE" },
232
- ],
233
- supervisorMessages,
234
- );
235
-
236
- const output = new PassThrough();
237
- const supervisor = new Supervisor({
238
- agentRunner,
239
- supervisorRunner,
240
- output,
241
- maxTurns: 10,
242
- });
243
-
244
- await supervisor.run("Evaluate the product");
245
-
246
- // Agent should receive only the final text, not the reasoning
247
- assert.strictEqual(
248
- capturedAgentPrompt,
249
- "Hello! Here is your task: install the packages.",
250
- );
251
- assert.ok(
252
- !capturedAgentPrompt.includes("research"),
253
- "Reasoning text should not leak to agent",
254
- );
255
- });
256
-
257
- test("runs multiple turns before completion", async () => {
258
- const agentRunner = createMockRunner([
259
- { text: "Started working." },
260
- { text: "Made progress." },
261
- { text: "Finished everything." },
262
- ]);
263
-
264
- const supervisorRunner = createMockRunner([
265
- { text: "Here is your task. Do the work." },
266
- { text: "Keep going, you need to do more." },
267
- { text: "Almost there, continue." },
268
- { text: "EVALUATION_COMPLETE" },
269
- ]);
270
-
271
- const output = new PassThrough();
272
- const supervisor = new Supervisor({
273
- agentRunner,
274
- supervisorRunner,
275
- output,
276
- maxTurns: 10,
277
- });
278
-
279
- const result = await supervisor.run("Do the work");
280
-
281
- assert.strictEqual(result.success, true);
282
- assert.strictEqual(result.turns, 3);
283
- });
284
-
285
- test("enforces maxTurns limit", async () => {
286
- const agentRunner = createMockRunner([
287
- { text: "Turn 1" },
288
- { text: "Turn 2" },
289
- ]);
290
-
291
- const supervisorRunner = createMockRunner([
292
- { text: "Start working." },
293
- { text: "Continue." },
294
- { text: "Continue." },
295
- ]);
296
-
297
- const output = new PassThrough();
298
- const supervisor = new Supervisor({
299
- agentRunner,
300
- supervisorRunner,
301
- output,
302
- maxTurns: 2,
303
- });
304
-
305
- const result = await supervisor.run("Endless task");
306
-
307
- assert.strictEqual(result.success, false);
308
- assert.strictEqual(result.turns, 2);
309
- });
310
- });