@forwardimpact/libeval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,342 @@
1
+ import { describe, test } from "node:test";
2
+ import assert from "node:assert";
3
+ import { PassThrough } from "node:stream";
4
+
5
+ import {
6
+ AgentRunner,
7
+ Supervisor,
8
+ createSupervisor,
9
+ } from "@forwardimpact/libeval";
10
+ import { isDone } from "../src/supervisor.js";
11
+
12
+ /**
13
+ * Create a mock AgentRunner that yields pre-scripted responses.
14
+ * Each call to run() or resume() pops the next response from the array.
15
+ * @param {object[]} responses - Array of {text, success} objects
16
+ * @param {object[]} [messages] - Messages to buffer per turn
17
+ * @returns {AgentRunner}
18
+ */
19
+ function createMockRunner(responses, messages) {
20
+ const output = new PassThrough();
21
+ let callIndex = 0;
22
+
23
+ const runner = new AgentRunner({
24
+ cwd: "/tmp",
25
+ query: async function* () {},
26
+ output,
27
+ });
28
+
29
+ // Override run and resume to return scripted responses
30
+ runner.run = async (_task) => {
31
+ const resp = responses[callIndex++];
32
+ const msgs = messages?.[callIndex - 1] ?? [
33
+ { type: "assistant", content: resp.text },
34
+ ];
35
+ for (const m of msgs) {
36
+ const line = JSON.stringify(m);
37
+ runner.buffer.push(line);
38
+ if (runner.onLine) runner.onLine(line);
39
+ }
40
+ runner.sessionId = "mock-session";
41
+ return {
42
+ success: resp.success ?? true,
43
+ text: resp.text,
44
+ sessionId: "mock-session",
45
+ };
46
+ };
47
+
48
+ runner.resume = async (_prompt) => {
49
+ const resp = responses[callIndex++];
50
+ const msgs = messages?.[callIndex - 1] ?? [
51
+ { type: "assistant", content: resp.text },
52
+ ];
53
+ for (const m of msgs) {
54
+ const line = JSON.stringify(m);
55
+ runner.buffer.push(line);
56
+ if (runner.onLine) runner.onLine(line);
57
+ }
58
+ return { success: resp.success ?? true, text: resp.text };
59
+ };
60
+
61
+ return runner;
62
+ }
63
+
64
+ describe("isDone", () => {
65
+ test("detects EVALUATION_COMPLETE on its own line", () => {
66
+ assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
67
+ assert.strictEqual(
68
+ isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
69
+ true,
70
+ );
71
+ assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
72
+ });
73
+
74
+ test("does not match EVALUATION_COMPLETE embedded in text", () => {
75
+ assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
76
+ assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
77
+ assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
78
+ });
79
+
80
+ test("does not match empty or unrelated text", () => {
81
+ assert.strictEqual(isDone(""), false);
82
+ assert.strictEqual(isDone("All done!"), false);
83
+ assert.strictEqual(isDone("DONE"), false);
84
+ });
85
+ });
86
+
87
+ describe("Supervisor", () => {
88
+ test("constructor throws on missing agentRunner", () => {
89
+ assert.throws(
90
+ () =>
91
+ new Supervisor({
92
+ supervisorRunner: createMockRunner([]),
93
+ output: new PassThrough(),
94
+ }),
95
+ /agentRunner is required/,
96
+ );
97
+ });
98
+
99
+ test("constructor throws on missing supervisorRunner", () => {
100
+ assert.throws(
101
+ () =>
102
+ new Supervisor({
103
+ agentRunner: createMockRunner([]),
104
+ output: new PassThrough(),
105
+ }),
106
+ /supervisorRunner is required/,
107
+ );
108
+ });
109
+
110
+ test("constructor throws on missing output", () => {
111
+ assert.throws(
112
+ () =>
113
+ new Supervisor({
114
+ agentRunner: createMockRunner([]),
115
+ supervisorRunner: createMockRunner([]),
116
+ }),
117
+ /output is required/,
118
+ );
119
+ });
120
+
121
+ test("completes on EVALUATION_COMPLETE from supervisor", async () => {
122
+ const agentRunner = createMockRunner([
123
+ { text: "I installed the packages." },
124
+ ]);
125
+
126
+ const supervisorRunner = createMockRunner([
127
+ { text: "Good work.\n\nEVALUATION_COMPLETE" },
128
+ ]);
129
+
130
+ const output = new PassThrough();
131
+ const supervisor = new Supervisor({
132
+ agentRunner,
133
+ supervisorRunner,
134
+ output,
135
+ maxTurns: 10,
136
+ });
137
+
138
+ const result = await supervisor.run("Install stuff");
139
+
140
+ assert.strictEqual(result.success, true);
141
+ assert.strictEqual(result.turns, 1);
142
+ });
143
+
144
+ test("runs multiple turns before completion", async () => {
145
+ const agentRunner = createMockRunner([
146
+ { text: "Started working." },
147
+ { text: "Made progress." },
148
+ { text: "Finished everything." },
149
+ ]);
150
+
151
+ const supervisorRunner = createMockRunner([
152
+ { text: "Keep going, you need to do more." },
153
+ { text: "Almost there, continue." },
154
+ { text: "EVALUATION_COMPLETE" },
155
+ ]);
156
+
157
+ const output = new PassThrough();
158
+ const supervisor = new Supervisor({
159
+ agentRunner,
160
+ supervisorRunner,
161
+ output,
162
+ maxTurns: 10,
163
+ });
164
+
165
+ const result = await supervisor.run("Do the work");
166
+
167
+ assert.strictEqual(result.success, true);
168
+ assert.strictEqual(result.turns, 3);
169
+ });
170
+
171
+ test("enforces maxTurns limit", async () => {
172
+ // Agent responds to every turn, supervisor never says done
173
+ const agentRunner = createMockRunner([
174
+ { text: "Turn 0" },
175
+ { text: "Turn 1" },
176
+ { text: "Turn 2" },
177
+ ]);
178
+
179
+ const supervisorRunner = createMockRunner([
180
+ { text: "Continue." },
181
+ { text: "Continue." },
182
+ ]);
183
+
184
+ const output = new PassThrough();
185
+ const supervisor = new Supervisor({
186
+ agentRunner,
187
+ supervisorRunner,
188
+ output,
189
+ maxTurns: 2,
190
+ });
191
+
192
+ const result = await supervisor.run("Endless task");
193
+
194
+ assert.strictEqual(result.success, false);
195
+ assert.strictEqual(result.turns, 2);
196
+ });
197
+
198
+ test("output contains tagged lines with correct source and turn", async () => {
199
+ const agentMessages = [[{ type: "assistant", content: "Working" }]];
200
+ const supervisorMessages = [
201
+ [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
202
+ ];
203
+
204
+ const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
205
+ const supervisorRunner = createMockRunner(
206
+ [{ text: "EVALUATION_COMPLETE" }],
207
+ supervisorMessages,
208
+ );
209
+
210
+ const output = new PassThrough();
211
+ const supervisor = new Supervisor({
212
+ agentRunner,
213
+ supervisorRunner,
214
+ output,
215
+ maxTurns: 10,
216
+ });
217
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
218
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
219
+
220
+ await supervisor.run("Task");
221
+
222
+ const data = output.read()?.toString() ?? "";
223
+ const lines = data
224
+ .trim()
225
+ .split("\n")
226
+ .filter((l) => l.length > 0);
227
+
228
+ // Should have: agent turn 0, supervisor turn 1, orchestrator summary
229
+ assert.ok(lines.length >= 3);
230
+
231
+ const agentLine = JSON.parse(lines[0]);
232
+ assert.strictEqual(agentLine.source, "agent");
233
+ assert.strictEqual(agentLine.turn, 0);
234
+ assert.ok("event" in agentLine);
235
+
236
+ const supervisorLine = JSON.parse(lines[1]);
237
+ assert.strictEqual(supervisorLine.source, "supervisor");
238
+ assert.strictEqual(supervisorLine.turn, 1);
239
+ assert.ok("event" in supervisorLine);
240
+
241
+ const summaryLine = JSON.parse(lines[lines.length - 1]);
242
+ assert.strictEqual(summaryLine.source, "orchestrator");
243
+ assert.strictEqual(summaryLine.type, "summary");
244
+ assert.strictEqual(summaryLine.success, true);
245
+ });
246
+
247
+ test("events are nested under event key (no field collisions)", async () => {
248
+ const sourceEvent = {
249
+ type: "assistant",
250
+ source: "sdk-internal",
251
+ content: "test",
252
+ };
253
+ const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
254
+ const supervisorRunner = createMockRunner(
255
+ [{ text: "EVALUATION_COMPLETE" }],
256
+ [[{ type: "assistant", content: "ok" }]],
257
+ );
258
+
259
+ const output = new PassThrough();
260
+ const supervisor = new Supervisor({
261
+ agentRunner,
262
+ supervisorRunner,
263
+ output,
264
+ maxTurns: 10,
265
+ });
266
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
267
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
268
+
269
+ await supervisor.run("Task");
270
+
271
+ const data = output.read()?.toString() ?? "";
272
+ const lines = data
273
+ .trim()
274
+ .split("\n")
275
+ .filter((l) => l.length > 0);
276
+
277
+ const tagged = JSON.parse(lines[0]);
278
+ // The original event's `source` field is preserved inside `event`
279
+ assert.strictEqual(tagged.source, "agent");
280
+ assert.strictEqual(tagged.event.source, "sdk-internal");
281
+ });
282
+
283
+ test("emits agent output and summary when agent errors on turn 0", async () => {
284
+ const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
285
+ const agentRunner = createMockRunner(
286
+ [{ text: "Partial work", success: false }],
287
+ agentMessages,
288
+ );
289
+
290
+ // Override run to simulate an error return
291
+ const origRun = agentRunner.run;
292
+ agentRunner.run = async (task) => {
293
+ const result = await origRun.call(agentRunner, task);
294
+ return { ...result, error: new Error("Process exited with code 1") };
295
+ };
296
+
297
+ const supervisorRunner = createMockRunner([]);
298
+
299
+ const output = new PassThrough();
300
+ const supervisor = new Supervisor({
301
+ agentRunner,
302
+ supervisorRunner,
303
+ output,
304
+ maxTurns: 10,
305
+ });
306
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
307
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
308
+
309
+ const result = await supervisor.run("Task");
310
+
311
+ assert.strictEqual(result.success, false);
312
+ assert.strictEqual(result.turns, 0);
313
+
314
+ // Output should still contain the agent's buffered lines + summary
315
+ const data = output.read()?.toString() ?? "";
316
+ const lines = data
317
+ .trim()
318
+ .split("\n")
319
+ .filter((l) => l.length > 0);
320
+
321
+ assert.ok(lines.length >= 2, "Expected at least agent line + summary");
322
+
323
+ const agentLine = JSON.parse(lines[0]);
324
+ assert.strictEqual(agentLine.source, "agent");
325
+ assert.strictEqual(agentLine.turn, 0);
326
+
327
+ const summaryLine = JSON.parse(lines[lines.length - 1]);
328
+ assert.strictEqual(summaryLine.source, "orchestrator");
329
+ assert.strictEqual(summaryLine.success, false);
330
+ assert.strictEqual(summaryLine.turns, 0);
331
+ });
332
+
333
+ test("createSupervisor factory returns a Supervisor instance", () => {
334
+ const supervisor = createSupervisor({
335
+ supervisorCwd: "/tmp/sup",
336
+ agentCwd: "/tmp/agent",
337
+ query: async function* () {},
338
+ output: new PassThrough(),
339
+ });
340
+ assert.ok(supervisor instanceof Supervisor);
341
+ });
342
+ });
@@ -0,0 +1,326 @@
1
+ import { describe, test } from "node:test";
2
+ import assert from "node:assert";
3
+ import { PassThrough } from "node:stream";
4
+
5
+ import { TeeWriter, createTeeWriter } from "@forwardimpact/libeval";
6
+
7
+ /**
8
+ * Collect all data written to a PassThrough stream as a string.
9
+ * @param {PassThrough} stream
10
+ * @returns {string}
11
+ */
12
+ function collect(stream) {
13
+ const data = stream.read();
14
+ return data ? data.toString() : "";
15
+ }
16
+
17
+ /**
18
+ * Write lines to a TeeWriter and wait for it to finish.
19
+ * @param {TeeWriter} writer
20
+ * @param {string[]} lines - JSON lines to write
21
+ */
22
+ async function writeLines(writer, lines) {
23
+ for (const line of lines) {
24
+ writer.write(line + "\n");
25
+ }
26
+ await new Promise((resolve) => writer.end(resolve));
27
+ }
28
+
29
+ describe("TeeWriter", () => {
30
+ test("constructor throws on missing fileStream", () => {
31
+ assert.throws(
32
+ () => new TeeWriter({ textStream: new PassThrough() }),
33
+ /fileStream is required/,
34
+ );
35
+ });
36
+
37
+ test("constructor throws on missing textStream", () => {
38
+ assert.throws(
39
+ () => new TeeWriter({ fileStream: new PassThrough() }),
40
+ /textStream is required/,
41
+ );
42
+ });
43
+
44
+ test("writes NDJSON to fileStream and text to textStream in raw mode", async () => {
45
+ const fileStream = new PassThrough();
46
+ const textStream = new PassThrough();
47
+ const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
48
+
49
+ const events = [
50
+ JSON.stringify({
51
+ type: "system",
52
+ subtype: "init",
53
+ session_id: "s1",
54
+ model: "opus",
55
+ }),
56
+ JSON.stringify({
57
+ type: "assistant",
58
+ message: {
59
+ content: [{ type: "text", text: "Hello world" }],
60
+ usage: { input_tokens: 10, output_tokens: 5 },
61
+ },
62
+ }),
63
+ JSON.stringify({
64
+ type: "assistant",
65
+ message: {
66
+ content: [
67
+ {
68
+ type: "tool_use",
69
+ id: "t1",
70
+ name: "Bash",
71
+ input: { command: "ls" },
72
+ },
73
+ ],
74
+ usage: { input_tokens: 20, output_tokens: 10 },
75
+ },
76
+ }),
77
+ JSON.stringify({
78
+ type: "result",
79
+ subtype: "success",
80
+ duration_ms: 5000,
81
+ num_turns: 2,
82
+ total_cost_usd: 0.05,
83
+ usage: { input_tokens: 30, output_tokens: 15 },
84
+ }),
85
+ ];
86
+
87
+ await writeLines(writer, events);
88
+
89
+ const fileData = collect(fileStream);
90
+ const textData = collect(textStream);
91
+
92
+ // File should contain all NDJSON lines
93
+ const fileLines = fileData.trim().split("\n");
94
+ assert.strictEqual(fileLines.length, 4);
95
+ assert.deepStrictEqual(JSON.parse(fileLines[0]).type, "system");
96
+ assert.deepStrictEqual(JSON.parse(fileLines[3]).type, "result");
97
+
98
+ // Text should contain human-readable output
99
+ assert.ok(textData.includes("Hello world"));
100
+ assert.ok(textData.includes("> Tool: Bash"));
101
+ assert.ok(textData.includes("--- Result: success"));
102
+ });
103
+
104
+ test("streams text incrementally as events arrive", async () => {
105
+ const fileStream = new PassThrough();
106
+ const textStream = new PassThrough();
107
+ const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
108
+
109
+ // Write first assistant message
110
+ writer.write(
111
+ JSON.stringify({
112
+ type: "assistant",
113
+ message: {
114
+ content: [{ type: "text", text: "First message" }],
115
+ usage: { input_tokens: 10, output_tokens: 5 },
116
+ },
117
+ }) + "\n",
118
+ );
119
+
120
+ // Text should be available before stream ends
121
+ const firstText = collect(textStream);
122
+ assert.ok(firstText.includes("First message"));
123
+
124
+ writer.write(
125
+ JSON.stringify({
126
+ type: "assistant",
127
+ message: {
128
+ content: [{ type: "text", text: "Second message" }],
129
+ usage: { input_tokens: 20, output_tokens: 10 },
130
+ },
131
+ }) + "\n",
132
+ );
133
+
134
+ const secondText = collect(textStream);
135
+ assert.ok(secondText.includes("Second message"));
136
+
137
+ await new Promise((resolve) => writer.end(resolve));
138
+ });
139
+
140
+ test("supervised mode shows source labels and unwraps events", async () => {
141
+ const fileStream = new PassThrough();
142
+ const textStream = new PassThrough();
143
+ const writer = new TeeWriter({
144
+ fileStream,
145
+ textStream,
146
+ mode: "supervised",
147
+ });
148
+
149
+ const events = [
150
+ JSON.stringify({
151
+ source: "agent",
152
+ turn: 0,
153
+ event: {
154
+ type: "assistant",
155
+ message: {
156
+ content: [{ type: "text", text: "Working on it" }],
157
+ usage: { input_tokens: 10, output_tokens: 5 },
158
+ },
159
+ },
160
+ }),
161
+ JSON.stringify({
162
+ source: "supervisor",
163
+ turn: 1,
164
+ event: {
165
+ type: "assistant",
166
+ message: {
167
+ content: [{ type: "text", text: "Looks good" }],
168
+ usage: { input_tokens: 20, output_tokens: 10 },
169
+ },
170
+ },
171
+ }),
172
+ JSON.stringify({
173
+ source: "orchestrator",
174
+ type: "summary",
175
+ success: true,
176
+ turns: 1,
177
+ }),
178
+ ];
179
+
180
+ await writeLines(writer, events);
181
+
182
+ const fileData = collect(fileStream);
183
+ const textData = collect(textStream);
184
+
185
+ // File should contain all raw tagged NDJSON
186
+ const fileLines = fileData.trim().split("\n");
187
+ assert.strictEqual(fileLines.length, 3);
188
+ assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
189
+
190
+ // Text should show source labels
191
+ assert.ok(textData.includes("[agent]"));
192
+ assert.ok(textData.includes("Working on it"));
193
+ assert.ok(textData.includes("[supervisor]"));
194
+ assert.ok(textData.includes("Looks good"));
195
+ assert.ok(textData.includes("Evaluation completed after 1 turns"));
196
+ });
197
+
198
+ test("supervised mode shows incomplete status on failure", async () => {
199
+ const fileStream = new PassThrough();
200
+ const textStream = new PassThrough();
201
+ const writer = new TeeWriter({
202
+ fileStream,
203
+ textStream,
204
+ mode: "supervised",
205
+ });
206
+
207
+ await writeLines(writer, [
208
+ JSON.stringify({
209
+ source: "orchestrator",
210
+ type: "summary",
211
+ success: false,
212
+ turns: 5,
213
+ }),
214
+ ]);
215
+
216
+ const textData = collect(textStream);
217
+ assert.ok(textData.includes("Evaluation incomplete after 5 turns"));
218
+ });
219
+
220
+ test("supervised mode only shows source label on change", async () => {
221
+ const fileStream = new PassThrough();
222
+ const textStream = new PassThrough();
223
+ const writer = new TeeWriter({
224
+ fileStream,
225
+ textStream,
226
+ mode: "supervised",
227
+ });
228
+
229
+ const events = [
230
+ JSON.stringify({
231
+ source: "agent",
232
+ turn: 0,
233
+ event: {
234
+ type: "assistant",
235
+ message: {
236
+ content: [{ type: "text", text: "Step 1" }],
237
+ usage: { input_tokens: 10, output_tokens: 5 },
238
+ },
239
+ },
240
+ }),
241
+ JSON.stringify({
242
+ source: "agent",
243
+ turn: 0,
244
+ event: {
245
+ type: "assistant",
246
+ message: {
247
+ content: [{ type: "text", text: "Step 2" }],
248
+ usage: { input_tokens: 10, output_tokens: 5 },
249
+ },
250
+ },
251
+ }),
252
+ ];
253
+
254
+ await writeLines(writer, events);
255
+
256
+ const textData = collect(textStream);
257
+ // [agent] label should appear only once
258
+ const agentLabels = textData.split("[agent]").length - 1;
259
+ assert.strictEqual(agentLabels, 1);
260
+ });
261
+
262
+ test("handles partial lines across chunks", async () => {
263
+ const fileStream = new PassThrough();
264
+ const textStream = new PassThrough();
265
+ const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
266
+
267
+ const fullLine = JSON.stringify({
268
+ type: "assistant",
269
+ message: {
270
+ content: [{ type: "text", text: "Split message" }],
271
+ usage: { input_tokens: 10, output_tokens: 5 },
272
+ },
273
+ });
274
+
275
+ // Split the line across two chunks
276
+ const mid = Math.floor(fullLine.length / 2);
277
+ writer.write(fullLine.slice(0, mid));
278
+ writer.write(fullLine.slice(mid) + "\n");
279
+ await new Promise((resolve) => writer.end(resolve));
280
+
281
+ const textData = collect(textStream);
282
+ assert.ok(textData.includes("Split message"));
283
+ });
284
+
285
+ test("truncates long tool input", async () => {
286
+ const fileStream = new PassThrough();
287
+ const textStream = new PassThrough();
288
+ const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
289
+
290
+ const longInput = { command: "x".repeat(300) };
291
+ const event = JSON.stringify({
292
+ type: "assistant",
293
+ message: {
294
+ content: [
295
+ { type: "tool_use", id: "t1", name: "Bash", input: longInput },
296
+ ],
297
+ usage: { input_tokens: 10, output_tokens: 5 },
298
+ },
299
+ });
300
+
301
+ await writeLines(writer, [event]);
302
+
303
+ const textData = collect(textStream);
304
+ assert.ok(textData.includes("> Tool: Bash"));
305
+ assert.ok(textData.includes("..."));
306
+ // Truncated to ~200 chars
307
+ const toolLine = textData.split("\n").find((l) => l.startsWith("> Tool:"));
308
+ assert.ok(toolLine.length < 250);
309
+ });
310
+
311
+ test("defaults to raw mode", () => {
312
+ const writer = new TeeWriter({
313
+ fileStream: new PassThrough(),
314
+ textStream: new PassThrough(),
315
+ });
316
+ assert.strictEqual(writer.mode, "raw");
317
+ });
318
+
319
+ test("createTeeWriter factory returns a TeeWriter instance", () => {
320
+ const writer = createTeeWriter({
321
+ fileStream: new PassThrough(),
322
+ textStream: new PassThrough(),
323
+ });
324
+ assert.ok(writer instanceof TeeWriter);
325
+ });
326
+ });