@forwardimpact/libeval 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ /**
2
+ * TeeWriter — a Writable stream that writes raw NDJSON to a file while
3
+ * simultaneously streaming human-readable text to a separate stream (e.g.
4
+ * process.stdout).
5
+ *
6
+ * Supports two modes:
7
+ * - "raw" (default): expects standard stream-json events from AgentRunner
8
+ * - "supervised": expects tagged events {source, turn, event} from Supervisor
9
+ *
10
+ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
11
+ */
12
+
13
+ import { Writable } from "node:stream";
14
+ import { TraceCollector } from "./trace-collector.js";
15
+
16
+ export class TeeWriter extends Writable {
17
+ /**
18
+ * @param {object} deps
19
+ * @param {import("stream").Writable} deps.fileStream - Stream to write raw NDJSON to
20
+ * @param {import("stream").Writable} deps.textStream - Stream to write human-readable text to
21
+ * @param {"raw"|"supervised"} [deps.mode] - Event format: "raw" or "supervised" (default: "raw")
22
+ */
23
+ constructor({ fileStream, textStream, mode }) {
24
+ super();
25
+ if (!fileStream) throw new Error("fileStream is required");
26
+ if (!textStream) throw new Error("textStream is required");
27
+ this.fileStream = fileStream;
28
+ this.textStream = textStream;
29
+ this.mode = mode ?? "raw";
30
+ this.collector = new TraceCollector();
31
+ this.turnsEmitted = 0;
32
+ this.lastSource = null;
33
+ this.partial = "";
34
+ }
35
+
36
+ /**
37
+ * @param {Buffer|string} chunk
38
+ * @param {string} encoding
39
+ * @param {function} callback
40
+ */
41
+ _write(chunk, encoding, callback) {
42
+ const str = this.partial + chunk.toString();
43
+ const lines = str.split("\n");
44
+ this.partial = lines.pop() ?? "";
45
+
46
+ for (const line of lines) {
47
+ if (!line.trim()) continue;
48
+ this.fileStream.write(line + "\n");
49
+ this.processLine(line);
50
+ }
51
+ callback();
52
+ }
53
+
54
+ /**
55
+ * @param {function} callback
56
+ */
57
+ _final(callback) {
58
+ if (this.partial.trim()) {
59
+ this.fileStream.write(this.partial + "\n");
60
+ this.processLine(this.partial);
61
+ }
62
+
63
+ if (this.mode === "raw" && this.collector.result) {
64
+ const text = this.collector.toText();
65
+ const idx = text.lastIndexOf("\n---");
66
+ if (idx !== -1) {
67
+ this.textStream.write(text.slice(idx) + "\n");
68
+ }
69
+ }
70
+
71
+ callback();
72
+ }
73
+
74
+ /**
75
+ * Process a single NDJSON line — feed to collector and flush text.
76
+ * @param {string} line
77
+ */
78
+ processLine(line) {
79
+ if (this.mode === "supervised") {
80
+ this.processSupervisedLine(line);
81
+ } else {
82
+ this.collector.addLine(line);
83
+ this.flushTurns();
84
+ }
85
+ }
86
+
87
+ /**
88
+ * Handle a tagged supervisor line: unwrap event, show source labels.
89
+ * @param {string} line
90
+ */
91
+ processSupervisedLine(line) {
92
+ let parsed;
93
+ try {
94
+ parsed = JSON.parse(line);
95
+ } catch {
96
+ return;
97
+ }
98
+
99
+ if (parsed.source === "orchestrator" && parsed.type === "summary") {
100
+ const status = parsed.success ? "completed" : "incomplete";
101
+ this.textStream.write(
102
+ `\n--- Evaluation ${status} after ${parsed.turns} turns ---\n`,
103
+ );
104
+ return;
105
+ }
106
+
107
+ if (parsed.event) {
108
+ if (parsed.source && parsed.source !== this.lastSource) {
109
+ this.lastSource = parsed.source;
110
+ this.textStream.write(`\n[${parsed.source}]\n`);
111
+ }
112
+ this.collector.addLine(JSON.stringify(parsed.event));
113
+ this.flushTurns();
114
+ }
115
+ }
116
+
117
+ /**
118
+ * Emit text for any new turns accumulated by the collector.
119
+ */
120
+ flushTurns() {
121
+ const turns = this.collector.turns;
122
+ while (this.turnsEmitted < turns.length) {
123
+ const turn = turns[this.turnsEmitted++];
124
+ if (turn.role === "assistant") {
125
+ for (const block of turn.content) {
126
+ if (block.type === "text") {
127
+ this.textStream.write(block.text + "\n");
128
+ } else if (block.type === "tool_use") {
129
+ const input = summarizeInput(block.input);
130
+ this.textStream.write(`> Tool: ${block.name} ${input}\n`);
131
+ }
132
+ }
133
+ }
134
+ }
135
+ }
136
+ }
137
+
138
+ /**
139
+ * Summarize tool input for text display, truncated to keep logs readable.
140
+ * @param {object} input - Tool input object
141
+ * @returns {string} Truncated summary
142
+ */
143
+ function summarizeInput(input) {
144
+ if (!input || typeof input !== "object") return "";
145
+ const json = JSON.stringify(input);
146
+ if (json.length <= 200) return json;
147
+ return json.slice(0, 197) + "...";
148
+ }
149
+
150
+ /**
151
+ * Factory function — wires a TeeWriter with the given streams.
152
+ * @param {object} deps - Same as TeeWriter constructor
153
+ * @returns {TeeWriter}
154
+ */
155
+ export function createTeeWriter(deps) {
156
+ return new TeeWriter(deps);
157
+ }
@@ -0,0 +1,292 @@
1
+ import { describe, test } from "node:test";
2
+ import assert from "node:assert";
3
+ import { PassThrough } from "node:stream";
4
+
5
+ import { AgentRunner, createAgentRunner } from "@forwardimpact/libeval";
6
+
7
+ /**
8
+ * Create a mock query function that yields canned messages.
9
+ * @param {object[]} messages - Messages to yield
10
+ * @param {function} [captureOptions] - Callback to capture query options
11
+ * @returns {function}
12
+ */
13
+ function mockQuery(messages, captureOptions) {
14
+ return async function* (params) {
15
+ if (captureOptions) captureOptions(params);
16
+ for (const msg of messages) {
17
+ yield msg;
18
+ }
19
+ };
20
+ }
21
+
22
+ /**
23
+ * Collect all NDJSON lines written to a PassThrough stream.
24
+ * @param {PassThrough} stream
25
+ * @returns {string[]}
26
+ */
27
+ function collectLines(stream) {
28
+ const data = stream.read();
29
+ if (!data) return [];
30
+ return data
31
+ .toString()
32
+ .trim()
33
+ .split("\n")
34
+ .filter((l) => l.length > 0);
35
+ }
36
+
37
+ describe("AgentRunner", () => {
38
+ test("constructor throws on missing cwd", () => {
39
+ assert.throws(
40
+ () =>
41
+ new AgentRunner({
42
+ query: async function* () {},
43
+ output: new PassThrough(),
44
+ }),
45
+ /cwd is required/,
46
+ );
47
+ });
48
+
49
+ test("constructor throws on missing query", () => {
50
+ assert.throws(
51
+ () => new AgentRunner({ cwd: "/tmp", output: new PassThrough() }),
52
+ /query is required/,
53
+ );
54
+ });
55
+
56
+ test("constructor throws on missing output", () => {
57
+ assert.throws(
58
+ () =>
59
+ new AgentRunner({
60
+ cwd: "/tmp",
61
+ query: async function* () {},
62
+ }),
63
+ /output is required/,
64
+ );
65
+ });
66
+
67
+ test("constructor uses defaults for optional params", () => {
68
+ const runner = new AgentRunner({
69
+ cwd: "/tmp",
70
+ query: async function* () {},
71
+ output: new PassThrough(),
72
+ });
73
+ assert.strictEqual(runner.model, "opus");
74
+ assert.strictEqual(runner.maxTurns, 50);
75
+ assert.deepStrictEqual(runner.allowedTools, [
76
+ "Bash",
77
+ "Read",
78
+ "Glob",
79
+ "Grep",
80
+ "Write",
81
+ "Edit",
82
+ ]);
83
+ assert.strictEqual(runner.permissionMode, "bypassPermissions");
84
+ assert.strictEqual(runner.sessionId, null);
85
+ });
86
+
87
+ test("run() writes NDJSON lines to output stream", async () => {
88
+ const messages = [
89
+ { type: "system", subtype: "init", session_id: "sess-1" },
90
+ { type: "assistant", content: "Working on it..." },
91
+ { type: "result", subtype: "success", result: "Done." },
92
+ ];
93
+
94
+ const output = new PassThrough();
95
+ const runner = new AgentRunner({
96
+ cwd: "/tmp",
97
+ query: mockQuery(messages),
98
+ output,
99
+ });
100
+
101
+ const result = await runner.run("Test task");
102
+ const lines = collectLines(output);
103
+
104
+ assert.strictEqual(lines.length, 3);
105
+ assert.deepStrictEqual(JSON.parse(lines[0]), messages[0]);
106
+ assert.deepStrictEqual(JSON.parse(lines[1]), messages[1]);
107
+ assert.deepStrictEqual(JSON.parse(lines[2]), messages[2]);
108
+ assert.strictEqual(result.success, true);
109
+ assert.strictEqual(result.text, "Done.");
110
+ assert.strictEqual(result.sessionId, "sess-1");
111
+ });
112
+
113
+ test("run() captures sessionId from init event", async () => {
114
+ const messages = [
115
+ { type: "system", subtype: "init", session_id: "my-session" },
116
+ { type: "result", subtype: "success", result: "OK" },
117
+ ];
118
+
119
+ const output = new PassThrough();
120
+ const runner = new AgentRunner({
121
+ cwd: "/tmp",
122
+ query: mockQuery(messages),
123
+ output,
124
+ });
125
+
126
+ await runner.run("Task");
127
+ assert.strictEqual(runner.sessionId, "my-session");
128
+ });
129
+
130
+ test("run() passes options to query", async () => {
131
+ let captured = null;
132
+ const query = mockQuery(
133
+ [{ type: "result", subtype: "success", result: "OK" }],
134
+ (params) => {
135
+ captured = params;
136
+ },
137
+ );
138
+
139
+ const output = new PassThrough();
140
+ const runner = new AgentRunner({
141
+ cwd: "/work",
142
+ query,
143
+ output,
144
+ model: "sonnet",
145
+ maxTurns: 10,
146
+ allowedTools: ["Read", "Grep"],
147
+ permissionMode: "plan",
148
+ });
149
+
150
+ await runner.run("My task");
151
+
152
+ assert.strictEqual(captured.prompt, "My task");
153
+ assert.strictEqual(captured.options.cwd, "/work");
154
+ assert.strictEqual(captured.options.model, "sonnet");
155
+ assert.strictEqual(captured.options.maxTurns, 10);
156
+ assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]);
157
+ assert.strictEqual(captured.options.permissionMode, "plan");
158
+ assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true);
159
+ });
160
+
161
+ test("run() returns success=false on non-success subtype", async () => {
162
+ const messages = [{ type: "result", subtype: "error", result: "Stopped" }];
163
+
164
+ const output = new PassThrough();
165
+ const runner = new AgentRunner({
166
+ cwd: "/tmp",
167
+ query: mockQuery(messages),
168
+ output,
169
+ });
170
+
171
+ const result = await runner.run("Task");
172
+ assert.strictEqual(result.success, false);
173
+ assert.strictEqual(result.text, "Stopped");
174
+ });
175
+
176
+ test("resume() passes sessionId via options.resume", async () => {
177
+ let resumeCapture = null;
178
+
179
+ const initMessages = [
180
+ { type: "system", subtype: "init", session_id: "sess-42" },
181
+ { type: "result", subtype: "success", result: "First done" },
182
+ ];
183
+
184
+ let callCount = 0;
185
+ const query = async function* (params) {
186
+ callCount++;
187
+ if (callCount === 1) {
188
+ for (const m of initMessages) yield m;
189
+ } else {
190
+ resumeCapture = params;
191
+ yield { type: "result", subtype: "success", result: "Resumed" };
192
+ }
193
+ };
194
+
195
+ const output = new PassThrough();
196
+ const runner = new AgentRunner({ cwd: "/tmp", query, output });
197
+
198
+ await runner.run("Initial task");
199
+ const result = await runner.resume("Follow up");
200
+
201
+ assert.strictEqual(resumeCapture.options.resume, "sess-42");
202
+ assert.strictEqual(resumeCapture.prompt, "Follow up");
203
+ assert.strictEqual(result.success, true);
204
+ assert.strictEqual(result.text, "Resumed");
205
+ });
206
+
207
+ test("drainOutput() returns buffered lines and clears buffer", async () => {
208
+ const messages = [
209
+ { type: "assistant", content: "Line 1" },
210
+ { type: "result", subtype: "success", result: "Line 2" },
211
+ ];
212
+
213
+ const output = new PassThrough();
214
+ const runner = new AgentRunner({
215
+ cwd: "/tmp",
216
+ query: mockQuery(messages),
217
+ output,
218
+ });
219
+
220
+ await runner.run("Task");
221
+
222
+ const drained = runner.drainOutput();
223
+ assert.strictEqual(drained.length, 2);
224
+ assert.deepStrictEqual(JSON.parse(drained[0]), messages[0]);
225
+ assert.deepStrictEqual(JSON.parse(drained[1]), messages[1]);
226
+
227
+ // Buffer should be empty after drain
228
+ const secondDrain = runner.drainOutput();
229
+ assert.strictEqual(secondDrain.length, 0);
230
+ });
231
+
232
+ test("run() captures error when query throws and returns buffered output", async () => {
233
+ async function* failingQuery() {
234
+ yield { type: "system", subtype: "init", session_id: "sess-err" };
235
+ yield { type: "assistant", content: "Partial work" };
236
+ throw new Error("Claude Code process exited with code 1");
237
+ }
238
+
239
+ const output = new PassThrough();
240
+ const runner = new AgentRunner({
241
+ cwd: "/tmp",
242
+ query: () => failingQuery(),
243
+ output,
244
+ });
245
+
246
+ const result = await runner.run("Task");
247
+ assert.strictEqual(result.success, false);
248
+ assert.ok(result.error);
249
+ assert.match(result.error.message, /exited with code 1/);
250
+ assert.strictEqual(result.sessionId, "sess-err");
251
+
252
+ // Buffered output should contain the messages yielded before the error
253
+ const drained = runner.drainOutput();
254
+ assert.strictEqual(drained.length, 2);
255
+ });
256
+
257
+ test("resume() captures error when query throws", async () => {
258
+ const initMessages = [
259
+ { type: "system", subtype: "init", session_id: "sess-r" },
260
+ { type: "result", subtype: "success", result: "OK" },
261
+ ];
262
+
263
+ let callCount = 0;
264
+ const query = async function* () {
265
+ callCount++;
266
+ if (callCount === 1) {
267
+ for (const m of initMessages) yield m;
268
+ } else {
269
+ yield { type: "assistant", content: "Resuming..." };
270
+ throw new Error("Process crashed");
271
+ }
272
+ };
273
+
274
+ const output = new PassThrough();
275
+ const runner = new AgentRunner({ cwd: "/tmp", query, output });
276
+
277
+ await runner.run("Task");
278
+ const result = await runner.resume("Continue");
279
+ assert.strictEqual(result.success, false);
280
+ assert.ok(result.error);
281
+ assert.match(result.error.message, /Process crashed/);
282
+ });
283
+
284
+ test("createAgentRunner factory returns an AgentRunner instance", () => {
285
+ const runner = createAgentRunner({
286
+ cwd: "/tmp",
287
+ query: async function* () {},
288
+ output: new PassThrough(),
289
+ });
290
+ assert.ok(runner instanceof AgentRunner);
291
+ });
292
+ });