@forwardimpact/libeval 0.1.9 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,271 +0,0 @@
1
- import { describe, test } from "node:test";
2
- import assert from "node:assert";
3
- import { PassThrough } from "node:stream";
4
-
5
- import { AgentRunner } from "@forwardimpact/libeval";
6
-
7
- /**
8
- * Create a mock query function that yields canned messages.
9
- * @param {object[]} messages - Messages to yield
10
- * @returns {function}
11
- */
12
- function mockQuery(messages) {
13
- return async function* () {
14
- for (const msg of messages) {
15
- yield msg;
16
- }
17
- };
18
- }
19
-
20
- const textBlock = (t) => ({
21
- type: "assistant",
22
- message: { content: [{ type: "text", text: t }] },
23
- });
24
-
25
- const toolOnly = (name) => ({
26
- type: "assistant",
27
- message: {
28
- content: [{ type: "tool_use", id: "tu_" + name, name, input: {} }],
29
- },
30
- });
31
-
32
- describe("AgentRunner - onBatch batching", () => {
33
- test("batchSize defaults to 3", () => {
34
- const runner = new AgentRunner({
35
- cwd: "/tmp",
36
- query: async function* () {},
37
- output: new PassThrough(),
38
- });
39
- assert.strictEqual(runner.batchSize, 3);
40
- });
41
-
42
- test("onBatch fires every 3 assistant text-block messages by default", async () => {
43
- // 5 text-block messages + terminal result. With the default batchSize
44
- // of 3, onBatch should fire on the 3rd text message and again on the
45
- // terminal result (flushing the remaining 2).
46
- const messages = [
47
- { type: "system", subtype: "init", session_id: "sess-batch" },
48
- textBlock("one"),
49
- textBlock("two"),
50
- textBlock("three"),
51
- textBlock("four"),
52
- textBlock("five"),
53
- { type: "result", subtype: "success", result: "Done." },
54
- ];
55
-
56
- const batches = [];
57
- const runner = new AgentRunner({
58
- cwd: "/tmp",
59
- query: mockQuery(messages),
60
- output: new PassThrough(),
61
- });
62
- runner.onBatch = async (lines) => {
63
- batches.push(lines.map((l) => JSON.parse(l)));
64
- };
65
-
66
- await runner.run("Task");
67
-
68
- // First flush carries init + first 3 text messages; second carries
69
- // remaining 2 text messages + the result.
70
- assert.strictEqual(batches.length, 2);
71
- assert.strictEqual(batches[0].length, 4);
72
- assert.strictEqual(batches[1].length, 3);
73
- });
74
-
75
- test("onBatch honours custom batchSize", async () => {
76
- // batchSize = 2: 4 text messages produce 2 flushes; result adds a 3rd.
77
- const messages = [
78
- textBlock("a"),
79
- textBlock("b"),
80
- textBlock("c"),
81
- textBlock("d"),
82
- { type: "result", subtype: "success", result: "Done." },
83
- ];
84
-
85
- const batches = [];
86
- const runner = new AgentRunner({
87
- cwd: "/tmp",
88
- query: mockQuery(messages),
89
- output: new PassThrough(),
90
- batchSize: 2,
91
- });
92
- runner.onBatch = async (lines) => {
93
- batches.push(lines.length);
94
- };
95
-
96
- await runner.run("Task");
97
-
98
- assert.deepStrictEqual(batches, [2, 2, 1]);
99
- });
100
-
101
- test("tool-only assistant messages ride along in the next flush", async () => {
102
- // Tool-only assistant messages accumulate without incrementing the
103
- // counter. The supervisor sees the preceding tool calls when the
104
- // flush eventually fires.
105
- const messages = [
106
- toolOnly("Read"),
107
- toolOnly("Grep"),
108
- textBlock("found it"),
109
- { type: "result", subtype: "success", result: "Done." },
110
- ];
111
-
112
- const batches = [];
113
- const runner = new AgentRunner({
114
- cwd: "/tmp",
115
- query: mockQuery(messages),
116
- output: new PassThrough(),
117
- batchSize: 1,
118
- });
119
- runner.onBatch = async (lines) => {
120
- batches.push(lines.map((l) => JSON.parse(l)));
121
- };
122
-
123
- await runner.run("Task");
124
-
125
- // First flush triggered by the single text-block message; it carries
126
- // the two preceding tool-only messages with it.
127
- assert.strictEqual(batches.length, 2);
128
- assert.strictEqual(batches[0].length, 3);
129
- assert.strictEqual(batches[0][0].message.content[0].type, "tool_use");
130
- assert.strictEqual(batches[0][1].message.content[0].type, "tool_use");
131
- assert.strictEqual(batches[0][2].message.content[0].type, "text");
132
- assert.strictEqual(batches[1].length, 1);
133
- assert.strictEqual(batches[1][0].type, "result");
134
- });
135
-
136
- test("terminal result always flushes even if batchSize not yet reached", async () => {
137
- // 1 text-block + result, batchSize = 5. The counter only reaches 1
138
- // but the terminal result must still flush.
139
- const messages = [
140
- textBlock("only one"),
141
- { type: "result", subtype: "success", result: "Done." },
142
- ];
143
-
144
- const batches = [];
145
- const runner = new AgentRunner({
146
- cwd: "/tmp",
147
- query: mockQuery(messages),
148
- output: new PassThrough(),
149
- batchSize: 5,
150
- });
151
- runner.onBatch = async (lines) => {
152
- batches.push(lines.length);
153
- };
154
-
155
- await runner.run("Task");
156
-
157
- assert.deepStrictEqual(batches, [2]);
158
- });
159
- });
160
-
161
- describe("AgentRunner - terminal flush on abnormal end", () => {
162
- test("iterator crash before a flush boundary still delivers the pending batch", async () => {
163
- // batchSize = 3: the first two text messages accumulate without
164
- // flushing. The iterator then throws before the threshold — the
165
- // pending batch must ship in a terminal flush.
166
- async function* crashingQuery() {
167
- yield { type: "system", subtype: "init", session_id: "sess-crash" };
168
- yield textBlock("step 1");
169
- yield textBlock("step 2");
170
- throw new Error("Claude Code process exited with code 1");
171
- }
172
-
173
- const batches = [];
174
- const runner = new AgentRunner({
175
- cwd: "/tmp",
176
- query: () => crashingQuery(),
177
- output: new PassThrough(),
178
- });
179
- runner.onBatch = async (lines) => {
180
- batches.push(lines.map((l) => JSON.parse(l)));
181
- };
182
-
183
- const result = await runner.run("Task");
184
-
185
- assert.ok(result.error);
186
- assert.match(result.error.message, /exited with code 1/);
187
- assert.strictEqual(batches.length, 1);
188
- assert.strictEqual(batches[0].length, 3);
189
- assert.strictEqual(batches[0][0].type, "system");
190
- assert.strictEqual(batches[0][1].type, "assistant");
191
- assert.strictEqual(batches[0][2].type, "assistant");
192
- });
193
-
194
- test("iterator crash after a completed batch does not re-flush", async () => {
195
- // batchSize = 2: two text messages trigger a normal flush, emptying
196
- // the pending batch. The iterator then throws with nothing pending —
197
- // the terminal flush must be a no-op, not an empty call.
198
- async function* crashingQuery() {
199
- yield textBlock("a");
200
- yield textBlock("b");
201
- throw new Error("boom");
202
- }
203
-
204
- const batches = [];
205
- const runner = new AgentRunner({
206
- cwd: "/tmp",
207
- query: () => crashingQuery(),
208
- output: new PassThrough(),
209
- batchSize: 2,
210
- });
211
- runner.onBatch = async (lines) => {
212
- batches.push(lines.length);
213
- };
214
-
215
- const result = await runner.run("Task");
216
- assert.ok(result.error);
217
- assert.match(result.error.message, /boom/);
218
- assert.deepStrictEqual(batches, [2]);
219
- });
220
-
221
- test("natural-end iterator without a result does not trigger terminal flush", async () => {
222
- // The real SDK always terminates with `result`. A mock that ends
223
- // naturally with pending lines is treated as an incomplete stub —
224
- // no phantom flush, since nothing about a natural end warrants a
225
- // new mid-turn review.
226
- async function* noResultQuery() {
227
- yield textBlock("one");
228
- yield textBlock("two");
229
- // No result, no error — just ends.
230
- }
231
-
232
- const batches = [];
233
- const runner = new AgentRunner({
234
- cwd: "/tmp",
235
- query: () => noResultQuery(),
236
- output: new PassThrough(),
237
- batchSize: 3,
238
- });
239
- runner.onBatch = async (lines) => {
240
- batches.push(lines.length);
241
- };
242
-
243
- const result = await runner.run("Task");
244
- assert.strictEqual(result.error, null);
245
- assert.strictEqual(batches.length, 0);
246
- });
247
-
248
- test("onBatch throw during terminal flush does not mask an earlier error", async () => {
249
- // The iterator threw first; the terminal flush also throws. The
250
- // original iterator error must win — it is the more actionable
251
- // condition to surface to the caller.
252
- async function* crashingQuery() {
253
- yield textBlock("partial");
254
- throw new Error("original failure");
255
- }
256
-
257
- const runner = new AgentRunner({
258
- cwd: "/tmp",
259
- query: () => crashingQuery(),
260
- output: new PassThrough(),
261
- batchSize: 3,
262
- });
263
- runner.onBatch = async () => {
264
- throw new Error("flush failure");
265
- };
266
-
267
- const result = await runner.run("Task");
268
- assert.ok(result.error);
269
- assert.match(result.error.message, /original failure/);
270
- });
271
- });
@@ -1,317 +0,0 @@
1
- import { describe, test } from "node:test";
2
- import assert from "node:assert";
3
- import { PassThrough } from "node:stream";
4
-
5
- import { AgentRunner, createAgentRunner } from "@forwardimpact/libeval";
6
-
7
- /**
8
- * Create a mock query function that yields canned messages.
9
- * @param {object[]} messages - Messages to yield
10
- * @param {function} [captureOptions] - Callback to capture query options
11
- * @returns {function}
12
- */
13
- function mockQuery(messages, captureOptions) {
14
- return async function* (params) {
15
- if (captureOptions) captureOptions(params);
16
- for (const msg of messages) {
17
- yield msg;
18
- }
19
- };
20
- }
21
-
22
- /**
23
- * Collect all NDJSON lines written to a PassThrough stream.
24
- * @param {PassThrough} stream
25
- * @returns {string[]}
26
- */
27
- function collectLines(stream) {
28
- const data = stream.read();
29
- if (!data) return [];
30
- return data
31
- .toString()
32
- .trim()
33
- .split("\n")
34
- .filter((l) => l.length > 0);
35
- }
36
-
37
- describe("AgentRunner", () => {
38
- test("constructor throws on missing cwd", () => {
39
- assert.throws(
40
- () =>
41
- new AgentRunner({
42
- query: async function* () {},
43
- output: new PassThrough(),
44
- }),
45
- /cwd is required/,
46
- );
47
- });
48
-
49
- test("constructor throws on missing query", () => {
50
- assert.throws(
51
- () => new AgentRunner({ cwd: "/tmp", output: new PassThrough() }),
52
- /query is required/,
53
- );
54
- });
55
-
56
- test("constructor throws on missing output", () => {
57
- assert.throws(
58
- () =>
59
- new AgentRunner({
60
- cwd: "/tmp",
61
- query: async function* () {},
62
- }),
63
- /output is required/,
64
- );
65
- });
66
-
67
- test("constructor uses defaults for optional params", () => {
68
- const runner = new AgentRunner({
69
- cwd: "/tmp",
70
- query: async function* () {},
71
- output: new PassThrough(),
72
- });
73
- assert.strictEqual(runner.model, "opus");
74
- assert.strictEqual(runner.maxTurns, 50);
75
- assert.deepStrictEqual(runner.allowedTools, [
76
- "Bash",
77
- "Read",
78
- "Glob",
79
- "Grep",
80
- "Write",
81
- "Edit",
82
- ]);
83
- assert.strictEqual(runner.permissionMode, "bypassPermissions");
84
- assert.deepStrictEqual(runner.settingSources, []);
85
- assert.strictEqual(runner.sessionId, null);
86
- });
87
-
88
- test("run() writes NDJSON lines to output stream", async () => {
89
- const messages = [
90
- { type: "system", subtype: "init", session_id: "sess-1" },
91
- { type: "assistant", content: "Working on it..." },
92
- { type: "result", subtype: "success", result: "Done." },
93
- ];
94
-
95
- const output = new PassThrough();
96
- const runner = new AgentRunner({
97
- cwd: "/tmp",
98
- query: mockQuery(messages),
99
- output,
100
- });
101
-
102
- const result = await runner.run("Test task");
103
- const lines = collectLines(output);
104
-
105
- assert.strictEqual(lines.length, 3);
106
- assert.deepStrictEqual(JSON.parse(lines[0]), messages[0]);
107
- assert.deepStrictEqual(JSON.parse(lines[1]), messages[1]);
108
- assert.deepStrictEqual(JSON.parse(lines[2]), messages[2]);
109
- assert.strictEqual(result.success, true);
110
- assert.strictEqual(result.text, "Done.");
111
- assert.strictEqual(result.sessionId, "sess-1");
112
- });
113
-
114
- test("run() captures sessionId from init event", async () => {
115
- const messages = [
116
- { type: "system", subtype: "init", session_id: "my-session" },
117
- { type: "result", subtype: "success", result: "OK" },
118
- ];
119
-
120
- const output = new PassThrough();
121
- const runner = new AgentRunner({
122
- cwd: "/tmp",
123
- query: mockQuery(messages),
124
- output,
125
- });
126
-
127
- await runner.run("Task");
128
- assert.strictEqual(runner.sessionId, "my-session");
129
- });
130
-
131
- test("run() passes options to query", async () => {
132
- let captured = null;
133
- const query = mockQuery(
134
- [{ type: "result", subtype: "success", result: "OK" }],
135
- (params) => {
136
- captured = params;
137
- },
138
- );
139
-
140
- const output = new PassThrough();
141
- const runner = new AgentRunner({
142
- cwd: "/work",
143
- query,
144
- output,
145
- model: "sonnet",
146
- maxTurns: 10,
147
- allowedTools: ["Read", "Grep"],
148
- permissionMode: "plan",
149
- settingSources: ["project"],
150
- });
151
-
152
- await runner.run("My task");
153
-
154
- assert.strictEqual(captured.prompt, "My task");
155
- assert.strictEqual(captured.options.cwd, "/work");
156
- assert.strictEqual(captured.options.model, "sonnet");
157
- assert.strictEqual(captured.options.maxTurns, 10);
158
- assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]);
159
- assert.strictEqual(captured.options.permissionMode, "plan");
160
- assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true);
161
- assert.deepStrictEqual(captured.options.settingSources, ["project"]);
162
- });
163
-
164
- test("run() returns success=false on non-success subtype", async () => {
165
- const messages = [{ type: "result", subtype: "error", result: "Stopped" }];
166
-
167
- const output = new PassThrough();
168
- const runner = new AgentRunner({
169
- cwd: "/tmp",
170
- query: mockQuery(messages),
171
- output,
172
- });
173
-
174
- const result = await runner.run("Task");
175
- assert.strictEqual(result.success, false);
176
- assert.strictEqual(result.text, "Stopped");
177
- });
178
-
179
- test("resume() passes sessionId via options.resume", async () => {
180
- let resumeCapture = null;
181
-
182
- const initMessages = [
183
- { type: "system", subtype: "init", session_id: "sess-42" },
184
- { type: "result", subtype: "success", result: "First done" },
185
- ];
186
-
187
- let callCount = 0;
188
- const query = async function* (params) {
189
- callCount++;
190
- if (callCount === 1) {
191
- for (const m of initMessages) yield m;
192
- } else {
193
- resumeCapture = params;
194
- yield { type: "result", subtype: "success", result: "Resumed" };
195
- }
196
- };
197
-
198
- const output = new PassThrough();
199
- const runner = new AgentRunner({ cwd: "/tmp", query, output });
200
-
201
- await runner.run("Initial task");
202
- const result = await runner.resume("Follow up");
203
-
204
- assert.strictEqual(resumeCapture.options.resume, "sess-42");
205
- assert.strictEqual(resumeCapture.prompt, "Follow up");
206
- assert.strictEqual(result.success, true);
207
- assert.strictEqual(result.text, "Resumed");
208
- });
209
-
210
- test("drainOutput() returns buffered lines and clears buffer", async () => {
211
- const messages = [
212
- { type: "assistant", content: "Line 1" },
213
- { type: "result", subtype: "success", result: "Line 2" },
214
- ];
215
-
216
- const output = new PassThrough();
217
- const runner = new AgentRunner({
218
- cwd: "/tmp",
219
- query: mockQuery(messages),
220
- output,
221
- });
222
-
223
- await runner.run("Task");
224
-
225
- const drained = runner.drainOutput();
226
- assert.strictEqual(drained.length, 2);
227
- assert.deepStrictEqual(JSON.parse(drained[0]), messages[0]);
228
- assert.deepStrictEqual(JSON.parse(drained[1]), messages[1]);
229
-
230
- // Buffer should be empty after drain
231
- const secondDrain = runner.drainOutput();
232
- assert.strictEqual(secondDrain.length, 0);
233
- });
234
-
235
- test("run() captures error when query throws and returns buffered output", async () => {
236
- async function* failingQuery() {
237
- yield { type: "system", subtype: "init", session_id: "sess-err" };
238
- yield { type: "assistant", content: "Partial work" };
239
- throw new Error("Claude Code process exited with code 1");
240
- }
241
-
242
- const output = new PassThrough();
243
- const runner = new AgentRunner({
244
- cwd: "/tmp",
245
- query: () => failingQuery(),
246
- output,
247
- });
248
-
249
- const result = await runner.run("Task");
250
- assert.strictEqual(result.success, false);
251
- assert.ok(result.error);
252
- assert.match(result.error.message, /exited with code 1/);
253
- assert.strictEqual(result.sessionId, "sess-err");
254
-
255
- // Buffered output should contain the messages yielded before the error
256
- const drained = runner.drainOutput();
257
- assert.strictEqual(drained.length, 2);
258
- });
259
-
260
- test("resume() captures error when query throws", async () => {
261
- const initMessages = [
262
- { type: "system", subtype: "init", session_id: "sess-r" },
263
- { type: "result", subtype: "success", result: "OK" },
264
- ];
265
-
266
- let callCount = 0;
267
- const query = async function* () {
268
- callCount++;
269
- if (callCount === 1) {
270
- for (const m of initMessages) yield m;
271
- } else {
272
- yield { type: "assistant", content: "Resuming..." };
273
- throw new Error("Process crashed");
274
- }
275
- };
276
-
277
- const output = new PassThrough();
278
- const runner = new AgentRunner({ cwd: "/tmp", query, output });
279
-
280
- await runner.run("Task");
281
- const result = await runner.resume("Continue");
282
- assert.strictEqual(result.success, false);
283
- assert.ok(result.error);
284
- assert.match(result.error.message, /Process crashed/);
285
- });
286
-
287
- test("run() succeeds when SDK throws after emitting successful result", async () => {
288
- async function* creditExhaustedQuery() {
289
- yield { type: "system", subtype: "init", session_id: "sess-credit" };
290
- yield { type: "assistant", content: "Analysis complete." };
291
- yield { type: "result", subtype: "success", result: "Done." };
292
- throw new Error("Credit balance is too low");
293
- }
294
-
295
- const output = new PassThrough();
296
- const runner = new AgentRunner({
297
- cwd: "/tmp",
298
- query: () => creditExhaustedQuery(),
299
- output,
300
- });
301
-
302
- const result = await runner.run("Task");
303
- assert.strictEqual(result.success, true);
304
- assert.strictEqual(result.text, "Done.");
305
- assert.ok(result.error);
306
- assert.match(result.error.message, /Credit balance/);
307
- });
308
-
309
- test("createAgentRunner factory returns an AgentRunner instance", () => {
310
- const runner = createAgentRunner({
311
- cwd: "/tmp",
312
- query: async function* () {},
313
- output: new PassThrough(),
314
- });
315
- assert.ok(runner instanceof AgentRunner);
316
- });
317
- });
@@ -1,7 +0,0 @@
1
- {"type":"system","subtype":"init","cwd":"/home/user/monorepo","session_id":"abc-123","tools":["Bash","Read","Glob","Grep","Write","Edit"],"mcp_servers":[],"model":"claude-opus-4-6","permissionMode":"default","claude_code_version":"2.1.87","uuid":"evt-1"}
2
- {"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01","type":"message","role":"assistant","content":[{"type":"text","text":"I'll start by checking the repository structure."}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":100,"cache_creation_input_tokens":500,"cache_read_input_tokens":200,"output_tokens":15,"service_tier":"standard"}},"session_id":"abc-123","uuid":"evt-2"}
3
- {"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_01","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01","name":"Bash","input":{"command":"ls -la"}}],"stop_reason":null,"stop_sequence":null,"usage":{"input_tokens":100,"cache_creation_input_tokens":500,"cache_read_input_tokens":200,"output_tokens":15,"service_tier":"standard"}},"session_id":"abc-123","uuid":"evt-3"}
4
- {"type":"user","message":{"role":"user","content":[{"tool_use_id":"toolu_01","type":"tool_result","content":"total 42\ndrwxr-xr-x 5 user user 4096 Mar 29 12:00 .\ndrwxr-xr-x 3 user user 4096 Mar 29 12:00 .."}]},"uuid":"evt-4","session_id":"abc-123"}
5
- {"type":"assistant","message":{"model":"claude-opus-4-6","id":"msg_02","type":"message","role":"assistant","content":[{"type":"text","text":"The repository looks good. No security issues found."}],"stop_reason":"end_turn","stop_sequence":null,"usage":{"input_tokens":150,"cache_creation_input_tokens":0,"cache_read_input_tokens":700,"output_tokens":12,"service_tier":"standard"}},"session_id":"abc-123","uuid":"evt-5"}
6
- {"type":"rate_limit_event","rate_limit_info":{"status":"allowed"},"uuid":"evt-6","session_id":"abc-123"}
7
- {"type":"result","subtype":"success","is_error":false,"duration_ms":5200,"duration_api_ms":4800,"num_turns":3,"result":"The repository looks good. No security issues found.","stop_reason":"end_turn","session_id":"abc-123","total_cost_usd":0.0523,"usage":{"input_tokens":350,"cache_creation_input_tokens":500,"cache_read_input_tokens":900,"output_tokens":42,"service_tier":"standard"},"modelUsage":{"claude-opus-4-6":{"inputTokens":350,"outputTokens":42,"cacheReadInputTokens":900,"cacheCreationInputTokens":500,"costUSD":0.0523}},"uuid":"evt-7"}