@forwardimpact/libeval 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +26 -1
- package/index.js +3 -0
- package/package.json +6 -3
- package/src/agent-runner.js +154 -0
- package/src/commands/run.js +76 -0
- package/src/commands/supervise.js +86 -0
- package/src/commands/tee.js +13 -75
- package/src/supervisor.js +186 -0
- package/src/tee-writer.js +157 -0
- package/test/agent-runner.test.js +317 -0
- package/test/supervisor.test.js +342 -0
- package/test/tee-writer.test.js +326 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TeeWriter — a Writable stream that writes raw NDJSON to a file while
|
|
3
|
+
* simultaneously streaming human-readable text to a separate stream (e.g.
|
|
4
|
+
* process.stdout).
|
|
5
|
+
*
|
|
6
|
+
* Supports two modes:
|
|
7
|
+
* - "raw" (default): expects standard stream-json events from AgentRunner
|
|
8
|
+
* - "supervised": expects tagged events {source, turn, event} from Supervisor
|
|
9
|
+
*
|
|
10
|
+
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { Writable } from "node:stream";
|
|
14
|
+
import { TraceCollector } from "./trace-collector.js";
|
|
15
|
+
|
|
16
|
+
export class TeeWriter extends Writable {
|
|
17
|
+
/**
|
|
18
|
+
* @param {object} deps
|
|
19
|
+
* @param {import("stream").Writable} deps.fileStream - Stream to write raw NDJSON to
|
|
20
|
+
* @param {import("stream").Writable} deps.textStream - Stream to write human-readable text to
|
|
21
|
+
* @param {"raw"|"supervised"} [deps.mode] - Event format: "raw" or "supervised" (default: "raw")
|
|
22
|
+
*/
|
|
23
|
+
constructor({ fileStream, textStream, mode }) {
|
|
24
|
+
super();
|
|
25
|
+
if (!fileStream) throw new Error("fileStream is required");
|
|
26
|
+
if (!textStream) throw new Error("textStream is required");
|
|
27
|
+
this.fileStream = fileStream;
|
|
28
|
+
this.textStream = textStream;
|
|
29
|
+
this.mode = mode ?? "raw";
|
|
30
|
+
this.collector = new TraceCollector();
|
|
31
|
+
this.turnsEmitted = 0;
|
|
32
|
+
this.lastSource = null;
|
|
33
|
+
this.partial = "";
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* @param {Buffer|string} chunk
|
|
38
|
+
* @param {string} encoding
|
|
39
|
+
* @param {function} callback
|
|
40
|
+
*/
|
|
41
|
+
_write(chunk, encoding, callback) {
|
|
42
|
+
const str = this.partial + chunk.toString();
|
|
43
|
+
const lines = str.split("\n");
|
|
44
|
+
this.partial = lines.pop() ?? "";
|
|
45
|
+
|
|
46
|
+
for (const line of lines) {
|
|
47
|
+
if (!line.trim()) continue;
|
|
48
|
+
this.fileStream.write(line + "\n");
|
|
49
|
+
this.processLine(line);
|
|
50
|
+
}
|
|
51
|
+
callback();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* @param {function} callback
|
|
56
|
+
*/
|
|
57
|
+
_final(callback) {
|
|
58
|
+
if (this.partial.trim()) {
|
|
59
|
+
this.fileStream.write(this.partial + "\n");
|
|
60
|
+
this.processLine(this.partial);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
if (this.mode === "raw" && this.collector.result) {
|
|
64
|
+
const text = this.collector.toText();
|
|
65
|
+
const idx = text.lastIndexOf("\n---");
|
|
66
|
+
if (idx !== -1) {
|
|
67
|
+
this.textStream.write(text.slice(idx) + "\n");
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
callback();
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Process a single NDJSON line — feed to collector and flush text.
|
|
76
|
+
* @param {string} line
|
|
77
|
+
*/
|
|
78
|
+
processLine(line) {
|
|
79
|
+
if (this.mode === "supervised") {
|
|
80
|
+
this.processSupervisedLine(line);
|
|
81
|
+
} else {
|
|
82
|
+
this.collector.addLine(line);
|
|
83
|
+
this.flushTurns();
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Handle a tagged supervisor line: unwrap event, show source labels.
|
|
89
|
+
* @param {string} line
|
|
90
|
+
*/
|
|
91
|
+
processSupervisedLine(line) {
|
|
92
|
+
let parsed;
|
|
93
|
+
try {
|
|
94
|
+
parsed = JSON.parse(line);
|
|
95
|
+
} catch {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (parsed.source === "orchestrator" && parsed.type === "summary") {
|
|
100
|
+
const status = parsed.success ? "completed" : "incomplete";
|
|
101
|
+
this.textStream.write(
|
|
102
|
+
`\n--- Evaluation ${status} after ${parsed.turns} turns ---\n`,
|
|
103
|
+
);
|
|
104
|
+
return;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if (parsed.event) {
|
|
108
|
+
if (parsed.source && parsed.source !== this.lastSource) {
|
|
109
|
+
this.lastSource = parsed.source;
|
|
110
|
+
this.textStream.write(`\n[${parsed.source}]\n`);
|
|
111
|
+
}
|
|
112
|
+
this.collector.addLine(JSON.stringify(parsed.event));
|
|
113
|
+
this.flushTurns();
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Emit text for any new turns accumulated by the collector.
|
|
119
|
+
*/
|
|
120
|
+
flushTurns() {
|
|
121
|
+
const turns = this.collector.turns;
|
|
122
|
+
while (this.turnsEmitted < turns.length) {
|
|
123
|
+
const turn = turns[this.turnsEmitted++];
|
|
124
|
+
if (turn.role === "assistant") {
|
|
125
|
+
for (const block of turn.content) {
|
|
126
|
+
if (block.type === "text") {
|
|
127
|
+
this.textStream.write(block.text + "\n");
|
|
128
|
+
} else if (block.type === "tool_use") {
|
|
129
|
+
const input = summarizeInput(block.input);
|
|
130
|
+
this.textStream.write(`> Tool: ${block.name} ${input}\n`);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Summarize tool input for text display, truncated to keep logs readable.
|
|
140
|
+
* @param {object} input - Tool input object
|
|
141
|
+
* @returns {string} Truncated summary
|
|
142
|
+
*/
|
|
143
|
+
function summarizeInput(input) {
|
|
144
|
+
if (!input || typeof input !== "object") return "";
|
|
145
|
+
const json = JSON.stringify(input);
|
|
146
|
+
if (json.length <= 200) return json;
|
|
147
|
+
return json.slice(0, 197) + "...";
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
/**
|
|
151
|
+
* Factory function — wires a TeeWriter with the given streams.
|
|
152
|
+
* @param {object} deps - Same as TeeWriter constructor
|
|
153
|
+
* @returns {TeeWriter}
|
|
154
|
+
*/
|
|
155
|
+
export function createTeeWriter(deps) {
|
|
156
|
+
return new TeeWriter(deps);
|
|
157
|
+
}
|
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
import { describe, test } from "node:test";
|
|
2
|
+
import assert from "node:assert";
|
|
3
|
+
import { PassThrough } from "node:stream";
|
|
4
|
+
|
|
5
|
+
import { AgentRunner, createAgentRunner } from "@forwardimpact/libeval";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Create a mock query function that yields canned messages.
|
|
9
|
+
* @param {object[]} messages - Messages to yield
|
|
10
|
+
* @param {function} [captureOptions] - Callback to capture query options
|
|
11
|
+
* @returns {function}
|
|
12
|
+
*/
|
|
13
|
+
function mockQuery(messages, captureOptions) {
|
|
14
|
+
return async function* (params) {
|
|
15
|
+
if (captureOptions) captureOptions(params);
|
|
16
|
+
for (const msg of messages) {
|
|
17
|
+
yield msg;
|
|
18
|
+
}
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Collect all NDJSON lines written to a PassThrough stream.
|
|
24
|
+
* @param {PassThrough} stream
|
|
25
|
+
* @returns {string[]}
|
|
26
|
+
*/
|
|
27
|
+
function collectLines(stream) {
|
|
28
|
+
const data = stream.read();
|
|
29
|
+
if (!data) return [];
|
|
30
|
+
return data
|
|
31
|
+
.toString()
|
|
32
|
+
.trim()
|
|
33
|
+
.split("\n")
|
|
34
|
+
.filter((l) => l.length > 0);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
describe("AgentRunner", () => {
|
|
38
|
+
test("constructor throws on missing cwd", () => {
|
|
39
|
+
assert.throws(
|
|
40
|
+
() =>
|
|
41
|
+
new AgentRunner({
|
|
42
|
+
query: async function* () {},
|
|
43
|
+
output: new PassThrough(),
|
|
44
|
+
}),
|
|
45
|
+
/cwd is required/,
|
|
46
|
+
);
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
test("constructor throws on missing query", () => {
|
|
50
|
+
assert.throws(
|
|
51
|
+
() => new AgentRunner({ cwd: "/tmp", output: new PassThrough() }),
|
|
52
|
+
/query is required/,
|
|
53
|
+
);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
test("constructor throws on missing output", () => {
|
|
57
|
+
assert.throws(
|
|
58
|
+
() =>
|
|
59
|
+
new AgentRunner({
|
|
60
|
+
cwd: "/tmp",
|
|
61
|
+
query: async function* () {},
|
|
62
|
+
}),
|
|
63
|
+
/output is required/,
|
|
64
|
+
);
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
test("constructor uses defaults for optional params", () => {
|
|
68
|
+
const runner = new AgentRunner({
|
|
69
|
+
cwd: "/tmp",
|
|
70
|
+
query: async function* () {},
|
|
71
|
+
output: new PassThrough(),
|
|
72
|
+
});
|
|
73
|
+
assert.strictEqual(runner.model, "opus");
|
|
74
|
+
assert.strictEqual(runner.maxTurns, 50);
|
|
75
|
+
assert.deepStrictEqual(runner.allowedTools, [
|
|
76
|
+
"Bash",
|
|
77
|
+
"Read",
|
|
78
|
+
"Glob",
|
|
79
|
+
"Grep",
|
|
80
|
+
"Write",
|
|
81
|
+
"Edit",
|
|
82
|
+
]);
|
|
83
|
+
assert.strictEqual(runner.permissionMode, "bypassPermissions");
|
|
84
|
+
assert.deepStrictEqual(runner.settingSources, []);
|
|
85
|
+
assert.strictEqual(runner.sessionId, null);
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
test("run() writes NDJSON lines to output stream", async () => {
|
|
89
|
+
const messages = [
|
|
90
|
+
{ type: "system", subtype: "init", session_id: "sess-1" },
|
|
91
|
+
{ type: "assistant", content: "Working on it..." },
|
|
92
|
+
{ type: "result", subtype: "success", result: "Done." },
|
|
93
|
+
];
|
|
94
|
+
|
|
95
|
+
const output = new PassThrough();
|
|
96
|
+
const runner = new AgentRunner({
|
|
97
|
+
cwd: "/tmp",
|
|
98
|
+
query: mockQuery(messages),
|
|
99
|
+
output,
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
const result = await runner.run("Test task");
|
|
103
|
+
const lines = collectLines(output);
|
|
104
|
+
|
|
105
|
+
assert.strictEqual(lines.length, 3);
|
|
106
|
+
assert.deepStrictEqual(JSON.parse(lines[0]), messages[0]);
|
|
107
|
+
assert.deepStrictEqual(JSON.parse(lines[1]), messages[1]);
|
|
108
|
+
assert.deepStrictEqual(JSON.parse(lines[2]), messages[2]);
|
|
109
|
+
assert.strictEqual(result.success, true);
|
|
110
|
+
assert.strictEqual(result.text, "Done.");
|
|
111
|
+
assert.strictEqual(result.sessionId, "sess-1");
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
test("run() captures sessionId from init event", async () => {
|
|
115
|
+
const messages = [
|
|
116
|
+
{ type: "system", subtype: "init", session_id: "my-session" },
|
|
117
|
+
{ type: "result", subtype: "success", result: "OK" },
|
|
118
|
+
];
|
|
119
|
+
|
|
120
|
+
const output = new PassThrough();
|
|
121
|
+
const runner = new AgentRunner({
|
|
122
|
+
cwd: "/tmp",
|
|
123
|
+
query: mockQuery(messages),
|
|
124
|
+
output,
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
await runner.run("Task");
|
|
128
|
+
assert.strictEqual(runner.sessionId, "my-session");
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
test("run() passes options to query", async () => {
|
|
132
|
+
let captured = null;
|
|
133
|
+
const query = mockQuery(
|
|
134
|
+
[{ type: "result", subtype: "success", result: "OK" }],
|
|
135
|
+
(params) => {
|
|
136
|
+
captured = params;
|
|
137
|
+
},
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
const output = new PassThrough();
|
|
141
|
+
const runner = new AgentRunner({
|
|
142
|
+
cwd: "/work",
|
|
143
|
+
query,
|
|
144
|
+
output,
|
|
145
|
+
model: "sonnet",
|
|
146
|
+
maxTurns: 10,
|
|
147
|
+
allowedTools: ["Read", "Grep"],
|
|
148
|
+
permissionMode: "plan",
|
|
149
|
+
settingSources: ["project"],
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
await runner.run("My task");
|
|
153
|
+
|
|
154
|
+
assert.strictEqual(captured.prompt, "My task");
|
|
155
|
+
assert.strictEqual(captured.options.cwd, "/work");
|
|
156
|
+
assert.strictEqual(captured.options.model, "sonnet");
|
|
157
|
+
assert.strictEqual(captured.options.maxTurns, 10);
|
|
158
|
+
assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]);
|
|
159
|
+
assert.strictEqual(captured.options.permissionMode, "plan");
|
|
160
|
+
assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true);
|
|
161
|
+
assert.deepStrictEqual(captured.options.settingSources, ["project"]);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
test("run() returns success=false on non-success subtype", async () => {
|
|
165
|
+
const messages = [{ type: "result", subtype: "error", result: "Stopped" }];
|
|
166
|
+
|
|
167
|
+
const output = new PassThrough();
|
|
168
|
+
const runner = new AgentRunner({
|
|
169
|
+
cwd: "/tmp",
|
|
170
|
+
query: mockQuery(messages),
|
|
171
|
+
output,
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
const result = await runner.run("Task");
|
|
175
|
+
assert.strictEqual(result.success, false);
|
|
176
|
+
assert.strictEqual(result.text, "Stopped");
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
test("resume() passes sessionId via options.resume", async () => {
|
|
180
|
+
let resumeCapture = null;
|
|
181
|
+
|
|
182
|
+
const initMessages = [
|
|
183
|
+
{ type: "system", subtype: "init", session_id: "sess-42" },
|
|
184
|
+
{ type: "result", subtype: "success", result: "First done" },
|
|
185
|
+
];
|
|
186
|
+
|
|
187
|
+
let callCount = 0;
|
|
188
|
+
const query = async function* (params) {
|
|
189
|
+
callCount++;
|
|
190
|
+
if (callCount === 1) {
|
|
191
|
+
for (const m of initMessages) yield m;
|
|
192
|
+
} else {
|
|
193
|
+
resumeCapture = params;
|
|
194
|
+
yield { type: "result", subtype: "success", result: "Resumed" };
|
|
195
|
+
}
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
const output = new PassThrough();
|
|
199
|
+
const runner = new AgentRunner({ cwd: "/tmp", query, output });
|
|
200
|
+
|
|
201
|
+
await runner.run("Initial task");
|
|
202
|
+
const result = await runner.resume("Follow up");
|
|
203
|
+
|
|
204
|
+
assert.strictEqual(resumeCapture.options.resume, "sess-42");
|
|
205
|
+
assert.strictEqual(resumeCapture.prompt, "Follow up");
|
|
206
|
+
assert.strictEqual(result.success, true);
|
|
207
|
+
assert.strictEqual(result.text, "Resumed");
|
|
208
|
+
});
|
|
209
|
+
|
|
210
|
+
test("drainOutput() returns buffered lines and clears buffer", async () => {
|
|
211
|
+
const messages = [
|
|
212
|
+
{ type: "assistant", content: "Line 1" },
|
|
213
|
+
{ type: "result", subtype: "success", result: "Line 2" },
|
|
214
|
+
];
|
|
215
|
+
|
|
216
|
+
const output = new PassThrough();
|
|
217
|
+
const runner = new AgentRunner({
|
|
218
|
+
cwd: "/tmp",
|
|
219
|
+
query: mockQuery(messages),
|
|
220
|
+
output,
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
await runner.run("Task");
|
|
224
|
+
|
|
225
|
+
const drained = runner.drainOutput();
|
|
226
|
+
assert.strictEqual(drained.length, 2);
|
|
227
|
+
assert.deepStrictEqual(JSON.parse(drained[0]), messages[0]);
|
|
228
|
+
assert.deepStrictEqual(JSON.parse(drained[1]), messages[1]);
|
|
229
|
+
|
|
230
|
+
// Buffer should be empty after drain
|
|
231
|
+
const secondDrain = runner.drainOutput();
|
|
232
|
+
assert.strictEqual(secondDrain.length, 0);
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
test("run() captures error when query throws and returns buffered output", async () => {
|
|
236
|
+
async function* failingQuery() {
|
|
237
|
+
yield { type: "system", subtype: "init", session_id: "sess-err" };
|
|
238
|
+
yield { type: "assistant", content: "Partial work" };
|
|
239
|
+
throw new Error("Claude Code process exited with code 1");
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const output = new PassThrough();
|
|
243
|
+
const runner = new AgentRunner({
|
|
244
|
+
cwd: "/tmp",
|
|
245
|
+
query: () => failingQuery(),
|
|
246
|
+
output,
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
const result = await runner.run("Task");
|
|
250
|
+
assert.strictEqual(result.success, false);
|
|
251
|
+
assert.ok(result.error);
|
|
252
|
+
assert.match(result.error.message, /exited with code 1/);
|
|
253
|
+
assert.strictEqual(result.sessionId, "sess-err");
|
|
254
|
+
|
|
255
|
+
// Buffered output should contain the messages yielded before the error
|
|
256
|
+
const drained = runner.drainOutput();
|
|
257
|
+
assert.strictEqual(drained.length, 2);
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
test("resume() captures error when query throws", async () => {
|
|
261
|
+
const initMessages = [
|
|
262
|
+
{ type: "system", subtype: "init", session_id: "sess-r" },
|
|
263
|
+
{ type: "result", subtype: "success", result: "OK" },
|
|
264
|
+
];
|
|
265
|
+
|
|
266
|
+
let callCount = 0;
|
|
267
|
+
const query = async function* () {
|
|
268
|
+
callCount++;
|
|
269
|
+
if (callCount === 1) {
|
|
270
|
+
for (const m of initMessages) yield m;
|
|
271
|
+
} else {
|
|
272
|
+
yield { type: "assistant", content: "Resuming..." };
|
|
273
|
+
throw new Error("Process crashed");
|
|
274
|
+
}
|
|
275
|
+
};
|
|
276
|
+
|
|
277
|
+
const output = new PassThrough();
|
|
278
|
+
const runner = new AgentRunner({ cwd: "/tmp", query, output });
|
|
279
|
+
|
|
280
|
+
await runner.run("Task");
|
|
281
|
+
const result = await runner.resume("Continue");
|
|
282
|
+
assert.strictEqual(result.success, false);
|
|
283
|
+
assert.ok(result.error);
|
|
284
|
+
assert.match(result.error.message, /Process crashed/);
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
test("run() succeeds when SDK throws after emitting successful result", async () => {
|
|
288
|
+
async function* creditExhaustedQuery() {
|
|
289
|
+
yield { type: "system", subtype: "init", session_id: "sess-credit" };
|
|
290
|
+
yield { type: "assistant", content: "Analysis complete." };
|
|
291
|
+
yield { type: "result", subtype: "success", result: "Done." };
|
|
292
|
+
throw new Error("Credit balance is too low");
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const output = new PassThrough();
|
|
296
|
+
const runner = new AgentRunner({
|
|
297
|
+
cwd: "/tmp",
|
|
298
|
+
query: () => creditExhaustedQuery(),
|
|
299
|
+
output,
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
const result = await runner.run("Task");
|
|
303
|
+
assert.strictEqual(result.success, true);
|
|
304
|
+
assert.strictEqual(result.text, "Done.");
|
|
305
|
+
assert.ok(result.error);
|
|
306
|
+
assert.match(result.error.message, /Credit balance/);
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
test("createAgentRunner factory returns an AgentRunner instance", () => {
|
|
310
|
+
const runner = createAgentRunner({
|
|
311
|
+
cwd: "/tmp",
|
|
312
|
+
query: async function* () {},
|
|
313
|
+
output: new PassThrough(),
|
|
314
|
+
});
|
|
315
|
+
assert.ok(runner instanceof AgentRunner);
|
|
316
|
+
});
|
|
317
|
+
});
|