@forwardimpact/libeval 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +1 -1
- package/package.json +1 -1
- package/src/agent-runner.js +14 -2
- package/src/commands/run.js +1 -0
- package/src/commands/tee.js +13 -75
- package/src/supervisor.js +37 -16
- package/test/agent-runner.test.js +25 -0
- package/test/supervisor.test.js +13 -4
package/bin/fit-eval.js
CHANGED
package/package.json
CHANGED
package/src/agent-runner.js
CHANGED
|
@@ -16,6 +16,8 @@ export class AgentRunner {
|
|
|
16
16
|
* @param {number} [deps.maxTurns] - Maximum agentic turns
|
|
17
17
|
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
18
18
|
* @param {string} [deps.permissionMode] - SDK permission mode
|
|
19
|
+
* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
|
|
20
|
+
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
|
|
19
21
|
*/
|
|
20
22
|
constructor({
|
|
21
23
|
cwd,
|
|
@@ -25,6 +27,8 @@ export class AgentRunner {
|
|
|
25
27
|
maxTurns,
|
|
26
28
|
allowedTools,
|
|
27
29
|
permissionMode,
|
|
30
|
+
onLine,
|
|
31
|
+
settingSources,
|
|
28
32
|
}) {
|
|
29
33
|
if (!cwd) throw new Error("cwd is required");
|
|
30
34
|
if (!query) throw new Error("query is required");
|
|
@@ -43,6 +47,8 @@ export class AgentRunner {
|
|
|
43
47
|
"Edit",
|
|
44
48
|
];
|
|
45
49
|
this.permissionMode = permissionMode ?? "bypassPermissions";
|
|
50
|
+
this.onLine = onLine ?? null;
|
|
51
|
+
this.settingSources = settingSources ?? [];
|
|
46
52
|
this.sessionId = null;
|
|
47
53
|
this.buffer = [];
|
|
48
54
|
}
|
|
@@ -67,11 +73,13 @@ export class AgentRunner {
|
|
|
67
73
|
model: this.model,
|
|
68
74
|
permissionMode: this.permissionMode,
|
|
69
75
|
allowDangerouslySkipPermissions: true,
|
|
76
|
+
settingSources: this.settingSources,
|
|
70
77
|
},
|
|
71
78
|
})) {
|
|
72
79
|
const line = JSON.stringify(message);
|
|
73
80
|
this.output.write(line + "\n");
|
|
74
81
|
this.buffer.push(line);
|
|
82
|
+
if (this.onLine) this.onLine(line);
|
|
75
83
|
|
|
76
84
|
if (message.type === "system" && message.subtype === "init") {
|
|
77
85
|
this.sessionId = message.session_id;
|
|
@@ -85,7 +93,10 @@ export class AgentRunner {
|
|
|
85
93
|
error = err;
|
|
86
94
|
}
|
|
87
95
|
|
|
88
|
-
|
|
96
|
+
// If the SDK already emitted a successful result, honour it even when the
|
|
97
|
+
// stream throws afterwards (e.g. "Credit balance is too low" during
|
|
98
|
+
// cleanup). Only treat errors as fatal when no result was received yet.
|
|
99
|
+
const success = stopReason === "success";
|
|
89
100
|
return { success, text, sessionId: this.sessionId, error };
|
|
90
101
|
}
|
|
91
102
|
|
|
@@ -107,6 +118,7 @@ export class AgentRunner {
|
|
|
107
118
|
const line = JSON.stringify(message);
|
|
108
119
|
this.output.write(line + "\n");
|
|
109
120
|
this.buffer.push(line);
|
|
121
|
+
if (this.onLine) this.onLine(line);
|
|
110
122
|
|
|
111
123
|
if (message.type === "result") {
|
|
112
124
|
text = message.result ?? "";
|
|
@@ -117,7 +129,7 @@ export class AgentRunner {
|
|
|
117
129
|
error = err;
|
|
118
130
|
}
|
|
119
131
|
|
|
120
|
-
const success =
|
|
132
|
+
const success = stopReason === "success";
|
|
121
133
|
return { success, text, error };
|
|
122
134
|
}
|
|
123
135
|
|
package/src/commands/run.js
CHANGED
package/src/commands/tee.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { createWriteStream } from "fs";
|
|
2
|
-
import {
|
|
2
|
+
import { PassThrough } from "node:stream";
|
|
3
|
+
import { pipeline } from "node:stream/promises";
|
|
4
|
+
import { createTeeWriter } from "../tee-writer.js";
|
|
3
5
|
|
|
4
6
|
/**
|
|
5
7
|
* Tee command — stream text output to stdout while optionally saving the raw
|
|
@@ -12,46 +14,18 @@ import { createTraceCollector } from "@forwardimpact/libeval";
|
|
|
12
14
|
export async function runTeeCommand(args) {
|
|
13
15
|
const outputPath = args.find((a) => !a.startsWith("-")) ?? null;
|
|
14
16
|
const fileStream = outputPath ? createWriteStream(outputPath) : null;
|
|
15
|
-
const collector = createTraceCollector();
|
|
16
|
-
const turnsEmitted = { count: 0 };
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
const line = buffer.slice(0, newlineIdx);
|
|
27
|
-
buffer = buffer.slice(newlineIdx + 1);
|
|
28
|
-
|
|
29
|
-
if (fileStream) {
|
|
30
|
-
fileStream.write(line + "\n");
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
collector.addLine(line);
|
|
34
|
-
flushNewTurns(collector, turnsEmitted);
|
|
35
|
-
}
|
|
36
|
-
}
|
|
18
|
+
// TeeWriter requires a fileStream; when no output file is specified,
|
|
19
|
+
// use a PassThrough as a no-op sink (NDJSON is not saved).
|
|
20
|
+
const sink = fileStream ?? new PassThrough();
|
|
21
|
+
const tee = createTeeWriter({
|
|
22
|
+
fileStream: sink,
|
|
23
|
+
textStream: process.stdout,
|
|
24
|
+
mode: "raw",
|
|
25
|
+
});
|
|
37
26
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
if (fileStream) {
|
|
41
|
-
fileStream.write(buffer + "\n");
|
|
42
|
-
}
|
|
43
|
-
collector.addLine(buffer);
|
|
44
|
-
flushNewTurns(collector, turnsEmitted);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// Emit the result summary at the end
|
|
48
|
-
if (collector.result) {
|
|
49
|
-
const text = collector.toText();
|
|
50
|
-
const lastNewline = text.lastIndexOf("\n---");
|
|
51
|
-
if (lastNewline !== -1) {
|
|
52
|
-
process.stdout.write(text.slice(lastNewline) + "\n");
|
|
53
|
-
}
|
|
54
|
-
}
|
|
27
|
+
try {
|
|
28
|
+
await pipeline(process.stdin, tee);
|
|
55
29
|
} finally {
|
|
56
30
|
if (fileStream) {
|
|
57
31
|
await new Promise((resolve, reject) => {
|
|
@@ -61,39 +35,3 @@ export async function runTeeCommand(args) {
|
|
|
61
35
|
}
|
|
62
36
|
}
|
|
63
37
|
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Write text for any new turns that haven't been emitted yet.
|
|
67
|
-
* @param {import("@forwardimpact/libeval").TraceCollector} collector
|
|
68
|
-
* @param {{ count: number }} turnsEmitted
|
|
69
|
-
*/
|
|
70
|
-
function flushNewTurns(collector, turnsEmitted) {
|
|
71
|
-
const turns = collector.turns;
|
|
72
|
-
while (turnsEmitted.count < turns.length) {
|
|
73
|
-
const turn = turns[turnsEmitted.count];
|
|
74
|
-
turnsEmitted.count++;
|
|
75
|
-
|
|
76
|
-
if (turn.role === "assistant") {
|
|
77
|
-
for (const block of turn.content) {
|
|
78
|
-
if (block.type === "text") {
|
|
79
|
-
process.stdout.write(block.text + "\n");
|
|
80
|
-
} else if (block.type === "tool_use") {
|
|
81
|
-
const inputSummary = summarizeInput(block.input);
|
|
82
|
-
process.stdout.write(`> Tool: ${block.name} ${inputSummary}\n`);
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
/**
|
|
90
|
-
* Summarize tool input for text display, truncated to keep logs readable.
|
|
91
|
-
* @param {object} input - Tool input object
|
|
92
|
-
* @returns {string} Truncated summary
|
|
93
|
-
*/
|
|
94
|
-
function summarizeInput(input) {
|
|
95
|
-
if (!input || typeof input !== "object") return "";
|
|
96
|
-
const json = JSON.stringify(input);
|
|
97
|
-
if (json.length <= 200) return json;
|
|
98
|
-
return json.slice(0, 197) + "...";
|
|
99
|
-
}
|
package/src/supervisor.js
CHANGED
|
@@ -36,6 +36,10 @@ export class Supervisor {
|
|
|
36
36
|
this.supervisorRunner = supervisorRunner;
|
|
37
37
|
this.output = output;
|
|
38
38
|
this.maxTurns = maxTurns ?? 20;
|
|
39
|
+
/** @type {"agent"|"supervisor"} */
|
|
40
|
+
this.currentSource = "agent";
|
|
41
|
+
/** @type {number} */
|
|
42
|
+
this.currentTurn = 0;
|
|
39
43
|
}
|
|
40
44
|
|
|
41
45
|
/**
|
|
@@ -45,8 +49,9 @@ export class Supervisor {
|
|
|
45
49
|
*/
|
|
46
50
|
async run(task) {
|
|
47
51
|
// Turn 0: Agent receives the task and starts working
|
|
52
|
+
this.currentSource = "agent";
|
|
53
|
+
this.currentTurn = 0;
|
|
48
54
|
let agentResult = await this.agentRunner.run(task);
|
|
49
|
-
this.emitTagged("agent", 0);
|
|
50
55
|
|
|
51
56
|
if (agentResult.error) {
|
|
52
57
|
this.emitSummary({ success: false, turns: 0 });
|
|
@@ -59,13 +64,14 @@ export class Supervisor {
|
|
|
59
64
|
`The agent reported:\n\n${agentResult.text}\n\n` +
|
|
60
65
|
`Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
|
|
61
66
|
|
|
67
|
+
this.currentSource = "supervisor";
|
|
68
|
+
this.currentTurn = turn;
|
|
62
69
|
let supervisorResult;
|
|
63
70
|
if (turn === 1) {
|
|
64
71
|
supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
|
|
65
72
|
} else {
|
|
66
73
|
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
|
|
67
74
|
}
|
|
68
|
-
this.emitTagged("supervisor", turn);
|
|
69
75
|
|
|
70
76
|
if (supervisorResult.error) {
|
|
71
77
|
this.emitSummary({ success: false, turns: turn });
|
|
@@ -78,8 +84,9 @@ export class Supervisor {
|
|
|
78
84
|
}
|
|
79
85
|
|
|
80
86
|
// Supervisor's response becomes the agent's next input
|
|
87
|
+
this.currentSource = "agent";
|
|
88
|
+
this.currentTurn = turn;
|
|
81
89
|
agentResult = await this.agentRunner.resume(supervisorResult.text);
|
|
82
|
-
this.emitTagged("agent", turn);
|
|
83
90
|
|
|
84
91
|
if (agentResult.error) {
|
|
85
92
|
this.emitSummary({ success: false, turns: turn });
|
|
@@ -92,19 +99,18 @@ export class Supervisor {
|
|
|
92
99
|
}
|
|
93
100
|
|
|
94
101
|
/**
|
|
95
|
-
*
|
|
96
|
-
*
|
|
97
|
-
* @param {
|
|
98
|
-
* @param {number} turn
|
|
102
|
+
* Emit a single NDJSON line tagged with the current source and turn.
|
|
103
|
+
* Called in real-time via the AgentRunner onLine callback.
|
|
104
|
+
* @param {string} line - Raw NDJSON line from the runner
|
|
99
105
|
*/
|
|
100
|
-
|
|
101
|
-
const
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
106
|
+
emitLine(line) {
|
|
107
|
+
const event = JSON.parse(line);
|
|
108
|
+
const tagged = {
|
|
109
|
+
source: this.currentSource,
|
|
110
|
+
turn: this.currentTurn,
|
|
111
|
+
event,
|
|
112
|
+
};
|
|
113
|
+
this.output.write(JSON.stringify(tagged) + "\n");
|
|
108
114
|
}
|
|
109
115
|
|
|
110
116
|
/**
|
|
@@ -143,6 +149,11 @@ export function createSupervisor({
|
|
|
143
149
|
maxTurns,
|
|
144
150
|
allowedTools,
|
|
145
151
|
}) {
|
|
152
|
+
// Forward-reference: onLine captures `supervisor` before construction completes.
|
|
153
|
+
// This is safe because onLine is only called during run(), after construction.
|
|
154
|
+
let supervisor;
|
|
155
|
+
const onLine = (line) => supervisor.emitLine(line);
|
|
156
|
+
|
|
146
157
|
const agentRunner = createAgentRunner({
|
|
147
158
|
cwd: agentCwd,
|
|
148
159
|
query,
|
|
@@ -150,6 +161,8 @@ export function createSupervisor({
|
|
|
150
161
|
model,
|
|
151
162
|
maxTurns: 50,
|
|
152
163
|
allowedTools,
|
|
164
|
+
onLine,
|
|
165
|
+
settingSources: ["project"],
|
|
153
166
|
});
|
|
154
167
|
|
|
155
168
|
const supervisorRunner = createAgentRunner({
|
|
@@ -159,7 +172,15 @@ export function createSupervisor({
|
|
|
159
172
|
model,
|
|
160
173
|
maxTurns: 10,
|
|
161
174
|
allowedTools: ["Read", "Glob", "Grep"],
|
|
175
|
+
onLine,
|
|
176
|
+
settingSources: ["project"],
|
|
162
177
|
});
|
|
163
178
|
|
|
164
|
-
|
|
179
|
+
supervisor = new Supervisor({
|
|
180
|
+
agentRunner,
|
|
181
|
+
supervisorRunner,
|
|
182
|
+
output,
|
|
183
|
+
maxTurns,
|
|
184
|
+
});
|
|
185
|
+
return supervisor;
|
|
165
186
|
}
|
|
@@ -81,6 +81,7 @@ describe("AgentRunner", () => {
|
|
|
81
81
|
"Edit",
|
|
82
82
|
]);
|
|
83
83
|
assert.strictEqual(runner.permissionMode, "bypassPermissions");
|
|
84
|
+
assert.deepStrictEqual(runner.settingSources, []);
|
|
84
85
|
assert.strictEqual(runner.sessionId, null);
|
|
85
86
|
});
|
|
86
87
|
|
|
@@ -145,6 +146,7 @@ describe("AgentRunner", () => {
|
|
|
145
146
|
maxTurns: 10,
|
|
146
147
|
allowedTools: ["Read", "Grep"],
|
|
147
148
|
permissionMode: "plan",
|
|
149
|
+
settingSources: ["project"],
|
|
148
150
|
});
|
|
149
151
|
|
|
150
152
|
await runner.run("My task");
|
|
@@ -156,6 +158,7 @@ describe("AgentRunner", () => {
|
|
|
156
158
|
assert.deepStrictEqual(captured.options.allowedTools, ["Read", "Grep"]);
|
|
157
159
|
assert.strictEqual(captured.options.permissionMode, "plan");
|
|
158
160
|
assert.strictEqual(captured.options.allowDangerouslySkipPermissions, true);
|
|
161
|
+
assert.deepStrictEqual(captured.options.settingSources, ["project"]);
|
|
159
162
|
});
|
|
160
163
|
|
|
161
164
|
test("run() returns success=false on non-success subtype", async () => {
|
|
@@ -281,6 +284,28 @@ describe("AgentRunner", () => {
|
|
|
281
284
|
assert.match(result.error.message, /Process crashed/);
|
|
282
285
|
});
|
|
283
286
|
|
|
287
|
+
test("run() succeeds when SDK throws after emitting successful result", async () => {
|
|
288
|
+
async function* creditExhaustedQuery() {
|
|
289
|
+
yield { type: "system", subtype: "init", session_id: "sess-credit" };
|
|
290
|
+
yield { type: "assistant", content: "Analysis complete." };
|
|
291
|
+
yield { type: "result", subtype: "success", result: "Done." };
|
|
292
|
+
throw new Error("Credit balance is too low");
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const output = new PassThrough();
|
|
296
|
+
const runner = new AgentRunner({
|
|
297
|
+
cwd: "/tmp",
|
|
298
|
+
query: () => creditExhaustedQuery(),
|
|
299
|
+
output,
|
|
300
|
+
});
|
|
301
|
+
|
|
302
|
+
const result = await runner.run("Task");
|
|
303
|
+
assert.strictEqual(result.success, true);
|
|
304
|
+
assert.strictEqual(result.text, "Done.");
|
|
305
|
+
assert.ok(result.error);
|
|
306
|
+
assert.match(result.error.message, /Credit balance/);
|
|
307
|
+
});
|
|
308
|
+
|
|
284
309
|
test("createAgentRunner factory returns an AgentRunner instance", () => {
|
|
285
310
|
const runner = createAgentRunner({
|
|
286
311
|
cwd: "/tmp",
|
package/test/supervisor.test.js
CHANGED
|
@@ -29,12 +29,13 @@ function createMockRunner(responses, messages) {
|
|
|
29
29
|
// Override run and resume to return scripted responses
|
|
30
30
|
runner.run = async (_task) => {
|
|
31
31
|
const resp = responses[callIndex++];
|
|
32
|
-
// Buffer messages for drainOutput
|
|
33
32
|
const msgs = messages?.[callIndex - 1] ?? [
|
|
34
33
|
{ type: "assistant", content: resp.text },
|
|
35
34
|
];
|
|
36
35
|
for (const m of msgs) {
|
|
37
|
-
|
|
36
|
+
const line = JSON.stringify(m);
|
|
37
|
+
runner.buffer.push(line);
|
|
38
|
+
if (runner.onLine) runner.onLine(line);
|
|
38
39
|
}
|
|
39
40
|
runner.sessionId = "mock-session";
|
|
40
41
|
return {
|
|
@@ -50,7 +51,9 @@ function createMockRunner(responses, messages) {
|
|
|
50
51
|
{ type: "assistant", content: resp.text },
|
|
51
52
|
];
|
|
52
53
|
for (const m of msgs) {
|
|
53
|
-
|
|
54
|
+
const line = JSON.stringify(m);
|
|
55
|
+
runner.buffer.push(line);
|
|
56
|
+
if (runner.onLine) runner.onLine(line);
|
|
54
57
|
}
|
|
55
58
|
return { success: resp.success ?? true, text: resp.text };
|
|
56
59
|
};
|
|
@@ -211,6 +214,8 @@ describe("Supervisor", () => {
|
|
|
211
214
|
output,
|
|
212
215
|
maxTurns: 10,
|
|
213
216
|
});
|
|
217
|
+
agentRunner.onLine = (line) => supervisor.emitLine(line);
|
|
218
|
+
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
|
|
214
219
|
|
|
215
220
|
await supervisor.run("Task");
|
|
216
221
|
|
|
@@ -258,6 +263,8 @@ describe("Supervisor", () => {
|
|
|
258
263
|
output,
|
|
259
264
|
maxTurns: 10,
|
|
260
265
|
});
|
|
266
|
+
agentRunner.onLine = (line) => supervisor.emitLine(line);
|
|
267
|
+
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
|
|
261
268
|
|
|
262
269
|
await supervisor.run("Task");
|
|
263
270
|
|
|
@@ -273,7 +280,7 @@ describe("Supervisor", () => {
|
|
|
273
280
|
assert.strictEqual(tagged.event.source, "sdk-internal");
|
|
274
281
|
});
|
|
275
282
|
|
|
276
|
-
test("
|
|
283
|
+
test("emits agent output and summary when agent errors on turn 0", async () => {
|
|
277
284
|
const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
|
|
278
285
|
const agentRunner = createMockRunner(
|
|
279
286
|
[{ text: "Partial work", success: false }],
|
|
@@ -296,6 +303,8 @@ describe("Supervisor", () => {
|
|
|
296
303
|
output,
|
|
297
304
|
maxTurns: 10,
|
|
298
305
|
});
|
|
306
|
+
agentRunner.onLine = (line) => supervisor.emitLine(line);
|
|
307
|
+
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
|
|
299
308
|
|
|
300
309
|
const result = await supervisor.run("Task");
|
|
301
310
|
|