@forwardimpact/libeval 0.1.9 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +103 -72
- package/package.json +14 -3
- package/src/commands/output.js +7 -26
- package/src/commands/run.js +16 -41
- package/src/commands/supervise.js +19 -47
- package/src/commands/tee.js +3 -2
- package/src/index.js +11 -0
- package/index.js +0 -11
- package/test/agent-runner-batching.test.js +0 -271
- package/test/agent-runner.test.js +0 -317
- package/test/fixtures/stream.ndjson +0 -7
- package/test/mock-runner.js +0 -113
- package/test/supervisor-batching.test.js +0 -175
- package/test/supervisor-intervention.test.js +0 -365
- package/test/supervisor-output.test.js +0 -369
- package/test/supervisor-run.test.js +0 -310
- package/test/tee-writer.test.js +0 -324
- package/test/trace-collector.test.js +0 -424
|
@@ -1,424 +0,0 @@
|
|
|
1
|
-
import { describe, test } from "node:test";
|
|
2
|
-
import assert from "node:assert";
|
|
3
|
-
import fs from "node:fs";
|
|
4
|
-
import path from "node:path";
|
|
5
|
-
import { fileURLToPath } from "node:url";
|
|
6
|
-
|
|
7
|
-
import { TraceCollector, createTraceCollector } from "@forwardimpact/libeval";
|
|
8
|
-
|
|
9
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
|
-
const fixturePath = path.join(__dirname, "fixtures", "stream.ndjson");
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Load fixture lines from the NDJSON file.
|
|
14
|
-
* @returns {string[]}
|
|
15
|
-
*/
|
|
16
|
-
function loadFixture() {
|
|
17
|
-
return fs.readFileSync(fixturePath, "utf8").trim().split("\n");
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
/**
|
|
21
|
-
* Feed all fixture lines into a collector and return it.
|
|
22
|
-
* @returns {TraceCollector}
|
|
23
|
-
*/
|
|
24
|
-
function collectFixture() {
|
|
25
|
-
const collector = new TraceCollector();
|
|
26
|
-
for (const line of loadFixture()) {
|
|
27
|
-
collector.addLine(line);
|
|
28
|
-
}
|
|
29
|
-
return collector;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
describe("TraceCollector", () => {
|
|
33
|
-
describe("addLine", () => {
|
|
34
|
-
test("extracts metadata from system init event", () => {
|
|
35
|
-
const collector = new TraceCollector();
|
|
36
|
-
collector.addLine(
|
|
37
|
-
JSON.stringify({
|
|
38
|
-
type: "system",
|
|
39
|
-
subtype: "init",
|
|
40
|
-
session_id: "sess-1",
|
|
41
|
-
model: "claude-opus-4-6",
|
|
42
|
-
claude_code_version: "2.1.87",
|
|
43
|
-
tools: ["Bash", "Read"],
|
|
44
|
-
permissionMode: "default",
|
|
45
|
-
}),
|
|
46
|
-
);
|
|
47
|
-
|
|
48
|
-
const trace = collector.toJSON();
|
|
49
|
-
assert.strictEqual(trace.metadata.sessionId, "sess-1");
|
|
50
|
-
assert.strictEqual(trace.metadata.model, "claude-opus-4-6");
|
|
51
|
-
assert.strictEqual(trace.metadata.claudeCodeVersion, "2.1.87");
|
|
52
|
-
assert.deepStrictEqual(trace.metadata.tools, ["Bash", "Read"]);
|
|
53
|
-
});
|
|
54
|
-
|
|
55
|
-
test("collects assistant text turns", () => {
|
|
56
|
-
const collector = new TraceCollector();
|
|
57
|
-
collector.addLine(
|
|
58
|
-
JSON.stringify({
|
|
59
|
-
type: "assistant",
|
|
60
|
-
message: {
|
|
61
|
-
content: [{ type: "text", text: "Hello world" }],
|
|
62
|
-
usage: { input_tokens: 10, output_tokens: 5 },
|
|
63
|
-
},
|
|
64
|
-
}),
|
|
65
|
-
);
|
|
66
|
-
|
|
67
|
-
const trace = collector.toJSON();
|
|
68
|
-
assert.strictEqual(trace.turns.length, 1);
|
|
69
|
-
assert.strictEqual(trace.turns[0].role, "assistant");
|
|
70
|
-
assert.strictEqual(trace.turns[0].content[0].text, "Hello world");
|
|
71
|
-
assert.strictEqual(trace.turns[0].usage.inputTokens, 10);
|
|
72
|
-
});
|
|
73
|
-
|
|
74
|
-
test("collects assistant tool_use turns", () => {
|
|
75
|
-
const collector = new TraceCollector();
|
|
76
|
-
collector.addLine(
|
|
77
|
-
JSON.stringify({
|
|
78
|
-
type: "assistant",
|
|
79
|
-
message: {
|
|
80
|
-
content: [
|
|
81
|
-
{
|
|
82
|
-
type: "tool_use",
|
|
83
|
-
id: "toolu_01",
|
|
84
|
-
name: "Bash",
|
|
85
|
-
input: { command: "ls" },
|
|
86
|
-
},
|
|
87
|
-
],
|
|
88
|
-
usage: { input_tokens: 20, output_tokens: 10 },
|
|
89
|
-
},
|
|
90
|
-
}),
|
|
91
|
-
);
|
|
92
|
-
|
|
93
|
-
const trace = collector.toJSON();
|
|
94
|
-
assert.strictEqual(trace.turns[0].content[0].type, "tool_use");
|
|
95
|
-
assert.strictEqual(trace.turns[0].content[0].name, "Bash");
|
|
96
|
-
assert.strictEqual(trace.turns[0].content[0].toolUseId, "toolu_01");
|
|
97
|
-
});
|
|
98
|
-
|
|
99
|
-
test("collects tool_result from user events", () => {
|
|
100
|
-
const collector = new TraceCollector();
|
|
101
|
-
collector.addLine(
|
|
102
|
-
JSON.stringify({
|
|
103
|
-
type: "user",
|
|
104
|
-
message: {
|
|
105
|
-
role: "user",
|
|
106
|
-
content: [
|
|
107
|
-
{
|
|
108
|
-
type: "tool_result",
|
|
109
|
-
tool_use_id: "toolu_01",
|
|
110
|
-
content: "file listing output",
|
|
111
|
-
},
|
|
112
|
-
],
|
|
113
|
-
},
|
|
114
|
-
}),
|
|
115
|
-
);
|
|
116
|
-
|
|
117
|
-
const trace = collector.toJSON();
|
|
118
|
-
assert.strictEqual(trace.turns.length, 1);
|
|
119
|
-
assert.strictEqual(trace.turns[0].role, "tool_result");
|
|
120
|
-
assert.strictEqual(trace.turns[0].toolUseId, "toolu_01");
|
|
121
|
-
assert.strictEqual(trace.turns[0].content, "file listing output");
|
|
122
|
-
});
|
|
123
|
-
|
|
124
|
-
test("extracts summary from result event", () => {
|
|
125
|
-
const collector = new TraceCollector();
|
|
126
|
-
collector.addLine(
|
|
127
|
-
JSON.stringify({
|
|
128
|
-
type: "result",
|
|
129
|
-
subtype: "success",
|
|
130
|
-
is_error: false,
|
|
131
|
-
total_cost_usd: 1.23,
|
|
132
|
-
duration_ms: 45000,
|
|
133
|
-
num_turns: 12,
|
|
134
|
-
usage: {
|
|
135
|
-
input_tokens: 5000,
|
|
136
|
-
output_tokens: 2000,
|
|
137
|
-
cache_read_input_tokens: 3000,
|
|
138
|
-
cache_creation_input_tokens: 1000,
|
|
139
|
-
},
|
|
140
|
-
modelUsage: { "claude-opus-4-6": { costUSD: 1.23 } },
|
|
141
|
-
}),
|
|
142
|
-
);
|
|
143
|
-
|
|
144
|
-
const trace = collector.toJSON();
|
|
145
|
-
assert.strictEqual(trace.summary.result, "success");
|
|
146
|
-
assert.strictEqual(trace.summary.totalCostUsd, 1.23);
|
|
147
|
-
assert.strictEqual(trace.summary.durationMs, 45000);
|
|
148
|
-
assert.strictEqual(trace.summary.numTurns, 12);
|
|
149
|
-
assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
|
|
150
|
-
});
|
|
151
|
-
|
|
152
|
-
test("unwraps combined supervised trace format {source, turn, event}", () => {
|
|
153
|
-
const collector = new TraceCollector();
|
|
154
|
-
|
|
155
|
-
// System init wrapped in supervisor envelope
|
|
156
|
-
collector.addLine(
|
|
157
|
-
JSON.stringify({
|
|
158
|
-
source: "agent",
|
|
159
|
-
turn: 0,
|
|
160
|
-
event: {
|
|
161
|
-
type: "system",
|
|
162
|
-
subtype: "init",
|
|
163
|
-
session_id: "sess-supervised",
|
|
164
|
-
model: "claude-opus-4-6",
|
|
165
|
-
tools: ["Bash"],
|
|
166
|
-
},
|
|
167
|
-
}),
|
|
168
|
-
);
|
|
169
|
-
|
|
170
|
-
// Assistant message wrapped in supervisor envelope
|
|
171
|
-
collector.addLine(
|
|
172
|
-
JSON.stringify({
|
|
173
|
-
source: "agent",
|
|
174
|
-
turn: 1,
|
|
175
|
-
event: {
|
|
176
|
-
type: "assistant",
|
|
177
|
-
message: {
|
|
178
|
-
content: [{ type: "text", text: "I ran the tests." }],
|
|
179
|
-
usage: { input_tokens: 100, output_tokens: 50 },
|
|
180
|
-
},
|
|
181
|
-
},
|
|
182
|
-
}),
|
|
183
|
-
);
|
|
184
|
-
|
|
185
|
-
// Tool result wrapped in supervisor envelope
|
|
186
|
-
collector.addLine(
|
|
187
|
-
JSON.stringify({
|
|
188
|
-
source: "agent",
|
|
189
|
-
turn: 1,
|
|
190
|
-
event: {
|
|
191
|
-
type: "user",
|
|
192
|
-
message: {
|
|
193
|
-
role: "user",
|
|
194
|
-
content: [
|
|
195
|
-
{
|
|
196
|
-
type: "tool_result",
|
|
197
|
-
tool_use_id: "toolu_sup",
|
|
198
|
-
content: "All tests passed",
|
|
199
|
-
},
|
|
200
|
-
],
|
|
201
|
-
},
|
|
202
|
-
},
|
|
203
|
-
}),
|
|
204
|
-
);
|
|
205
|
-
|
|
206
|
-
// Result event wrapped in supervisor envelope
|
|
207
|
-
collector.addLine(
|
|
208
|
-
JSON.stringify({
|
|
209
|
-
source: "supervisor",
|
|
210
|
-
turn: 1,
|
|
211
|
-
event: {
|
|
212
|
-
type: "result",
|
|
213
|
-
subtype: "success",
|
|
214
|
-
total_cost_usd: 0.44,
|
|
215
|
-
duration_ms: 30000,
|
|
216
|
-
num_turns: 2,
|
|
217
|
-
},
|
|
218
|
-
}),
|
|
219
|
-
);
|
|
220
|
-
|
|
221
|
-
const trace = collector.toJSON();
|
|
222
|
-
assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
|
|
223
|
-
assert.strictEqual(trace.turns.length, 2);
|
|
224
|
-
assert.strictEqual(trace.turns[0].role, "assistant");
|
|
225
|
-
assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
|
|
226
|
-
assert.strictEqual(trace.turns[1].role, "tool_result");
|
|
227
|
-
assert.strictEqual(trace.turns[1].content, "All tests passed");
|
|
228
|
-
assert.strictEqual(trace.summary.result, "success");
|
|
229
|
-
assert.strictEqual(trace.summary.totalCostUsd, 0.44);
|
|
230
|
-
});
|
|
231
|
-
|
|
232
|
-
test("skips orchestrator summary lines from supervised traces", () => {
|
|
233
|
-
const collector = new TraceCollector();
|
|
234
|
-
collector.addLine(
|
|
235
|
-
JSON.stringify({
|
|
236
|
-
source: "orchestrator",
|
|
237
|
-
type: "summary",
|
|
238
|
-
success: true,
|
|
239
|
-
turns: 3,
|
|
240
|
-
}),
|
|
241
|
-
);
|
|
242
|
-
|
|
243
|
-
// Orchestrator summaries have no inner event and no recognized type
|
|
244
|
-
// after unwrap — they should be silently skipped.
|
|
245
|
-
assert.strictEqual(collector.toJSON().turns.length, 0);
|
|
246
|
-
});
|
|
247
|
-
|
|
248
|
-
test("skips rate_limit_event and unknown types", () => {
|
|
249
|
-
const collector = new TraceCollector();
|
|
250
|
-
collector.addLine(
|
|
251
|
-
JSON.stringify({ type: "rate_limit_event", rate_limit_info: {} }),
|
|
252
|
-
);
|
|
253
|
-
collector.addLine(JSON.stringify({ type: "unknown_event" }));
|
|
254
|
-
|
|
255
|
-
const trace = collector.toJSON();
|
|
256
|
-
assert.strictEqual(trace.turns.length, 0);
|
|
257
|
-
});
|
|
258
|
-
|
|
259
|
-
test("skips malformed JSON lines", () => {
|
|
260
|
-
const collector = new TraceCollector();
|
|
261
|
-
collector.addLine("not valid json {{{");
|
|
262
|
-
collector.addLine("");
|
|
263
|
-
collector.addLine(" ");
|
|
264
|
-
|
|
265
|
-
const trace = collector.toJSON();
|
|
266
|
-
assert.strictEqual(trace.turns.length, 0);
|
|
267
|
-
});
|
|
268
|
-
|
|
269
|
-
test("skips assistant event with missing message", () => {
|
|
270
|
-
const collector = new TraceCollector();
|
|
271
|
-
collector.addLine(JSON.stringify({ type: "assistant" }));
|
|
272
|
-
collector.addLine(JSON.stringify({ type: "assistant", message: null }));
|
|
273
|
-
|
|
274
|
-
assert.strictEqual(collector.toJSON().turns.length, 0);
|
|
275
|
-
});
|
|
276
|
-
|
|
277
|
-
test("skips user event with non-array content", () => {
|
|
278
|
-
const collector = new TraceCollector();
|
|
279
|
-
collector.addLine(
|
|
280
|
-
JSON.stringify({
|
|
281
|
-
type: "user",
|
|
282
|
-
message: { role: "user", content: "plain string" },
|
|
283
|
-
}),
|
|
284
|
-
);
|
|
285
|
-
collector.addLine(
|
|
286
|
-
JSON.stringify({ type: "user", message: { role: "user" } }),
|
|
287
|
-
);
|
|
288
|
-
collector.addLine(JSON.stringify({ type: "user" }));
|
|
289
|
-
|
|
290
|
-
assert.strictEqual(collector.toJSON().turns.length, 0);
|
|
291
|
-
});
|
|
292
|
-
|
|
293
|
-
test("uses event timestamp when present in system init", () => {
|
|
294
|
-
const collector = new TraceCollector();
|
|
295
|
-
collector.addLine(
|
|
296
|
-
JSON.stringify({
|
|
297
|
-
type: "system",
|
|
298
|
-
subtype: "init",
|
|
299
|
-
timestamp: "2026-01-15T10:00:00Z",
|
|
300
|
-
session_id: "sess-ts",
|
|
301
|
-
}),
|
|
302
|
-
);
|
|
303
|
-
|
|
304
|
-
assert.strictEqual(
|
|
305
|
-
collector.toJSON().metadata.timestamp,
|
|
306
|
-
"2026-01-15T10:00:00Z",
|
|
307
|
-
);
|
|
308
|
-
});
|
|
309
|
-
});
|
|
310
|
-
|
|
311
|
-
describe("toJSON", () => {
|
|
312
|
-
test("produces complete trace from fixture", () => {
|
|
313
|
-
const collector = collectFixture();
|
|
314
|
-
const trace = collector.toJSON();
|
|
315
|
-
|
|
316
|
-
assert.strictEqual(trace.version, "1.0.0");
|
|
317
|
-
assert.strictEqual(trace.metadata.sessionId, "abc-123");
|
|
318
|
-
assert.strictEqual(trace.metadata.model, "claude-opus-4-6");
|
|
319
|
-
assert.strictEqual(trace.metadata.claudeCodeVersion, "2.1.87");
|
|
320
|
-
assert.strictEqual(trace.metadata.tools.length, 6);
|
|
321
|
-
assert.ok(trace.turns.length > 0);
|
|
322
|
-
assert.strictEqual(trace.summary.result, "success");
|
|
323
|
-
assert.strictEqual(trace.summary.totalCostUsd, 0.0523);
|
|
324
|
-
assert.strictEqual(trace.summary.numTurns, 3);
|
|
325
|
-
});
|
|
326
|
-
|
|
327
|
-
test("assigns sequential turn indexes", () => {
|
|
328
|
-
const collector = collectFixture();
|
|
329
|
-
const trace = collector.toJSON();
|
|
330
|
-
|
|
331
|
-
trace.turns.forEach((turn, i) => {
|
|
332
|
-
assert.strictEqual(turn.index, i);
|
|
333
|
-
});
|
|
334
|
-
});
|
|
335
|
-
|
|
336
|
-
test("returns defaults for empty input", () => {
|
|
337
|
-
const collector = new TraceCollector();
|
|
338
|
-
const trace = collector.toJSON();
|
|
339
|
-
|
|
340
|
-
assert.strictEqual(trace.version, "1.0.0");
|
|
341
|
-
assert.strictEqual(trace.metadata.sessionId, null);
|
|
342
|
-
assert.strictEqual(trace.turns.length, 0);
|
|
343
|
-
assert.strictEqual(trace.summary.result, "unknown");
|
|
344
|
-
});
|
|
345
|
-
});
|
|
346
|
-
|
|
347
|
-
describe("toText", () => {
|
|
348
|
-
test("includes assistant text content", () => {
|
|
349
|
-
const collector = collectFixture();
|
|
350
|
-
const text = collector.toText();
|
|
351
|
-
|
|
352
|
-
assert.ok(
|
|
353
|
-
text.includes("I'll start by checking the repository structure"),
|
|
354
|
-
);
|
|
355
|
-
assert.ok(text.includes("No security issues found"));
|
|
356
|
-
});
|
|
357
|
-
|
|
358
|
-
test("includes tool call summaries", () => {
|
|
359
|
-
const collector = collectFixture();
|
|
360
|
-
const text = collector.toText();
|
|
361
|
-
|
|
362
|
-
assert.ok(text.includes("> Tool: Bash"));
|
|
363
|
-
assert.ok(text.includes("ls -la"));
|
|
364
|
-
});
|
|
365
|
-
|
|
366
|
-
test("includes result summary line", () => {
|
|
367
|
-
const collector = collectFixture();
|
|
368
|
-
const text = collector.toText();
|
|
369
|
-
|
|
370
|
-
assert.ok(text.includes("--- Result: success"));
|
|
371
|
-
assert.ok(text.includes("Turns: 3"));
|
|
372
|
-
assert.ok(text.includes("Cost: $0.0523"));
|
|
373
|
-
assert.ok(text.includes("Duration: 5s"));
|
|
374
|
-
});
|
|
375
|
-
|
|
376
|
-
test("truncates long tool input summaries", () => {
|
|
377
|
-
const collector = new TraceCollector();
|
|
378
|
-
const longCommand = "x".repeat(300);
|
|
379
|
-
collector.addLine(
|
|
380
|
-
JSON.stringify({
|
|
381
|
-
type: "assistant",
|
|
382
|
-
message: {
|
|
383
|
-
content: [
|
|
384
|
-
{
|
|
385
|
-
type: "tool_use",
|
|
386
|
-
name: "Bash",
|
|
387
|
-
input: { command: longCommand },
|
|
388
|
-
},
|
|
389
|
-
],
|
|
390
|
-
},
|
|
391
|
-
}),
|
|
392
|
-
);
|
|
393
|
-
|
|
394
|
-
const text = collector.toText();
|
|
395
|
-
assert.ok(text.includes("> Tool: Bash"));
|
|
396
|
-
assert.ok(text.includes("..."));
|
|
397
|
-
// Total line should be truncated, not the full 300+ chars
|
|
398
|
-
const toolLine = text.split("\n").find((l) => l.startsWith("> Tool:"));
|
|
399
|
-
assert.ok(toolLine.length < 250);
|
|
400
|
-
});
|
|
401
|
-
|
|
402
|
-
test("returns empty string for empty input", () => {
|
|
403
|
-
const collector = new TraceCollector();
|
|
404
|
-
const text = collector.toText();
|
|
405
|
-
|
|
406
|
-
assert.strictEqual(text, "");
|
|
407
|
-
});
|
|
408
|
-
});
|
|
409
|
-
|
|
410
|
-
describe("createTraceCollector", () => {
|
|
411
|
-
test("returns a TraceCollector instance", () => {
|
|
412
|
-
const collector = createTraceCollector();
|
|
413
|
-
assert.ok(collector instanceof TraceCollector);
|
|
414
|
-
});
|
|
415
|
-
|
|
416
|
-
test("accepts injectable clock for deterministic timestamps", () => {
|
|
417
|
-
const fixedTime = "2026-01-01T00:00:00Z";
|
|
418
|
-
const collector = createTraceCollector({ now: () => fixedTime });
|
|
419
|
-
const trace = collector.toJSON();
|
|
420
|
-
|
|
421
|
-
assert.strictEqual(trace.metadata.timestamp, fixedTime);
|
|
422
|
-
});
|
|
423
|
-
});
|
|
424
|
-
});
|