@forwardimpact/libeval 0.1.63 → 0.1.65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -201
- package/bin/fit-trace.js +166 -31
- package/package.json +1 -1
- package/src/benchmark/judge.js +16 -1
- package/src/benchmark/result.js +12 -0
- package/src/benchmark/runner.js +44 -25
- package/src/commands/callback.js +11 -5
- package/src/commands/trace.js +333 -53
- package/src/cost.js +79 -0
- package/src/index.js +2 -0
- package/src/redaction.js +65 -6
- package/src/trace-collector.js +58 -2
- package/src/trace-github.js +175 -3
- package/src/trace-multi.js +101 -0
- package/src/trace-query.js +294 -45
- package/src/trace-render.js +211 -0
- package/src/trace-usage.js +249 -0
package/src/benchmark/runner.js
CHANGED
|
@@ -18,6 +18,7 @@ import { createInterface } from "node:readline";
|
|
|
18
18
|
import { join, resolve as resolvePath } from "node:path";
|
|
19
19
|
|
|
20
20
|
import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
|
|
21
|
+
import { sumTraceCost } from "../cost.js";
|
|
21
22
|
import { createSupervisor } from "../supervisor.js";
|
|
22
23
|
import { installApm as defaultInstallApm } from "./apm-installer.js";
|
|
23
24
|
import { installNpm as defaultInstallNpm } from "./npm-installer.js";
|
|
@@ -193,8 +194,9 @@ export class BenchmarkRunner {
|
|
|
193
194
|
resultsRecordKey(task, runIndex),
|
|
194
195
|
);
|
|
195
196
|
}
|
|
196
|
-
const
|
|
197
|
-
|
|
197
|
+
const agentRun = await this.#runAgentSafe(task, workdir);
|
|
198
|
+
const { costUsd, turns, submission, agentError } = agentRun;
|
|
199
|
+
const breakdown = agentRun.costBreakdown ?? { agent: 0, supervisor: 0 };
|
|
198
200
|
const invariants = await this._runInvariantsHook(
|
|
199
201
|
task,
|
|
200
202
|
{
|
|
@@ -206,13 +208,14 @@ export class BenchmarkRunner {
|
|
|
206
208
|
this.runtime,
|
|
207
209
|
);
|
|
208
210
|
let judgeVerdict = null;
|
|
211
|
+
let judgeCost = 0;
|
|
209
212
|
if (task.paths.judge) {
|
|
210
213
|
const judgeContext = await this.#buildJudgeContext(
|
|
211
214
|
task,
|
|
212
215
|
workdir,
|
|
213
216
|
skillSetHash,
|
|
214
217
|
);
|
|
215
|
-
|
|
218
|
+
const judgeResult = await this._runJudgeHook(
|
|
216
219
|
task,
|
|
217
220
|
workdir,
|
|
218
221
|
invariants,
|
|
@@ -225,6 +228,13 @@ export class BenchmarkRunner {
|
|
|
225
228
|
},
|
|
226
229
|
judgeContext,
|
|
227
230
|
);
|
|
231
|
+
judgeCost = judgeResult.costUsd ?? 0;
|
|
232
|
+
// The record's judgeVerdict carries only the verdict + summary; the
|
|
233
|
+
// judge's cost is folded into costUsd / costBreakdown instead.
|
|
234
|
+
judgeVerdict = {
|
|
235
|
+
verdict: judgeResult.verdict,
|
|
236
|
+
summary: judgeResult.summary,
|
|
237
|
+
};
|
|
228
238
|
}
|
|
229
239
|
const verdict =
|
|
230
240
|
invariants.verdict === "pass" &&
|
|
@@ -238,7 +248,12 @@ export class BenchmarkRunner {
|
|
|
238
248
|
invariants,
|
|
239
249
|
submission,
|
|
240
250
|
...(judgeVerdict && { judgeVerdict }),
|
|
241
|
-
costUsd,
|
|
251
|
+
costUsd: costUsd + judgeCost,
|
|
252
|
+
costBreakdown: {
|
|
253
|
+
agent: breakdown.agent ?? 0,
|
|
254
|
+
supervisor: breakdown.supervisor ?? 0,
|
|
255
|
+
judge: judgeCost,
|
|
256
|
+
},
|
|
242
257
|
turns,
|
|
243
258
|
agentTracePath: workdir.agentTracePath,
|
|
244
259
|
supervisorTracePath: workdir.supervisorTracePath,
|
|
@@ -280,6 +295,7 @@ export class BenchmarkRunner {
|
|
|
280
295
|
} catch (e) {
|
|
281
296
|
return {
|
|
282
297
|
costUsd: 0,
|
|
298
|
+
costBreakdown: { agent: 0, supervisor: 0 },
|
|
283
299
|
turns: 0,
|
|
284
300
|
submission: "",
|
|
285
301
|
agentError: { message: e.message ?? String(e), aborted: false },
|
|
@@ -334,8 +350,20 @@ export class BenchmarkRunner {
|
|
|
334
350
|
workdir.agentTracePath,
|
|
335
351
|
workdir.supervisorTracePath,
|
|
336
352
|
);
|
|
353
|
+
// Cost is summed across every participant's result events from the one
|
|
354
|
+
// combined trace, attributed per source. Read before unlinking.
|
|
355
|
+
const combined = await fs.readFile(combinedPath, "utf8");
|
|
356
|
+
const { totalCostUsd, bySource } = sumTraceCost(combined.split("\n"));
|
|
337
357
|
await fs.unlink(combinedPath).catch(() => {});
|
|
338
|
-
return {
|
|
358
|
+
return {
|
|
359
|
+
...summary,
|
|
360
|
+
costUsd: totalCostUsd,
|
|
361
|
+
costBreakdown: {
|
|
362
|
+
agent: bySource.agent ?? 0,
|
|
363
|
+
supervisor: bySource.supervisor ?? 0,
|
|
364
|
+
},
|
|
365
|
+
agentError,
|
|
366
|
+
};
|
|
339
367
|
}
|
|
340
368
|
|
|
341
369
|
async #buildJudgeContext(task, workdir, skillSetHash) {
|
|
@@ -441,10 +469,13 @@ async function writeRecord(stream, record) {
|
|
|
441
469
|
}
|
|
442
470
|
|
|
443
471
|
/**
|
|
444
|
-
* Split the combined supervisor trace into agent and supervisor files
|
|
445
|
-
* extract
|
|
446
|
-
*
|
|
447
|
-
*
|
|
472
|
+
* Split the combined supervisor trace into agent and supervisor files and
|
|
473
|
+
* extract turn count and submission in a single pass. Agent-source events go
|
|
474
|
+
* to `agentPath`; supervisor and orchestrator events go to `supervisorPath`.
|
|
475
|
+
*
|
|
476
|
+
* Cost is deliberately not summed here — the caller derives it from the same
|
|
477
|
+
* combined trace via `sumTraceCost`, so there is one cost path across the
|
|
478
|
+
* benchmark, callback, and `fit-trace cost` consumers.
|
|
448
479
|
*/
|
|
449
480
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
|
|
450
481
|
async function splitAndSummarize(
|
|
@@ -460,8 +491,6 @@ async function splitAndSummarize(
|
|
|
460
491
|
input: fs.createReadStream(combinedPath),
|
|
461
492
|
crlfDelay: Infinity,
|
|
462
493
|
});
|
|
463
|
-
let agentCost = 0;
|
|
464
|
-
let supervisorCost = 0;
|
|
465
494
|
let turns = 0;
|
|
466
495
|
let submission = "";
|
|
467
496
|
for await (const line of rl) {
|
|
@@ -476,19 +505,9 @@ async function splitAndSummarize(
|
|
|
476
505
|
target.write(line + "\n");
|
|
477
506
|
const inner = event.event;
|
|
478
507
|
if (!inner) continue;
|
|
479
|
-
if (event.source === "agent") {
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
}
|
|
483
|
-
if (inner.type === "assistant") {
|
|
484
|
-
const text = extractText(inner);
|
|
485
|
-
if (text) submission = text;
|
|
486
|
-
}
|
|
487
|
-
}
|
|
488
|
-
if (event.source === "supervisor") {
|
|
489
|
-
if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
|
|
490
|
-
supervisorCost = inner.total_cost_usd;
|
|
491
|
-
}
|
|
508
|
+
if (event.source === "agent" && inner.type === "assistant") {
|
|
509
|
+
const text = extractText(inner);
|
|
510
|
+
if (text) submission = text;
|
|
492
511
|
}
|
|
493
512
|
if (event.source === "orchestrator" && inner.type === "summary") {
|
|
494
513
|
turns = inner.turns ?? 0;
|
|
@@ -498,7 +517,7 @@ async function splitAndSummarize(
|
|
|
498
517
|
new Promise((r) => agentStream.end(r)),
|
|
499
518
|
new Promise((r) => supStream.end(r)),
|
|
500
519
|
]);
|
|
501
|
-
return {
|
|
520
|
+
return { turns, submission };
|
|
502
521
|
}
|
|
503
522
|
|
|
504
523
|
function extractText(inner) {
|
package/src/commands/callback.js
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import { sumTraceCost } from "../cost.js";
|
|
2
|
+
|
|
1
3
|
/**
|
|
2
4
|
* Scan an NDJSON trace and return the last orchestrator summary event,
|
|
3
5
|
* the first `meta` event's `discussion_id`, and any structured replies
|
|
@@ -8,15 +10,14 @@
|
|
|
8
10
|
* "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
|
|
9
11
|
* its channel semantics.
|
|
10
12
|
*
|
|
11
|
-
* @param {string}
|
|
12
|
-
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
13
|
+
* @param {string} content - Raw NDJSON trace content.
|
|
13
14
|
* @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
|
|
14
15
|
*/
|
|
15
16
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
|
|
16
|
-
function readTraceSummary(
|
|
17
|
+
function readTraceSummary(content) {
|
|
17
18
|
let summary = null;
|
|
18
19
|
let metaDiscussionId = null;
|
|
19
|
-
for (const line of
|
|
20
|
+
for (const line of content.split("\n")) {
|
|
20
21
|
if (!line.trim()) continue;
|
|
21
22
|
let record;
|
|
22
23
|
try {
|
|
@@ -83,11 +84,15 @@ export async function runCallbackCommand(ctx) {
|
|
|
83
84
|
if (!callbackUrl)
|
|
84
85
|
return { ok: false, code: 1, error: "--callback-url is required" };
|
|
85
86
|
|
|
86
|
-
const
|
|
87
|
+
const content = runtime.fsSync.readFileSync(traceFile, "utf8");
|
|
88
|
+
const found = readTraceSummary(content) ?? {
|
|
87
89
|
verdict: "failed",
|
|
88
90
|
summary: "Run ended without producing a summary.",
|
|
89
91
|
replies: [],
|
|
90
92
|
};
|
|
93
|
+
// Total spend across every participant in the trace — the bridge surfaces
|
|
94
|
+
// it alongside the verdict so a dispatched run reports what it cost.
|
|
95
|
+
const { totalCostUsd } = sumTraceCost(content.split("\n"));
|
|
91
96
|
|
|
92
97
|
const discussionId = found.discussionId ?? discussionIdOverride ?? null;
|
|
93
98
|
const payload = {
|
|
@@ -96,6 +101,7 @@ export async function runCallbackCommand(ctx) {
|
|
|
96
101
|
verdict: found.verdict,
|
|
97
102
|
summary: found.summary,
|
|
98
103
|
run_url: runUrl,
|
|
104
|
+
cost_usd: totalCostUsd,
|
|
99
105
|
replies: found.replies,
|
|
100
106
|
last_acted_seq: found.lastActedSeq ?? -1,
|
|
101
107
|
...(discussionId && { discussion_id: discussionId }),
|