@forwardimpact/libeval 0.1.63 → 0.1.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,6 +18,7 @@ import { createInterface } from "node:readline";
18
18
  import { join, resolve as resolvePath } from "node:path";
19
19
 
20
20
  import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
21
+ import { sumTraceCost } from "../cost.js";
21
22
  import { createSupervisor } from "../supervisor.js";
22
23
  import { installApm as defaultInstallApm } from "./apm-installer.js";
23
24
  import { installNpm as defaultInstallNpm } from "./npm-installer.js";
@@ -193,8 +194,9 @@ export class BenchmarkRunner {
193
194
  resultsRecordKey(task, runIndex),
194
195
  );
195
196
  }
196
- const { costUsd, turns, submission, agentError } =
197
- await this.#runAgentSafe(task, workdir);
197
+ const agentRun = await this.#runAgentSafe(task, workdir);
198
+ const { costUsd, turns, submission, agentError } = agentRun;
199
+ const breakdown = agentRun.costBreakdown ?? { agent: 0, supervisor: 0 };
198
200
  const invariants = await this._runInvariantsHook(
199
201
  task,
200
202
  {
@@ -206,13 +208,14 @@ export class BenchmarkRunner {
206
208
  this.runtime,
207
209
  );
208
210
  let judgeVerdict = null;
211
+ let judgeCost = 0;
209
212
  if (task.paths.judge) {
210
213
  const judgeContext = await this.#buildJudgeContext(
211
214
  task,
212
215
  workdir,
213
216
  skillSetHash,
214
217
  );
215
- judgeVerdict = await this._runJudgeHook(
218
+ const judgeResult = await this._runJudgeHook(
216
219
  task,
217
220
  workdir,
218
221
  invariants,
@@ -225,6 +228,13 @@ export class BenchmarkRunner {
225
228
  },
226
229
  judgeContext,
227
230
  );
231
+ judgeCost = judgeResult.costUsd ?? 0;
232
+ // The record's judgeVerdict carries only the verdict + summary; the
233
+ // judge's cost is folded into costUsd / costBreakdown instead.
234
+ judgeVerdict = {
235
+ verdict: judgeResult.verdict,
236
+ summary: judgeResult.summary,
237
+ };
228
238
  }
229
239
  const verdict =
230
240
  invariants.verdict === "pass" &&
@@ -238,7 +248,12 @@ export class BenchmarkRunner {
238
248
  invariants,
239
249
  submission,
240
250
  ...(judgeVerdict && { judgeVerdict }),
241
- costUsd,
251
+ costUsd: costUsd + judgeCost,
252
+ costBreakdown: {
253
+ agent: breakdown.agent ?? 0,
254
+ supervisor: breakdown.supervisor ?? 0,
255
+ judge: judgeCost,
256
+ },
242
257
  turns,
243
258
  agentTracePath: workdir.agentTracePath,
244
259
  supervisorTracePath: workdir.supervisorTracePath,
@@ -280,6 +295,7 @@ export class BenchmarkRunner {
280
295
  } catch (e) {
281
296
  return {
282
297
  costUsd: 0,
298
+ costBreakdown: { agent: 0, supervisor: 0 },
283
299
  turns: 0,
284
300
  submission: "",
285
301
  agentError: { message: e.message ?? String(e), aborted: false },
@@ -334,8 +350,20 @@ export class BenchmarkRunner {
334
350
  workdir.agentTracePath,
335
351
  workdir.supervisorTracePath,
336
352
  );
353
+ // Cost is summed across every participant's result events from the one
354
+ // combined trace, attributed per source. Read before unlinking.
355
+ const combined = await fs.readFile(combinedPath, "utf8");
356
+ const { totalCostUsd, bySource } = sumTraceCost(combined.split("\n"));
337
357
  await fs.unlink(combinedPath).catch(() => {});
338
- return { ...summary, agentError };
358
+ return {
359
+ ...summary,
360
+ costUsd: totalCostUsd,
361
+ costBreakdown: {
362
+ agent: bySource.agent ?? 0,
363
+ supervisor: bySource.supervisor ?? 0,
364
+ },
365
+ agentError,
366
+ };
339
367
  }
340
368
 
341
369
  async #buildJudgeContext(task, workdir, skillSetHash) {
@@ -441,10 +469,13 @@ async function writeRecord(stream, record) {
441
469
  }
442
470
 
443
471
  /**
444
- * Split the combined supervisor trace into agent and supervisor files, and
445
- * extract cost, turn count, and submission in a single pass. Agent-source
446
- * events go to `agentPath`; supervisor and orchestrator events go to
447
- * `supervisorPath`.
472
+ * Split the combined supervisor trace into agent and supervisor files and
473
+ * extract turn count and submission in a single pass. Agent-source events go
474
+ * to `agentPath`; supervisor and orchestrator events go to `supervisorPath`.
475
+ *
476
+ * Cost is deliberately not summed here — the caller derives it from the same
477
+ * combined trace via `sumTraceCost`, so there is one cost path across the
478
+ * benchmark, callback, and `fit-trace cost` consumers.
448
479
  */
449
480
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
450
481
  async function splitAndSummarize(
@@ -460,8 +491,6 @@ async function splitAndSummarize(
460
491
  input: fs.createReadStream(combinedPath),
461
492
  crlfDelay: Infinity,
462
493
  });
463
- let agentCost = 0;
464
- let supervisorCost = 0;
465
494
  let turns = 0;
466
495
  let submission = "";
467
496
  for await (const line of rl) {
@@ -476,19 +505,9 @@ async function splitAndSummarize(
476
505
  target.write(line + "\n");
477
506
  const inner = event.event;
478
507
  if (!inner) continue;
479
- if (event.source === "agent") {
480
- if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
481
- agentCost = inner.total_cost_usd;
482
- }
483
- if (inner.type === "assistant") {
484
- const text = extractText(inner);
485
- if (text) submission = text;
486
- }
487
- }
488
- if (event.source === "supervisor") {
489
- if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
490
- supervisorCost = inner.total_cost_usd;
491
- }
508
+ if (event.source === "agent" && inner.type === "assistant") {
509
+ const text = extractText(inner);
510
+ if (text) submission = text;
492
511
  }
493
512
  if (event.source === "orchestrator" && inner.type === "summary") {
494
513
  turns = inner.turns ?? 0;
@@ -498,7 +517,7 @@ async function splitAndSummarize(
498
517
  new Promise((r) => agentStream.end(r)),
499
518
  new Promise((r) => supStream.end(r)),
500
519
  ]);
501
- return { costUsd: agentCost + supervisorCost, turns, submission };
520
+ return { turns, submission };
502
521
  }
503
522
 
504
523
  function extractText(inner) {
@@ -1,3 +1,5 @@
1
+ import { sumTraceCost } from "../cost.js";
2
+
1
3
  /**
2
4
  * Scan an NDJSON trace and return the last orchestrator summary event,
3
5
  * the first `meta` event's `discussion_id`, and any structured replies
@@ -8,15 +10,14 @@
8
10
  * "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
9
11
  * its channel semantics.
10
12
  *
11
- * @param {string} traceFile
12
- * @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
13
+ * @param {string} content - Raw NDJSON trace content.
13
14
  * @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
14
15
  */
15
16
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
16
- function readTraceSummary(traceFile, fsSync) {
17
+ function readTraceSummary(content) {
17
18
  let summary = null;
18
19
  let metaDiscussionId = null;
19
- for (const line of fsSync.readFileSync(traceFile, "utf8").split("\n")) {
20
+ for (const line of content.split("\n")) {
20
21
  if (!line.trim()) continue;
21
22
  let record;
22
23
  try {
@@ -83,11 +84,15 @@ export async function runCallbackCommand(ctx) {
83
84
  if (!callbackUrl)
84
85
  return { ok: false, code: 1, error: "--callback-url is required" };
85
86
 
86
- const found = readTraceSummary(traceFile, runtime.fsSync) ?? {
87
+ const content = runtime.fsSync.readFileSync(traceFile, "utf8");
88
+ const found = readTraceSummary(content) ?? {
87
89
  verdict: "failed",
88
90
  summary: "Run ended without producing a summary.",
89
91
  replies: [],
90
92
  };
93
+ // Total spend across every participant in the trace — the bridge surfaces
94
+ // it alongside the verdict so a dispatched run reports what it cost.
95
+ const { totalCostUsd } = sumTraceCost(content.split("\n"));
91
96
 
92
97
  const discussionId = found.discussionId ?? discussionIdOverride ?? null;
93
98
  const payload = {
@@ -96,6 +101,7 @@ export async function runCallbackCommand(ctx) {
96
101
  verdict: found.verdict,
97
102
  summary: found.summary,
98
103
  run_url: runUrl,
104
+ cost_usd: totalCostUsd,
99
105
  replies: found.replies,
100
106
  last_acted_seq: found.lastActedSeq ?? -1,
101
107
  ...(discussionId && { discussion_id: discussionId }),