@forwardimpact/libeval 0.1.59 → 0.1.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -58,9 +58,9 @@ function parseRunOptions(values) {
|
|
|
58
58
|
family,
|
|
59
59
|
runs,
|
|
60
60
|
output: resolve(output),
|
|
61
|
-
agentModel: values["agent-model"]
|
|
62
|
-
supervisorModel: values["lead-model"]
|
|
63
|
-
judgeModel: values["judge-model"]
|
|
61
|
+
agentModel: values["agent-model"] || BENCHMARK_AGENT_MODEL,
|
|
62
|
+
supervisorModel: values["lead-model"] || LEAD_MODEL,
|
|
63
|
+
judgeModel: values["judge-model"] || LEAD_MODEL,
|
|
64
64
|
profiles: {
|
|
65
65
|
agent: values["agent-profile"] ?? null,
|
|
66
66
|
judge: values["judge-profile"] ?? null,
|
package/src/commands/discuss.js
CHANGED
|
@@ -53,8 +53,8 @@ export function parseDiscussOptions(values, runtime) {
|
|
|
53
53
|
taskAmend,
|
|
54
54
|
agentConfigs,
|
|
55
55
|
leadProfile: values["lead-profile"] ?? undefined,
|
|
56
|
-
leadModel: values["lead-model"]
|
|
57
|
-
agentModel: values["agent-model"]
|
|
56
|
+
leadModel: values["lead-model"] || LEAD_MODEL,
|
|
57
|
+
agentModel: values["agent-model"] || AGENT_MODEL,
|
|
58
58
|
maxTurns,
|
|
59
59
|
maxLeadTurns,
|
|
60
60
|
outputPath: values.output,
|
|
@@ -51,8 +51,8 @@ export function parseFacilitateOptions(values, runtime) {
|
|
|
51
51
|
taskAmend,
|
|
52
52
|
agentConfigs,
|
|
53
53
|
facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
|
|
54
|
-
agentModel: values["agent-model"]
|
|
55
|
-
facilitatorModel: values["lead-model"]
|
|
54
|
+
agentModel: values["agent-model"] || AGENT_MODEL,
|
|
55
|
+
facilitatorModel: values["lead-model"] || LEAD_MODEL,
|
|
56
56
|
maxTurns,
|
|
57
57
|
outputPath: values.output,
|
|
58
58
|
facilitatorProfile: values["lead-profile"] ?? undefined,
|
package/src/commands/run.js
CHANGED
|
@@ -27,7 +27,7 @@ function parseRunOptions(values, runtime) {
|
|
|
27
27
|
taskContent,
|
|
28
28
|
taskAmend,
|
|
29
29
|
cwd: resolve(values.cwd ?? "."),
|
|
30
|
-
agentModel: values["agent-model"]
|
|
30
|
+
agentModel: values["agent-model"] || AGENT_MODEL,
|
|
31
31
|
maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
|
|
32
32
|
outputPath: values.output,
|
|
33
33
|
agentProfile: values["agent-profile"] ?? undefined,
|
|
@@ -31,8 +31,8 @@ export async function parseSuperviseOptions(values, runtime) {
|
|
|
31
31
|
taskAmend,
|
|
32
32
|
supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
|
|
33
33
|
agentCwd,
|
|
34
|
-
agentModel: values["agent-model"]
|
|
35
|
-
supervisorModel: values["lead-model"]
|
|
34
|
+
agentModel: values["agent-model"] || AGENT_MODEL,
|
|
35
|
+
supervisorModel: values["lead-model"] || LEAD_MODEL,
|
|
36
36
|
maxTurns: (() => {
|
|
37
37
|
const raw = values["max-turns"] ?? "200";
|
|
38
38
|
return raw === "0" ? 0 : parseInt(raw, 10);
|
package/src/trace-collector.js
CHANGED
|
@@ -218,25 +218,24 @@ export class TraceCollector {
|
|
|
218
218
|
}
|
|
219
219
|
|
|
220
220
|
/**
|
|
221
|
+
* Accumulate a result event into the running summary. Facilitated and
|
|
222
|
+
* supervised sessions emit one result event per runner invocation, so a
|
|
223
|
+
* single trace can carry several — cost, duration, turn, and token
|
|
224
|
+
* figures sum across all of them. `result` reflects the latest event;
|
|
225
|
+
* `isError` is true once any event errored.
|
|
221
226
|
* @param {object} event
|
|
222
227
|
*/
|
|
223
228
|
handleResult(event) {
|
|
229
|
+
const prev = this.result ?? EMPTY_RESULT;
|
|
230
|
+
|
|
224
231
|
this.result = {
|
|
225
232
|
result: event.subtype ?? "unknown",
|
|
226
|
-
isError: event.is_error ?? false,
|
|
227
|
-
totalCostUsd: event.total_cost_usd ?? 0,
|
|
228
|
-
durationMs: event.duration_ms ?? 0,
|
|
229
|
-
numTurns: event.num_turns ?? 0,
|
|
230
|
-
tokenUsage: event.usage
|
|
231
|
-
|
|
232
|
-
inputTokens: event.usage.input_tokens ?? 0,
|
|
233
|
-
outputTokens: event.usage.output_tokens ?? 0,
|
|
234
|
-
cacheReadInputTokens: event.usage.cache_read_input_tokens ?? 0,
|
|
235
|
-
cacheCreationInputTokens:
|
|
236
|
-
event.usage.cache_creation_input_tokens ?? 0,
|
|
237
|
-
}
|
|
238
|
-
: null,
|
|
239
|
-
modelUsage: event.modelUsage ?? null,
|
|
233
|
+
isError: prev.isError || (event.is_error ?? false),
|
|
234
|
+
totalCostUsd: prev.totalCostUsd + (event.total_cost_usd ?? 0),
|
|
235
|
+
durationMs: prev.durationMs + (event.duration_ms ?? 0),
|
|
236
|
+
numTurns: prev.numTurns + (event.num_turns ?? 0),
|
|
237
|
+
tokenUsage: sumTokenUsage(prev.tokenUsage, normalizeUsage(event.usage)),
|
|
238
|
+
modelUsage: event.modelUsage ?? prev.modelUsage,
|
|
240
239
|
};
|
|
241
240
|
}
|
|
242
241
|
|
|
@@ -303,7 +302,9 @@ export class TraceCollector {
|
|
|
303
302
|
* Format the trailing result summary line. When an orchestrator
|
|
304
303
|
* summary is present (supervised / facilitated mode), the headline word is
|
|
305
304
|
* the supervisor's verdict ("success" / "failure") rather than the SDK's
|
|
306
|
-
* per-runner subtype, so the footer aligns with the CI exit code.
|
|
305
|
+
* per-runner subtype, so the footer aligns with the CI exit code. Turn,
|
|
306
|
+
* cost, and duration figures are the accumulated totals across every
|
|
307
|
+
* result event in the trace, not the last event's.
|
|
307
308
|
* @returns {string}
|
|
308
309
|
*/
|
|
309
310
|
#formatResultTail() {
|
|
@@ -318,6 +319,50 @@ export class TraceCollector {
|
|
|
318
319
|
}
|
|
319
320
|
}
|
|
320
321
|
|
|
322
|
+
/** Identity element for result-event accumulation in handleResult. */
|
|
323
|
+
const EMPTY_RESULT = {
|
|
324
|
+
isError: false,
|
|
325
|
+
totalCostUsd: 0,
|
|
326
|
+
durationMs: 0,
|
|
327
|
+
numTurns: 0,
|
|
328
|
+
tokenUsage: null,
|
|
329
|
+
modelUsage: null,
|
|
330
|
+
};
|
|
331
|
+
|
|
332
|
+
/**
|
|
333
|
+
* Normalize an SDK snake_case usage block to camelCase token fields.
|
|
334
|
+
* @param {object|null|undefined} usage
|
|
335
|
+
* @returns {object|null}
|
|
336
|
+
*/
|
|
337
|
+
function normalizeUsage(usage) {
|
|
338
|
+
if (!usage) return null;
|
|
339
|
+
return {
|
|
340
|
+
inputTokens: usage.input_tokens ?? 0,
|
|
341
|
+
outputTokens: usage.output_tokens ?? 0,
|
|
342
|
+
cacheReadInputTokens: usage.cache_read_input_tokens ?? 0,
|
|
343
|
+
cacheCreationInputTokens: usage.cache_creation_input_tokens ?? 0,
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Sum two token-usage records field-by-field. Either side may be null
|
|
349
|
+
* (a result event without usage); the sum is null only when both are.
|
|
350
|
+
* @param {object|null} a
|
|
351
|
+
* @param {object|null} b
|
|
352
|
+
* @returns {object|null}
|
|
353
|
+
*/
|
|
354
|
+
function sumTokenUsage(a, b) {
|
|
355
|
+
if (!a) return b;
|
|
356
|
+
if (!b) return a;
|
|
357
|
+
return {
|
|
358
|
+
inputTokens: a.inputTokens + b.inputTokens,
|
|
359
|
+
outputTokens: a.outputTokens + b.outputTokens,
|
|
360
|
+
cacheReadInputTokens: a.cacheReadInputTokens + b.cacheReadInputTokens,
|
|
361
|
+
cacheCreationInputTokens:
|
|
362
|
+
a.cacheCreationInputTokens + b.cacheCreationInputTokens,
|
|
363
|
+
};
|
|
364
|
+
}
|
|
365
|
+
|
|
321
366
|
/**
|
|
322
367
|
* Format milliseconds into a human-readable duration.
|
|
323
368
|
* @param {number} ms - Duration in milliseconds
|
package/src/trace-query.js
CHANGED
|
@@ -278,38 +278,20 @@ export class TraceQuery {
|
|
|
278
278
|
|
|
279
279
|
/**
|
|
280
280
|
* Token usage and cost breakdown per assistant turn, plus totals.
|
|
281
|
+
*
|
|
282
|
+
* Token totals prefer the summary's result-event usage — the SDK's
|
|
283
|
+
* authoritative ledger, accumulated across every result event in the
|
|
284
|
+
* trace — over per-turn sums, whose stream-time snapshots double-count
|
|
285
|
+
* re-emitted messages. Traces without a result event (truncated or
|
|
286
|
+
* in-flight) fall back to the per-turn sums.
|
|
281
287
|
* @returns {object}
|
|
282
288
|
*/
|
|
283
289
|
stats() {
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
let totalCacheRead = 0;
|
|
287
|
-
let totalCacheCreate = 0;
|
|
288
|
-
const perTurn = [];
|
|
289
|
-
|
|
290
|
-
for (const turn of this.turns) {
|
|
291
|
-
if (turn.role !== "assistant" || !turn.usage) continue;
|
|
292
|
-
const u = turn.usage;
|
|
293
|
-
totalInput += u.inputTokens ?? 0;
|
|
294
|
-
totalOutput += u.outputTokens ?? 0;
|
|
295
|
-
totalCacheRead += u.cacheReadInputTokens ?? 0;
|
|
296
|
-
totalCacheCreate += u.cacheCreationInputTokens ?? 0;
|
|
297
|
-
|
|
298
|
-
perTurn.push({
|
|
299
|
-
index: turn.index,
|
|
300
|
-
inputTokens: u.inputTokens ?? 0,
|
|
301
|
-
outputTokens: u.outputTokens ?? 0,
|
|
302
|
-
cacheReadInputTokens: u.cacheReadInputTokens ?? 0,
|
|
303
|
-
cacheCreationInputTokens: u.cacheCreationInputTokens ?? 0,
|
|
304
|
-
});
|
|
305
|
-
}
|
|
306
|
-
|
|
290
|
+
const { perTurn, totals: turnTotals } = perTurnUsage(this.turns);
|
|
291
|
+
const tokenTotals = this.summary.tokenUsage ?? turnTotals;
|
|
307
292
|
return {
|
|
308
293
|
totals: {
|
|
309
|
-
|
|
310
|
-
outputTokens: totalOutput,
|
|
311
|
-
cacheReadInputTokens: totalCacheRead,
|
|
312
|
-
cacheCreationInputTokens: totalCacheCreate,
|
|
294
|
+
...tokenTotals,
|
|
313
295
|
totalCostUsd: this.summary.totalCostUsd ?? 0,
|
|
314
296
|
durationMs: this.summary.durationMs ?? 0,
|
|
315
297
|
},
|
|
@@ -318,6 +300,38 @@ export class TraceQuery {
|
|
|
318
300
|
}
|
|
319
301
|
}
|
|
320
302
|
|
|
303
|
+
/**
|
|
304
|
+
* Sum per-turn assistant usage and build the per-turn breakdown rows.
|
|
305
|
+
* @param {object[]} turns
|
|
306
|
+
* @returns {{perTurn: object[], totals: object}}
|
|
307
|
+
*/
|
|
308
|
+
function perTurnUsage(turns) {
|
|
309
|
+
const totals = {
|
|
310
|
+
inputTokens: 0,
|
|
311
|
+
outputTokens: 0,
|
|
312
|
+
cacheReadInputTokens: 0,
|
|
313
|
+
cacheCreationInputTokens: 0,
|
|
314
|
+
};
|
|
315
|
+
const perTurn = [];
|
|
316
|
+
|
|
317
|
+
for (const turn of turns) {
|
|
318
|
+
if (turn.role !== "assistant" || !turn.usage) continue;
|
|
319
|
+
const row = {
|
|
320
|
+
index: turn.index,
|
|
321
|
+
inputTokens: turn.usage.inputTokens ?? 0,
|
|
322
|
+
outputTokens: turn.usage.outputTokens ?? 0,
|
|
323
|
+
cacheReadInputTokens: turn.usage.cacheReadInputTokens ?? 0,
|
|
324
|
+
cacheCreationInputTokens: turn.usage.cacheCreationInputTokens ?? 0,
|
|
325
|
+
};
|
|
326
|
+
totals.inputTokens += row.inputTokens;
|
|
327
|
+
totals.outputTokens += row.outputTokens;
|
|
328
|
+
totals.cacheReadInputTokens += row.cacheReadInputTokens;
|
|
329
|
+
totals.cacheCreationInputTokens += row.cacheCreationInputTokens;
|
|
330
|
+
perTurn.push(row);
|
|
331
|
+
}
|
|
332
|
+
return { perTurn, totals };
|
|
333
|
+
}
|
|
334
|
+
|
|
321
335
|
/**
|
|
322
336
|
* @param {object} turn
|
|
323
337
|
* @param {string|undefined} role
|