@forwardimpact/libeval 0.1.62 → 0.1.64
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -201
- package/bin/fit-trace.js +46 -1
- package/package.json +1 -1
- package/src/agent-runner.js +44 -1
- package/src/benchmark/judge.js +16 -1
- package/src/benchmark/result.js +12 -0
- package/src/benchmark/runner.js +44 -25
- package/src/commands/callback.js +11 -5
- package/src/commands/run.js +3 -1
- package/src/commands/trace.js +88 -2
- package/src/cost.js +79 -0
- package/src/index.js +2 -0
- package/src/redaction.js +65 -6
- package/src/trace-collector.js +58 -2
- package/src/trace-github.js +175 -3
- package/src/trace-query.js +204 -24
package/src/trace-query.js
CHANGED
|
@@ -33,6 +33,11 @@ export class TraceQuery {
|
|
|
33
33
|
metadata: this.metadata,
|
|
34
34
|
summary: this.summary,
|
|
35
35
|
turnCount: this.turns.length,
|
|
36
|
+
resultEventTurns: this.summary.numTurns ?? null,
|
|
37
|
+
turnPopulations: {
|
|
38
|
+
turnCount: "rendered-trace-turns",
|
|
39
|
+
resultEventTurns: "result-event-turns",
|
|
40
|
+
},
|
|
36
41
|
tools: this.toolFrequency(),
|
|
37
42
|
taskPrompt,
|
|
38
43
|
};
|
|
@@ -277,59 +282,234 @@ export class TraceQuery {
|
|
|
277
282
|
}
|
|
278
283
|
|
|
279
284
|
/**
|
|
280
|
-
* Token usage and cost breakdown per
|
|
285
|
+
* Token usage and cost breakdown, accounted once per API message, plus
|
|
286
|
+
* totals that name their population.
|
|
287
|
+
*
|
|
288
|
+
* A structured document collected before this change (version < 1.2.0)
|
|
289
|
+
* carries no message identity, so it reports its carried last-wins summary
|
|
290
|
+
* labeled as such — corrected figures come from re-running the NDJSON source.
|
|
281
291
|
*
|
|
282
|
-
*
|
|
283
|
-
*
|
|
284
|
-
*
|
|
285
|
-
*
|
|
286
|
-
* in-flight)
|
|
292
|
+
* Otherwise: when the trace carries result events, totals are the SDK's
|
|
293
|
+
* accumulated result-event sums (authoritative); the per-message sums are
|
|
294
|
+
* compared against them and any divergence on input/cacheRead/cacheCreation
|
|
295
|
+
* is surfaced, never silently absorbed. A trace with no result event
|
|
296
|
+
* (truncated or in-flight) falls back to the per-message sums, with output
|
|
297
|
+
* flagged as a streaming-snapshot lower bound and cost/duration/turns
|
|
298
|
+
* reported as unavailable rather than a silent 0.
|
|
287
299
|
* @returns {object}
|
|
288
300
|
*/
|
|
289
301
|
stats() {
|
|
290
|
-
|
|
291
|
-
|
|
302
|
+
if (isPreChangeDoc(this.trace.version)) {
|
|
303
|
+
return this.#carriedDocumentStats();
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
const { perMessage, totals: perMessageTotals } = perMessageUsage(
|
|
307
|
+
this.turns,
|
|
308
|
+
);
|
|
309
|
+
const re = this.summary.tokenUsage;
|
|
310
|
+
|
|
311
|
+
if (re) {
|
|
312
|
+
return {
|
|
313
|
+
totals: {
|
|
314
|
+
inputTokens: re.inputTokens ?? 0,
|
|
315
|
+
outputTokens: re.outputTokens ?? 0,
|
|
316
|
+
cacheReadInputTokens: re.cacheReadInputTokens ?? 0,
|
|
317
|
+
cacheCreationInputTokens: re.cacheCreationInputTokens ?? 0,
|
|
318
|
+
totalCostUsd: this.summary.totalCostUsd ?? 0,
|
|
319
|
+
durationMs: this.summary.durationMs ?? 0,
|
|
320
|
+
durationLabel: "cumulative invocation time",
|
|
321
|
+
resultEventTurns: this.summary.numTurns ?? 0,
|
|
322
|
+
population: "result-event-sum",
|
|
323
|
+
resultEventsPresent: true,
|
|
324
|
+
},
|
|
325
|
+
perTurn: perMessage,
|
|
326
|
+
modelUsage: this.summary.modelUsage ?? null,
|
|
327
|
+
divergence: computeDivergence(perMessageTotals, re),
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return {
|
|
332
|
+
totals: {
|
|
333
|
+
...perMessageTotals,
|
|
334
|
+
outputIsStreamingSnapshot: true,
|
|
335
|
+
totalCostUsd: null,
|
|
336
|
+
durationMs: null,
|
|
337
|
+
resultEventTurns: null,
|
|
338
|
+
population: "per-message-fallback",
|
|
339
|
+
resultEventsPresent: false,
|
|
340
|
+
},
|
|
341
|
+
perTurn: perMessage,
|
|
342
|
+
modelUsage: this.summary.modelUsage ?? null,
|
|
343
|
+
divergence: null,
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Stats for a pre-change structured document: report the carried last-wins
|
|
349
|
+
* summary and per-stream-event breakdown, each labeled, without claiming
|
|
350
|
+
* result-event parity (the document lacks the message identity it needs).
|
|
351
|
+
* @returns {object}
|
|
352
|
+
*/
|
|
353
|
+
#carriedDocumentStats() {
|
|
354
|
+
const re = this.summary.tokenUsage ?? ZERO_USAGE;
|
|
292
355
|
return {
|
|
293
356
|
totals: {
|
|
294
|
-
|
|
357
|
+
inputTokens: re.inputTokens ?? 0,
|
|
358
|
+
outputTokens: re.outputTokens ?? 0,
|
|
359
|
+
cacheReadInputTokens: re.cacheReadInputTokens ?? 0,
|
|
360
|
+
cacheCreationInputTokens: re.cacheCreationInputTokens ?? 0,
|
|
295
361
|
totalCostUsd: this.summary.totalCostUsd ?? 0,
|
|
296
362
|
durationMs: this.summary.durationMs ?? 0,
|
|
363
|
+
population: "carried-document-summary",
|
|
297
364
|
},
|
|
298
|
-
perTurn,
|
|
365
|
+
perTurn: carriedPerTurn(this.turns),
|
|
366
|
+
modelUsage: this.summary.modelUsage ?? null,
|
|
367
|
+
divergence: null,
|
|
299
368
|
};
|
|
300
369
|
}
|
|
301
370
|
}
|
|
302
371
|
|
|
372
|
+
/** Zero-valued token usage, used as the carried-document fallback. */
|
|
373
|
+
const ZERO_USAGE = {
|
|
374
|
+
inputTokens: 0,
|
|
375
|
+
outputTokens: 0,
|
|
376
|
+
cacheReadInputTokens: 0,
|
|
377
|
+
cacheCreationInputTokens: 0,
|
|
378
|
+
};
|
|
379
|
+
|
|
303
380
|
/**
|
|
304
|
-
*
|
|
381
|
+
* Per-stream-event breakdown for a pre-change document, labeled as carried —
|
|
382
|
+
* old documents lack message identity, so rows stay keyed by turn index.
|
|
305
383
|
* @param {object[]} turns
|
|
306
|
-
* @returns {
|
|
384
|
+
* @returns {object[]}
|
|
307
385
|
*/
|
|
308
|
-
function
|
|
309
|
-
const totals = {
|
|
310
|
-
inputTokens: 0,
|
|
311
|
-
outputTokens: 0,
|
|
312
|
-
cacheReadInputTokens: 0,
|
|
313
|
-
cacheCreationInputTokens: 0,
|
|
314
|
-
};
|
|
386
|
+
function carriedPerTurn(turns) {
|
|
315
387
|
const perTurn = [];
|
|
316
|
-
|
|
317
388
|
for (const turn of turns) {
|
|
318
389
|
if (turn.role !== "assistant" || !turn.usage) continue;
|
|
319
|
-
|
|
390
|
+
perTurn.push({
|
|
320
391
|
index: turn.index,
|
|
321
392
|
inputTokens: turn.usage.inputTokens ?? 0,
|
|
322
393
|
outputTokens: turn.usage.outputTokens ?? 0,
|
|
323
394
|
cacheReadInputTokens: turn.usage.cacheReadInputTokens ?? 0,
|
|
324
395
|
cacheCreationInputTokens: turn.usage.cacheCreationInputTokens ?? 0,
|
|
325
|
-
|
|
396
|
+
population: "carried-document-per-turn",
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
return perTurn;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Whether a structured-document version predates per-message accounting
|
|
404
|
+
* (1.2.0). A trace with no version (collected by this build from NDJSON) is
|
|
405
|
+
* not pre-change. Compares numeric version parts so 1.10.0 reads as post-change.
|
|
406
|
+
* @param {string|undefined|null} version
|
|
407
|
+
* @returns {boolean}
|
|
408
|
+
*/
|
|
409
|
+
function isPreChangeDoc(version) {
|
|
410
|
+
if (typeof version !== "string") return false;
|
|
411
|
+
const [major = 0, minor = 0] = version
|
|
412
|
+
.split(".")
|
|
413
|
+
.map((part) => parseInt(part, 10) || 0);
|
|
414
|
+
if (major !== 1) return major < 1;
|
|
415
|
+
// Per-message accounting arrived in 1.2.0; any 1.2.x is post-change.
|
|
416
|
+
return minor < 2;
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
/**
|
|
420
|
+
* Account assistant usage once per API message. Turns are grouped by
|
|
421
|
+
* `messageId` (a null id is its own singleton message); per message the
|
|
422
|
+
* field-wise max across its snapshots is taken — order-insensitive, equal to
|
|
423
|
+
* the single value when a message's duplicate snapshots are byte-identical
|
|
424
|
+
* (zero residual against result-event sums), and a floor for output (the
|
|
425
|
+
* largest streaming snapshot, never an overstatement).
|
|
426
|
+
* @param {object[]} turns
|
|
427
|
+
* @returns {{perMessage: object[], totals: object}}
|
|
428
|
+
*/
|
|
429
|
+
function perMessageUsage(turns) {
|
|
430
|
+
const byMessage = new Map();
|
|
431
|
+
let singletonSeq = 0;
|
|
432
|
+
|
|
433
|
+
for (const turn of turns) {
|
|
434
|
+
if (turn.role !== "assistant" || !turn.usage) continue;
|
|
435
|
+
const key = turn.messageId ?? `__null__${singletonSeq++}`;
|
|
436
|
+
accumulateMessage(byMessage, key, turn);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
const totals = {
|
|
440
|
+
inputTokens: 0,
|
|
441
|
+
outputTokens: 0,
|
|
442
|
+
cacheReadInputTokens: 0,
|
|
443
|
+
cacheCreationInputTokens: 0,
|
|
444
|
+
};
|
|
445
|
+
const perMessage = [];
|
|
446
|
+
for (const row of byMessage.values()) {
|
|
326
447
|
totals.inputTokens += row.inputTokens;
|
|
327
448
|
totals.outputTokens += row.outputTokens;
|
|
328
449
|
totals.cacheReadInputTokens += row.cacheReadInputTokens;
|
|
329
450
|
totals.cacheCreationInputTokens += row.cacheCreationInputTokens;
|
|
330
|
-
|
|
451
|
+
perMessage.push({
|
|
452
|
+
...row,
|
|
453
|
+
outputIsStreamingSnapshot: true,
|
|
454
|
+
population: "api-message",
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
return { perMessage, totals };
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* Fold one assistant turn's usage into its message bucket by field-wise max.
|
|
462
|
+
* @param {Map<string, object>} byMessage
|
|
463
|
+
* @param {string} key
|
|
464
|
+
* @param {object} turn
|
|
465
|
+
*/
|
|
466
|
+
function accumulateMessage(byMessage, key, turn) {
|
|
467
|
+
const u = turn.usage;
|
|
468
|
+
const prev = byMessage.get(key);
|
|
469
|
+
if (!prev) {
|
|
470
|
+
byMessage.set(key, {
|
|
471
|
+
messageId: turn.messageId ?? null,
|
|
472
|
+
inputTokens: u.inputTokens ?? 0,
|
|
473
|
+
outputTokens: u.outputTokens ?? 0,
|
|
474
|
+
cacheReadInputTokens: u.cacheReadInputTokens ?? 0,
|
|
475
|
+
cacheCreationInputTokens: u.cacheCreationInputTokens ?? 0,
|
|
476
|
+
});
|
|
477
|
+
return;
|
|
478
|
+
}
|
|
479
|
+
prev.inputTokens = Math.max(prev.inputTokens, u.inputTokens ?? 0);
|
|
480
|
+
prev.outputTokens = Math.max(prev.outputTokens, u.outputTokens ?? 0);
|
|
481
|
+
prev.cacheReadInputTokens = Math.max(
|
|
482
|
+
prev.cacheReadInputTokens,
|
|
483
|
+
u.cacheReadInputTokens ?? 0,
|
|
484
|
+
);
|
|
485
|
+
prev.cacheCreationInputTokens = Math.max(
|
|
486
|
+
prev.cacheCreationInputTokens,
|
|
487
|
+
u.cacheCreationInputTokens ?? 0,
|
|
488
|
+
);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
/**
|
|
492
|
+
* Compare per-message sums against the result-event sums on the fields the
|
|
493
|
+
* spec guarantees parity for (input, cacheRead, cacheCreation — never output,
|
|
494
|
+
* which always diverges by mechanism 2). Returns the first divergent field as
|
|
495
|
+
* `{field, perMessageSum, resultEventSum}`, or null when all agree.
|
|
496
|
+
* @param {object} perMessageTotals
|
|
497
|
+
* @param {object} resultEventUsage
|
|
498
|
+
* @returns {object|null}
|
|
499
|
+
*/
|
|
500
|
+
function computeDivergence(perMessageTotals, resultEventUsage) {
|
|
501
|
+
for (const field of [
|
|
502
|
+
"inputTokens",
|
|
503
|
+
"cacheReadInputTokens",
|
|
504
|
+
"cacheCreationInputTokens",
|
|
505
|
+
]) {
|
|
506
|
+
const perMessageSum = perMessageTotals[field] ?? 0;
|
|
507
|
+
const resultEventSum = resultEventUsage[field] ?? 0;
|
|
508
|
+
if (perMessageSum !== resultEventSum) {
|
|
509
|
+
return { field, perMessageSum, resultEventSum };
|
|
510
|
+
}
|
|
331
511
|
}
|
|
332
|
-
return
|
|
512
|
+
return null;
|
|
333
513
|
}
|
|
334
514
|
|
|
335
515
|
/**
|