@forwardimpact/libeval 0.1.62 → 0.1.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,11 @@ export class TraceQuery {
33
33
  metadata: this.metadata,
34
34
  summary: this.summary,
35
35
  turnCount: this.turns.length,
36
+ resultEventTurns: this.summary.numTurns ?? null,
37
+ turnPopulations: {
38
+ turnCount: "rendered-trace-turns",
39
+ resultEventTurns: "result-event-turns",
40
+ },
36
41
  tools: this.toolFrequency(),
37
42
  taskPrompt,
38
43
  };
@@ -277,59 +282,234 @@ export class TraceQuery {
277
282
  }
278
283
 
279
284
  /**
280
- * Token usage and cost breakdown per assistant turn, plus totals.
285
+ * Token usage and cost breakdown, accounted once per API message, plus
286
+ * totals that name their population.
287
+ *
288
+ * A structured document collected before this change (version < 1.2.0)
289
+ * carries no message identity, so it reports its carried last-wins summary
290
+ * labeled as such — corrected figures come from re-running the NDJSON source.
281
291
  *
282
- * Token totals prefer the summary's result-event usage the SDK's
283
- * authoritative ledger, accumulated across every result event in the
284
- * trace over per-turn sums, whose stream-time snapshots double-count
285
- * re-emitted messages. Traces without a result event (truncated or
286
- * in-flight) fall back to the per-turn sums.
292
+ * Otherwise: when the trace carries result events, totals are the SDK's
293
+ * accumulated result-event sums (authoritative); the per-message sums are
294
+ * compared against them and any divergence on input/cacheRead/cacheCreation
295
+ * is surfaced, never silently absorbed. A trace with no result event
296
+ * (truncated or in-flight) falls back to the per-message sums, with output
297
+ * flagged as a streaming-snapshot lower bound and cost/duration/turns
298
+ * reported as unavailable rather than a silent 0.
287
299
  * @returns {object}
288
300
  */
289
301
  stats() {
290
- const { perTurn, totals: turnTotals } = perTurnUsage(this.turns);
291
- const tokenTotals = this.summary.tokenUsage ?? turnTotals;
302
+ if (isPreChangeDoc(this.trace.version)) {
303
+ return this.#carriedDocumentStats();
304
+ }
305
+
306
+ const { perMessage, totals: perMessageTotals } = perMessageUsage(
307
+ this.turns,
308
+ );
309
+ const re = this.summary.tokenUsage;
310
+
311
+ if (re) {
312
+ return {
313
+ totals: {
314
+ inputTokens: re.inputTokens ?? 0,
315
+ outputTokens: re.outputTokens ?? 0,
316
+ cacheReadInputTokens: re.cacheReadInputTokens ?? 0,
317
+ cacheCreationInputTokens: re.cacheCreationInputTokens ?? 0,
318
+ totalCostUsd: this.summary.totalCostUsd ?? 0,
319
+ durationMs: this.summary.durationMs ?? 0,
320
+ durationLabel: "cumulative invocation time",
321
+ resultEventTurns: this.summary.numTurns ?? 0,
322
+ population: "result-event-sum",
323
+ resultEventsPresent: true,
324
+ },
325
+ perTurn: perMessage,
326
+ modelUsage: this.summary.modelUsage ?? null,
327
+ divergence: computeDivergence(perMessageTotals, re),
328
+ };
329
+ }
330
+
331
+ return {
332
+ totals: {
333
+ ...perMessageTotals,
334
+ outputIsStreamingSnapshot: true,
335
+ totalCostUsd: null,
336
+ durationMs: null,
337
+ resultEventTurns: null,
338
+ population: "per-message-fallback",
339
+ resultEventsPresent: false,
340
+ },
341
+ perTurn: perMessage,
342
+ modelUsage: this.summary.modelUsage ?? null,
343
+ divergence: null,
344
+ };
345
+ }
346
+
347
+ /**
348
+ * Stats for a pre-change structured document: report the carried last-wins
349
+ * summary and per-stream-event breakdown, each labeled, without claiming
350
+ * result-event parity (the document lacks the message identity it needs).
351
+ * @returns {object}
352
+ */
353
+ #carriedDocumentStats() {
354
+ const re = this.summary.tokenUsage ?? ZERO_USAGE;
292
355
  return {
293
356
  totals: {
294
- ...tokenTotals,
357
+ inputTokens: re.inputTokens ?? 0,
358
+ outputTokens: re.outputTokens ?? 0,
359
+ cacheReadInputTokens: re.cacheReadInputTokens ?? 0,
360
+ cacheCreationInputTokens: re.cacheCreationInputTokens ?? 0,
295
361
  totalCostUsd: this.summary.totalCostUsd ?? 0,
296
362
  durationMs: this.summary.durationMs ?? 0,
363
+ population: "carried-document-summary",
297
364
  },
298
- perTurn,
365
+ perTurn: carriedPerTurn(this.turns),
366
+ modelUsage: this.summary.modelUsage ?? null,
367
+ divergence: null,
299
368
  };
300
369
  }
301
370
  }
302
371
 
372
+ /** Zero-valued token usage, used as the carried-document fallback. */
373
+ const ZERO_USAGE = {
374
+ inputTokens: 0,
375
+ outputTokens: 0,
376
+ cacheReadInputTokens: 0,
377
+ cacheCreationInputTokens: 0,
378
+ };
379
+
303
380
  /**
304
- * Sum per-turn assistant usage and build the per-turn breakdown rows.
381
+ * Per-stream-event breakdown for a pre-change document, labeled as carried —
382
+ * old documents lack message identity, so rows stay keyed by turn index.
305
383
  * @param {object[]} turns
306
- * @returns {{perTurn: object[], totals: object}}
384
+ * @returns {object[]}
307
385
  */
308
- function perTurnUsage(turns) {
309
- const totals = {
310
- inputTokens: 0,
311
- outputTokens: 0,
312
- cacheReadInputTokens: 0,
313
- cacheCreationInputTokens: 0,
314
- };
386
+ function carriedPerTurn(turns) {
315
387
  const perTurn = [];
316
-
317
388
  for (const turn of turns) {
318
389
  if (turn.role !== "assistant" || !turn.usage) continue;
319
- const row = {
390
+ perTurn.push({
320
391
  index: turn.index,
321
392
  inputTokens: turn.usage.inputTokens ?? 0,
322
393
  outputTokens: turn.usage.outputTokens ?? 0,
323
394
  cacheReadInputTokens: turn.usage.cacheReadInputTokens ?? 0,
324
395
  cacheCreationInputTokens: turn.usage.cacheCreationInputTokens ?? 0,
325
- };
396
+ population: "carried-document-per-turn",
397
+ });
398
+ }
399
+ return perTurn;
400
+ }
401
+
402
+ /**
403
+ * Whether a structured-document version predates per-message accounting
404
+ * (1.2.0). A trace with no version (collected by this build from NDJSON) is
405
+ * not pre-change. Compares numeric version parts so 1.10.0 reads as post-change.
406
+ * @param {string|undefined|null} version
407
+ * @returns {boolean}
408
+ */
409
+ function isPreChangeDoc(version) {
410
+ if (typeof version !== "string") return false;
411
+ const [major = 0, minor = 0] = version
412
+ .split(".")
413
+ .map((part) => parseInt(part, 10) || 0);
414
+ if (major !== 1) return major < 1;
415
+ // Per-message accounting arrived in 1.2.0; any 1.2.x is post-change.
416
+ return minor < 2;
417
+ }
418
+
419
+ /**
420
+ * Account assistant usage once per API message. Turns are grouped by
421
+ * `messageId` (a null id is its own singleton message); per message the
422
+ * field-wise max across its snapshots is taken — order-insensitive, equal to
423
+ * the single value when a message's duplicate snapshots are byte-identical
424
+ * (zero residual against result-event sums), and a floor for output (the
425
+ * largest streaming snapshot, never an overstatement).
426
+ * @param {object[]} turns
427
+ * @returns {{perMessage: object[], totals: object}}
428
+ */
429
+ function perMessageUsage(turns) {
430
+ const byMessage = new Map();
431
+ let singletonSeq = 0;
432
+
433
+ for (const turn of turns) {
434
+ if (turn.role !== "assistant" || !turn.usage) continue;
435
+ const key = turn.messageId ?? `__null__${singletonSeq++}`;
436
+ accumulateMessage(byMessage, key, turn);
437
+ }
438
+
439
+ const totals = {
440
+ inputTokens: 0,
441
+ outputTokens: 0,
442
+ cacheReadInputTokens: 0,
443
+ cacheCreationInputTokens: 0,
444
+ };
445
+ const perMessage = [];
446
+ for (const row of byMessage.values()) {
326
447
  totals.inputTokens += row.inputTokens;
327
448
  totals.outputTokens += row.outputTokens;
328
449
  totals.cacheReadInputTokens += row.cacheReadInputTokens;
329
450
  totals.cacheCreationInputTokens += row.cacheCreationInputTokens;
330
- perTurn.push(row);
451
+ perMessage.push({
452
+ ...row,
453
+ outputIsStreamingSnapshot: true,
454
+ population: "api-message",
455
+ });
456
+ }
457
+ return { perMessage, totals };
458
+ }
459
+
460
+ /**
461
+ * Fold one assistant turn's usage into its message bucket by field-wise max.
462
+ * @param {Map<string, object>} byMessage
463
+ * @param {string} key
464
+ * @param {object} turn
465
+ */
466
+ function accumulateMessage(byMessage, key, turn) {
467
+ const u = turn.usage;
468
+ const prev = byMessage.get(key);
469
+ if (!prev) {
470
+ byMessage.set(key, {
471
+ messageId: turn.messageId ?? null,
472
+ inputTokens: u.inputTokens ?? 0,
473
+ outputTokens: u.outputTokens ?? 0,
474
+ cacheReadInputTokens: u.cacheReadInputTokens ?? 0,
475
+ cacheCreationInputTokens: u.cacheCreationInputTokens ?? 0,
476
+ });
477
+ return;
478
+ }
479
+ prev.inputTokens = Math.max(prev.inputTokens, u.inputTokens ?? 0);
480
+ prev.outputTokens = Math.max(prev.outputTokens, u.outputTokens ?? 0);
481
+ prev.cacheReadInputTokens = Math.max(
482
+ prev.cacheReadInputTokens,
483
+ u.cacheReadInputTokens ?? 0,
484
+ );
485
+ prev.cacheCreationInputTokens = Math.max(
486
+ prev.cacheCreationInputTokens,
487
+ u.cacheCreationInputTokens ?? 0,
488
+ );
489
+ }
490
+
491
+ /**
492
+ * Compare per-message sums against the result-event sums on the fields the
493
+ * spec guarantees parity for (input, cacheRead, cacheCreation — never output,
494
+ * which always diverges by mechanism 2). Returns the first divergent field as
495
+ * `{field, perMessageSum, resultEventSum}`, or null when all agree.
496
+ * @param {object} perMessageTotals
497
+ * @param {object} resultEventUsage
498
+ * @returns {object|null}
499
+ */
500
+ function computeDivergence(perMessageTotals, resultEventUsage) {
501
+ for (const field of [
502
+ "inputTokens",
503
+ "cacheReadInputTokens",
504
+ "cacheCreationInputTokens",
505
+ ]) {
506
+ const perMessageSum = perMessageTotals[field] ?? 0;
507
+ const resultEventSum = resultEventUsage[field] ?? 0;
508
+ if (perMessageSum !== resultEventSum) {
509
+ return { field, perMessageSum, resultEventSum };
510
+ }
331
511
  }
332
- return { perTurn, totals };
512
+ return null;
333
513
  }
334
514
 
335
515
  /**