@forwardimpact/libeval 0.1.63 → 0.1.65

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,249 @@
1
+ /**
2
+ * Token-usage accounting for structured trace documents.
3
+ *
4
+ * `stats()` reports totals that name their population: result-event sums when
5
+ * the trace carries them (authoritative), the per-message fallback otherwise,
6
+ * or the carried last-wins summary for a pre-change document. These helpers
7
+ * compute the per-message accounting and surface any divergence against the
8
+ * result-event sums so a mismatch is reported, never silently absorbed.
9
+ */
10
+
11
+ /** Zero-valued token usage, used as the carried-document fallback. */
12
+ export const ZERO_USAGE = {
13
+ inputTokens: 0,
14
+ outputTokens: 0,
15
+ cacheReadInputTokens: 0,
16
+ cacheCreationInputTokens: 0,
17
+ };
18
+
19
+ /**
20
+ * Per-stream-event breakdown for a pre-change document, labeled as carried —
21
+ * old documents lack message identity, so rows stay keyed by turn index.
22
+ * @param {object[]} turns
23
+ * @returns {object[]}
24
+ */
25
+ export function carriedPerTurn(turns) {
26
+ const perTurn = [];
27
+ for (const turn of turns) {
28
+ if (turn.role !== "assistant" || !turn.usage) continue;
29
+ perTurn.push({
30
+ index: turn.index,
31
+ inputTokens: turn.usage.inputTokens ?? 0,
32
+ outputTokens: turn.usage.outputTokens ?? 0,
33
+ cacheReadInputTokens: turn.usage.cacheReadInputTokens ?? 0,
34
+ cacheCreationInputTokens: turn.usage.cacheCreationInputTokens ?? 0,
35
+ population: "carried-document-per-turn",
36
+ });
37
+ }
38
+ return perTurn;
39
+ }
40
+
41
+ /**
42
+ * Whether a structured-document version predates per-message accounting
43
+ * (1.2.0). A trace with no version (collected by this build from NDJSON) is
44
+ * not pre-change. Compares numeric version parts so 1.10.0 reads as post-change.
45
+ * @param {string|undefined|null} version
46
+ * @returns {boolean}
47
+ */
48
+ export function isPreChangeDoc(version) {
49
+ if (typeof version !== "string") return false;
50
+ const [major = 0, minor = 0] = version
51
+ .split(".")
52
+ .map((part) => parseInt(part, 10) || 0);
53
+ if (major !== 1) return major < 1;
54
+ // Per-message accounting arrived in 1.2.0; any 1.2.x is post-change.
55
+ return minor < 2;
56
+ }
57
+
58
+ /**
59
+ * Account assistant usage once per API message. Turns are grouped by
60
+ * `messageId` (a null id is its own singleton message); per message the
61
+ * field-wise max across its snapshots is taken — order-insensitive, equal to
62
+ * the single value when a message's duplicate snapshots are byte-identical
63
+ * (zero residual against result-event sums), and a floor for output (the
64
+ * largest streaming snapshot, never an overstatement).
65
+ * @param {object[]} turns
66
+ * @returns {{perMessage: object[], totals: object}}
67
+ */
68
+ export function perMessageUsage(turns) {
69
+ const byMessage = new Map();
70
+ let singletonSeq = 0;
71
+
72
+ for (const turn of turns) {
73
+ if (turn.role !== "assistant" || !turn.usage) continue;
74
+ const key = turn.messageId ?? `__null__${singletonSeq++}`;
75
+ accumulateMessage(byMessage, key, turn);
76
+ }
77
+
78
+ const totals = {
79
+ inputTokens: 0,
80
+ outputTokens: 0,
81
+ cacheReadInputTokens: 0,
82
+ cacheCreationInputTokens: 0,
83
+ };
84
+ const perMessage = [];
85
+ for (const row of byMessage.values()) {
86
+ totals.inputTokens += row.inputTokens;
87
+ totals.outputTokens += row.outputTokens;
88
+ totals.cacheReadInputTokens += row.cacheReadInputTokens;
89
+ totals.cacheCreationInputTokens += row.cacheCreationInputTokens;
90
+ perMessage.push({
91
+ ...row,
92
+ outputIsStreamingSnapshot: true,
93
+ population: "api-message",
94
+ });
95
+ }
96
+ return { perMessage, totals };
97
+ }
98
+
99
+ /**
100
+ * Fold one assistant turn's usage into its message bucket by field-wise max.
101
+ * @param {Map<string, object>} byMessage
102
+ * @param {string} key
103
+ * @param {object} turn
104
+ */
105
+ function accumulateMessage(byMessage, key, turn) {
106
+ const u = turn.usage;
107
+ const prev = byMessage.get(key);
108
+ if (!prev) {
109
+ byMessage.set(key, {
110
+ messageId: turn.messageId ?? null,
111
+ inputTokens: u.inputTokens ?? 0,
112
+ outputTokens: u.outputTokens ?? 0,
113
+ cacheReadInputTokens: u.cacheReadInputTokens ?? 0,
114
+ cacheCreationInputTokens: u.cacheCreationInputTokens ?? 0,
115
+ });
116
+ return;
117
+ }
118
+ prev.inputTokens = Math.max(prev.inputTokens, u.inputTokens ?? 0);
119
+ prev.outputTokens = Math.max(prev.outputTokens, u.outputTokens ?? 0);
120
+ prev.cacheReadInputTokens = Math.max(
121
+ prev.cacheReadInputTokens,
122
+ u.cacheReadInputTokens ?? 0,
123
+ );
124
+ prev.cacheCreationInputTokens = Math.max(
125
+ prev.cacheCreationInputTokens,
126
+ u.cacheCreationInputTokens ?? 0,
127
+ );
128
+ }
129
+
130
+ /**
131
+ * Compare per-message sums against the result-event sums on the fields the
132
+ * spec guarantees parity for (input, cacheRead, cacheCreation — never output,
133
+ * which always diverges by mechanism 2). Returns the first divergent field as
134
+ * `{field, perMessageSum, resultEventSum}`, or null when all agree.
135
+ * @param {object} perMessageTotals
136
+ * @param {object} resultEventUsage
137
+ * @returns {object|null}
138
+ */
139
+ export function computeDivergence(perMessageTotals, resultEventUsage) {
140
+ for (const field of [
141
+ "inputTokens",
142
+ "cacheReadInputTokens",
143
+ "cacheCreationInputTokens",
144
+ ]) {
145
+ const perMessageSum = perMessageTotals[field] ?? 0;
146
+ const resultEventSum = resultEventUsage[field] ?? 0;
147
+ if (perMessageSum !== resultEventSum) {
148
+ return { field, perMessageSum, resultEventSum };
149
+ }
150
+ }
151
+ return null;
152
+ }
153
+
154
+ /** Sentinel bucket name for assistant turns that ran no tool call. */
155
+ const NO_TOOL = "(no-tool)";
156
+
157
+ /**
158
+ * Attribute per-turn usage to per-tool buckets: each `tool_use` block gets an
159
+ * equal share of its host turn's usage; assistant turns with no `tool_use`
160
+ * block contribute full usage to the `(no-tool)` bucket.
161
+ * @param {object[]} turns
162
+ * @returns {{buckets: Map<string, {inputTokens: number, outputTokens: number}>, bucketTurns: Map<string, Set<number>>}}
163
+ */
164
+ export function bucketUsageByTool(turns) {
165
+ const buckets = new Map();
166
+ const bucketTurns = new Map();
167
+ const ensure = (name) => {
168
+ if (!buckets.has(name)) {
169
+ buckets.set(name, { inputTokens: 0, outputTokens: 0 });
170
+ bucketTurns.set(name, new Set());
171
+ }
172
+ return buckets.get(name);
173
+ };
174
+
175
+ for (const turn of turns) {
176
+ if (turn.role !== "assistant" || !turn.usage) continue;
177
+ const input = turn.usage.inputTokens ?? 0;
178
+ const output = turn.usage.outputTokens ?? 0;
179
+ const toolBlocks = turn.content.filter((b) => b.type === "tool_use");
180
+ const targets = toolBlocks.length === 0 ? [NO_TOOL] : toolBlocks;
181
+ const shareIn = input / targets.length;
182
+ const shareOut = output / targets.length;
183
+ for (const target of targets) {
184
+ const name = typeof target === "string" ? target : target.name;
185
+ const bucket = ensure(name);
186
+ bucket.inputTokens += shareIn;
187
+ bucket.outputTokens += shareOut;
188
+ bucketTurns.get(name).add(turn.index);
189
+ }
190
+ }
191
+ return { buckets, bucketTurns };
192
+ }
193
+
194
+ /**
195
+ * Scale per-tool buckets onto the headline totals so the input, output, and
196
+ * `costShare` columns each sum to the corresponding `totals` value (and 1.0)
197
+ * exactly, regardless of population (result-event-sum, per-message-fallback, or
198
+ * carried-document). The largest bucket absorbs the rounding residual on each
199
+ * axis (criterion-6 invariant).
200
+ * @param {Map<string, {inputTokens: number, outputTokens: number}>} buckets
201
+ * @param {Map<string, Set<number>>} bucketTurns
202
+ * @param {object} totals
203
+ * @returns {Array<{tool: string, turns: number, inputTokens: number, outputTokens: number, costShare: number}>}
204
+ */
205
+ export function reconcileBucketsToTotals(buckets, bucketTurns, totals) {
206
+ const rawIn = sumField(buckets, "inputTokens");
207
+ const rawOut = sumField(buckets, "outputTokens");
208
+ const scaleIn = rawIn === 0 ? 0 : (totals.inputTokens ?? 0) / rawIn;
209
+ const scaleOut = rawOut === 0 ? 0 : (totals.outputTokens ?? 0) / rawOut;
210
+ for (const b of buckets.values()) {
211
+ b.inputTokens *= scaleIn;
212
+ b.outputTokens *= scaleOut;
213
+ }
214
+
215
+ const totalTokens =
216
+ sumField(buckets, "inputTokens") + sumField(buckets, "outputTokens");
217
+ const perTool = [...buckets.entries()].map(([tool, b]) => ({
218
+ tool,
219
+ turns: bucketTurns.get(tool).size,
220
+ inputTokens: b.inputTokens,
221
+ outputTokens: b.outputTokens,
222
+ costShare:
223
+ totalTokens === 0 ? 0 : (b.inputTokens + b.outputTokens) / totalTokens,
224
+ }));
225
+ perTool.sort(
226
+ (x, y) => y.costShare - x.costShare || x.tool.localeCompare(y.tool),
227
+ );
228
+
229
+ if (perTool.length > 0) {
230
+ const top = perTool[0];
231
+ const sum = (field) => perTool.reduce((s, r) => s + r[field], 0);
232
+ top.inputTokens += (totals.inputTokens ?? 0) - sum("inputTokens");
233
+ top.outputTokens += (totals.outputTokens ?? 0) - sum("outputTokens");
234
+ top.costShare += 1 - sum("costShare");
235
+ }
236
+ return perTool;
237
+ }
238
+
239
+ /**
240
+ * Sum one numeric field across every bucket value.
241
+ * @param {Map<string, object>} buckets
242
+ * @param {string} field
243
+ * @returns {number}
244
+ */
245
+ function sumField(buckets, field) {
246
+ let total = 0;
247
+ for (const b of buckets.values()) total += b[field];
248
+ return total;
249
+ }