@tangle-network/agent-eval 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +102 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +35 -2
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-4W4NCYM2.js +0 -1945
  129. package/dist/chunk-4W4NCYM2.js.map +0 -1
  130. package/dist/chunk-5IIQKMD5.js.map +0 -1
  131. package/dist/chunk-6KQG5HAH.js.map +0 -1
  132. package/dist/chunk-6M774GY6.js.map +0 -1
  133. package/dist/chunk-7EAUOUQS.js.map +0 -1
  134. package/dist/chunk-AXHNWLIX.js.map +0 -1
  135. package/dist/chunk-EXGR4XEM.js.map +0 -1
  136. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  137. package/dist/chunk-KAO3Q65R.js.map +0 -1
  138. package/dist/chunk-LZKIOBG2.js +0 -2026
  139. package/dist/chunk-LZKIOBG2.js.map +0 -1
  140. package/dist/chunk-QBW3YBTR.js.map +0 -1
  141. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  142. package/dist/chunk-SQQLHODJ.js.map +0 -1
  143. package/dist/chunk-V5QSWN7L.js +0 -1310
  144. package/dist/chunk-V5QSWN7L.js.map +0 -1
  145. package/dist/chunk-VQQSPGSM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,409 @@
1
+ import {
2
+ DEFAULT_RULES,
3
+ classifyFailure,
4
+ compareToBaseline,
5
+ computeToolUseMetrics
6
+ } from "../chunk-OHEPNJQN.js";
7
+ import {
8
+ buildTrajectory
9
+ } from "../chunk-RZTMDUO7.js";
10
+ import {
11
+ interRaterReliability
12
+ } from "../chunk-I4MBDTY5.js";
13
+ import {
14
+ aggregateLlm,
15
+ argHash,
16
+ llmSpans,
17
+ runFailureClass,
18
+ toolSpans
19
+ } from "../chunk-47X6LRCE.js";
20
+ import "../chunk-5BKGXME7.js";
21
+ import "../chunk-NG236HPC.js";
22
+ import "../chunk-PZ5AY32C.js";
23
+
24
+ // src/pipelines/budget-breach.ts
25
+ async function budgetBreachView(store, options = {}) {
26
+ const runs = await store.listRuns({
27
+ scenarioId: options.scenarioId,
28
+ variantId: options.variantId
29
+ });
30
+ const findings = [];
31
+ const byDimension = {};
32
+ const byScenario = {};
33
+ const byVariant = {};
34
+ for (const run of runs) {
35
+ const entries = await store.budget(run.runId);
36
+ for (const e of entries) {
37
+ if (!e.breached) continue;
38
+ const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity;
39
+ findings.push({
40
+ runId: run.runId,
41
+ scenarioId: run.scenarioId,
42
+ variantId: run.variantId,
43
+ dimension: e.dimension,
44
+ limit: e.limit,
45
+ consumed: e.consumed,
46
+ excessRatio,
47
+ timestamp: e.timestamp
48
+ });
49
+ byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1;
50
+ byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1;
51
+ if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1;
52
+ }
53
+ }
54
+ const breachedRuns = new Set(findings.map((f) => f.runId));
55
+ return {
56
+ findings,
57
+ byDimension,
58
+ byScenario,
59
+ byVariant,
60
+ totalRuns: runs.length,
61
+ breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0
62
+ };
63
+ }
64
+
65
+ // src/pipelines/failure-cluster.ts
66
+ async function failureClusterView(store, options = {}) {
67
+ const rules = options.rules ?? DEFAULT_RULES;
68
+ const minSize = options.minClusterSize ?? 1;
69
+ const runs = await store.listRuns();
70
+ const clusters = /* @__PURE__ */ new Map();
71
+ let totalFailures = 0;
72
+ for (const run of runs) {
73
+ if (run.status === "completed" && run.outcome?.pass !== false) continue;
74
+ totalFailures++;
75
+ const spans = await store.spans({ runId: run.runId });
76
+ const events = await store.events({ runId: run.runId });
77
+ const cls = classifyFailure({ run, spans, events }, rules);
78
+ let toolName;
79
+ let argPrefix;
80
+ let dimension;
81
+ if (cls.triggerSpanId) {
82
+ const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
83
+ if (trig?.kind === "tool") {
84
+ toolName = trig.toolName;
85
+ argPrefix = argHash(trig.args).slice(0, 16);
86
+ } else if (trig?.kind === "judge") {
87
+ dimension = trig.dimension;
88
+ }
89
+ }
90
+ if (!toolName) {
91
+ const ts = await toolSpans(store, run.runId);
92
+ const errored = ts.filter((t) => t.status === "error").pop();
93
+ if (errored) {
94
+ toolName = errored.toolName;
95
+ argPrefix = argHash(errored.args).slice(0, 16);
96
+ }
97
+ }
98
+ if (!dimension) {
99
+ const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
100
+ if (judge?.kind === "judge") dimension = judge.dimension;
101
+ }
102
+ const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
103
+ let cluster = clusters.get(key);
104
+ if (!cluster) {
105
+ cluster = {
106
+ failureClass: cls.failureClass,
107
+ toolName,
108
+ argPrefix,
109
+ dimension,
110
+ runCount: 0,
111
+ scenarioIds: [],
112
+ exampleRunId: run.runId,
113
+ exampleError: firstErrorMessage(spans) ?? cls.reason
114
+ };
115
+ clusters.set(key, cluster);
116
+ }
117
+ cluster.runCount++;
118
+ if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
119
+ }
120
+ const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
121
+ return { clusters: arr, totalFailures, totalRuns: runs.length };
122
+ }
123
+ function firstErrorMessage(spans) {
124
+ const errored = spans.find((s) => s.status === "error");
125
+ return errored?.error;
126
+ }
127
+
128
+ // src/pipelines/first-divergence.ts
129
+ async function firstDivergenceView(store, runA, runB, options = {}) {
130
+ const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
131
+ const eq = options.stepEquals ?? defaultStepEquals;
132
+ const minLen = Math.min(a.steps.length, b.steps.length);
133
+ for (let i = 0; i < minLen; i++) {
134
+ const aStep = a.steps[i];
135
+ const bStep = b.steps[i];
136
+ if (!eq(aStep, bStep)) {
137
+ return {
138
+ runA,
139
+ runB,
140
+ firstDivergenceIndex: i,
141
+ aStep,
142
+ bStep,
143
+ reason: describeDifference(aStep, bStep),
144
+ commonPrefixLen: i
145
+ };
146
+ }
147
+ }
148
+ if (a.steps.length === b.steps.length) {
149
+ return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen };
150
+ }
151
+ const longer = a.steps.length > b.steps.length ? a : b;
152
+ return {
153
+ runA,
154
+ runB,
155
+ firstDivergenceIndex: minLen,
156
+ aStep: a.steps[minLen],
157
+ bStep: b.steps[minLen],
158
+ reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,
159
+ commonPrefixLen: minLen
160
+ };
161
+ }
162
+ function defaultStepEquals(a, b) {
163
+ if (a.span.kind !== b.span.kind) return false;
164
+ if (a.span.kind === "tool" && b.span.kind === "tool") return a.span.toolName === b.span.toolName;
165
+ if (a.span.kind === "llm" && b.span.kind === "llm") return a.span.model === b.span.model;
166
+ if (a.span.kind === "judge" && b.span.kind === "judge")
167
+ return a.span.dimension === b.span.dimension;
168
+ return a.span.name === b.span.name;
169
+ }
170
+ function describeDifference(a, b) {
171
+ if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`;
172
+ if (a.span.kind === "tool" && b.span.kind === "tool" && a.span.toolName !== b.span.toolName) {
173
+ return `tool ${a.span.toolName} vs ${b.span.toolName}`;
174
+ }
175
+ if (a.span.kind === "llm" && b.span.kind === "llm" && a.span.model !== b.span.model) {
176
+ return `model ${a.span.model} vs ${b.span.model}`;
177
+ }
178
+ return `name "${a.span.name}" vs "${b.span.name}"`;
179
+ }
180
+
181
+ // src/pipelines/judge-agreement.ts
182
+ async function judgeAgreementView(store) {
183
+ const all = (await store.spans({ kind: "judge" })).filter(
184
+ (s) => s.kind === "judge"
185
+ );
186
+ if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] };
187
+ const byDimension = /* @__PURE__ */ new Map();
188
+ for (const s of all) {
189
+ const arr = byDimension.get(s.dimension) ?? [];
190
+ arr.push(s);
191
+ byDimension.set(s.dimension, arr);
192
+ }
193
+ const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort();
194
+ const pairs = [];
195
+ for (const [dim, spans] of byDimension) {
196
+ const byJudge = /* @__PURE__ */ new Map();
197
+ for (const s of spans) {
198
+ const m = byJudge.get(s.judgeId) ?? /* @__PURE__ */ new Map();
199
+ m.set(s.targetSpanId, s.score);
200
+ byJudge.set(s.judgeId, m);
201
+ }
202
+ const judgesHere = [...byJudge.keys()];
203
+ for (let i = 0; i < judgesHere.length; i++) {
204
+ for (let j = i + 1; j < judgesHere.length; j++) {
205
+ const judgeI = judgesHere[i];
206
+ const judgeJ = judgesHere[j];
207
+ const a = byJudge.get(judgeI);
208
+ const b = byJudge.get(judgeJ);
209
+ const common = [];
210
+ for (const [target, scoreA] of a) {
211
+ const scoreB = b.get(target);
212
+ if (scoreB !== void 0) common.push([scoreA, scoreB]);
213
+ }
214
+ if (common.length < 2) continue;
215
+ const judgeScores = common.map(
216
+ ([scoreA, scoreB]) => [
217
+ { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: "" },
218
+ { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: "" }
219
+ ]
220
+ );
221
+ const k = interRaterReliability(
222
+ judgeScores[0].map((_, k2) => judgeScores.map((pair) => pair[k2]))
223
+ );
224
+ pairs.push({
225
+ judgeA: judgeI,
226
+ judgeB: judgeJ,
227
+ dimension: dim,
228
+ commonItems: common.length,
229
+ pearson: pearson(
230
+ common.map((c) => c[0]),
231
+ common.map((c) => c[1])
232
+ ),
233
+ krippendorff: k
234
+ });
235
+ }
236
+ }
237
+ }
238
+ return {
239
+ pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),
240
+ dimensions: [...byDimension.keys()].sort(),
241
+ judgeIds
242
+ };
243
+ }
244
+ function pearson(a, b) {
245
+ if (a.length !== b.length || a.length < 2) return NaN;
246
+ const mA = a.reduce((s, v) => s + v, 0) / a.length;
247
+ const mB = b.reduce((s, v) => s + v, 0) / b.length;
248
+ let num = 0, denA = 0, denB = 0;
249
+ for (let i = 0; i < a.length; i++) {
250
+ const dA = a[i] - mA;
251
+ const dB = b[i] - mB;
252
+ num += dA * dB;
253
+ denA += dA * dA;
254
+ denB += dB * dB;
255
+ }
256
+ if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0;
257
+ return num / Math.sqrt(denA * denB);
258
+ }
259
+
260
+ // src/pipelines/regression.ts
261
+ async function regressionView(store, metrics, options) {
262
+ const baselineRuns = await store.listRuns(options.baseline);
263
+ const candidateRuns = await store.listRuns(options.candidate);
264
+ const samples = await Promise.all(
265
+ metrics.map(async (m) => {
266
+ const extract = m.extract ?? defaultExtract(m.metric);
267
+ const baseline = await extractAll(baselineRuns, extract, store);
268
+ const candidate = await extractAll(candidateRuns, extract, store);
269
+ return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate };
270
+ })
271
+ );
272
+ return compareToBaseline(samples, options);
273
+ }
274
+ async function extractAll(runs, extract, store) {
275
+ const out = [];
276
+ for (const r of runs) {
277
+ const v = await extract(r, store);
278
+ if (v !== null && Number.isFinite(v)) out.push(v);
279
+ }
280
+ return out;
281
+ }
282
+ function defaultExtract(metric) {
283
+ return async (run, store) => {
284
+ switch (metric) {
285
+ case "score":
286
+ case "overallScore":
287
+ return run.outcome?.score ?? null;
288
+ case "pass":
289
+ return run.outcome?.pass === true ? 1 : 0;
290
+ case "durationMs":
291
+ return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null;
292
+ case "costUsd": {
293
+ const llm = await llmSpans(store, run.runId);
294
+ return aggregateLlm(llm).costUsd;
295
+ }
296
+ case "inputTokens": {
297
+ const llm = await llmSpans(store, run.runId);
298
+ return aggregateLlm(llm).inputTokens;
299
+ }
300
+ case "outputTokens": {
301
+ const llm = await llmSpans(store, run.runId);
302
+ return aggregateLlm(llm).outputTokens;
303
+ }
304
+ case "failureClass": {
305
+ return runFailureClass(run) === "success" ? 1 : 0;
306
+ }
307
+ default:
308
+ return null;
309
+ }
310
+ };
311
+ }
312
+
313
+ // src/pipelines/stuck-loop.ts
314
+ async function stuckLoopView(store, options = {}) {
315
+ const minOccurrences = options.minOccurrences ?? 3;
316
+ const runs = options.runId ? [{ runId: options.runId }] : (await store.listRuns()).map((r) => ({ runId: r.runId }));
317
+ const findings = [];
318
+ for (const { runId } of runs) {
319
+ const tools = await toolSpans(store, runId);
320
+ const byKey = /* @__PURE__ */ new Map();
321
+ for (const t of tools) {
322
+ const h = argHash(t.args);
323
+ const key = `${t.toolName}|${h}`;
324
+ const bucket = byKey.get(key) ?? { spans: [], argHash: h };
325
+ bucket.spans.push(t);
326
+ byKey.set(key, bucket);
327
+ }
328
+ for (const [key, { spans, argHash: h }] of byKey) {
329
+ if (spans.length < minOccurrences) continue;
330
+ const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt);
331
+ const first = sorted[0].startedAt;
332
+ const last = sorted[sorted.length - 1].startedAt;
333
+ findings.push({
334
+ runId,
335
+ toolName: key.split("|")[0],
336
+ argHash: h,
337
+ occurrences: sorted.length,
338
+ spanIds: sorted.map((s) => s.spanId),
339
+ windowMs: last - first
340
+ });
341
+ }
342
+ }
343
+ const affectedRuns = new Set(findings.map((f) => f.runId));
344
+ return {
345
+ findings,
346
+ affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,
347
+ totalRuns: runs.length
348
+ };
349
+ }
350
+
351
+ // src/pipelines/tool-waste.ts
352
+ async function toolWasteView(store, options = {}) {
353
+ const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId);
354
+ const byRun = [];
355
+ let totalCalls = 0;
356
+ let totalWasted = 0;
357
+ for (const runId of runs) {
358
+ const tools = await toolSpans(store, runId);
359
+ if (tools.length === 0) {
360
+ byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 });
361
+ continue;
362
+ }
363
+ const llms = await llmSpans(store, runId);
364
+ let wasted = 0;
365
+ for (const t of tools) {
366
+ if (t.status === "error") {
367
+ wasted++;
368
+ continue;
369
+ }
370
+ const laterLlm = llms.filter((l) => l.startedAt > t.startedAt);
371
+ if (options.usageOracle) {
372
+ if (!options.usageOracle(t, { llm: laterLlm })) wasted++;
373
+ } else {
374
+ const resultStr = stringify(t.result);
375
+ const used = laterLlm.some(
376
+ (l) => l.messages.some(
377
+ (m) => typeof m.content === "string" && resultStr && m.content.includes(resultStr.slice(0, 120))
378
+ )
379
+ );
380
+ if (!used) wasted++;
381
+ }
382
+ }
383
+ const wasteRate = wasted / tools.length;
384
+ byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate });
385
+ totalCalls += tools.length;
386
+ totalWasted += wasted;
387
+ }
388
+ return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 };
389
+ }
390
+ function stringify(v) {
391
+ if (v === null || v === void 0) return "";
392
+ if (typeof v === "string") return v;
393
+ try {
394
+ return JSON.stringify(v);
395
+ } catch {
396
+ return String(v);
397
+ }
398
+ }
399
+ export {
400
+ budgetBreachView,
401
+ computeToolUseMetrics,
402
+ failureClusterView,
403
+ firstDivergenceView,
404
+ judgeAgreementView,
405
+ regressionView,
406
+ stuckLoopView,
407
+ toolWasteView
408
+ };
409
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/pipelines/budget-breach.ts","../../src/pipelines/failure-cluster.ts","../../src/pipelines/first-divergence.ts","../../src/pipelines/judge-agreement.ts","../../src/pipelines/regression.ts","../../src/pipelines/stuck-loop.ts","../../src/pipelines/tool-waste.ts"],"sourcesContent":["/**\n * BudgetBreachView — aggregates breach events across the corpus.\n *\n * Answers: which dimensions get hit most often? Which scenarios are\n * underbudgeted? Which variants trigger the most breaches?\n */\n\nimport type { BudgetSpec } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface BudgetBreachFinding {\n runId: string\n scenarioId: string\n variantId?: string\n dimension: keyof BudgetSpec\n limit: number\n consumed: number\n excessRatio: number\n timestamp: number\n}\n\nexport interface BudgetBreachReport {\n findings: BudgetBreachFinding[]\n byDimension: Record<string, number>\n byScenario: Record<string, number>\n byVariant: Record<string, number>\n totalRuns: number\n breachedRunRatio: number\n}\n\nexport async function budgetBreachView(\n store: TraceStore,\n options: { scenarioId?: string; variantId?: string } = {},\n): Promise<BudgetBreachReport> {\n const runs = await store.listRuns({\n scenarioId: options.scenarioId,\n variantId: options.variantId,\n })\n const findings: BudgetBreachFinding[] = []\n const byDimension: Record<string, number> = {}\n const byScenario: Record<string, number> = {}\n const byVariant: Record<string, number> = {}\n\n for (const run of runs) {\n const entries = await store.budget(run.runId)\n for (const e of entries) {\n if (!e.breached) continue\n const excessRatio = e.limit > 0 ? e.consumed / e.limit : Infinity\n findings.push({\n runId: run.runId,\n scenarioId: run.scenarioId,\n variantId: run.variantId,\n dimension: e.dimension,\n limit: e.limit,\n consumed: e.consumed,\n excessRatio,\n timestamp: e.timestamp,\n })\n byDimension[e.dimension] = (byDimension[e.dimension] ?? 0) + 1\n byScenario[run.scenarioId] = (byScenario[run.scenarioId] ?? 0) + 1\n if (run.variantId) byVariant[run.variantId] = (byVariant[run.variantId] ?? 0) + 1\n }\n }\n\n const breachedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n byDimension,\n byScenario,\n byVariant,\n totalRuns: runs.length,\n breachedRunRatio: runs.length > 0 ? breachedRuns.size / runs.length : 0,\n }\n}\n","/**\n * FailureClusterView — groups failed runs by (failureClass, triggerTool,\n * argHash-prefix) so weekly reviews can prioritize the top-N clusters.\n *\n * Each cluster includes: N runs, scenarios affected, representative\n * error message, a proposed mitigation hint (rule → action table).\n */\n\nimport { classifyFailure, DEFAULT_RULES, type FailureRule } from '../failure-taxonomy'\nimport { argHash, toolSpans } from '../trace/query'\nimport type { FailureClass, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface FailureCluster {\n failureClass: FailureClass\n /** Tool name when the trigger was a tool span, else undefined. */\n toolName?: string\n /** First 16 chars of argHash — clusters similar args. */\n argPrefix?: string\n /**\n * Source dimension when the trigger was a judge span (e.g. `'format'`,\n * `'safety'`, `'correctness'`). Lets cross-template aggregators\n * group failures by the dimension that fired without overloading\n * `argPrefix`. Optional — legacy clusters without this field\n * deserialize cleanly.\n */\n dimension?: string\n runCount: number\n scenarioIds: string[]\n exampleError?: string\n exampleRunId: string\n}\n\nexport interface FailureClusterReport {\n clusters: FailureCluster[]\n totalFailures: number\n totalRuns: number\n}\n\nexport async function failureClusterView(\n store: TraceStore,\n options: { rules?: FailureRule[]; minClusterSize?: number } = {},\n): Promise<FailureClusterReport> {\n const rules = options.rules ?? DEFAULT_RULES\n const minSize = options.minClusterSize ?? 1\n const runs = await store.listRuns()\n\n type Key = string\n const clusters = new Map<Key, FailureCluster>()\n let totalFailures = 0\n\n for (const run of runs) {\n if (run.status === 'completed' && run.outcome?.pass !== false) continue\n totalFailures++\n const spans = await store.spans({ runId: run.runId })\n const events = await store.events({ runId: run.runId })\n const cls = classifyFailure({ run, spans, events }, rules)\n\n let toolName: string | undefined\n let argPrefix: string | undefined\n let dimension: string | undefined\n if (cls.triggerSpanId) {\n const trig = spans.find((s) => s.spanId === cls.triggerSpanId)\n if (trig?.kind === 'tool') {\n toolName = trig.toolName\n argPrefix = argHash(trig.args).slice(0, 16)\n } else if (trig?.kind === 'judge') {\n dimension = trig.dimension\n }\n }\n // Fallback: look at the last errored tool span\n if (!toolName) {\n const ts = await toolSpans(store, run.runId)\n const errored = ts.filter((t) => t.status === 'error').pop()\n if (errored) {\n toolName = errored.toolName\n argPrefix = argHash(errored.args).slice(0, 16)\n }\n }\n // Secondary signal: any judge span on the failed run carries a\n // dimension. Useful when the rule classified by judge score but\n // didn't surface the trigger span (or surfaced a non-judge span).\n if (!dimension) {\n const judge = spans.find((s) => s.kind === 'judge' && typeof s.dimension === 'string')\n if (judge?.kind === 'judge') dimension = judge.dimension\n }\n\n const key = `${cls.failureClass}|${toolName ?? ''}|${argPrefix ?? ''}|${dimension ?? ''}`\n let cluster = clusters.get(key)\n if (!cluster) {\n cluster = {\n failureClass: cls.failureClass,\n toolName,\n argPrefix,\n dimension,\n runCount: 0,\n scenarioIds: [],\n exampleRunId: run.runId,\n exampleError: firstErrorMessage(spans) ?? cls.reason,\n }\n clusters.set(key, cluster)\n }\n cluster.runCount++\n if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId)\n }\n\n const arr = [...clusters.values()]\n .filter((c) => c.runCount >= minSize)\n .sort((a, b) => b.runCount - a.runCount)\n\n return { clusters: arr, totalFailures, totalRuns: runs.length }\n}\n\nfunction firstErrorMessage(spans: Span[]): string | undefined {\n const errored = spans.find((s) => s.status === 'error')\n return errored?.error\n}\n","/**\n * FirstDivergenceView — aligns two trajectories by step index, reports\n * the first step where they differ.\n *\n * \"Differ\" is configurable — default is (kind, toolName if tool, model\n * if llm). Use this view to attribute \"why is variant B better?\" to a\n * specific step rather than an aggregate mean delta.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface DivergenceReport {\n runA: string\n runB: string\n firstDivergenceIndex: number | null\n aStep?: TrajectoryStep\n bStep?: TrajectoryStep\n reason?: string\n /** Common prefix length (steps that matched). */\n commonPrefixLen: number\n}\n\nexport interface DivergenceOptions {\n /** Returns true if two steps are considered equal. Default: kind + tool/model match. */\n stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean\n}\n\nexport async function firstDivergenceView(\n store: TraceStore,\n runA: string,\n runB: string,\n options: DivergenceOptions = {},\n): Promise<DivergenceReport> {\n const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)])\n const eq = options.stepEquals ?? defaultStepEquals\n const minLen = Math.min(a.steps.length, b.steps.length)\n for (let i = 0; i < minLen; i++) {\n const aStep = a.steps[i]!\n const bStep = b.steps[i]!\n if (!eq(aStep, bStep)) {\n return {\n runA,\n runB,\n firstDivergenceIndex: i,\n aStep,\n bStep,\n reason: describeDifference(aStep, bStep),\n commonPrefixLen: i,\n }\n }\n }\n if (a.steps.length === b.steps.length) {\n return { runA, runB, firstDivergenceIndex: null, commonPrefixLen: minLen }\n }\n const longer: Trajectory = a.steps.length > b.steps.length ? a : b\n return {\n runA,\n runB,\n firstDivergenceIndex: minLen,\n aStep: a.steps[minLen],\n bStep: b.steps[minLen],\n reason: `one trajectory has ${longer.steps.length - minLen} more step(s) after index ${minLen - 1}`,\n commonPrefixLen: minLen,\n }\n}\n\nfunction defaultStepEquals(a: TrajectoryStep, b: TrajectoryStep): boolean {\n if (a.span.kind !== b.span.kind) return false\n if (a.span.kind === 'tool' && b.span.kind === 'tool') return a.span.toolName === b.span.toolName\n if (a.span.kind === 'llm' && b.span.kind === 'llm') return a.span.model === b.span.model\n if (a.span.kind === 'judge' && b.span.kind === 'judge')\n return a.span.dimension === b.span.dimension\n return a.span.name === b.span.name\n}\n\nfunction describeDifference(a: TrajectoryStep, b: TrajectoryStep): string {\n if (a.span.kind !== b.span.kind) return `kind ${a.span.kind} vs ${b.span.kind}`\n if (a.span.kind === 'tool' && b.span.kind === 'tool' && a.span.toolName !== b.span.toolName) {\n return `tool ${a.span.toolName} vs ${b.span.toolName}`\n }\n if (a.span.kind === 'llm' && b.span.kind === 'llm' && a.span.model !== b.span.model) {\n return `model ${a.span.model} vs ${b.span.model}`\n }\n return `name \"${a.span.name}\" vs \"${b.span.name}\"`\n}\n","/**\n * JudgeAgreementView — pairwise agreement between judges across the\n * corpus, grouped by dimension.\n *\n * Output drives two workflows:\n * - Judge robustness audit: \"does Claude agree with GPT at κ ≥ 0.6?\"\n * - Calibration tracking: κ vs golden human labels over time (by\n * providing a `humanGoldenJudgeId`).\n */\n\nimport { interRaterReliability } from '../statistics'\nimport type { JudgeSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface JudgePair {\n judgeA: string\n judgeB: string\n dimension: string\n /** Number of (targetSpanId, dimension) tuples both judges scored. */\n commonItems: number\n pearson: number\n krippendorff: number\n}\n\nexport interface JudgeAgreementReport {\n pairs: JudgePair[]\n dimensions: string[]\n judgeIds: string[]\n}\n\nexport async function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport> {\n const all = (await store.spans({ kind: 'judge' })).filter(\n (s): s is JudgeSpan => s.kind === 'judge',\n )\n if (all.length === 0) return { pairs: [], dimensions: [], judgeIds: [] }\n\n const byDimension = new Map<string, JudgeSpan[]>()\n for (const s of all) {\n const arr = byDimension.get(s.dimension) ?? []\n arr.push(s)\n byDimension.set(s.dimension, arr)\n }\n\n const judgeIds = [...new Set(all.map((s) => s.judgeId))].sort()\n const pairs: JudgePair[] = []\n for (const [dim, spans] of byDimension) {\n const byJudge = new Map<string, Map<string, number>>()\n for (const s of spans) {\n const m = byJudge.get(s.judgeId) ?? new Map<string, number>()\n m.set(s.targetSpanId, s.score)\n byJudge.set(s.judgeId, m)\n }\n const judgesHere = [...byJudge.keys()]\n for (let i = 0; i < judgesHere.length; i++) {\n for (let j = i + 1; j < judgesHere.length; j++) {\n const judgeI = judgesHere[i]!\n const judgeJ = judgesHere[j]!\n const a = byJudge.get(judgeI)!\n const b = byJudge.get(judgeJ)!\n const common: Array<[number, number]> = []\n for (const [target, scoreA] of a) {\n const scoreB = b.get(target)\n if (scoreB !== undefined) common.push([scoreA, scoreB])\n }\n if (common.length < 2) continue\n const judgeScores = common.map(\n ([scoreA, scoreB]) =>\n [\n { judgeName: judgeI, dimension: dim, score: scoreA, reasoning: '' },\n { judgeName: judgeJ, dimension: dim, score: scoreB, reasoning: '' },\n ] as const,\n )\n const k = interRaterReliability(\n judgeScores[0]!.map((_, k2) => judgeScores.map((pair) => pair[k2]!)),\n )\n pairs.push({\n judgeA: judgeI,\n judgeB: judgeJ,\n dimension: dim,\n commonItems: common.length,\n pearson: pearson(\n common.map((c) => c[0]),\n common.map((c) => c[1]),\n ),\n krippendorff: k,\n })\n }\n }\n }\n\n return {\n pairs: pairs.sort((a, b) => b.commonItems - a.commonItems),\n dimensions: [...byDimension.keys()].sort(),\n judgeIds,\n }\n}\n\nfunction pearson(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n denA = 0,\n denB = 0\n for (let i = 0; i < a.length; i++) {\n const dA = a[i]! - mA\n const dB = b[i]! - mB\n num += dA * dB\n denA += dA * dA\n denB += dB * dB\n }\n if (denA === 0 || denB === 0) return denA === 0 && denB === 0 ? 1 : 0\n return num / Math.sqrt(denA * denB)\n}\n","/**\n * RegressionView — compares a candidate slice to a baseline slice on a\n * named metric. Delegates the statistics (Welch's t-test, Cohen's d,\n * IQR stability) to `baseline.ts`.\n *\n * This is the entry point for CI regression gates: \"given runs tagged\n * release=A and release=B, did any metric regress?\"\n */\n\nimport { type BaselineOptions, type BaselineReport, compareToBaseline } from '../baseline'\nimport { aggregateLlm, llmSpans, runFailureClass } from '../trace/query'\nimport type { Run } from '../trace/schema'\nimport type { RunFilter, TraceStore } from '../trace/store'\n\nexport interface RegressionSpec {\n metric: string\n higherIsBetter: boolean\n /** Extract a scalar from a run. Default extractors handle common metrics. */\n extract?: (run: Run, store: TraceStore) => Promise<number | null>\n}\n\nexport interface RegressionOptions extends BaselineOptions {\n baseline: RunFilter\n candidate: RunFilter\n}\n\nexport async function regressionView(\n store: TraceStore,\n metrics: RegressionSpec[],\n options: RegressionOptions,\n): Promise<BaselineReport> {\n const baselineRuns = await store.listRuns(options.baseline)\n const candidateRuns = await store.listRuns(options.candidate)\n const samples = await Promise.all(\n metrics.map(async (m) => {\n const extract = m.extract ?? defaultExtract(m.metric)\n const baseline = await extractAll(baselineRuns, extract, store)\n const candidate = await extractAll(candidateRuns, extract, store)\n return { metric: m.metric, higherIsBetter: m.higherIsBetter, baseline, candidate }\n }),\n )\n return compareToBaseline(samples, options)\n}\n\nasync function extractAll(\n runs: Run[],\n extract: (r: Run, s: TraceStore) => Promise<number | null>,\n store: TraceStore,\n): Promise<number[]> {\n const out: number[] = []\n for (const r of runs) {\n const v = await extract(r, store)\n if (v !== null && Number.isFinite(v)) out.push(v)\n }\n return out\n}\n\nfunction defaultExtract(metric: string): (run: Run, store: TraceStore) => Promise<number | null> {\n return async (run, store) => {\n switch (metric) {\n case 'score':\n case 'overallScore':\n return run.outcome?.score ?? null\n case 'pass':\n return run.outcome?.pass === true ? 1 : 0\n case 'durationMs':\n return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null\n case 'costUsd': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).costUsd\n }\n case 'inputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).inputTokens\n }\n case 'outputTokens': {\n const llm = await llmSpans(store, run.runId)\n return aggregateLlm(llm).outputTokens\n }\n case 'failureClass': {\n return runFailureClass(run) === 'success' ? 1 : 0\n }\n default:\n return null\n }\n }\n}\n","/**\n * StuckLoopView — detects when an agent calls the same tool with the\n * same (or structurally similar) arguments ≥ N times in a short window.\n *\n * Rationale: agents that loop are the number-one production failure\n * mode on long-horizon flows. The view returns (runId, toolName,\n * argHash, occurrences, windowMs) for each detected loop plus a\n * fraction of runs affected.\n */\n\nimport { argHash, toolSpans } from '../trace/query'\nimport type { TraceStore } from '../trace/store'\n\nexport interface StuckLoopFinding {\n runId: string\n toolName: string\n argHash: string\n occurrences: number\n spanIds: string[]\n /** Milliseconds between first and last call in the loop. */\n windowMs: number\n}\n\nexport interface StuckLoopReport {\n findings: StuckLoopFinding[]\n affectedRunRatio: number\n totalRuns: number\n}\n\nexport interface StuckLoopOptions {\n /** Minimum call count to flag a loop (default 3). */\n minOccurrences?: number\n /** Filter to a specific runId; omit to scan the entire corpus. */\n runId?: string\n}\n\nexport async function stuckLoopView(\n store: TraceStore,\n options: StuckLoopOptions = {},\n): Promise<StuckLoopReport> {\n const minOccurrences = options.minOccurrences ?? 3\n const runs = options.runId\n ? [{ runId: options.runId }]\n : (await store.listRuns()).map((r) => ({ runId: r.runId }))\n\n const findings: StuckLoopFinding[] = []\n for (const { runId } of runs) {\n const tools = await toolSpans(store, runId)\n const byKey = new Map<string, { spans: typeof tools; argHash: string }>()\n for (const t of tools) {\n const h = argHash(t.args)\n const key = `${t.toolName}|${h}`\n const bucket = byKey.get(key) ?? { spans: [], argHash: h }\n bucket.spans.push(t)\n byKey.set(key, bucket)\n }\n for (const [key, { spans, argHash: h }] of byKey) {\n if (spans.length < minOccurrences) continue\n const sorted = [...spans].sort((a, b) => a.startedAt - b.startedAt)\n const first = sorted[0]!.startedAt\n const last = sorted[sorted.length - 1]!.startedAt\n findings.push({\n runId,\n toolName: key.split('|')[0]!,\n argHash: h,\n occurrences: sorted.length,\n spanIds: sorted.map((s) => s.spanId),\n windowMs: last - first,\n })\n }\n }\n\n const affectedRuns = new Set(findings.map((f) => f.runId))\n return {\n findings,\n affectedRunRatio: runs.length > 0 ? affectedRuns.size / runs.length : 0,\n totalRuns: runs.length,\n }\n}\n","/**\n * ToolWasteView — fraction of tool calls whose results weren't used\n * downstream. Without a \"used\" signal we fall back to structural\n * proxies: error calls, duplicate calls, and tool calls followed by\n * zero subsequent LLM spans are all considered waste.\n *\n * Consumers can pass a `usageOracle` that inspects a tool span and\n * returns true iff the tool's result appears in a later LLM message,\n * artifact, or state mutation — that's the canonical definition; the\n * default heuristic is a reasonable fallback.\n */\n\nimport { computeToolUseMetrics } from '../tool-use-metrics'\nimport { llmSpans, toolSpans } from '../trace/query'\nimport type { ToolSpan } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\n\nexport interface ToolWasteFinding {\n runId: string\n wastedCalls: number\n totalCalls: number\n wasteRate: number\n}\n\nexport interface ToolWasteReport {\n byRun: ToolWasteFinding[]\n overallWasteRate: number\n}\n\nexport interface ToolWasteOptions {\n runId?: string\n usageOracle?: (tool: ToolSpan, later: { llm: Awaited<ReturnType<typeof llmSpans>> }) => boolean\n}\n\nexport async function toolWasteView(\n store: TraceStore,\n options: ToolWasteOptions = {},\n): Promise<ToolWasteReport> {\n const runs = options.runId ? [options.runId] : (await store.listRuns()).map((r) => r.runId)\n\n const byRun: ToolWasteFinding[] = []\n let totalCalls = 0\n let totalWasted = 0\n for (const runId of runs) {\n const tools = await toolSpans(store, runId)\n if (tools.length === 0) {\n byRun.push({ runId, wastedCalls: 0, totalCalls: 0, wasteRate: 0 })\n continue\n }\n const llms = await llmSpans(store, runId)\n let wasted = 0\n for (const t of tools) {\n if (t.status === 'error') {\n wasted++\n continue\n }\n const laterLlm = llms.filter((l) => l.startedAt > t.startedAt)\n if (options.usageOracle) {\n if (!options.usageOracle(t, { llm: laterLlm })) wasted++\n } else {\n // Default heuristic: a tool whose result is NOT mentioned in any\n // later LLM input message is likely wasted.\n const resultStr = stringify(t.result)\n const used = laterLlm.some((l) =>\n l.messages.some(\n (m) =>\n typeof m.content === 'string' &&\n resultStr &&\n m.content.includes(resultStr.slice(0, 120)),\n ),\n )\n if (!used) wasted++\n }\n }\n const wasteRate = wasted / tools.length\n byRun.push({ runId, wastedCalls: wasted, totalCalls: tools.length, wasteRate })\n totalCalls += tools.length\n totalWasted += wasted\n }\n return { byRun, overallWasteRate: totalCalls > 0 ? totalWasted / totalCalls : 0 }\n}\n\nfunction stringify(v: unknown): string {\n if (v === null || v === undefined) return ''\n if (typeof v === 'string') return v\n try {\n return JSON.stringify(v)\n } catch {\n return String(v)\n }\n}\n\n// Re-export for convenience in consumers that want both descriptive and usage metrics.\nexport { computeToolUseMetrics }\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;AA8BA,eAAsB,iBACpB,OACA,UAAuD,CAAC,GAC3B;AAC7B,QAAM,OAAO,MAAM,MAAM,SAAS;AAAA,IAChC,YAAY,QAAQ;AAAA,IACpB,WAAW,QAAQ;AAAA,EACrB,CAAC;AACD,QAAM,WAAkC,CAAC;AACzC,QAAM,cAAsC,CAAC;AAC7C,QAAM,aAAqC,CAAC;AAC5C,QAAM,YAAoC,CAAC;AAE3C,aAAW,OAAO,MAAM;AACtB,UAAM,UAAU,MAAM,MAAM,OAAO,IAAI,KAAK;AAC5C,eAAW,KAAK,SAAS;AACvB,UAAI,CAAC,EAAE,SAAU;AACjB,YAAM,cAAc,EAAE,QAAQ,IAAI,EAAE,WAAW,EAAE,QAAQ;AACzD,eAAS,KAAK;AAAA,QACZ,OAAO,IAAI;AAAA,QACX,YAAY,IAAI;AAAA,QAChB,WAAW,IAAI;AAAA,QACf,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,QACT,UAAU,EAAE;AAAA,QACZ;AAAA,QACA,WAAW,EAAE;AAAA,MACf,CAAC;AACD,kBAAY,EAAE,SAAS,KAAK,YAAY,EAAE,SAAS,KAAK,KAAK;AAC7D,iBAAW,IAAI,UAAU,KAAK,WAAW,IAAI,UAAU,KAAK,KAAK;AACjE,UAAI,IAAI,UAAW,WAAU,IAAI,SAAS,KAAK,UAAU,IAAI,SAAS,KAAK,KAAK;AAAA,IAClF;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,EACxE;AACF;;;AClCA,eAAsB,mBACpB,OACA,UAA8D,CAAC,GAChC;AAC/B,QAAM,QAAQ,QAAQ,SAAS;AAC/B,QAAM,UAAU,QAAQ,kBAAkB;AAC1C,QAAM,OAAO,MAAM,MAAM,SAAS;AAGlC,QAAM,WAAW,oBAAI,IAAyB;AAC9C,MAAI,gBAAgB;AAEpB,aAAW,OAAO,MAAM;AACtB,QAAI,IAAI,WAAW,eAAe,IAAI,SAAS,SAAS,MAAO;AAC/D;AACA,UAAM,QAAQ,MAAM,MAAM,MAAM,EAAE,OAAO,IAAI,MAAM,CAAC;AACpD,UAAM,SAAS,MAAM,MAAM,OAAO,EAAE,OAAO,IAAI,MAAM,CAAC;AACtD,UAAM,MAAM,gBAAgB,EAAE,KAAK,OAAO,OAAO,GAAG,KAAK;AAEzD,QAAI;AACJ,QAAI;AACJ,QAAI;AACJ,QAAI,IAAI,eAAe;AACrB,YAAM,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,IAAI,aAAa;AAC7D,UAAI,MAAM,SAAS,QAAQ;AACzB,mBAAW,KAAK;AAChB,oBAAY,QAAQ,KAAK,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC5C,WAAW,MAAM,SAAS,SAAS;AACjC,oBAAY,KAAK;AAAA,MACnB;AAAA,IACF;AAEA,QAAI,CAAC,UAAU;AACb,YAAM,KAAK,MAAM,UAAU,OAAO,IAAI,KAAK;AAC3C,YAAM,UAAU,GAAG,OAAO,CAAC,MAAM,EAAE,WAAW,OAAO,EAAE,IAAI;AAC3D,UAAI,SAAS;AACX,mBAAW,QAAQ;AACnB,oBAAY,QAAQ,QAAQ,IAAI,EAAE,MAAM,GAAG,EAAE;AAAA,MAC/C;AAAA,IACF;AAIA,QAAI,CAAC,WAAW;AACd,YAAM,QAAQ,MAAM,KAAK,CAAC,MAAM,EAAE,SAAS,WAAW,OAAO,EAAE,cAAc,QAAQ;AACrF,UAAI,OAAO,SAAS,QAAS,aAAY,MAAM;AAAA,IACjD;AAEA,UAAM,MAAM,GAAG,IAAI,YAAY,IAAI,YAAY,EAAE,IAAI,aAAa,EAAE,IAAI,aAAa,EAAE;AACvF,QAAI,UAAU,SAAS,IAAI,GAAG;AAC9B,QAAI,CAAC,SAAS;AACZ,gBAAU;AAAA,QACR,cAAc,IAAI;AAAA,QAClB;AAAA,QACA;AAAA,QACA;AAAA,QACA,UAAU;AAAA,QACV,aAAa,CAAC;AAAA,QACd,cAAc,IAAI;AAAA,QAClB,cAAc,kBAAkB,KAAK,KAAK,IAAI;AAAA,MAChD;AACA,eAAS,IAAI,KAAK,OAAO;AAAA,IAC3B;AACA,YAAQ;AACR,QAAI,CAAC,QAAQ,YAAY,SAAS,IAAI,UAAU,EAAG,SAAQ,YAAY,KAAK,IAAI,UAAU;AAAA,EAC5F;AAEA,QAAM,MAAM,CAAC,GAAG,SAAS,OAAO,CAAC,EAC9B,OAAO,CAAC,MAAM,EAAE,YAAY,OAAO,EACnC,KAAK,CAAC,GAAG,MAAM,EAAE,WAAW,EAAE,QAAQ;AAEzC,SAAO,EAAE,UAAU,KAAK,eAAe,WAAW,KAAK,OAAO;AAChE;AAEA,SAAS,kBAAkB,OAAmC;AAC5D,QAAM,UAAU,MAAM,KAAK,CAAC,MAAM,EAAE,WAAW,OAAO;AACtD,SAAO,SAAS;AAClB;;;ACxFA,eAAsB,oBACpB,OACA,MACA,MACA,UAA6B,CAAC,GACH;AAC3B,QAAM,CAAC,GAAG,CAAC,IAAI,MAAM,QAAQ,IAAI,CAAC,gBAAgB,OAAO,IAAI,GAAG,gBAAgB,OAAO,IAAI,CAAC,CAAC;AAC7F,QAAM,KAAK,QAAQ,cAAc;AACjC,QAAM,SAAS,KAAK,IAAI,EAAE,MAAM,QAAQ,EAAE,MAAM,MAAM;AACtD,WAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,UAAM,QAAQ,EAAE,MAAM,CAAC;AACvB,QAAI,CAAC,GAAG,OAAO,KAAK,GAAG;AACrB,aAAO;AAAA,QACL;AAAA,QACA;AAAA,QACA,sBAAsB;AAAA,QACtB;AAAA,QACA;AAAA,QACA,QAAQ,mBAAmB,OAAO,KAAK;AAAA,QACvC,iBAAiB;AAAA,MACnB;AAAA,IACF;AAAA,EACF;AACA,MAAI,EAAE,MAAM,WAAW,EAAE,MAAM,QAAQ;AACrC,WAAO,EAAE,MAAM,MAAM,sBAAsB,MAAM,iBAAiB,OAAO;AAAA,EAC3E;AACA,QAAM,SAAqB,EAAE,MAAM,SAAS,EAAE,MAAM,SAAS,IAAI;AACjE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,sBAAsB;AAAA,IACtB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,OAAO,EAAE,MAAM,MAAM;AAAA,IACrB,QAAQ,sBAAsB,OAAO,MAAM,SAAS,MAAM,6BAA6B,SAAS,CAAC;AAAA,IACjG,iBAAiB;AAAA,EACnB;AACF;AAEA,SAAS,kBAAkB,GAAmB,GAA4B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO;AACxC,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,OAAQ,QAAO,EAAE,KAAK,aAAa,EAAE,KAAK;AACxF,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,MAAO,QAAO,EAAE,KAAK,UAAU,EAAE,KAAK;AACnF,MAAI,EAAE,KAAK,SAAS,WAAW,EAAE,KAAK,SAAS;AAC7C,WAAO,EAAE,KAAK,cAAc,EAAE,KAAK;AACrC,SAAO,EAAE,KAAK,SAAS,EAAE,KAAK;AAChC;AAEA,SAAS,mBAAmB,GAAmB,GAA2B;AACxE,MAAI,EAAE,KAAK,SAAS,EAAE,KAAK,KAAM,QAAO,QAAQ,EAAE,KAAK,IAAI,OAAO,EAAE,KAAK,IAAI;AAC7E,MAAI,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,SAAS,UAAU,EAAE,KAAK,aAAa,EAAE,KAAK,UAAU;AAC3F,WAAO,QAAQ,EAAE,KAAK,QAAQ,OAAO,EAAE,KAAK,QAAQ;AAAA,EACtD;AACA,MAAI,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,SAAS,SAAS,EAAE,KAAK,UAAU,EAAE,KAAK,OAAO;AACnF,WAAO,SAAS,EAAE,KAAK,KAAK,OAAO,EAAE,KAAK,KAAK;AAAA,EACjD;AACA,SAAO,SAAS,EAAE,KAAK,IAAI,SAAS,EAAE,KAAK,IAAI;AACjD;;;ACvDA,eAAsB,mBAAmB,OAAkD;AACzF,QAAM,OAAO,MAAM,MAAM,MAAM,EAAE,MAAM,QAAQ,CAAC,GAAG;AAAA,IACjD,CAAC,MAAsB,EAAE,SAAS;AAAA,EACpC;AACA,MAAI,IAAI,WAAW,EAAG,QAAO,EAAE,OAAO,CAAC,GAAG,YAAY,CAAC,GAAG,UAAU,CAAC,EAAE;AAEvE,QAAM,cAAc,oBAAI,IAAyB;AACjD,aAAW,KAAK,KAAK;AACnB,UAAM,MAAM,YAAY,IAAI,EAAE,SAAS,KAAK,CAAC;AAC7C,QAAI,KAAK,CAAC;AACV,gBAAY,IAAI,EAAE,WAAW,GAAG;AAAA,EAClC;AAEA,QAAM,WAAW,CAAC,GAAG,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,EAAE,KAAK;AAC9D,QAAM,QAAqB,CAAC;AAC5B,aAAW,CAAC,KAAK,KAAK,KAAK,aAAa;AACtC,UAAM,UAAU,oBAAI,IAAiC;AACrD,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,IAAI,EAAE,OAAO,KAAK,oBAAI,IAAoB;AAC5D,QAAE,IAAI,EAAE,cAAc,EAAE,KAAK;AAC7B,cAAQ,IAAI,EAAE,SAAS,CAAC;AAAA,IAC1B;AACA,UAAM,aAAa,CAAC,GAAG,QAAQ,KAAK,CAAC;AACrC,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,eAAS,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC9C,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,SAAS,WAAW,CAAC;AAC3B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,IAAI,QAAQ,IAAI,MAAM;AAC5B,cAAM,SAAkC,CAAC;AACzC,mBAAW,CAAC,QAAQ,MAAM,KAAK,GAAG;AAChC,gBAAM,SAAS,EAAE,IAAI,MAAM;AAC3B,cAAI,WAAW,OAAW,QAAO,KAAK,CAAC,QAAQ,MAAM,CAAC;AAAA,QACxD;AACA,YAAI,OAAO,SAAS,EAAG;AACvB,cAAM,cAAc,OAAO;AAAA,UACzB,CAAC,CAAC,QAAQ,MAAM,MACd;AAAA,YACE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,YAClE,EAAE,WAAW,QAAQ,WAAW,KAAK,OAAO,QAAQ,WAAW,GAAG;AAAA,UACpE;AAAA,QACJ;AACA,cAAM,IAAI;AAAA,UACR,YAAY,CAAC,EAAG,IAAI,CAAC,GAAG,OAAO,YAAY,IAAI,CAAC,SAAS,KAAK,EAAE,CAAE,CAAC;AAAA,QACrE;AACA,cAAM,KAAK;AAAA,UACT,QAAQ;AAAA,UACR,QAAQ;AAAA,UACR,WAAW;AAAA,UACX,aAAa,OAAO;AAAA,UACpB,SAAS;AAAA,YACP,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,YACtB,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC;AAAA,UACxB;AAAA,UACA,cAAc;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,OAAO,MAAM,KAAK,CAAC,GAAG,MAAM,EAAE,cAAc,EAAE,WAAW;AAAA,IACzD,YAAY,CAAC,GAAG,YAAY,KAAK,CAAC,EAAE,KAAK;AAAA,IACzC;AAAA,EACF;AACF;AAEA,SAAS,QAAQ,GAAa,GAAqB;AACjD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,OAAO,GACP,OAAO;AACT,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,YAAQ,KAAK;AACb,YAAQ,KAAK;AAAA,EACf;AACA,MAAI,SAAS,KAAK,SAAS,EAAG,QAAO,SAAS,KAAK,SAAS,IAAI,IAAI;AACpE,SAAO,MAAM,KAAK,KAAK,OAAO,IAAI;AACpC;;;ACvFA,eAAsB,eACpB,OACA,SACA,SACyB;AACzB,QAAM,eAAe,MAAM,MAAM,SAAS,QAAQ,QAAQ;AAC1D,QAAM,gBAAgB,MAAM,MAAM,SAAS,QAAQ,SAAS;AAC5D,QAAM,UAAU,MAAM,QAAQ;AAAA,IAC5B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,UAAU,EAAE,WAAW,eAAe,EAAE,MAAM;AACpD,YAAM,WAAW,MAAM,WAAW,cAAc,SAAS,KAAK;AAC9D,YAAM,YAAY,MAAM,WAAW,eAAe,SAAS,KAAK;AAChE,aAAO,EAAE,QAAQ,EAAE,QAAQ,gBAAgB,EAAE,gBAAgB,UAAU,UAAU;AAAA,IACnF,CAAC;AAAA,EACH;AACA,SAAO,kBAAkB,SAAS,OAAO;AAC3C;AAEA,eAAe,WACb,MACA,SACA,OACmB;AACnB,QAAM,MAAgB,CAAC;AACvB,aAAW,KAAK,MAAM;AACpB,UAAM,IAAI,MAAM,QAAQ,GAAG,KAAK;AAChC,QAAI,MAAM,QAAQ,OAAO,SAAS,CAAC,EAAG,KAAI,KAAK,CAAC;AAAA,EAClD;AACA,SAAO;AACT;AAEA,SAAS,eAAe,QAAyE;AAC/F,SAAO,OAAO,KAAK,UAAU;AAC3B,YAAQ,QAAQ;AAAA,MACd,KAAK;AAAA,MACL,KAAK;AACH,eAAO,IAAI,SAAS,SAAS;AAAA,MAC/B,KAAK;AACH,eAAO,IAAI,SAAS,SAAS,OAAO,IAAI;AAAA,MAC1C,KAAK;AACH,eAAO,IAAI,WAAW,IAAI,YAAY,IAAI,UAAU,IAAI,YAAY;AAAA,MACtE,KAAK,WAAW;AACd,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,eAAe;AAClB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,cAAM,MAAM,MAAM,SAAS,OAAO,IAAI,KAAK;AAC3C,eAAO,aAAa,GAAG,EAAE;AAAA,MAC3B;AAAA,MACA,KAAK,gBAAgB;AACnB,eAAO,gBAAgB,GAAG,MAAM,YAAY,IAAI;AAAA,MAClD;AAAA,MACA;AACE,eAAO;AAAA,IACX;AAAA,EACF;AACF;;;AClDA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,OAAO,QAAQ,QACjB,CAAC,EAAE,OAAO,QAAQ,MAAM,CAAC,KACxB,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE;AAE5D,QAAM,WAA+B,CAAC;AACtC,aAAW,EAAE,MAAM,KAAK,MAAM;AAC5B,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,UAAM,QAAQ,oBAAI,IAAsD;AACxE,eAAW,KAAK,OAAO;AACrB,YAAM,IAAI,QAAQ,EAAE,IAAI;AACxB,YAAM,MAAM,GAAG,EAAE,QAAQ,IAAI,CAAC;AAC9B,YAAM,SAAS,MAAM,IAAI,GAAG,KAAK,EAAE,OAAO,CAAC,GAAG,SAAS,EAAE;AACzD,aAAO,MAAM,KAAK,CAAC;AACnB,YAAM,IAAI,KAAK,MAAM;AAAA,IACvB;AACA,eAAW,CAAC,KAAK,EAAE,OAAO,SAAS,EAAE,CAAC,KAAK,OAAO;AAChD,UAAI,MAAM,SAAS,eAAgB;AACnC,YAAM,SAAS,CAAC,GAAG,KAAK,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAClE,YAAM,QAAQ,OAAO,CAAC,EAAG;AACzB,YAAM,OAAO,OAAO,OAAO,SAAS,CAAC,EAAG;AACxC,eAAS,KAAK;AAAA,QACZ;AAAA,QACA,UAAU,IAAI,MAAM,GAAG,EAAE,CAAC;AAAA,QAC1B,SAAS;AAAA,QACT,aAAa,OAAO;AAAA,QACpB,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,MAAM;AAAA,QACnC,UAAU,OAAO;AAAA,MACnB,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,eAAe,IAAI,IAAI,SAAS,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC;AACzD,SAAO;AAAA,IACL;AAAA,IACA,kBAAkB,KAAK,SAAS,IAAI,aAAa,OAAO,KAAK,SAAS;AAAA,IACtE,WAAW,KAAK;AAAA,EAClB;AACF;;;AC5CA,eAAsB,cACpB,OACA,UAA4B,CAAC,GACH;AAC1B,QAAM,OAAO,QAAQ,QAAQ,CAAC,QAAQ,KAAK,KAAK,MAAM,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,EAAE,KAAK;AAE1F,QAAM,QAA4B,CAAC;AACnC,MAAI,aAAa;AACjB,MAAI,cAAc;AAClB,aAAW,SAAS,MAAM;AACxB,UAAM,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1C,QAAI,MAAM,WAAW,GAAG;AACtB,YAAM,KAAK,EAAE,OAAO,aAAa,GAAG,YAAY,GAAG,WAAW,EAAE,CAAC;AACjE;AAAA,IACF;AACA,UAAM,OAAO,MAAM,SAAS,OAAO,KAAK;AACxC,QAAI,SAAS;AACb,eAAW,KAAK,OAAO;AACrB,UAAI,EAAE,WAAW,SAAS;AACxB;AACA;AAAA,MACF;AACA,YAAM,WAAW,KAAK,OAAO,CAAC,MAAM,EAAE,YAAY,EAAE,SAAS;AAC7D,UAAI,QAAQ,aAAa;AACvB,YAAI,CAAC,QAAQ,YAAY,GAAG,EAAE,KAAK,SAAS,CAAC,EAAG;AAAA,MAClD,OAAO;AAGL,cAAM,YAAY,UAAU,EAAE,MAAM;AACpC,cAAM,OAAO,SAAS;AAAA,UAAK,CAAC,MAC1B,EAAE,SAAS;AAAA,YACT,CAAC,MACC,OAAO,EAAE,YAAY,YACrB,aACA,EAAE,QAAQ,SAAS,UAAU,MAAM,GAAG,GAAG,CAAC;AAAA,UAC9C;AAAA,QACF;AACA,YAAI,CAAC,KAAM;AAAA,MACb;AAAA,IACF;AACA,UAAM,YAAY,SAAS,MAAM;AACjC,UAAM,KAAK,EAAE,OAAO,aAAa,QAAQ,YAAY,MAAM,QAAQ,UAAU,CAAC;AAC9E,kBAAc,MAAM;AACpB,mBAAe;AAAA,EACjB;AACA,SAAO,EAAE,OAAO,kBAAkB,aAAa,IAAI,cAAc,aAAa,EAAE;AAClF;AAEA,SAAS,UAAU,GAAoB;AACrC,MAAI,MAAM,QAAQ,MAAM,OAAW,QAAO;AAC1C,MAAI,OAAO,MAAM,SAAU,QAAO;AAClC,MAAI;AACF,WAAO,KAAK,UAAU,CAAC;AAAA,EACzB,QAAQ;AACN,WAAO,OAAO,CAAC;AAAA,EACjB;AACF;","names":[]}
@@ -0,0 +1,99 @@
1
+ import { P as PrmGradedTrace, S as StepRubric, a as PrmGrader } from '../rubric-D5tjHNJQ.js';
2
+ export { G as GradedStep, b as StepContext, i as isPrmVerdict } from '../rubric-D5tjHNJQ.js';
3
+ import { S as Span, T as TraceStore } from '../store-Db2Bv8Cf.js';
4
+ import '../trajectory-CnoBo-JY.js';
5
+
6
+ /**
7
+ * Export PRM-graded traces as training data for downstream reward-model
8
+ * fine-tuning. Canonical format is NDJSON of
9
+ * `{ trajectory_text, step_index, rubric, score }` so a small model can
10
+ * learn to predict step rewards from step context.
11
+ *
12
+ * The framework doesn't train the model — we emit the data; callers
13
+ * plug it into their preferred trainer (TRL, Unsloth, custom).
14
+ */
15
+
16
+ interface PrmTrainingSample {
17
+ runId: string;
18
+ spanId: string;
19
+ rubricId: string;
20
+ score: number;
21
+ /** Serialized step context — step + surrounding conversation. */
22
+ context: {
23
+ priorTurns: Array<{
24
+ role: string;
25
+ content: string;
26
+ }>;
27
+ step: {
28
+ kind: Span['kind'];
29
+ text: string;
30
+ };
31
+ };
32
+ /** Optional evidence + rationale for auditability. */
33
+ rationale?: string;
34
+ evidence?: string;
35
+ }
36
+ declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
37
+ contextWindow?: number;
38
+ }): Promise<PrmTrainingSample[]>;
39
+ /** NDJSON serialization — write to file or stream directly to a trainer. */
40
+ declare function toNdjson(samples: PrmTrainingSample[]): string;
41
+
42
+ /**
43
+ * Built-in reference rubrics. Consumers combine these with domain
44
+ * rubrics. All are deterministic, rule-based — cheap to run + easy
45
+ * to unit-test. LLM-based rubrics are trivially authored by
46
+ * following the StepRubric contract.
47
+ */
48
+
49
+ /** Penalize very short or very long assistant outputs. */
50
+ declare function outputLengthRubric(args?: {
51
+ minChars?: number;
52
+ maxChars?: number;
53
+ weight?: number;
54
+ }): StepRubric;
55
+ /** Reward tool calls that succeeded (status='ok') with an informative result. */
56
+ declare function toolSuccessRubric(args?: {
57
+ weight?: number;
58
+ }): StepRubric;
59
+ /** Penalize tool calls that duplicate a prior call with identical args. */
60
+ declare function toolNonRedundantRubric(args?: {
61
+ weight?: number;
62
+ }): StepRubric;
63
+ /** Penalize LLM outputs that contain common refusal markers when a refusal
64
+ * is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
65
+ declare function nonRefusalRubric(args?: {
66
+ markers?: RegExp[];
67
+ weight?: number;
68
+ }): StepRubric;
69
+ /** Reward outputs that invoke the next-step tool the trajectory actually uses
70
+ * (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
71
+ declare function toolIntentAlignmentRubric(args?: {
72
+ weight?: number;
73
+ }): StepRubric;
74
+
75
+ /**
76
+ * Inference-time PRM scoring — pick the best of N candidate trajectories
77
+ * using a trained reward model (or a rule-based PRM as a proxy).
78
+ *
79
+ * The canonical Best-of-N pattern: generate N completions, score each
80
+ * with a PRM, pick the winner. Here the scoring loop is framework-agnostic
81
+ * — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
82
+ */
83
+
84
+ interface BestOfNResult {
85
+ winner: PrmGradedTrace;
86
+ ranked: PrmGradedTrace[];
87
+ /** Standard deviation of aggregate scores — small = candidates were homogenous. */
88
+ stdDev: number;
89
+ }
90
+ declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
91
+ /**
92
+ * Weighted vote across multiple graders — use when you want a PRM ensemble
93
+ * (e.g. rule-based + LLM-based + trained model). Each grader produces its
94
+ * own ranking; we aggregate via rank-sum (Borda count) so no single grader
95
+ * dominates via a different score scale.
96
+ */
97
+ declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
98
+
99
+ export { type BestOfNResult, PrmGradedTrace, PrmGrader, type PrmTrainingSample, StepRubric, exportTrainingData, nonRefusalRubric, outputLengthRubric, prmBestOfN, prmEnsembleBestOfN, toNdjson, toolIntentAlignmentRubric, toolNonRedundantRubric, toolSuccessRubric };