@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,618 @@
1
+ import {
2
+ cohensD
3
+ } from "./chunk-I4MBDTY5.js";
4
+ import {
5
+ argHash,
6
+ groupBy,
7
+ toolSpans
8
+ } from "./chunk-47X6LRCE.js";
9
+
10
+ // src/failure-taxonomy.ts
11
+ var DEFAULT_RULES = [
12
+ // Outcome already named? Respect it.
13
+ {
14
+ id: "explicit-outcome",
15
+ match: ({ run }) => {
16
+ const fc = run.outcome?.failureClass;
17
+ if (fc && fc !== "unknown")
18
+ return { failureClass: fc, reason: "outcome.failureClass set explicitly" };
19
+ return null;
20
+ }
21
+ },
22
+ {
23
+ id: "knowledge-readiness-blocked",
24
+ match: ({ events }) => {
25
+ const event = events.find(
26
+ (e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false
27
+ );
28
+ return event ? {
29
+ failureClass: "knowledge_readiness_blocked",
30
+ reason: "knowledge readiness report blocked execution",
31
+ triggerEventId: event.eventId
32
+ } : null;
33
+ }
34
+ },
35
+ {
36
+ id: "bad-integration-manifest",
37
+ match: ({ events }) => {
38
+ const event = events.find(
39
+ (e) => e.kind === "custom" && (e.payload.kind === "integration_manifest_validated" && e.payload.valid === false || e.payload.kind === "integration_invoke_failed" && e.payload.code === "manifest_invalid")
40
+ );
41
+ return event ? {
42
+ failureClass: "bad_integration_manifest",
43
+ reason: "integration manifest validation failed before launch",
44
+ triggerEventId: event.eventId
45
+ } : null;
46
+ }
47
+ },
48
+ {
49
+ id: "missing-integration-connection",
50
+ match: ({ events }) => {
51
+ const event = events.find(
52
+ (e) => e.kind === "custom" && e.payload.kind === "integration_manifest_resolved" && hasResolutionStatus(e.payload, "missing_connection")
53
+ );
54
+ return event ? {
55
+ failureClass: "missing_integration_connection",
56
+ reason: "required integration connection was missing",
57
+ triggerEventId: event.eventId
58
+ } : null;
59
+ }
60
+ },
61
+ {
62
+ id: "missing-integration-scope",
63
+ match: ({ events }) => {
64
+ const event = events.find(
65
+ (e) => e.kind === "custom" && (e.payload.kind === "integration_manifest_resolved" && hasMissingScopes(e.payload) || e.payload.kind === "integration_invoke_failed" && e.payload.code === "scope_denied")
66
+ );
67
+ return event ? {
68
+ failureClass: "missing_integration_scope",
69
+ reason: "integration grant or connection lacks required scopes",
70
+ triggerEventId: event.eventId
71
+ } : null;
72
+ }
73
+ },
74
+ {
75
+ id: "integration-approval-required",
76
+ match: ({ events }) => {
77
+ const event = events.find(
78
+ (e) => e.kind === "custom" && (e.payload.kind === "integration_invoke" && e.payload.status === "approval_required" || e.payload.kind === "integration_invoke_failed" && e.payload.code === "approval_required" || e.payload.kind === "integration_approval_required")
79
+ );
80
+ return event ? {
81
+ failureClass: "integration_approval_required",
82
+ reason: "integration write paused for user approval",
83
+ triggerEventId: event.eventId
84
+ } : null;
85
+ }
86
+ },
87
+ {
88
+ id: "integration-auth-expired",
89
+ match: ({ events }) => {
90
+ const event = events.find(
91
+ (e) => e.kind === "custom" && e.payload.kind === "integration_invoke_failed" && (e.payload.code === "auth_expired" || e.payload.code === "connection_not_active" || e.payload.code === "capability_expired" || e.payload.status === "expired")
92
+ );
93
+ return event ? {
94
+ failureClass: "integration_auth_expired",
95
+ reason: "integration connection or capability expired",
96
+ triggerEventId: event.eventId
97
+ } : null;
98
+ }
99
+ },
100
+ {
101
+ id: "unsafe-integration-write-denied",
102
+ match: ({ events }) => {
103
+ const event = events.find(
104
+ (e) => e.kind === "custom" && e.payload.kind === "integration_invoke_failed" && (e.payload.code === "unsafe_write_denied" || e.payload.code === "policy_denied" || e.payload.code === "action_denied")
105
+ );
106
+ return event ? {
107
+ failureClass: "unsafe_integration_write_denied",
108
+ reason: "integration write was denied by policy or capability scope",
109
+ triggerEventId: event.eventId
110
+ } : null;
111
+ }
112
+ },
113
+ {
114
+ id: "integration-provider-failure",
115
+ match: ({ events }) => {
116
+ const event = events.find(
117
+ (e) => e.kind === "custom" && e.payload.kind === "integration_invoke_failed" && ![
118
+ "scope_denied",
119
+ "approval_required",
120
+ "auth_expired",
121
+ "connection_not_active",
122
+ "capability_expired",
123
+ "unsafe_write_denied",
124
+ "policy_denied",
125
+ "action_denied",
126
+ "manifest_invalid"
127
+ ].includes(String(e.payload.code))
128
+ );
129
+ return event ? {
130
+ failureClass: "integration_provider_failure",
131
+ reason: "integration provider invocation failed",
132
+ triggerEventId: event.eventId
133
+ } : null;
134
+ }
135
+ },
136
+ {
137
+ id: "missing-credentials",
138
+ match: ({ events }) => {
139
+ const event = events.find(
140
+ (e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret"
141
+ );
142
+ return event ? {
143
+ failureClass: "missing_credentials",
144
+ reason: "required credential or secret was missing",
145
+ triggerEventId: event.eventId
146
+ } : null;
147
+ }
148
+ },
149
+ {
150
+ id: "bad-retrieval",
151
+ match: ({ run, spans }) => {
152
+ if (run.outcome?.pass !== false) return null;
153
+ const retrieval = spans.find(
154
+ (s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0))
155
+ );
156
+ return retrieval ? {
157
+ failureClass: "bad_retrieval",
158
+ reason: "retrieval returned no useful hits for a failed run",
159
+ triggerSpanId: retrieval.spanId
160
+ } : null;
161
+ }
162
+ },
163
+ {
164
+ id: "insufficient-evidence",
165
+ match: ({ events }) => {
166
+ const event = events.find(
167
+ (e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence"
168
+ );
169
+ return event ? {
170
+ failureClass: "insufficient_evidence",
171
+ reason: "task proceeded with insufficient supporting evidence",
172
+ triggerEventId: event.eventId
173
+ } : null;
174
+ }
175
+ },
176
+ {
177
+ id: "contradictory-evidence",
178
+ match: ({ events }) => {
179
+ const event = events.find(
180
+ (e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence"
181
+ );
182
+ return event ? {
183
+ failureClass: "contradictory_evidence",
184
+ reason: "supporting evidence contradicted itself",
185
+ triggerEventId: event.eventId
186
+ } : null;
187
+ }
188
+ },
189
+ // Budget breach events
190
+ {
191
+ id: "budget-breach",
192
+ match: ({ events }) => {
193
+ const breach = events.find((e) => e.kind === "budget_breach");
194
+ return breach ? {
195
+ failureClass: "budget_exceeded",
196
+ reason: `budget breached on ${breach.payload.dimension ?? "unknown dimension"}`,
197
+ triggerEventId: breach.eventId
198
+ } : null;
199
+ }
200
+ },
201
+ // Policy violations
202
+ {
203
+ id: "policy-violation",
204
+ match: ({ events }) => {
205
+ const e = events.find((x) => x.kind === "policy_violation");
206
+ return e ? {
207
+ failureClass: "policy_violation",
208
+ reason: "policy_violation event emitted",
209
+ triggerEventId: e.eventId
210
+ } : null;
211
+ }
212
+ },
213
+ // Sandbox non-zero exit code
214
+ {
215
+ id: "sandbox-failure",
216
+ match: ({ spans }) => {
217
+ const s = spans.find(
218
+ (x) => x.kind === "sandbox" && typeof x.exitCode === "number" && x.exitCode !== 0
219
+ );
220
+ if (!s) return null;
221
+ return {
222
+ failureClass: "sandbox_failure",
223
+ reason: `sandbox exited ${s.exitCode}`,
224
+ triggerSpanId: s.spanId
225
+ };
226
+ }
227
+ },
228
+ // Timeout: run aborted by external signal
229
+ {
230
+ id: "timeout",
231
+ match: ({ run, events }) => {
232
+ if (run.status !== "aborted") return null;
233
+ const hasTimeout = events.some(
234
+ (e) => e.kind === "error" && String(e.payload.reason ?? "").toLowerCase().includes("timeout")
235
+ );
236
+ const note = (run.outcome?.notes ?? "").toLowerCase();
237
+ if (hasTimeout || note.includes("timeout") || note.includes("deadline")) {
238
+ return { failureClass: "timeout", reason: "timeout signal observed" };
239
+ }
240
+ return null;
241
+ }
242
+ },
243
+ // Tool recovery failure: many consecutive tool errors on the same tool
244
+ {
245
+ id: "tool-recovery-failure",
246
+ match: ({ spans }) => {
247
+ const tools = spans.filter((s) => s.kind === "tool");
248
+ const byTool = /* @__PURE__ */ new Map();
249
+ for (const t of tools) {
250
+ const name = t.toolName;
251
+ const arr = byTool.get(name) ?? [];
252
+ arr.push(t);
253
+ byTool.set(name, arr);
254
+ }
255
+ for (const [name, arr] of byTool) {
256
+ const errs = arr.filter((s) => s.status === "error");
257
+ if (errs.length >= 3 && errs.length === arr.length) {
258
+ return {
259
+ failureClass: "tool_recovery_failure",
260
+ reason: `${errs.length} consecutive errors on tool "${name}"`,
261
+ triggerSpanId: errs[errs.length - 1].spanId
262
+ };
263
+ }
264
+ }
265
+ return null;
266
+ }
267
+ },
268
+ // Tool selection error: the run failed and agent called zero tools despite having them
269
+ {
270
+ id: "tool-selection-error",
271
+ match: ({ run, spans }) => {
272
+ if (run.outcome?.pass !== false) return null;
273
+ const hasToolsAvailable = spans.some(
274
+ (s) => s.kind === "agent" && s.attributes?.toolsAvailable !== void 0 && s.attributes?.toolsAvailable > 0
275
+ );
276
+ const tools = spans.filter((s) => s.kind === "tool");
277
+ if (hasToolsAvailable && tools.length === 0) {
278
+ return {
279
+ failureClass: "tool_selection_error",
280
+ reason: "tools were available but none were called"
281
+ };
282
+ }
283
+ return null;
284
+ }
285
+ },
286
+ // Format drift: scored by a judge with dimension='format' below threshold
287
+ {
288
+ id: "format-drift",
289
+ match: ({ spans }) => {
290
+ const judge = spans.find(
291
+ (s) => s.kind === "judge" && s.dimension === "format" && s.score < 0.5
292
+ );
293
+ return judge ? {
294
+ failureClass: "format_drift",
295
+ reason: "format judge scored below 0.5",
296
+ triggerSpanId: judge.spanId
297
+ } : null;
298
+ }
299
+ }
300
+ ];
301
+ function hasResolutionStatus(payload, status) {
302
+ if (status === "missing_connection" && stringArray(payload.missingConnections).length > 0)
303
+ return true;
304
+ return resolutionItems(payload).some((item) => item.status === status);
305
+ }
306
+ function hasMissingScopes(payload) {
307
+ if (stringArray(payload.missingScopes).length > 0) return true;
308
+ return resolutionItems(payload).some(
309
+ (item) => Array.isArray(item.missingScopes) && item.missingScopes.length > 0
310
+ );
311
+ }
312
+ function resolutionItems(payload) {
313
+ return [
314
+ ...records(payload.missing),
315
+ ...records(payload.optionalMissing),
316
+ ...records(payload.ready)
317
+ ];
318
+ }
319
+ function records(value) {
320
+ if (!Array.isArray(value)) return [];
321
+ return value.filter(
322
+ (item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)
323
+ );
324
+ }
325
+ function stringArray(value) {
326
+ return Array.isArray(value) ? value.filter((item) => typeof item === "string") : [];
327
+ }
328
+ function classifyFailure(ctx, rules = DEFAULT_RULES) {
329
+ if (ctx.run.outcome?.pass !== false && ctx.run.status === "completed") {
330
+ return { failureClass: "success", reason: "run completed with pass=true (or no explicit fail)" };
331
+ }
332
+ for (const rule of rules) {
333
+ const hit = rule.match(ctx);
334
+ if (hit) return hit;
335
+ }
336
+ return { failureClass: "unknown", reason: "no rule matched; run failed for unclassified reason" };
337
+ }
338
+
339
+ // src/pipelines/failure-cluster.ts
340
+ async function failureClusterView(store, options = {}) {
341
+ const rules = options.rules ?? DEFAULT_RULES;
342
+ const minSize = options.minClusterSize ?? 1;
343
+ const runs = await store.listRuns();
344
+ const clusters = /* @__PURE__ */ new Map();
345
+ let totalFailures = 0;
346
+ for (const run of runs) {
347
+ if (run.status === "completed" && run.outcome?.pass !== false) continue;
348
+ totalFailures++;
349
+ const spans = await store.spans({ runId: run.runId });
350
+ const events = await store.events({ runId: run.runId });
351
+ const cls = classifyFailure({ run, spans, events }, rules);
352
+ let toolName;
353
+ let argPrefix;
354
+ let dimension;
355
+ if (cls.triggerSpanId) {
356
+ const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
357
+ if (trig?.kind === "tool") {
358
+ toolName = trig.toolName;
359
+ argPrefix = argHash(trig.args).slice(0, 16);
360
+ } else if (trig?.kind === "judge") {
361
+ dimension = trig.dimension;
362
+ }
363
+ }
364
+ if (!toolName) {
365
+ const ts = await toolSpans(store, run.runId);
366
+ const errored = ts.filter((t) => t.status === "error").pop();
367
+ if (errored) {
368
+ toolName = errored.toolName;
369
+ argPrefix = argHash(errored.args).slice(0, 16);
370
+ }
371
+ }
372
+ if (!dimension) {
373
+ const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
374
+ if (judge?.kind === "judge") dimension = judge.dimension;
375
+ }
376
+ const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
377
+ let cluster = clusters.get(key);
378
+ if (!cluster) {
379
+ cluster = {
380
+ failureClass: cls.failureClass,
381
+ toolName,
382
+ argPrefix,
383
+ dimension,
384
+ runCount: 0,
385
+ scenarioIds: [],
386
+ exampleRunId: run.runId,
387
+ exampleError: firstErrorMessage(spans) ?? cls.reason
388
+ };
389
+ clusters.set(key, cluster);
390
+ }
391
+ cluster.runCount++;
392
+ if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
393
+ }
394
+ const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
395
+ return { clusters: arr, totalFailures, totalRuns: runs.length };
396
+ }
397
+ function firstErrorMessage(spans) {
398
+ const errored = spans.find((s) => s.status === "error");
399
+ return errored?.error;
400
+ }
401
+
402
+ // src/tool-use-metrics.ts
403
+ async function computeToolUseMetrics(store, runId, options = {}) {
404
+ const tools = await toolSpans(store, runId);
405
+ if (tools.length === 0) {
406
+ return { runId, totalCalls: 0, byTool: {}, errorRate: 0, duplicateRate: 0, retryRate: 0 };
407
+ }
408
+ const byTool = {};
409
+ let totalErrors = 0;
410
+ let totalDuplicates = 0;
411
+ const sortedTools = [...tools].sort((a, b) => a.startedAt - b.startedAt);
412
+ const seenSignatures = /* @__PURE__ */ new Set();
413
+ for (const t of sortedTools) {
414
+ const stat = byTool[t.toolName] ??= { calls: 0, errors: 0, avgLatencyMs: 0, duplicates: 0 };
415
+ stat.calls += 1;
416
+ if (t.status === "error") {
417
+ stat.errors += 1;
418
+ totalErrors += 1;
419
+ }
420
+ if (typeof t.latencyMs === "number") stat.avgLatencyMs += t.latencyMs;
421
+ const sig = `${t.toolName}|${argHash(t.args)}`;
422
+ if (seenSignatures.has(sig)) {
423
+ stat.duplicates += 1;
424
+ totalDuplicates += 1;
425
+ }
426
+ seenSignatures.add(sig);
427
+ }
428
+ for (const stat of Object.values(byTool)) {
429
+ stat.avgLatencyMs = stat.calls > 0 ? stat.avgLatencyMs / stat.calls : 0;
430
+ }
431
+ let retryOpportunities = 0;
432
+ let retriesFollowed = 0;
433
+ for (const [, arr] of groupBy(sortedTools, (t) => t.toolName)) {
434
+ for (let i = 0; i < arr.length; i++) {
435
+ if (arr[i].status !== "error") continue;
436
+ retryOpportunities += 1;
437
+ if (arr[i + 1]) retriesFollowed += 1;
438
+ }
439
+ }
440
+ const retryRate = retryOpportunities > 0 ? retriesFollowed / retryOpportunities : 0;
441
+ let selectionAccuracy;
442
+ if (options.selectionLabels) {
443
+ const labeled = sortedTools.filter((t) => t.spanId in options.selectionLabels);
444
+ if (labeled.length > 0) {
445
+ selectionAccuracy = labeled.filter((t) => options.selectionLabels[t.spanId]).length / labeled.length;
446
+ }
447
+ }
448
+ return {
449
+ runId,
450
+ totalCalls: sortedTools.length,
451
+ byTool,
452
+ errorRate: totalErrors / sortedTools.length,
453
+ duplicateRate: totalDuplicates / sortedTools.length,
454
+ retryRate,
455
+ selectionAccuracy
456
+ };
457
+ }
458
+
459
+ // src/baseline.ts
460
+ function compareToBaseline(samples, options = {}) {
461
+ const effectThreshold = options.effectThreshold ?? 0.5;
462
+ const alpha = options.alpha ?? 0.05;
463
+ const cvThreshold = options.unstableCvThreshold ?? 0.3;
464
+ const metrics = samples.map((s) => {
465
+ if (s.baseline.length < 2 || s.candidate.length < 2) {
466
+ throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
467
+ }
468
+ const bMean = mean(s.baseline);
469
+ const cMean = mean(s.candidate);
470
+ const delta = cMean - bMean;
471
+ const d = cohensD(s.baseline, s.candidate);
472
+ const { t, df, p } = welchsTTest(s.baseline, s.candidate);
473
+ const baselineIqr = iqr(s.baseline);
474
+ const candidateIqr = iqr(s.candidate);
475
+ const baselineStable = baselineIqr / Math.max(Math.abs(bMean), 1e-9) <= cvThreshold;
476
+ const candidateStable = candidateIqr / Math.max(Math.abs(cMean), 1e-9) <= cvThreshold;
477
+ const stable = baselineStable && candidateStable;
478
+ const reportedIqr = Math.max(baselineIqr, candidateIqr);
479
+ let verdict;
480
+ if (!stable) {
481
+ verdict = "unstable";
482
+ } else if (p < alpha && Math.abs(d) >= effectThreshold) {
483
+ const candidateIsBetter = s.higherIsBetter ? delta > 0 : delta < 0;
484
+ verdict = candidateIsBetter ? "improved" : "regressed";
485
+ } else {
486
+ verdict = "stable";
487
+ }
488
+ return {
489
+ metric: s.metric,
490
+ baselineMean: bMean,
491
+ candidateMean: cMean,
492
+ delta,
493
+ cohensD: d,
494
+ welchT: t,
495
+ welchDf: df,
496
+ welchP: p,
497
+ stable,
498
+ iqr: reportedIqr,
499
+ verdict
500
+ };
501
+ });
502
+ return {
503
+ metrics,
504
+ hasRegression: metrics.some((m) => m.verdict === "regressed"),
505
+ hasUnstable: metrics.some((m) => m.verdict === "unstable")
506
+ };
507
+ }
508
+ function mean(xs) {
509
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
510
+ }
511
+ function iqr(xs) {
512
+ if (xs.length === 0) return 0;
513
+ const sorted = [...xs].sort((a, b) => a - b);
514
+ const q = (p) => {
515
+ const idx = p * (sorted.length - 1);
516
+ const lo = Math.floor(idx);
517
+ const hi = Math.ceil(idx);
518
+ return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
519
+ };
520
+ return q(0.75) - q(0.25);
521
+ }
522
+ function welchsTTest(a, b) {
523
+ if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
524
+ const mA = mean(a);
525
+ const mB = mean(b);
526
+ const vA = variance(a, mA);
527
+ const vB = variance(b, mB);
528
+ const seSquared = vA / a.length + vB / b.length;
529
+ if (seSquared === 0) return { t: mA === mB ? 0 : Infinity, df: 0, p: mA === mB ? 1 : 0 };
530
+ const t = (mB - mA) / Math.sqrt(seSquared);
531
+ const df = seSquared * seSquared / ((vA / a.length) ** 2 / (a.length - 1) + (vB / b.length) ** 2 / (b.length - 1));
532
+ const p = 2 * (1 - studentTCdf(Math.abs(t), df));
533
+ return { t, df, p };
534
+ }
535
+ function variance(xs, m) {
536
+ return xs.reduce((acc, x) => acc + (x - m) ** 2, 0) / (xs.length - 1);
537
+ }
538
+ function studentTCdf(t, df) {
539
+ if (df <= 0) return 0.5;
540
+ if (df > 100) return normalCdf(t);
541
+ const x = df / (df + t * t);
542
+ const ib = incompleteBeta(x, df / 2, 0.5);
543
+ return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
544
+ }
545
+ function incompleteBeta(x, a, b) {
546
+ if (x <= 0) return 0;
547
+ if (x >= 1) return 1;
548
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
549
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
550
+ let c = 1;
551
+ let d = 1 - (a + b) * x / (a + 1);
552
+ if (Math.abs(d) < 1e-30) d = 1e-30;
553
+ d = 1 / d;
554
+ let f = d;
555
+ for (let m = 1; m <= 200; m++) {
556
+ const m2 = 2 * m;
557
+ let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
558
+ d = 1 + num * d;
559
+ if (Math.abs(d) < 1e-30) d = 1e-30;
560
+ c = 1 + num / c;
561
+ if (Math.abs(c) < 1e-30) c = 1e-30;
562
+ d = 1 / d;
563
+ f *= d * c;
564
+ num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
565
+ d = 1 + num * d;
566
+ if (Math.abs(d) < 1e-30) d = 1e-30;
567
+ c = 1 + num / c;
568
+ if (Math.abs(c) < 1e-30) c = 1e-30;
569
+ d = 1 / d;
570
+ const delta = d * c;
571
+ f *= delta;
572
+ if (Math.abs(delta - 1) < 3e-7) break;
573
+ }
574
+ return front * f;
575
+ }
576
+ function lnGamma(z) {
577
+ const coefs = [
578
+ 0.9999999999998099,
579
+ 676.5203681218851,
580
+ -1259.1392167224028,
581
+ 771.3234287776531,
582
+ -176.6150291621406,
583
+ 12.507343278686905,
584
+ -0.13857109526572012,
585
+ 9984369578019572e-21,
586
+ 15056327351493116e-23
587
+ ];
588
+ if (z < 0.5) return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
589
+ z -= 1;
590
+ let x = coefs[0];
591
+ for (let i = 1; i < 9; i++) x += coefs[i] / (z + i);
592
+ const t = z + 7.5;
593
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
594
+ }
595
+ function normalCdf(x) {
596
+ const a1 = 0.254829592;
597
+ const a2 = -0.284496736;
598
+ const a3 = 1.421413741;
599
+ const a4 = -1.453152027;
600
+ const a5 = 1.061405429;
601
+ const p = 0.3275911;
602
+ const sign = x < 0 ? -1 : 1;
603
+ const absX = Math.abs(x);
604
+ const t = 1 / (1 + p * absX);
605
+ const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
606
+ return 0.5 * (1 + sign * y);
607
+ }
608
+
609
+ export {
610
+ DEFAULT_RULES,
611
+ classifyFailure,
612
+ failureClusterView,
613
+ computeToolUseMetrics,
614
+ compareToBaseline,
615
+ iqr,
616
+ welchsTTest
617
+ };
618
+ //# sourceMappingURL=chunk-JLZQWFV3.js.map