@tangle-network/agent-eval 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +212 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
- package/dist/chunk-EDUKQ5AM.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-JLZQWFV3.js +618 -0
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +2018 -3003
- package/dist/index.js +7443 -9102
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +345 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-BNgMdqPF.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +369 -25
- package/dist/wire/index.js +22 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
import {
|
|
2
|
+
cohensD
|
|
3
|
+
} from "./chunk-I4MBDTY5.js";
|
|
4
|
+
import {
|
|
5
|
+
argHash,
|
|
6
|
+
groupBy,
|
|
7
|
+
toolSpans
|
|
8
|
+
} from "./chunk-47X6LRCE.js";
|
|
9
|
+
|
|
10
|
+
// src/failure-taxonomy.ts
|
|
11
|
+
var DEFAULT_RULES = [
|
|
12
|
+
// Outcome already named? Respect it.
|
|
13
|
+
{
|
|
14
|
+
id: "explicit-outcome",
|
|
15
|
+
match: ({ run }) => {
|
|
16
|
+
const fc = run.outcome?.failureClass;
|
|
17
|
+
if (fc && fc !== "unknown")
|
|
18
|
+
return { failureClass: fc, reason: "outcome.failureClass set explicitly" };
|
|
19
|
+
return null;
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
id: "knowledge-readiness-blocked",
|
|
24
|
+
match: ({ events }) => {
|
|
25
|
+
const event = events.find(
|
|
26
|
+
(e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false
|
|
27
|
+
);
|
|
28
|
+
return event ? {
|
|
29
|
+
failureClass: "knowledge_readiness_blocked",
|
|
30
|
+
reason: "knowledge readiness report blocked execution",
|
|
31
|
+
triggerEventId: event.eventId
|
|
32
|
+
} : null;
|
|
33
|
+
}
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
id: "bad-integration-manifest",
|
|
37
|
+
match: ({ events }) => {
|
|
38
|
+
const event = events.find(
|
|
39
|
+
(e) => e.kind === "custom" && (e.payload.kind === "integration_manifest_validated" && e.payload.valid === false || e.payload.kind === "integration_invoke_failed" && e.payload.code === "manifest_invalid")
|
|
40
|
+
);
|
|
41
|
+
return event ? {
|
|
42
|
+
failureClass: "bad_integration_manifest",
|
|
43
|
+
reason: "integration manifest validation failed before launch",
|
|
44
|
+
triggerEventId: event.eventId
|
|
45
|
+
} : null;
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
id: "missing-integration-connection",
|
|
50
|
+
match: ({ events }) => {
|
|
51
|
+
const event = events.find(
|
|
52
|
+
(e) => e.kind === "custom" && e.payload.kind === "integration_manifest_resolved" && hasResolutionStatus(e.payload, "missing_connection")
|
|
53
|
+
);
|
|
54
|
+
return event ? {
|
|
55
|
+
failureClass: "missing_integration_connection",
|
|
56
|
+
reason: "required integration connection was missing",
|
|
57
|
+
triggerEventId: event.eventId
|
|
58
|
+
} : null;
|
|
59
|
+
}
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
id: "missing-integration-scope",
|
|
63
|
+
match: ({ events }) => {
|
|
64
|
+
const event = events.find(
|
|
65
|
+
(e) => e.kind === "custom" && (e.payload.kind === "integration_manifest_resolved" && hasMissingScopes(e.payload) || e.payload.kind === "integration_invoke_failed" && e.payload.code === "scope_denied")
|
|
66
|
+
);
|
|
67
|
+
return event ? {
|
|
68
|
+
failureClass: "missing_integration_scope",
|
|
69
|
+
reason: "integration grant or connection lacks required scopes",
|
|
70
|
+
triggerEventId: event.eventId
|
|
71
|
+
} : null;
|
|
72
|
+
}
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
id: "integration-approval-required",
|
|
76
|
+
match: ({ events }) => {
|
|
77
|
+
const event = events.find(
|
|
78
|
+
(e) => e.kind === "custom" && (e.payload.kind === "integration_invoke" && e.payload.status === "approval_required" || e.payload.kind === "integration_invoke_failed" && e.payload.code === "approval_required" || e.payload.kind === "integration_approval_required")
|
|
79
|
+
);
|
|
80
|
+
return event ? {
|
|
81
|
+
failureClass: "integration_approval_required",
|
|
82
|
+
reason: "integration write paused for user approval",
|
|
83
|
+
triggerEventId: event.eventId
|
|
84
|
+
} : null;
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
id: "integration-auth-expired",
|
|
89
|
+
match: ({ events }) => {
|
|
90
|
+
const event = events.find(
|
|
91
|
+
(e) => e.kind === "custom" && e.payload.kind === "integration_invoke_failed" && (e.payload.code === "auth_expired" || e.payload.code === "connection_not_active" || e.payload.code === "capability_expired" || e.payload.status === "expired")
|
|
92
|
+
);
|
|
93
|
+
return event ? {
|
|
94
|
+
failureClass: "integration_auth_expired",
|
|
95
|
+
reason: "integration connection or capability expired",
|
|
96
|
+
triggerEventId: event.eventId
|
|
97
|
+
} : null;
|
|
98
|
+
}
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
id: "unsafe-integration-write-denied",
|
|
102
|
+
match: ({ events }) => {
|
|
103
|
+
const event = events.find(
|
|
104
|
+
(e) => e.kind === "custom" && e.payload.kind === "integration_invoke_failed" && (e.payload.code === "unsafe_write_denied" || e.payload.code === "policy_denied" || e.payload.code === "action_denied")
|
|
105
|
+
);
|
|
106
|
+
return event ? {
|
|
107
|
+
failureClass: "unsafe_integration_write_denied",
|
|
108
|
+
reason: "integration write was denied by policy or capability scope",
|
|
109
|
+
triggerEventId: event.eventId
|
|
110
|
+
} : null;
|
|
111
|
+
}
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
id: "integration-provider-failure",
|
|
115
|
+
match: ({ events }) => {
|
|
116
|
+
const event = events.find(
|
|
117
|
+
(e) => e.kind === "custom" && e.payload.kind === "integration_invoke_failed" && ![
|
|
118
|
+
"scope_denied",
|
|
119
|
+
"approval_required",
|
|
120
|
+
"auth_expired",
|
|
121
|
+
"connection_not_active",
|
|
122
|
+
"capability_expired",
|
|
123
|
+
"unsafe_write_denied",
|
|
124
|
+
"policy_denied",
|
|
125
|
+
"action_denied",
|
|
126
|
+
"manifest_invalid"
|
|
127
|
+
].includes(String(e.payload.code))
|
|
128
|
+
);
|
|
129
|
+
return event ? {
|
|
130
|
+
failureClass: "integration_provider_failure",
|
|
131
|
+
reason: "integration provider invocation failed",
|
|
132
|
+
triggerEventId: event.eventId
|
|
133
|
+
} : null;
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
id: "missing-credentials",
|
|
138
|
+
match: ({ events }) => {
|
|
139
|
+
const event = events.find(
|
|
140
|
+
(e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret"
|
|
141
|
+
);
|
|
142
|
+
return event ? {
|
|
143
|
+
failureClass: "missing_credentials",
|
|
144
|
+
reason: "required credential or secret was missing",
|
|
145
|
+
triggerEventId: event.eventId
|
|
146
|
+
} : null;
|
|
147
|
+
}
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
id: "bad-retrieval",
|
|
151
|
+
match: ({ run, spans }) => {
|
|
152
|
+
if (run.outcome?.pass !== false) return null;
|
|
153
|
+
const retrieval = spans.find(
|
|
154
|
+
(s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0))
|
|
155
|
+
);
|
|
156
|
+
return retrieval ? {
|
|
157
|
+
failureClass: "bad_retrieval",
|
|
158
|
+
reason: "retrieval returned no useful hits for a failed run",
|
|
159
|
+
triggerSpanId: retrieval.spanId
|
|
160
|
+
} : null;
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
id: "insufficient-evidence",
|
|
165
|
+
match: ({ events }) => {
|
|
166
|
+
const event = events.find(
|
|
167
|
+
(e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence"
|
|
168
|
+
);
|
|
169
|
+
return event ? {
|
|
170
|
+
failureClass: "insufficient_evidence",
|
|
171
|
+
reason: "task proceeded with insufficient supporting evidence",
|
|
172
|
+
triggerEventId: event.eventId
|
|
173
|
+
} : null;
|
|
174
|
+
}
|
|
175
|
+
},
|
|
176
|
+
{
|
|
177
|
+
id: "contradictory-evidence",
|
|
178
|
+
match: ({ events }) => {
|
|
179
|
+
const event = events.find(
|
|
180
|
+
(e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence"
|
|
181
|
+
);
|
|
182
|
+
return event ? {
|
|
183
|
+
failureClass: "contradictory_evidence",
|
|
184
|
+
reason: "supporting evidence contradicted itself",
|
|
185
|
+
triggerEventId: event.eventId
|
|
186
|
+
} : null;
|
|
187
|
+
}
|
|
188
|
+
},
|
|
189
|
+
// Budget breach events
|
|
190
|
+
{
|
|
191
|
+
id: "budget-breach",
|
|
192
|
+
match: ({ events }) => {
|
|
193
|
+
const breach = events.find((e) => e.kind === "budget_breach");
|
|
194
|
+
return breach ? {
|
|
195
|
+
failureClass: "budget_exceeded",
|
|
196
|
+
reason: `budget breached on ${breach.payload.dimension ?? "unknown dimension"}`,
|
|
197
|
+
triggerEventId: breach.eventId
|
|
198
|
+
} : null;
|
|
199
|
+
}
|
|
200
|
+
},
|
|
201
|
+
// Policy violations
|
|
202
|
+
{
|
|
203
|
+
id: "policy-violation",
|
|
204
|
+
match: ({ events }) => {
|
|
205
|
+
const e = events.find((x) => x.kind === "policy_violation");
|
|
206
|
+
return e ? {
|
|
207
|
+
failureClass: "policy_violation",
|
|
208
|
+
reason: "policy_violation event emitted",
|
|
209
|
+
triggerEventId: e.eventId
|
|
210
|
+
} : null;
|
|
211
|
+
}
|
|
212
|
+
},
|
|
213
|
+
// Sandbox non-zero exit code
|
|
214
|
+
{
|
|
215
|
+
id: "sandbox-failure",
|
|
216
|
+
match: ({ spans }) => {
|
|
217
|
+
const s = spans.find(
|
|
218
|
+
(x) => x.kind === "sandbox" && typeof x.exitCode === "number" && x.exitCode !== 0
|
|
219
|
+
);
|
|
220
|
+
if (!s) return null;
|
|
221
|
+
return {
|
|
222
|
+
failureClass: "sandbox_failure",
|
|
223
|
+
reason: `sandbox exited ${s.exitCode}`,
|
|
224
|
+
triggerSpanId: s.spanId
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
},
|
|
228
|
+
// Timeout: run aborted by external signal
|
|
229
|
+
{
|
|
230
|
+
id: "timeout",
|
|
231
|
+
match: ({ run, events }) => {
|
|
232
|
+
if (run.status !== "aborted") return null;
|
|
233
|
+
const hasTimeout = events.some(
|
|
234
|
+
(e) => e.kind === "error" && String(e.payload.reason ?? "").toLowerCase().includes("timeout")
|
|
235
|
+
);
|
|
236
|
+
const note = (run.outcome?.notes ?? "").toLowerCase();
|
|
237
|
+
if (hasTimeout || note.includes("timeout") || note.includes("deadline")) {
|
|
238
|
+
return { failureClass: "timeout", reason: "timeout signal observed" };
|
|
239
|
+
}
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
// Tool recovery failure: many consecutive tool errors on the same tool
|
|
244
|
+
{
|
|
245
|
+
id: "tool-recovery-failure",
|
|
246
|
+
match: ({ spans }) => {
|
|
247
|
+
const tools = spans.filter((s) => s.kind === "tool");
|
|
248
|
+
const byTool = /* @__PURE__ */ new Map();
|
|
249
|
+
for (const t of tools) {
|
|
250
|
+
const name = t.toolName;
|
|
251
|
+
const arr = byTool.get(name) ?? [];
|
|
252
|
+
arr.push(t);
|
|
253
|
+
byTool.set(name, arr);
|
|
254
|
+
}
|
|
255
|
+
for (const [name, arr] of byTool) {
|
|
256
|
+
const errs = arr.filter((s) => s.status === "error");
|
|
257
|
+
if (errs.length >= 3 && errs.length === arr.length) {
|
|
258
|
+
return {
|
|
259
|
+
failureClass: "tool_recovery_failure",
|
|
260
|
+
reason: `${errs.length} consecutive errors on tool "${name}"`,
|
|
261
|
+
triggerSpanId: errs[errs.length - 1].spanId
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
return null;
|
|
266
|
+
}
|
|
267
|
+
},
|
|
268
|
+
// Tool selection error: the run failed and agent called zero tools despite having them
|
|
269
|
+
{
|
|
270
|
+
id: "tool-selection-error",
|
|
271
|
+
match: ({ run, spans }) => {
|
|
272
|
+
if (run.outcome?.pass !== false) return null;
|
|
273
|
+
const hasToolsAvailable = spans.some(
|
|
274
|
+
(s) => s.kind === "agent" && s.attributes?.toolsAvailable !== void 0 && s.attributes?.toolsAvailable > 0
|
|
275
|
+
);
|
|
276
|
+
const tools = spans.filter((s) => s.kind === "tool");
|
|
277
|
+
if (hasToolsAvailable && tools.length === 0) {
|
|
278
|
+
return {
|
|
279
|
+
failureClass: "tool_selection_error",
|
|
280
|
+
reason: "tools were available but none were called"
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
return null;
|
|
284
|
+
}
|
|
285
|
+
},
|
|
286
|
+
// Format drift: scored by a judge with dimension='format' below threshold
|
|
287
|
+
{
|
|
288
|
+
id: "format-drift",
|
|
289
|
+
match: ({ spans }) => {
|
|
290
|
+
const judge = spans.find(
|
|
291
|
+
(s) => s.kind === "judge" && s.dimension === "format" && s.score < 0.5
|
|
292
|
+
);
|
|
293
|
+
return judge ? {
|
|
294
|
+
failureClass: "format_drift",
|
|
295
|
+
reason: "format judge scored below 0.5",
|
|
296
|
+
triggerSpanId: judge.spanId
|
|
297
|
+
} : null;
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
];
|
|
301
|
+
function hasResolutionStatus(payload, status) {
|
|
302
|
+
if (status === "missing_connection" && stringArray(payload.missingConnections).length > 0)
|
|
303
|
+
return true;
|
|
304
|
+
return resolutionItems(payload).some((item) => item.status === status);
|
|
305
|
+
}
|
|
306
|
+
function hasMissingScopes(payload) {
|
|
307
|
+
if (stringArray(payload.missingScopes).length > 0) return true;
|
|
308
|
+
return resolutionItems(payload).some(
|
|
309
|
+
(item) => Array.isArray(item.missingScopes) && item.missingScopes.length > 0
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
function resolutionItems(payload) {
|
|
313
|
+
return [
|
|
314
|
+
...records(payload.missing),
|
|
315
|
+
...records(payload.optionalMissing),
|
|
316
|
+
...records(payload.ready)
|
|
317
|
+
];
|
|
318
|
+
}
|
|
319
|
+
function records(value) {
|
|
320
|
+
if (!Array.isArray(value)) return [];
|
|
321
|
+
return value.filter(
|
|
322
|
+
(item) => Boolean(item) && typeof item === "object" && !Array.isArray(item)
|
|
323
|
+
);
|
|
324
|
+
}
|
|
325
|
+
function stringArray(value) {
|
|
326
|
+
return Array.isArray(value) ? value.filter((item) => typeof item === "string") : [];
|
|
327
|
+
}
|
|
328
|
+
function classifyFailure(ctx, rules = DEFAULT_RULES) {
|
|
329
|
+
if (ctx.run.outcome?.pass !== false && ctx.run.status === "completed") {
|
|
330
|
+
return { failureClass: "success", reason: "run completed with pass=true (or no explicit fail)" };
|
|
331
|
+
}
|
|
332
|
+
for (const rule of rules) {
|
|
333
|
+
const hit = rule.match(ctx);
|
|
334
|
+
if (hit) return hit;
|
|
335
|
+
}
|
|
336
|
+
return { failureClass: "unknown", reason: "no rule matched; run failed for unclassified reason" };
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
// src/pipelines/failure-cluster.ts
|
|
340
|
+
async function failureClusterView(store, options = {}) {
|
|
341
|
+
const rules = options.rules ?? DEFAULT_RULES;
|
|
342
|
+
const minSize = options.minClusterSize ?? 1;
|
|
343
|
+
const runs = await store.listRuns();
|
|
344
|
+
const clusters = /* @__PURE__ */ new Map();
|
|
345
|
+
let totalFailures = 0;
|
|
346
|
+
for (const run of runs) {
|
|
347
|
+
if (run.status === "completed" && run.outcome?.pass !== false) continue;
|
|
348
|
+
totalFailures++;
|
|
349
|
+
const spans = await store.spans({ runId: run.runId });
|
|
350
|
+
const events = await store.events({ runId: run.runId });
|
|
351
|
+
const cls = classifyFailure({ run, spans, events }, rules);
|
|
352
|
+
let toolName;
|
|
353
|
+
let argPrefix;
|
|
354
|
+
let dimension;
|
|
355
|
+
if (cls.triggerSpanId) {
|
|
356
|
+
const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
|
|
357
|
+
if (trig?.kind === "tool") {
|
|
358
|
+
toolName = trig.toolName;
|
|
359
|
+
argPrefix = argHash(trig.args).slice(0, 16);
|
|
360
|
+
} else if (trig?.kind === "judge") {
|
|
361
|
+
dimension = trig.dimension;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
if (!toolName) {
|
|
365
|
+
const ts = await toolSpans(store, run.runId);
|
|
366
|
+
const errored = ts.filter((t) => t.status === "error").pop();
|
|
367
|
+
if (errored) {
|
|
368
|
+
toolName = errored.toolName;
|
|
369
|
+
argPrefix = argHash(errored.args).slice(0, 16);
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
if (!dimension) {
|
|
373
|
+
const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
|
|
374
|
+
if (judge?.kind === "judge") dimension = judge.dimension;
|
|
375
|
+
}
|
|
376
|
+
const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
|
|
377
|
+
let cluster = clusters.get(key);
|
|
378
|
+
if (!cluster) {
|
|
379
|
+
cluster = {
|
|
380
|
+
failureClass: cls.failureClass,
|
|
381
|
+
toolName,
|
|
382
|
+
argPrefix,
|
|
383
|
+
dimension,
|
|
384
|
+
runCount: 0,
|
|
385
|
+
scenarioIds: [],
|
|
386
|
+
exampleRunId: run.runId,
|
|
387
|
+
exampleError: firstErrorMessage(spans) ?? cls.reason
|
|
388
|
+
};
|
|
389
|
+
clusters.set(key, cluster);
|
|
390
|
+
}
|
|
391
|
+
cluster.runCount++;
|
|
392
|
+
if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
|
|
393
|
+
}
|
|
394
|
+
const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
|
|
395
|
+
return { clusters: arr, totalFailures, totalRuns: runs.length };
|
|
396
|
+
}
|
|
397
|
+
function firstErrorMessage(spans) {
|
|
398
|
+
const errored = spans.find((s) => s.status === "error");
|
|
399
|
+
return errored?.error;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// src/tool-use-metrics.ts
|
|
403
|
+
async function computeToolUseMetrics(store, runId, options = {}) {
|
|
404
|
+
const tools = await toolSpans(store, runId);
|
|
405
|
+
if (tools.length === 0) {
|
|
406
|
+
return { runId, totalCalls: 0, byTool: {}, errorRate: 0, duplicateRate: 0, retryRate: 0 };
|
|
407
|
+
}
|
|
408
|
+
const byTool = {};
|
|
409
|
+
let totalErrors = 0;
|
|
410
|
+
let totalDuplicates = 0;
|
|
411
|
+
const sortedTools = [...tools].sort((a, b) => a.startedAt - b.startedAt);
|
|
412
|
+
const seenSignatures = /* @__PURE__ */ new Set();
|
|
413
|
+
for (const t of sortedTools) {
|
|
414
|
+
const stat = byTool[t.toolName] ??= { calls: 0, errors: 0, avgLatencyMs: 0, duplicates: 0 };
|
|
415
|
+
stat.calls += 1;
|
|
416
|
+
if (t.status === "error") {
|
|
417
|
+
stat.errors += 1;
|
|
418
|
+
totalErrors += 1;
|
|
419
|
+
}
|
|
420
|
+
if (typeof t.latencyMs === "number") stat.avgLatencyMs += t.latencyMs;
|
|
421
|
+
const sig = `${t.toolName}|${argHash(t.args)}`;
|
|
422
|
+
if (seenSignatures.has(sig)) {
|
|
423
|
+
stat.duplicates += 1;
|
|
424
|
+
totalDuplicates += 1;
|
|
425
|
+
}
|
|
426
|
+
seenSignatures.add(sig);
|
|
427
|
+
}
|
|
428
|
+
for (const stat of Object.values(byTool)) {
|
|
429
|
+
stat.avgLatencyMs = stat.calls > 0 ? stat.avgLatencyMs / stat.calls : 0;
|
|
430
|
+
}
|
|
431
|
+
let retryOpportunities = 0;
|
|
432
|
+
let retriesFollowed = 0;
|
|
433
|
+
for (const [, arr] of groupBy(sortedTools, (t) => t.toolName)) {
|
|
434
|
+
for (let i = 0; i < arr.length; i++) {
|
|
435
|
+
if (arr[i].status !== "error") continue;
|
|
436
|
+
retryOpportunities += 1;
|
|
437
|
+
if (arr[i + 1]) retriesFollowed += 1;
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
const retryRate = retryOpportunities > 0 ? retriesFollowed / retryOpportunities : 0;
|
|
441
|
+
let selectionAccuracy;
|
|
442
|
+
if (options.selectionLabels) {
|
|
443
|
+
const labeled = sortedTools.filter((t) => t.spanId in options.selectionLabels);
|
|
444
|
+
if (labeled.length > 0) {
|
|
445
|
+
selectionAccuracy = labeled.filter((t) => options.selectionLabels[t.spanId]).length / labeled.length;
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
return {
|
|
449
|
+
runId,
|
|
450
|
+
totalCalls: sortedTools.length,
|
|
451
|
+
byTool,
|
|
452
|
+
errorRate: totalErrors / sortedTools.length,
|
|
453
|
+
duplicateRate: totalDuplicates / sortedTools.length,
|
|
454
|
+
retryRate,
|
|
455
|
+
selectionAccuracy
|
|
456
|
+
};
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
// src/baseline.ts
|
|
460
|
+
function compareToBaseline(samples, options = {}) {
|
|
461
|
+
const effectThreshold = options.effectThreshold ?? 0.5;
|
|
462
|
+
const alpha = options.alpha ?? 0.05;
|
|
463
|
+
const cvThreshold = options.unstableCvThreshold ?? 0.3;
|
|
464
|
+
const metrics = samples.map((s) => {
|
|
465
|
+
if (s.baseline.length < 2 || s.candidate.length < 2) {
|
|
466
|
+
throw new Error(`compareToBaseline: need \u22652 samples per side for "${s.metric}"`);
|
|
467
|
+
}
|
|
468
|
+
const bMean = mean(s.baseline);
|
|
469
|
+
const cMean = mean(s.candidate);
|
|
470
|
+
const delta = cMean - bMean;
|
|
471
|
+
const d = cohensD(s.baseline, s.candidate);
|
|
472
|
+
const { t, df, p } = welchsTTest(s.baseline, s.candidate);
|
|
473
|
+
const baselineIqr = iqr(s.baseline);
|
|
474
|
+
const candidateIqr = iqr(s.candidate);
|
|
475
|
+
const baselineStable = baselineIqr / Math.max(Math.abs(bMean), 1e-9) <= cvThreshold;
|
|
476
|
+
const candidateStable = candidateIqr / Math.max(Math.abs(cMean), 1e-9) <= cvThreshold;
|
|
477
|
+
const stable = baselineStable && candidateStable;
|
|
478
|
+
const reportedIqr = Math.max(baselineIqr, candidateIqr);
|
|
479
|
+
let verdict;
|
|
480
|
+
if (!stable) {
|
|
481
|
+
verdict = "unstable";
|
|
482
|
+
} else if (p < alpha && Math.abs(d) >= effectThreshold) {
|
|
483
|
+
const candidateIsBetter = s.higherIsBetter ? delta > 0 : delta < 0;
|
|
484
|
+
verdict = candidateIsBetter ? "improved" : "regressed";
|
|
485
|
+
} else {
|
|
486
|
+
verdict = "stable";
|
|
487
|
+
}
|
|
488
|
+
return {
|
|
489
|
+
metric: s.metric,
|
|
490
|
+
baselineMean: bMean,
|
|
491
|
+
candidateMean: cMean,
|
|
492
|
+
delta,
|
|
493
|
+
cohensD: d,
|
|
494
|
+
welchT: t,
|
|
495
|
+
welchDf: df,
|
|
496
|
+
welchP: p,
|
|
497
|
+
stable,
|
|
498
|
+
iqr: reportedIqr,
|
|
499
|
+
verdict
|
|
500
|
+
};
|
|
501
|
+
});
|
|
502
|
+
return {
|
|
503
|
+
metrics,
|
|
504
|
+
hasRegression: metrics.some((m) => m.verdict === "regressed"),
|
|
505
|
+
hasUnstable: metrics.some((m) => m.verdict === "unstable")
|
|
506
|
+
};
|
|
507
|
+
}
|
|
508
|
+
function mean(xs) {
|
|
509
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
510
|
+
}
|
|
511
|
+
function iqr(xs) {
|
|
512
|
+
if (xs.length === 0) return 0;
|
|
513
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
514
|
+
const q = (p) => {
|
|
515
|
+
const idx = p * (sorted.length - 1);
|
|
516
|
+
const lo = Math.floor(idx);
|
|
517
|
+
const hi = Math.ceil(idx);
|
|
518
|
+
return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
|
|
519
|
+
};
|
|
520
|
+
return q(0.75) - q(0.25);
|
|
521
|
+
}
|
|
522
|
+
function welchsTTest(a, b) {
|
|
523
|
+
if (a.length < 2 || b.length < 2) return { t: 0, df: 0, p: 1 };
|
|
524
|
+
const mA = mean(a);
|
|
525
|
+
const mB = mean(b);
|
|
526
|
+
const vA = variance(a, mA);
|
|
527
|
+
const vB = variance(b, mB);
|
|
528
|
+
const seSquared = vA / a.length + vB / b.length;
|
|
529
|
+
if (seSquared === 0) return { t: mA === mB ? 0 : Infinity, df: 0, p: mA === mB ? 1 : 0 };
|
|
530
|
+
const t = (mB - mA) / Math.sqrt(seSquared);
|
|
531
|
+
const df = seSquared * seSquared / ((vA / a.length) ** 2 / (a.length - 1) + (vB / b.length) ** 2 / (b.length - 1));
|
|
532
|
+
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
533
|
+
return { t, df, p };
|
|
534
|
+
}
|
|
535
|
+
function variance(xs, m) {
|
|
536
|
+
return xs.reduce((acc, x) => acc + (x - m) ** 2, 0) / (xs.length - 1);
|
|
537
|
+
}
|
|
538
|
+
function studentTCdf(t, df) {
|
|
539
|
+
if (df <= 0) return 0.5;
|
|
540
|
+
if (df > 100) return normalCdf(t);
|
|
541
|
+
const x = df / (df + t * t);
|
|
542
|
+
const ib = incompleteBeta(x, df / 2, 0.5);
|
|
543
|
+
return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
|
|
544
|
+
}
|
|
545
|
+
function incompleteBeta(x, a, b) {
|
|
546
|
+
if (x <= 0) return 0;
|
|
547
|
+
if (x >= 1) return 1;
|
|
548
|
+
const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
|
|
549
|
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
|
|
550
|
+
let c = 1;
|
|
551
|
+
let d = 1 - (a + b) * x / (a + 1);
|
|
552
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
553
|
+
d = 1 / d;
|
|
554
|
+
let f = d;
|
|
555
|
+
for (let m = 1; m <= 200; m++) {
|
|
556
|
+
const m2 = 2 * m;
|
|
557
|
+
let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
|
|
558
|
+
d = 1 + num * d;
|
|
559
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
560
|
+
c = 1 + num / c;
|
|
561
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
562
|
+
d = 1 / d;
|
|
563
|
+
f *= d * c;
|
|
564
|
+
num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
|
|
565
|
+
d = 1 + num * d;
|
|
566
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
567
|
+
c = 1 + num / c;
|
|
568
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
569
|
+
d = 1 / d;
|
|
570
|
+
const delta = d * c;
|
|
571
|
+
f *= delta;
|
|
572
|
+
if (Math.abs(delta - 1) < 3e-7) break;
|
|
573
|
+
}
|
|
574
|
+
return front * f;
|
|
575
|
+
}
|
|
576
|
+
function lnGamma(z) {
|
|
577
|
+
const coefs = [
|
|
578
|
+
0.9999999999998099,
|
|
579
|
+
676.5203681218851,
|
|
580
|
+
-1259.1392167224028,
|
|
581
|
+
771.3234287776531,
|
|
582
|
+
-176.6150291621406,
|
|
583
|
+
12.507343278686905,
|
|
584
|
+
-0.13857109526572012,
|
|
585
|
+
9984369578019572e-21,
|
|
586
|
+
15056327351493116e-23
|
|
587
|
+
];
|
|
588
|
+
if (z < 0.5) return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
|
|
589
|
+
z -= 1;
|
|
590
|
+
let x = coefs[0];
|
|
591
|
+
for (let i = 1; i < 9; i++) x += coefs[i] / (z + i);
|
|
592
|
+
const t = z + 7.5;
|
|
593
|
+
return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
|
|
594
|
+
}
|
|
595
|
+
function normalCdf(x) {
|
|
596
|
+
const a1 = 0.254829592;
|
|
597
|
+
const a2 = -0.284496736;
|
|
598
|
+
const a3 = 1.421413741;
|
|
599
|
+
const a4 = -1.453152027;
|
|
600
|
+
const a5 = 1.061405429;
|
|
601
|
+
const p = 0.3275911;
|
|
602
|
+
const sign = x < 0 ? -1 : 1;
|
|
603
|
+
const absX = Math.abs(x);
|
|
604
|
+
const t = 1 / (1 + p * absX);
|
|
605
|
+
const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
|
|
606
|
+
return 0.5 * (1 + sign * y);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
export {
|
|
610
|
+
DEFAULT_RULES,
|
|
611
|
+
classifyFailure,
|
|
612
|
+
failureClusterView,
|
|
613
|
+
computeToolUseMetrics,
|
|
614
|
+
compareToBaseline,
|
|
615
|
+
iqr,
|
|
616
|
+
welchsTTest
|
|
617
|
+
};
|
|
618
|
+
//# sourceMappingURL=chunk-JLZQWFV3.js.map
|