@agent-inspect/eval 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +733 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +129 -0
- package/dist/index.d.ts +129 -0
- package/dist/index.mjs +729 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +45 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var readers = require('agent-inspect/readers');
|
|
4
|
+
|
|
5
|
+
// packages/eval/src/index.ts
|
|
6
|
+
function isTraceReadResult(value) {
|
|
7
|
+
return value !== null && typeof value === "object" && "runs" in value && "events" in value && "format" in value;
|
|
8
|
+
}
|
|
9
|
+
function traceInputFrom(value) {
|
|
10
|
+
return { type: "file", path: value instanceof URL ? value.pathname : value };
|
|
11
|
+
}
|
|
12
|
+
async function resolveRead(input, options) {
|
|
13
|
+
try {
|
|
14
|
+
if (typeof input === "string") {
|
|
15
|
+
const read2 = await readers.openTrace(traceInputFrom(input), {
|
|
16
|
+
...options.format !== void 0 && options.format !== "auto" ? { format: options.format } : {}
|
|
17
|
+
});
|
|
18
|
+
return { read: read2, runId: options.runId, diagnostics: [] };
|
|
19
|
+
}
|
|
20
|
+
if (isTraceReadResult(input)) {
|
|
21
|
+
return { read: input, runId: options.runId, diagnostics: [] };
|
|
22
|
+
}
|
|
23
|
+
if (isTraceReadResult(input.trace)) {
|
|
24
|
+
return { read: input.trace, runId: options.runId ?? input.runId, diagnostics: [] };
|
|
25
|
+
}
|
|
26
|
+
const format = options.format ?? input.format;
|
|
27
|
+
const read = await readers.openTrace(traceInputFrom(input.trace), {
|
|
28
|
+
...format !== void 0 && format !== "auto" ? { format } : {}
|
|
29
|
+
});
|
|
30
|
+
return { read, runId: options.runId ?? input.runId, diagnostics: [] };
|
|
31
|
+
} catch (error) {
|
|
32
|
+
return { diagnostics: [diagnosticFromError(error)] };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
function diagnosticFromError(error) {
|
|
36
|
+
if (error instanceof readers.TraceReadError) {
|
|
37
|
+
const code = error.code === "unsupported_format" ? "AI_EVAL_UNSUPPORTED_FORMAT" : error.code === "ambiguous_format" ? "AI_EVAL_AMBIGUOUS_FORMAT" : "AI_EVAL_TRACE_UNREADABLE";
|
|
38
|
+
return { code, severity: "error", message: error.message };
|
|
39
|
+
}
|
|
40
|
+
return {
|
|
41
|
+
code: "AI_EVAL_TRACE_UNREADABLE",
|
|
42
|
+
severity: "error",
|
|
43
|
+
message: error instanceof Error ? error.message : String(error)
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
function flatten(nodes) {
|
|
47
|
+
return nodes.flatMap((node) => [node, ...flatten(node.children)]);
|
|
48
|
+
}
|
|
49
|
+
function selectRun(read, runId) {
|
|
50
|
+
if (runId !== void 0) {
|
|
51
|
+
const run = read.runs.find((candidate) => candidate.runId === runId);
|
|
52
|
+
return run === void 0 ? {
|
|
53
|
+
diagnostics: [
|
|
54
|
+
{
|
|
55
|
+
code: "AI_EVAL_RUN_SELECTION_REQUIRED",
|
|
56
|
+
severity: "error",
|
|
57
|
+
message: `Run not found: ${runId}.`
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
} : { run, diagnostics: [] };
|
|
61
|
+
}
|
|
62
|
+
if (read.runs.length === 1) {
|
|
63
|
+
return { run: read.runs[0], diagnostics: [] };
|
|
64
|
+
}
|
|
65
|
+
if (read.runs.length === 0) {
|
|
66
|
+
return {
|
|
67
|
+
diagnostics: [
|
|
68
|
+
{
|
|
69
|
+
code: "AI_EVAL_RUN_SELECTION_REQUIRED",
|
|
70
|
+
severity: "error",
|
|
71
|
+
message: "No runs are available for eval."
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
return {
|
|
77
|
+
diagnostics: [
|
|
78
|
+
{
|
|
79
|
+
code: "AI_EVAL_RUN_SELECTION_REQUIRED",
|
|
80
|
+
severity: "error",
|
|
81
|
+
message: "Multiple runs are available; select a run before eval."
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
function errorResult(format, diagnostics, runId) {
|
|
87
|
+
const errors = diagnostics.filter((item) => item.severity === "error").length;
|
|
88
|
+
return {
|
|
89
|
+
ok: false,
|
|
90
|
+
status: "error",
|
|
91
|
+
format,
|
|
92
|
+
...runId !== void 0 ? { runId } : {},
|
|
93
|
+
summary: { passed: 0, failed: 0, warnings: 0, errors },
|
|
94
|
+
findings: [],
|
|
95
|
+
diagnostics: [...diagnostics]
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
function normalizeFinding(rule, finding) {
|
|
99
|
+
return {
|
|
100
|
+
...finding,
|
|
101
|
+
ruleId: finding.ruleId || rule.id,
|
|
102
|
+
severity: finding.severity ?? rule.severity ?? "error",
|
|
103
|
+
evidence: [...finding.evidence]
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
function compareFindings(a, b) {
|
|
107
|
+
const aEvidence = a.evidence[0];
|
|
108
|
+
const bEvidence = b.evidence[0];
|
|
109
|
+
return a.ruleId.localeCompare(b.ruleId) || (aEvidence?.runId ?? "").localeCompare(bEvidence?.runId ?? "") || (aEvidence?.eventId ?? "").localeCompare(bEvidence?.eventId ?? "") || (aEvidence?.path ?? "").localeCompare(bEvidence?.path ?? "") || a.message.localeCompare(b.message);
|
|
110
|
+
}
|
|
111
|
+
function summarize(findings, ruleCount) {
|
|
112
|
+
const failed = findings.filter((item) => item.status === "fail").length;
|
|
113
|
+
const warnings = findings.filter((item) => item.status === "warning").length;
|
|
114
|
+
const errors = findings.filter((item) => item.severity === "error").length;
|
|
115
|
+
return {
|
|
116
|
+
passed: Math.max(0, ruleCount - failed - warnings),
|
|
117
|
+
failed,
|
|
118
|
+
warnings,
|
|
119
|
+
errors
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
function defaultRules() {
|
|
123
|
+
return [checks.requireSuccess()];
|
|
124
|
+
}
|
|
125
|
+
async function evalRun(input, options = {}) {
|
|
126
|
+
const resolved = await resolveRead(input, options);
|
|
127
|
+
if (resolved.read === void 0) {
|
|
128
|
+
return errorResult("unknown", resolved.diagnostics);
|
|
129
|
+
}
|
|
130
|
+
const selected = selectRun(resolved.read, resolved.runId);
|
|
131
|
+
if (selected.run === void 0) {
|
|
132
|
+
return errorResult(resolved.read.format, selected.diagnostics, resolved.runId);
|
|
133
|
+
}
|
|
134
|
+
const rules = [...options.checks ?? defaultRules()].sort(
|
|
135
|
+
(a, b) => a.id.localeCompare(b.id)
|
|
136
|
+
);
|
|
137
|
+
const context = {
|
|
138
|
+
format: resolved.read.format,
|
|
139
|
+
run: selected.run,
|
|
140
|
+
nodes: flatten(selected.run.children),
|
|
141
|
+
events: resolved.read.events.filter((event) => event.runId === selected.run?.runId)
|
|
142
|
+
};
|
|
143
|
+
const diagnostics = [];
|
|
144
|
+
const findings = [];
|
|
145
|
+
for (const rule of rules) {
|
|
146
|
+
try {
|
|
147
|
+
findings.push(...rule.evaluate(context).map((finding) => normalizeFinding(rule, finding)));
|
|
148
|
+
} catch (error) {
|
|
149
|
+
diagnostics.push({
|
|
150
|
+
code: "AI_EVAL_RULE_FAILED",
|
|
151
|
+
severity: "error",
|
|
152
|
+
ruleId: rule.id,
|
|
153
|
+
message: `Eval rule ${rule.id} failed: ${error instanceof Error ? error.message : String(error)}`
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
if (diagnostics.length > 0) {
|
|
158
|
+
return errorResult(resolved.read.format, diagnostics, selected.run.runId);
|
|
159
|
+
}
|
|
160
|
+
const sortedFindings = findings.sort(compareFindings);
|
|
161
|
+
const summary = summarize(sortedFindings, rules.length);
|
|
162
|
+
const status = summary.failed > 0 ? "fail" : "pass";
|
|
163
|
+
return {
|
|
164
|
+
ok: status === "pass",
|
|
165
|
+
status,
|
|
166
|
+
format: resolved.read.format,
|
|
167
|
+
runId: selected.run.runId,
|
|
168
|
+
summary,
|
|
169
|
+
findings: sortedFindings,
|
|
170
|
+
diagnostics: []
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
function evidenceForRun(run, path) {
|
|
174
|
+
return [{ runId: run.runId, ...path !== void 0 ? { path } : {} }];
|
|
175
|
+
}
|
|
176
|
+
function evidenceForEvent(event, path) {
|
|
177
|
+
return [
|
|
178
|
+
{
|
|
179
|
+
runId: event.runId,
|
|
180
|
+
eventId: event.eventId,
|
|
181
|
+
...event.parentId !== void 0 ? { parentId: event.parentId } : {},
|
|
182
|
+
kind: event.kind,
|
|
183
|
+
name: event.name,
|
|
184
|
+
...path !== void 0 ? { path } : {}
|
|
185
|
+
}
|
|
186
|
+
];
|
|
187
|
+
}
|
|
188
|
+
function fail(ruleId, message, evidence, expected, actual) {
|
|
189
|
+
return {
|
|
190
|
+
ruleId,
|
|
191
|
+
status: "fail",
|
|
192
|
+
severity: "error",
|
|
193
|
+
message,
|
|
194
|
+
...expected !== void 0 ? { expected } : {},
|
|
195
|
+
...actual !== void 0 ? { actual } : {},
|
|
196
|
+
evidence
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
function nodeNames(nodes, kind) {
|
|
200
|
+
return nodes.filter((node) => node.event.kind === kind).map((node) => node.event.name).sort((a, b) => a.localeCompare(b));
|
|
201
|
+
}
|
|
202
|
+
function hasAttribute(node, key) {
|
|
203
|
+
return node.event.attributes !== void 0 && Object.prototype.hasOwnProperty.call(node.event.attributes, key);
|
|
204
|
+
}
|
|
205
|
+
function numericAttribute(node, keys) {
|
|
206
|
+
const attrs = node.event.attributes;
|
|
207
|
+
if (attrs === void 0) return void 0;
|
|
208
|
+
for (const key of keys) {
|
|
209
|
+
const value = attrs[key];
|
|
210
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
211
|
+
}
|
|
212
|
+
return void 0;
|
|
213
|
+
}
|
|
214
|
+
function totalTokenCount(events) {
|
|
215
|
+
return events.reduce((total, event) => {
|
|
216
|
+
const usage = event.tokenUsage;
|
|
217
|
+
if (usage?.total !== void 0) return total + usage.total;
|
|
218
|
+
return total + (usage?.input ?? 0) + (usage?.output ?? 0);
|
|
219
|
+
}, 0);
|
|
220
|
+
}
|
|
221
|
+
function createRule(id, category, evaluate) {
|
|
222
|
+
return { id, category, severity: "error", evaluate };
|
|
223
|
+
}
|
|
224
|
+
var DEFAULT_ANSWER_KEYS = [
|
|
225
|
+
"answer",
|
|
226
|
+
"finalAnswer",
|
|
227
|
+
"final",
|
|
228
|
+
"response",
|
|
229
|
+
"result",
|
|
230
|
+
"output",
|
|
231
|
+
"outputPreview",
|
|
232
|
+
"completion",
|
|
233
|
+
"text"
|
|
234
|
+
];
|
|
235
|
+
var DEFAULT_CONTEXT_KEYS = [
|
|
236
|
+
"context",
|
|
237
|
+
"contexts",
|
|
238
|
+
"document",
|
|
239
|
+
"documents",
|
|
240
|
+
"retrieved",
|
|
241
|
+
"retrieval",
|
|
242
|
+
"chunks",
|
|
243
|
+
"chunk",
|
|
244
|
+
"source",
|
|
245
|
+
"sources",
|
|
246
|
+
"sourceText"
|
|
247
|
+
];
|
|
248
|
+
var DEFAULT_CITATION_KEYS = [
|
|
249
|
+
"citation",
|
|
250
|
+
"citations",
|
|
251
|
+
"reference",
|
|
252
|
+
"references",
|
|
253
|
+
"sourceId",
|
|
254
|
+
"sourceIds",
|
|
255
|
+
"source_id",
|
|
256
|
+
"source_ids"
|
|
257
|
+
];
|
|
258
|
+
var DEFAULT_SOURCE_ID_KEYS = [
|
|
259
|
+
...DEFAULT_CITATION_KEYS,
|
|
260
|
+
"id",
|
|
261
|
+
"ids",
|
|
262
|
+
"documentId",
|
|
263
|
+
"documentIds",
|
|
264
|
+
"docId",
|
|
265
|
+
"docIds"
|
|
266
|
+
];
|
|
267
|
+
var DEFAULT_BANNED_UNSUPPORTED_PHRASES = [
|
|
268
|
+
"i don't have enough information",
|
|
269
|
+
"i do not have enough information",
|
|
270
|
+
"not enough context",
|
|
271
|
+
"cannot determine from the context",
|
|
272
|
+
"unable to determine from the provided context"
|
|
273
|
+
];
|
|
274
|
+
var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
275
|
+
"a",
|
|
276
|
+
"an",
|
|
277
|
+
"and",
|
|
278
|
+
"are",
|
|
279
|
+
"but",
|
|
280
|
+
"for",
|
|
281
|
+
"from",
|
|
282
|
+
"have",
|
|
283
|
+
"into",
|
|
284
|
+
"not",
|
|
285
|
+
"that",
|
|
286
|
+
"the",
|
|
287
|
+
"their",
|
|
288
|
+
"this",
|
|
289
|
+
"was",
|
|
290
|
+
"were",
|
|
291
|
+
"with",
|
|
292
|
+
"you",
|
|
293
|
+
"your"
|
|
294
|
+
]);
|
|
295
|
+
function normalizeKey(key) {
|
|
296
|
+
return key.toLowerCase().replace(/[^a-z0-9]/g, "");
|
|
297
|
+
}
|
|
298
|
+
function keySet(keys) {
|
|
299
|
+
return new Set(keys.map(normalizeKey));
|
|
300
|
+
}
|
|
301
|
+
function valuesAsStrings(value, depth = 0) {
|
|
302
|
+
if (depth > 5) return [];
|
|
303
|
+
if (typeof value === "string") {
|
|
304
|
+
const trimmed = value.trim();
|
|
305
|
+
return trimmed.length > 0 ? [trimmed] : [];
|
|
306
|
+
}
|
|
307
|
+
if (typeof value === "number" || typeof value === "boolean") return [String(value)];
|
|
308
|
+
if (Array.isArray(value)) {
|
|
309
|
+
return value.flatMap((item) => valuesAsStrings(item, depth + 1));
|
|
310
|
+
}
|
|
311
|
+
if (value !== null && typeof value === "object") {
|
|
312
|
+
return Object.values(value).flatMap(
|
|
313
|
+
(item) => valuesAsStrings(item, depth + 1)
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
return [];
|
|
317
|
+
}
|
|
318
|
+
function collectTextFields(nodes, keys, preferredKinds = []) {
|
|
319
|
+
const wanted = keySet(keys);
|
|
320
|
+
const preferred = new Set(preferredKinds);
|
|
321
|
+
const orderedNodes = [...nodes].sort((a, b) => {
|
|
322
|
+
const aPreferred = preferred.has(a.event.kind) ? 0 : 1;
|
|
323
|
+
const bPreferred = preferred.has(b.event.kind) ? 0 : 1;
|
|
324
|
+
return aPreferred - bPreferred || a.event.eventId.localeCompare(b.event.eventId);
|
|
325
|
+
});
|
|
326
|
+
const fields = [];
|
|
327
|
+
for (const node of orderedNodes) {
|
|
328
|
+
const attrs = node.event.attributes;
|
|
329
|
+
if (attrs === void 0) continue;
|
|
330
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
331
|
+
if (!wanted.has(normalizeKey(key))) continue;
|
|
332
|
+
for (const text of valuesAsStrings(value)) {
|
|
333
|
+
fields.push({ text, node, path: `attributes.${key}` });
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
return fields;
|
|
338
|
+
}
|
|
339
|
+
function tokenize(text) {
|
|
340
|
+
return [...text.toLowerCase().matchAll(/[a-z0-9][a-z0-9'-]{2,}/g)].map((match) => match[0].replace(/^['-]+|['-]+$/g, "")).filter((token) => token.length > 2 && !STOP_WORDS.has(token));
|
|
341
|
+
}
|
|
342
|
+
function firstEvidence(fields, run, path) {
|
|
343
|
+
const first = fields[0];
|
|
344
|
+
return first === void 0 ? evidenceForRun(run, path) : evidenceForEvent(first.node.event, first.path);
|
|
345
|
+
}
|
|
346
|
+
function collectSourceIds(nodes, keys) {
|
|
347
|
+
const wanted = keySet(keys);
|
|
348
|
+
const ids = /* @__PURE__ */ new Set();
|
|
349
|
+
for (const node of nodes) {
|
|
350
|
+
const attrs = node.event.attributes;
|
|
351
|
+
if (attrs === void 0) continue;
|
|
352
|
+
for (const [key, value] of Object.entries(attrs)) {
|
|
353
|
+
if (!wanted.has(normalizeKey(key))) continue;
|
|
354
|
+
for (const candidate of valuesAsStrings(value)) {
|
|
355
|
+
const trimmed = candidate.trim();
|
|
356
|
+
if (trimmed.length > 0 && trimmed.length <= 128) ids.add(trimmed);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
return [...ids].sort((a, b) => a.localeCompare(b));
|
|
361
|
+
}
|
|
362
|
+
function citationCount(answer, citationFields) {
|
|
363
|
+
const inline = answer.match(/\[[^\]\n]{1,40}\]|\([A-Za-z][A-Za-z0-9_-]{0,39}\)/g)?.length ?? 0;
|
|
364
|
+
return inline + citationFields.length;
|
|
365
|
+
}
|
|
366
|
+
function quotedSnippets(text, minLength) {
|
|
367
|
+
const snippets = /* @__PURE__ */ new Set();
|
|
368
|
+
const pattern = /"([^"\n]+)"|'([^'\n]+)'|“([^”\n]+)”/g;
|
|
369
|
+
for (const match of text.matchAll(pattern)) {
|
|
370
|
+
const snippet = (match[1] ?? match[2] ?? match[3] ?? "").trim();
|
|
371
|
+
if (snippet.length >= minLength) snippets.add(snippet);
|
|
372
|
+
}
|
|
373
|
+
return [...snippets].sort((a, b) => a.localeCompare(b));
|
|
374
|
+
}
|
|
375
|
+
function wordCount(text) {
|
|
376
|
+
return tokenize(text).length;
|
|
377
|
+
}
|
|
378
|
+
var checks = {
|
|
379
|
+
requireSuccess() {
|
|
380
|
+
return createRule(
|
|
381
|
+
"eval.requireSuccess",
|
|
382
|
+
"run",
|
|
383
|
+
(context) => context.run.status === "ok" ? [] : [
|
|
384
|
+
fail(
|
|
385
|
+
"eval.requireSuccess",
|
|
386
|
+
"Run did not complete successfully.",
|
|
387
|
+
evidenceForRun(context.run, "status"),
|
|
388
|
+
"ok",
|
|
389
|
+
context.run.status ?? "unknown"
|
|
390
|
+
)
|
|
391
|
+
]
|
|
392
|
+
);
|
|
393
|
+
},
|
|
394
|
+
requiredTools(required) {
|
|
395
|
+
const expected = [...required].sort((a, b) => a.localeCompare(b));
|
|
396
|
+
return createRule("eval.requiredTools", "tool", (context) => {
|
|
397
|
+
const tools = new Set(nodeNames(context.nodes, "TOOL"));
|
|
398
|
+
return expected.filter((name) => !tools.has(name)).map(
|
|
399
|
+
(name) => fail(
|
|
400
|
+
"eval.requiredTools",
|
|
401
|
+
`Required tool ${name} did not appear.`,
|
|
402
|
+
evidenceForRun(context.run, "children"),
|
|
403
|
+
name,
|
|
404
|
+
[...tools].sort((a, b) => a.localeCompare(b))
|
|
405
|
+
)
|
|
406
|
+
);
|
|
407
|
+
});
|
|
408
|
+
},
|
|
409
|
+
forbiddenTools(forbidden) {
|
|
410
|
+
const blocked = [...forbidden].sort((a, b) => a.localeCompare(b));
|
|
411
|
+
return createRule(
|
|
412
|
+
"eval.forbiddenTools",
|
|
413
|
+
"tool",
|
|
414
|
+
(context) => context.nodes.filter((node) => node.event.kind === "TOOL" && blocked.includes(node.event.name)).map(
|
|
415
|
+
(node) => fail(
|
|
416
|
+
"eval.forbiddenTools",
|
|
417
|
+
`Forbidden tool ${node.event.name} appeared.`,
|
|
418
|
+
evidenceForEvent(node.event, "name"),
|
|
419
|
+
"tool absent",
|
|
420
|
+
node.event.name
|
|
421
|
+
)
|
|
422
|
+
)
|
|
423
|
+
);
|
|
424
|
+
},
|
|
425
|
+
maxDurationMs(maxDurationMs) {
|
|
426
|
+
return createRule(
|
|
427
|
+
"eval.maxDurationMs",
|
|
428
|
+
"run",
|
|
429
|
+
(context) => context.run.durationMs !== void 0 && context.run.durationMs > maxDurationMs ? [
|
|
430
|
+
fail(
|
|
431
|
+
"eval.maxDurationMs",
|
|
432
|
+
`Run duration exceeded ${maxDurationMs}ms.`,
|
|
433
|
+
evidenceForRun(context.run, "durationMs"),
|
|
434
|
+
{ maxDurationMs },
|
|
435
|
+
context.run.durationMs
|
|
436
|
+
)
|
|
437
|
+
] : []
|
|
438
|
+
);
|
|
439
|
+
},
|
|
440
|
+
maxDepth(maxDepth) {
|
|
441
|
+
return createRule("eval.maxDepth", "structure", (context) => {
|
|
442
|
+
const deepest = context.nodes.reduce(
|
|
443
|
+
(current, node) => current === void 0 || node.depth > current.depth ? node : current,
|
|
444
|
+
void 0
|
|
445
|
+
);
|
|
446
|
+
return deepest !== void 0 && deepest.depth > maxDepth ? [
|
|
447
|
+
fail(
|
|
448
|
+
"eval.maxDepth",
|
|
449
|
+
`Run tree depth exceeded ${maxDepth}.`,
|
|
450
|
+
evidenceForEvent(deepest.event, "depth"),
|
|
451
|
+
{ maxDepth },
|
|
452
|
+
deepest.depth
|
|
453
|
+
)
|
|
454
|
+
] : [];
|
|
455
|
+
});
|
|
456
|
+
},
|
|
457
|
+
maxRetries(maxRetries) {
|
|
458
|
+
return createRule(
|
|
459
|
+
"eval.maxRetries",
|
|
460
|
+
"structure",
|
|
461
|
+
(context) => context.nodes.flatMap((node) => {
|
|
462
|
+
const retries = numericAttribute(node, ["retryCount", "retries", "attempt"]);
|
|
463
|
+
return retries !== void 0 && retries > maxRetries ? [
|
|
464
|
+
fail(
|
|
465
|
+
"eval.maxRetries",
|
|
466
|
+
`Retry count exceeded ${maxRetries}.`,
|
|
467
|
+
evidenceForEvent(node.event, "attributes.retryCount"),
|
|
468
|
+
{ maxRetries },
|
|
469
|
+
retries
|
|
470
|
+
)
|
|
471
|
+
] : [];
|
|
472
|
+
})
|
|
473
|
+
);
|
|
474
|
+
},
|
|
475
|
+
maxTotalTokens(maxTotalTokens) {
|
|
476
|
+
return createRule("eval.maxTotalTokens", "llm", (context) => {
|
|
477
|
+
const total = totalTokenCount(context.events);
|
|
478
|
+
return total > maxTotalTokens ? [
|
|
479
|
+
fail(
|
|
480
|
+
"eval.maxTotalTokens",
|
|
481
|
+
`Total token usage exceeded ${maxTotalTokens}.`,
|
|
482
|
+
evidenceForRun(context.run, "tokenUsage.total"),
|
|
483
|
+
{ maxTotalTokens },
|
|
484
|
+
total
|
|
485
|
+
)
|
|
486
|
+
] : [];
|
|
487
|
+
});
|
|
488
|
+
},
|
|
489
|
+
noFailedSteps() {
|
|
490
|
+
return createRule(
|
|
491
|
+
"eval.noFailedSteps",
|
|
492
|
+
"run",
|
|
493
|
+
(context) => context.nodes.filter((node) => node.event.status === "error" || node.event.kind === "ERROR").map(
|
|
494
|
+
(node) => fail(
|
|
495
|
+
"eval.noFailedSteps",
|
|
496
|
+
"Run contains a failed step or error node.",
|
|
497
|
+
evidenceForEvent(node.event, "status"),
|
|
498
|
+
"no failed nodes",
|
|
499
|
+
node.event.status ?? node.event.kind
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
);
|
|
503
|
+
},
|
|
504
|
+
requiredRetrievalBeforeGeneration() {
|
|
505
|
+
return createRule("eval.requiredRetrievalBeforeGeneration", "retrieval", (context) => {
|
|
506
|
+
const firstLlmIndex = context.nodes.findIndex((node) => node.event.kind === "LLM");
|
|
507
|
+
if (firstLlmIndex === -1) return [];
|
|
508
|
+
const retrievalIndex = context.nodes.findIndex(
|
|
509
|
+
(node, index) => index < firstLlmIndex && node.event.kind === "RETRIEVER"
|
|
510
|
+
);
|
|
511
|
+
return retrievalIndex === -1 ? [
|
|
512
|
+
fail(
|
|
513
|
+
"eval.requiredRetrievalBeforeGeneration",
|
|
514
|
+
"No retrieval step appeared before the first LLM generation.",
|
|
515
|
+
evidenceForEvent(context.nodes[firstLlmIndex].event, "kind"),
|
|
516
|
+
"RETRIEVER before LLM",
|
|
517
|
+
"LLM before RETRIEVER"
|
|
518
|
+
)
|
|
519
|
+
] : [];
|
|
520
|
+
});
|
|
521
|
+
},
|
|
522
|
+
requiredDecisionMetadata(keys) {
|
|
523
|
+
const required = [...keys].sort((a, b) => a.localeCompare(b));
|
|
524
|
+
return createRule("eval.requiredDecisionMetadata", "structure", (context) => {
|
|
525
|
+
const decisions = context.nodes.filter((node) => node.event.kind === "DECISION");
|
|
526
|
+
if (decisions.length === 0) {
|
|
527
|
+
return [
|
|
528
|
+
fail(
|
|
529
|
+
"eval.requiredDecisionMetadata",
|
|
530
|
+
"No decision node is available for required metadata.",
|
|
531
|
+
evidenceForRun(context.run, "children"),
|
|
532
|
+
{ decisionMetadata: required },
|
|
533
|
+
"no decision nodes"
|
|
534
|
+
)
|
|
535
|
+
];
|
|
536
|
+
}
|
|
537
|
+
return decisions.flatMap(
|
|
538
|
+
(node) => required.filter((key) => !hasAttribute(node, key)).map(
|
|
539
|
+
(key) => fail(
|
|
540
|
+
"eval.requiredDecisionMetadata",
|
|
541
|
+
`Decision metadata ${key} is missing.`,
|
|
542
|
+
evidenceForEvent(node.event, `attributes.${key}`),
|
|
543
|
+
key,
|
|
544
|
+
"missing"
|
|
545
|
+
)
|
|
546
|
+
)
|
|
547
|
+
);
|
|
548
|
+
});
|
|
549
|
+
},
|
|
550
|
+
contextOverlap(options = {}) {
|
|
551
|
+
const minOverlap = options.minOverlap ?? 0.1;
|
|
552
|
+
const minSharedTerms = options.minSharedTerms ?? 1;
|
|
553
|
+
const answerKeys = options.answerKeys ?? DEFAULT_ANSWER_KEYS;
|
|
554
|
+
const contextKeys = options.contextKeys ?? DEFAULT_CONTEXT_KEYS;
|
|
555
|
+
return createRule("eval.contextOverlap", "retrieval", (context) => {
|
|
556
|
+
const answers = collectTextFields(context.nodes, answerKeys, ["RESULT", "LLM", "AGENT"]);
|
|
557
|
+
const contexts = collectTextFields(context.nodes, contextKeys, ["RETRIEVER", "TOOL"]);
|
|
558
|
+
if (answers.length === 0 || contexts.length === 0) {
|
|
559
|
+
return [
|
|
560
|
+
fail(
|
|
561
|
+
"eval.contextOverlap",
|
|
562
|
+
"Answer and context text are required for overlap evaluation.",
|
|
563
|
+
firstEvidence(answers.length > 0 ? answers : contexts, context.run, "children"),
|
|
564
|
+
{ answer: "present", context: "present" },
|
|
565
|
+
{ answerFields: answers.length, contextFields: contexts.length }
|
|
566
|
+
)
|
|
567
|
+
];
|
|
568
|
+
}
|
|
569
|
+
const answerTerms = new Set(tokenize(answers.map((field) => field.text).join(" ")));
|
|
570
|
+
const contextTerms = new Set(tokenize(contexts.map((field) => field.text).join(" ")));
|
|
571
|
+
const sharedTerms = [...answerTerms].filter((term) => contextTerms.has(term)).length;
|
|
572
|
+
const overlap = answerTerms.size === 0 ? 0 : sharedTerms / answerTerms.size;
|
|
573
|
+
return sharedTerms < minSharedTerms || overlap < minOverlap ? [
|
|
574
|
+
fail(
|
|
575
|
+
"eval.contextOverlap",
|
|
576
|
+
"Answer text did not sufficiently overlap retrieved context.",
|
|
577
|
+
firstEvidence(answers, context.run, "attributes.answer"),
|
|
578
|
+
{ minOverlap, minSharedTerms },
|
|
579
|
+
{
|
|
580
|
+
answerTerms: answerTerms.size,
|
|
581
|
+
contextTerms: contextTerms.size,
|
|
582
|
+
sharedTerms,
|
|
583
|
+
overlap: Number(overlap.toFixed(4))
|
|
584
|
+
}
|
|
585
|
+
)
|
|
586
|
+
] : [];
|
|
587
|
+
});
|
|
588
|
+
},
|
|
589
|
+
quoteOverlap(options = {}) {
|
|
590
|
+
const answerKeys = options.answerKeys ?? DEFAULT_ANSWER_KEYS;
|
|
591
|
+
const contextKeys = options.contextKeys ?? DEFAULT_CONTEXT_KEYS;
|
|
592
|
+
const minQuoteLength = options.minQuoteLength ?? 6;
|
|
593
|
+
const requireQuote = options.requireQuote ?? true;
|
|
594
|
+
return createRule("eval.quoteOverlap", "retrieval", (context) => {
|
|
595
|
+
const answers = collectTextFields(context.nodes, answerKeys, ["RESULT", "LLM", "AGENT"]);
|
|
596
|
+
const contexts = collectTextFields(context.nodes, contextKeys, ["RETRIEVER", "TOOL"]);
|
|
597
|
+
const answerText = answers.map((field) => field.text).join(" ");
|
|
598
|
+
const contextText = contexts.map((field) => field.text).join(" ").toLowerCase();
|
|
599
|
+
const quotes = quotedSnippets(answerText, minQuoteLength);
|
|
600
|
+
if (quotes.length === 0) {
|
|
601
|
+
return requireQuote ? [
|
|
602
|
+
fail(
|
|
603
|
+
"eval.quoteOverlap",
|
|
604
|
+
"Answer did not contain a quote for overlap evaluation.",
|
|
605
|
+
firstEvidence(answers, context.run, "attributes.answer"),
|
|
606
|
+
{ quotedText: "present" },
|
|
607
|
+
{ quoteCount: 0 }
|
|
608
|
+
)
|
|
609
|
+
] : [];
|
|
610
|
+
}
|
|
611
|
+
const missing = quotes.filter((quote) => !contextText.includes(quote.toLowerCase()));
|
|
612
|
+
return missing.length > 0 ? [
|
|
613
|
+
fail(
|
|
614
|
+
"eval.quoteOverlap",
|
|
615
|
+
"Quoted answer text did not appear in retrieved context.",
|
|
616
|
+
firstEvidence(answers, context.run, "attributes.answer"),
|
|
617
|
+
{ allQuotesInContext: true },
|
|
618
|
+
{ quoteCount: quotes.length, missingQuotes: missing.length }
|
|
619
|
+
)
|
|
620
|
+
] : [];
|
|
621
|
+
});
|
|
622
|
+
},
|
|
623
|
+
citationPresence(options = {}) {
|
|
624
|
+
const answerKeys = options.answerKeys ?? DEFAULT_ANSWER_KEYS;
|
|
625
|
+
const citationKeys = options.citationKeys ?? DEFAULT_CITATION_KEYS;
|
|
626
|
+
return createRule("eval.citationPresence", "retrieval", (context) => {
|
|
627
|
+
const answers = collectTextFields(context.nodes, answerKeys, ["RESULT", "LLM", "AGENT"]);
|
|
628
|
+
const citations = collectTextFields(context.nodes, citationKeys);
|
|
629
|
+
const count = citationCount(answers.map((field) => field.text).join(" "), citations);
|
|
630
|
+
return count === 0 ? [
|
|
631
|
+
fail(
|
|
632
|
+
"eval.citationPresence",
|
|
633
|
+
"Answer did not include citations or source references.",
|
|
634
|
+
firstEvidence(answers, context.run, "attributes.answer"),
|
|
635
|
+
{ citationCount: ">= 1" },
|
|
636
|
+
{ citationCount: 0 }
|
|
637
|
+
)
|
|
638
|
+
] : [];
|
|
639
|
+
});
|
|
640
|
+
},
|
|
641
|
+
requiredSourceIds(requiredIds, options = {}) {
|
|
642
|
+
const expected = [...requiredIds].sort((a, b) => a.localeCompare(b));
|
|
643
|
+
const sourceIdKeys = options.sourceIdKeys ?? DEFAULT_SOURCE_ID_KEYS;
|
|
644
|
+
return createRule("eval.requiredSourceIds", "retrieval", (context) => {
|
|
645
|
+
const available = collectSourceIds(context.nodes, sourceIdKeys);
|
|
646
|
+
const availableSet = new Set(available);
|
|
647
|
+
const missing = expected.filter((id) => !availableSet.has(id));
|
|
648
|
+
return missing.length > 0 ? [
|
|
649
|
+
fail(
|
|
650
|
+
"eval.requiredSourceIds",
|
|
651
|
+
"Required source IDs were not present in trace context or citations.",
|
|
652
|
+
evidenceForRun(context.run, "children"),
|
|
653
|
+
{ sourceIds: expected },
|
|
654
|
+
{ missingSourceIds: missing, availableSourceIds: available.slice(0, 20) }
|
|
655
|
+
)
|
|
656
|
+
] : [];
|
|
657
|
+
});
|
|
658
|
+
},
|
|
659
|
+
answerLengthBounds(options) {
|
|
660
|
+
const answerKeys = options.answerKeys ?? DEFAULT_ANSWER_KEYS;
|
|
661
|
+
return createRule("eval.answerLengthBounds", "llm", (context) => {
|
|
662
|
+
const answers = collectTextFields(context.nodes, answerKeys, ["RESULT", "LLM", "AGENT"]);
|
|
663
|
+
const answer = answers.map((field) => field.text).join(" ").trim();
|
|
664
|
+
const characters = answer.length;
|
|
665
|
+
const words = wordCount(answer);
|
|
666
|
+
const tooShort = options.minCharacters !== void 0 && characters < options.minCharacters || options.minWords !== void 0 && words < options.minWords;
|
|
667
|
+
const tooLong = options.maxCharacters !== void 0 && characters > options.maxCharacters || options.maxWords !== void 0 && words > options.maxWords;
|
|
668
|
+
return answer.length === 0 || tooShort || tooLong ? [
|
|
669
|
+
fail(
|
|
670
|
+
"eval.answerLengthBounds",
|
|
671
|
+
"Answer length fell outside required bounds.",
|
|
672
|
+
firstEvidence(answers, context.run, "attributes.answer"),
|
|
673
|
+
{
|
|
674
|
+
minCharacters: options.minCharacters,
|
|
675
|
+
maxCharacters: options.maxCharacters,
|
|
676
|
+
minWords: options.minWords,
|
|
677
|
+
maxWords: options.maxWords
|
|
678
|
+
},
|
|
679
|
+
{ characters, words }
|
|
680
|
+
)
|
|
681
|
+
] : [];
|
|
682
|
+
});
|
|
683
|
+
},
|
|
684
|
+
bannedUnsupportedPhrases(phrases = DEFAULT_BANNED_UNSUPPORTED_PHRASES, options = {}) {
|
|
685
|
+
const answerKeys = options.answerKeys ?? DEFAULT_ANSWER_KEYS;
|
|
686
|
+
const banned = [...phrases].map((phrase) => phrase.toLowerCase()).sort();
|
|
687
|
+
return createRule("eval.bannedUnsupportedPhrases", "safety", (context) => {
|
|
688
|
+
const answers = collectTextFields(context.nodes, answerKeys, ["RESULT", "LLM", "AGENT"]);
|
|
689
|
+
const answer = answers.map((field) => field.text).join(" ").toLowerCase();
|
|
690
|
+
const matches = banned.filter((phrase) => answer.includes(phrase));
|
|
691
|
+
return matches.length > 0 ? [
|
|
692
|
+
fail(
|
|
693
|
+
"eval.bannedUnsupportedPhrases",
|
|
694
|
+
"Answer contained banned unsupported-answer phrasing.",
|
|
695
|
+
firstEvidence(answers, context.run, "attributes.answer"),
|
|
696
|
+
{ bannedPhraseCount: banned.length },
|
|
697
|
+
{ matchedPhraseCount: matches.length }
|
|
698
|
+
)
|
|
699
|
+
] : [];
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
};
|
|
703
|
+
function renderEvalMarkdown(result) {
|
|
704
|
+
const lines = [
|
|
705
|
+
`# AgentInspect Eval`,
|
|
706
|
+
"",
|
|
707
|
+
`Status: ${result.status}`,
|
|
708
|
+
`Format: ${result.format}`,
|
|
709
|
+
...result.runId !== void 0 ? [`Run: ${result.runId}`] : [],
|
|
710
|
+
`Summary: ${result.summary.passed} passed, ${result.summary.failed} failed, ${result.summary.warnings} warnings, ${result.summary.errors} errors`
|
|
711
|
+
];
|
|
712
|
+
if (result.diagnostics.length > 0) {
|
|
713
|
+
lines.push("", "## Diagnostics");
|
|
714
|
+
for (const diagnostic of result.diagnostics) {
|
|
715
|
+
lines.push(`- ${diagnostic.code}: ${diagnostic.message}`);
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
if (result.findings.length > 0) {
|
|
719
|
+
lines.push("", "## Findings");
|
|
720
|
+
for (const finding of result.findings) {
|
|
721
|
+
const path = finding.evidence[0]?.path;
|
|
722
|
+
lines.push(`- ${finding.ruleId}: ${finding.message}${path ? ` (${path})` : ""}`);
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
return `${lines.join("\n")}
|
|
726
|
+
`;
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
exports.checks = checks;
|
|
730
|
+
exports.evalRun = evalRun;
|
|
731
|
+
exports.renderEvalMarkdown = renderEvalMarkdown;
|
|
732
|
+
//# sourceMappingURL=index.cjs.map
|
|
733
|
+
//# sourceMappingURL=index.cjs.map
|