@kat-ai/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -0
- package/dist/agent/index.cjs +468 -0
- package/dist/agent/index.cjs.map +1 -0
- package/dist/agent/index.d.cts +170 -0
- package/dist/agent/index.d.ts +170 -0
- package/dist/agent/index.js +466 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/index.cjs +1043 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +93 -0
- package/dist/index.d.ts +93 -0
- package/dist/index.js +1032 -0
- package/dist/index.js.map +1 -0
- package/dist/introspection/index.cjs +476 -0
- package/dist/introspection/index.cjs.map +1 -0
- package/dist/introspection/index.d.cts +107 -0
- package/dist/introspection/index.d.ts +107 -0
- package/dist/introspection/index.js +474 -0
- package/dist/introspection/index.js.map +1 -0
- package/dist/retrieval/index.cjs +312 -0
- package/dist/retrieval/index.cjs.map +1 -0
- package/dist/retrieval/index.d.cts +98 -0
- package/dist/retrieval/index.d.ts +98 -0
- package/dist/retrieval/index.js +310 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-BJjlqNhg.d.cts +112 -0
- package/dist/types-BJjlqNhg.d.ts +112 -0
- package/package.json +79 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,1043 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var pinecone = require('@pinecone-database/pinecone');
|
|
4
|
+
var openai = require('@ai-sdk/openai');
|
|
5
|
+
var ai = require('ai');
|
|
6
|
+
var zod = require('zod');
|
|
7
|
+
var core = require('@kat/core');
|
|
8
|
+
|
|
9
|
+
// src/introspection/index.ts
|
|
10
|
+
var GradeSchema = zod.z.object({
|
|
11
|
+
score: zod.z.number().min(0).max(100).describe("Score from 0-100"),
|
|
12
|
+
reasoning: zod.z.string().describe("Explanation for the score"),
|
|
13
|
+
examples: zod.z.array(zod.z.string()).optional().describe("Specific examples that influenced the score")
|
|
14
|
+
});
|
|
15
|
+
var MultiCriteriaGradeSchema = zod.z.object({
|
|
16
|
+
scores: zod.z.array(zod.z.object({
|
|
17
|
+
criterion: zod.z.string(),
|
|
18
|
+
score: zod.z.number().min(0).max(100),
|
|
19
|
+
reasoning: zod.z.string()
|
|
20
|
+
})),
|
|
21
|
+
overallReasoning: zod.z.string()
|
|
22
|
+
});
|
|
23
|
+
function createLLMGrader(config = {}) {
|
|
24
|
+
const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
25
|
+
if (!apiKey) {
|
|
26
|
+
throw new Error("OPENAI_API_KEY is required for LLM grading");
|
|
27
|
+
}
|
|
28
|
+
const openai$1 = openai.createOpenAI({ apiKey });
|
|
29
|
+
const model = config.model || core.resolveDefaultOpenAiChatModelId();
|
|
30
|
+
const temperature = config.temperature ?? 0.1;
|
|
31
|
+
return {
|
|
32
|
+
async grade(content, criterion, rubric) {
|
|
33
|
+
const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
|
|
34
|
+
|
|
35
|
+
${rubric ? `Rubric: ${rubric}
|
|
36
|
+
` : ""}
|
|
37
|
+
Content to evaluate:
|
|
38
|
+
"""
|
|
39
|
+
${content}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
43
|
+
const result = await ai.generateObject({
|
|
44
|
+
model: openai$1(model),
|
|
45
|
+
schema: GradeSchema,
|
|
46
|
+
prompt,
|
|
47
|
+
temperature
|
|
48
|
+
});
|
|
49
|
+
return result.object;
|
|
50
|
+
},
|
|
51
|
+
async gradeMultiple(content, criteria) {
|
|
52
|
+
const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
|
|
53
|
+
const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
|
|
54
|
+
|
|
55
|
+
Criteria:
|
|
56
|
+
${criteriaDescription}
|
|
57
|
+
|
|
58
|
+
Content to evaluate:
|
|
59
|
+
"""
|
|
60
|
+
${content}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
For each criterion, provide a score from 0-100 and explain your reasoning.`;
|
|
64
|
+
const result = await ai.generateObject({
|
|
65
|
+
model: openai$1(model),
|
|
66
|
+
schema: MultiCriteriaGradeSchema,
|
|
67
|
+
prompt,
|
|
68
|
+
temperature
|
|
69
|
+
});
|
|
70
|
+
return result.object.scores.map((s) => ({
|
|
71
|
+
criterion: s.criterion,
|
|
72
|
+
score: s.score,
|
|
73
|
+
reasoning: s.reasoning
|
|
74
|
+
}));
|
|
75
|
+
},
|
|
76
|
+
async gradeRelevance(query, content) {
|
|
77
|
+
const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
|
|
78
|
+
|
|
79
|
+
Query: "${query}"
|
|
80
|
+
|
|
81
|
+
Content:
|
|
82
|
+
"""
|
|
83
|
+
${content}
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
A score of 100 means the content directly and completely answers the query.
|
|
87
|
+
A score of 0 means the content is completely irrelevant.
|
|
88
|
+
|
|
89
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
90
|
+
const result = await ai.generateObject({
|
|
91
|
+
model: openai$1(model),
|
|
92
|
+
schema: GradeSchema,
|
|
93
|
+
prompt,
|
|
94
|
+
temperature
|
|
95
|
+
});
|
|
96
|
+
return {
|
|
97
|
+
score: result.object.score,
|
|
98
|
+
reasoning: result.object.reasoning
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
async function gradeWithLLM(content, criterion, config = {}) {
|
|
104
|
+
const grader = createLLMGrader(config);
|
|
105
|
+
return grader.grade(content, criterion);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// src/utils/metrics.ts
|
|
109
|
+
function calculateWeightedScore(scores) {
|
|
110
|
+
if (scores.length === 0) return 0;
|
|
111
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
112
|
+
if (totalWeight === 0) return 0;
|
|
113
|
+
const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
|
|
114
|
+
return Math.round(weightedSum / totalWeight);
|
|
115
|
+
}
|
|
116
|
+
function calculatePercentage(found, expected) {
|
|
117
|
+
if (expected === 0) return 100;
|
|
118
|
+
return Math.round(found / expected * 100);
|
|
119
|
+
}
|
|
120
|
+
function average(numbers) {
|
|
121
|
+
if (numbers.length === 0) return 0;
|
|
122
|
+
return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
|
|
123
|
+
}
|
|
124
|
+
function clamp(value, min, max) {
|
|
125
|
+
return Math.max(min, Math.min(max, value));
|
|
126
|
+
}
|
|
127
|
+
function normalizeScore(score) {
|
|
128
|
+
return clamp(Math.round(score), 0, 100);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// src/utils/reporters.ts
|
|
132
|
+
function formatConsoleReport(result, options = {}) {
|
|
133
|
+
const lines = [];
|
|
134
|
+
const { includeEvidence = true } = options;
|
|
135
|
+
const status = result.passed ? "\u2713 PASSED" : "\u2717 FAILED";
|
|
136
|
+
const statusColor = result.passed ? "\x1B[32m" : "\x1B[31m";
|
|
137
|
+
const reset = "\x1B[0m";
|
|
138
|
+
lines.push("");
|
|
139
|
+
lines.push("\u2550".repeat(60));
|
|
140
|
+
lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);
|
|
141
|
+
lines.push("\u2550".repeat(60));
|
|
142
|
+
lines.push("");
|
|
143
|
+
lines.push(`Summary: ${result.summary}`);
|
|
144
|
+
lines.push(`Duration: ${result.duration}ms`);
|
|
145
|
+
lines.push("");
|
|
146
|
+
lines.push("Scores:");
|
|
147
|
+
for (const [name, score] of Object.entries(result.scores)) {
|
|
148
|
+
const bar = createProgressBar(score, 20);
|
|
149
|
+
const formattedName = formatScoreName(name);
|
|
150
|
+
lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);
|
|
151
|
+
}
|
|
152
|
+
if (includeEvidence && result.evidence.length > 0) {
|
|
153
|
+
lines.push("");
|
|
154
|
+
lines.push("Evidence:");
|
|
155
|
+
for (const evidence of result.evidence) {
|
|
156
|
+
lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);
|
|
157
|
+
lines.push(` ${evidence.reasoning}`);
|
|
158
|
+
if (evidence.examples && evidence.examples.length > 0) {
|
|
159
|
+
for (const example of evidence.examples.slice(0, 3)) {
|
|
160
|
+
lines.push(` - ${example}`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
lines.push("");
|
|
166
|
+
lines.push("\u2500".repeat(60));
|
|
167
|
+
return lines.join("\n");
|
|
168
|
+
}
|
|
169
|
+
function createProgressBar(value, width) {
|
|
170
|
+
const filled = Math.round(value / 100 * width);
|
|
171
|
+
const empty = width - filled;
|
|
172
|
+
return `[${"\u2588".repeat(filled)}${"\u2591".repeat(empty)}]`;
|
|
173
|
+
}
|
|
174
|
+
function formatScoreName(name) {
|
|
175
|
+
return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
|
|
176
|
+
}
|
|
177
|
+
function formatJsonReport(result, options = {}) {
|
|
178
|
+
const { includeEvidence = true, includeRawData = false } = options;
|
|
179
|
+
const output = {
|
|
180
|
+
passed: result.passed,
|
|
181
|
+
overallScore: result.overallScore,
|
|
182
|
+
scores: result.scores,
|
|
183
|
+
summary: result.summary,
|
|
184
|
+
duration: result.duration
|
|
185
|
+
};
|
|
186
|
+
if (includeEvidence) {
|
|
187
|
+
output.evidence = result.evidence;
|
|
188
|
+
}
|
|
189
|
+
for (const [key, value] of Object.entries(result)) {
|
|
190
|
+
if (!["passed", "overallScore", "scores", "evidence", "summary", "duration"].includes(key) && (includeRawData || !isRawData(value))) {
|
|
191
|
+
output[key] = value;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
return JSON.stringify(output, null, 2);
|
|
195
|
+
}
|
|
196
|
+
function isRawData(value) {
|
|
197
|
+
if (Array.isArray(value) && value.length > 10) return true;
|
|
198
|
+
if (typeof value === "object" && value !== null) {
|
|
199
|
+
const keys = Object.keys(value);
|
|
200
|
+
if (keys.length > 20) return true;
|
|
201
|
+
}
|
|
202
|
+
return false;
|
|
203
|
+
}
|
|
204
|
+
function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
|
|
205
|
+
const entries = Object.entries(scores);
|
|
206
|
+
const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
|
|
207
|
+
const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
|
|
208
|
+
const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
|
|
209
|
+
const parts = [];
|
|
210
|
+
if (avgScore >= thresholds.good) {
|
|
211
|
+
parts.push("Strong overall performance");
|
|
212
|
+
} else if (avgScore >= thresholds.acceptable) {
|
|
213
|
+
parts.push("Acceptable performance with room for improvement");
|
|
214
|
+
} else {
|
|
215
|
+
parts.push("Performance below acceptable thresholds");
|
|
216
|
+
}
|
|
217
|
+
if (goodMetrics.length > 0) {
|
|
218
|
+
parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
|
|
219
|
+
}
|
|
220
|
+
if (poorMetrics.length > 0) {
|
|
221
|
+
parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
|
|
222
|
+
}
|
|
223
|
+
return parts.join(". ") + ".";
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// src/introspection/index.ts
|
|
227
|
+
async function evaluateIntrospection(config) {
|
|
228
|
+
const startTime = Date.now();
|
|
229
|
+
const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
|
|
230
|
+
if (!apiKey) {
|
|
231
|
+
throw new Error("PINECONE_API_KEY is required for introspection eval");
|
|
232
|
+
}
|
|
233
|
+
const pinecone$1 = new pinecone.Pinecone({ apiKey });
|
|
234
|
+
const assistant = pinecone$1.assistant(config.assistantName);
|
|
235
|
+
const [entityResult, slotResult, scopeResult, capabilityResult] = await Promise.all([
|
|
236
|
+
evaluateEntityCoverage(assistant, config.manifest, config),
|
|
237
|
+
evaluateSlotAccuracy(assistant, config.manifest, config),
|
|
238
|
+
evaluateScopePrecision(assistant, config.manifest, config),
|
|
239
|
+
evaluateCapabilityMatch(assistant, config.manifest, config)
|
|
240
|
+
]);
|
|
241
|
+
const scores = {
|
|
242
|
+
entityCoverage: entityResult.score,
|
|
243
|
+
slotAccuracy: slotResult.score,
|
|
244
|
+
scopePrecision: scopeResult.score,
|
|
245
|
+
capabilityMatch: capabilityResult.score
|
|
246
|
+
};
|
|
247
|
+
const overallScore = calculateWeightedScore([
|
|
248
|
+
{ score: scores.entityCoverage, weight: 0.25 },
|
|
249
|
+
{ score: scores.slotAccuracy, weight: 0.3 },
|
|
250
|
+
{ score: scores.scopePrecision, weight: 0.25 },
|
|
251
|
+
{ score: scores.capabilityMatch, weight: 0.2 }
|
|
252
|
+
]);
|
|
253
|
+
const evidence = [
|
|
254
|
+
...entityResult.evidence,
|
|
255
|
+
...slotResult.evidence,
|
|
256
|
+
...scopeResult.evidence,
|
|
257
|
+
...capabilityResult.evidence
|
|
258
|
+
];
|
|
259
|
+
return {
|
|
260
|
+
passed: overallScore >= 70,
|
|
261
|
+
overallScore,
|
|
262
|
+
scores,
|
|
263
|
+
evidence,
|
|
264
|
+
summary: generateSummary(scores),
|
|
265
|
+
duration: Date.now() - startTime,
|
|
266
|
+
details: {
|
|
267
|
+
missingEntities: entityResult.missing,
|
|
268
|
+
incorrectSlots: slotResult.incorrect,
|
|
269
|
+
scopeMisclassifications: scopeResult.misclassified,
|
|
270
|
+
mismatchedCapabilities: capabilityResult.mismatched
|
|
271
|
+
},
|
|
272
|
+
manifest: config.manifest
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
async function evaluateEntityCoverage(assistant, manifest, config) {
|
|
276
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
277
|
+
const manifestEntities = extractEntitiesFromManifest(manifest);
|
|
278
|
+
const discoveryQueries = [
|
|
279
|
+
"What are the main topics you can help with?",
|
|
280
|
+
"What products or entities do you have information about?",
|
|
281
|
+
"List the categories of information you contain."
|
|
282
|
+
];
|
|
283
|
+
const discoveredEntities = /* @__PURE__ */ new Set();
|
|
284
|
+
for (const query of discoveryQueries) {
|
|
285
|
+
try {
|
|
286
|
+
const response = await assistant.chat({
|
|
287
|
+
messages: [{ role: "user", content: query }]
|
|
288
|
+
});
|
|
289
|
+
const entities = await extractEntitiesWithLLM(grader, response.message?.content || "");
|
|
290
|
+
entities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
|
|
291
|
+
} catch {
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
if (config.groundTruth) {
|
|
295
|
+
for (const gt of config.groundTruth) {
|
|
296
|
+
if (gt.expectedEntities) {
|
|
297
|
+
gt.expectedEntities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
const found = [];
|
|
302
|
+
const missing = [];
|
|
303
|
+
for (const entity of discoveredEntities) {
|
|
304
|
+
if (manifestEntities.some((me) => me.toLowerCase().includes(entity) || entity.includes(me.toLowerCase()))) {
|
|
305
|
+
found.push(entity);
|
|
306
|
+
} else {
|
|
307
|
+
missing.push(entity);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
const score = discoveredEntities.size === 0 ? 100 : normalizeScore(found.length / discoveredEntities.size * 100);
|
|
311
|
+
return {
|
|
312
|
+
score,
|
|
313
|
+
evidence: [
|
|
314
|
+
{
|
|
315
|
+
criterion: "entityCoverage",
|
|
316
|
+
score,
|
|
317
|
+
reasoning: `Found ${found.length}/${discoveredEntities.size} expected entities in the manifest.`,
|
|
318
|
+
examples: missing.slice(0, 5)
|
|
319
|
+
}
|
|
320
|
+
],
|
|
321
|
+
found,
|
|
322
|
+
missing
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
async function evaluateSlotAccuracy(assistant, manifest, config) {
|
|
326
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
327
|
+
const slots = manifest.slots || [];
|
|
328
|
+
if (slots.length === 0) {
|
|
329
|
+
return {
|
|
330
|
+
score: 50,
|
|
331
|
+
// Penalize but don't fail for no slots
|
|
332
|
+
evidence: [
|
|
333
|
+
{
|
|
334
|
+
criterion: "slotAccuracy",
|
|
335
|
+
score: 50,
|
|
336
|
+
reasoning: "No slots defined in manifest. Consider adding slots for common query parameters."
|
|
337
|
+
}
|
|
338
|
+
],
|
|
339
|
+
correct: [],
|
|
340
|
+
incorrect: []
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
const correct = [];
|
|
344
|
+
const incorrect = [];
|
|
345
|
+
for (const slot of slots) {
|
|
346
|
+
const testQuery = `To answer questions about ${manifest.domain || "this topic"}, do I need to know the ${slot.name}?`;
|
|
347
|
+
try {
|
|
348
|
+
const response = await assistant.chat({
|
|
349
|
+
messages: [{ role: "user", content: testQuery }]
|
|
350
|
+
});
|
|
351
|
+
const evaluation = await grader.grade(
|
|
352
|
+
`Slot: ${slot.name}
|
|
353
|
+
Description: ${slot.description || "N/A"}
|
|
354
|
+
KB Response: ${response.message?.content}`,
|
|
355
|
+
"slot relevance",
|
|
356
|
+
"Score 100 if the slot seems relevant to the KB content, 0 if completely irrelevant."
|
|
357
|
+
);
|
|
358
|
+
if (evaluation.score >= 60) {
|
|
359
|
+
correct.push(slot.name);
|
|
360
|
+
} else {
|
|
361
|
+
incorrect.push(slot.name);
|
|
362
|
+
}
|
|
363
|
+
} catch {
|
|
364
|
+
correct.push(slot.name);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
const score = normalizeScore(correct.length / slots.length * 100);
|
|
368
|
+
return {
|
|
369
|
+
score,
|
|
370
|
+
evidence: [
|
|
371
|
+
{
|
|
372
|
+
criterion: "slotAccuracy",
|
|
373
|
+
score,
|
|
374
|
+
reasoning: `${correct.length}/${slots.length} slots appear relevant to the KB content.`,
|
|
375
|
+
examples: incorrect
|
|
376
|
+
}
|
|
377
|
+
],
|
|
378
|
+
correct,
|
|
379
|
+
incorrect
|
|
380
|
+
};
|
|
381
|
+
}
|
|
382
|
+
async function evaluateScopePrecision(assistant, manifest, config) {
|
|
383
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
384
|
+
const scope = manifest.scope;
|
|
385
|
+
if (!scope) {
|
|
386
|
+
return {
|
|
387
|
+
score: 60,
|
|
388
|
+
// Penalize but don't fail
|
|
389
|
+
evidence: [
|
|
390
|
+
{
|
|
391
|
+
criterion: "scopePrecision",
|
|
392
|
+
score: 60,
|
|
393
|
+
reasoning: "No scope definition in manifest. Consider defining in-scope and out-of-scope examples."
|
|
394
|
+
}
|
|
395
|
+
],
|
|
396
|
+
correctClassifications: 0,
|
|
397
|
+
totalClassifications: 0,
|
|
398
|
+
misclassified: []
|
|
399
|
+
};
|
|
400
|
+
}
|
|
401
|
+
const testCases = [];
|
|
402
|
+
if (scope.inScopeExamples) {
|
|
403
|
+
scope.inScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: true }));
|
|
404
|
+
}
|
|
405
|
+
if (scope.outOfScopeExamples) {
|
|
406
|
+
scope.outOfScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: false }));
|
|
407
|
+
}
|
|
408
|
+
if (config.groundTruth) {
|
|
409
|
+
config.groundTruth.filter((gt) => gt.shouldBeInScope !== void 0).forEach((gt) => testCases.push({ query: gt.query, expectedInScope: gt.shouldBeInScope }));
|
|
410
|
+
}
|
|
411
|
+
if (testCases.length === 0) {
|
|
412
|
+
return {
|
|
413
|
+
score: 70,
|
|
414
|
+
evidence: [
|
|
415
|
+
{
|
|
416
|
+
criterion: "scopePrecision",
|
|
417
|
+
score: 70,
|
|
418
|
+
reasoning: "No scope test cases available. Add in-scope and out-of-scope examples to test."
|
|
419
|
+
}
|
|
420
|
+
],
|
|
421
|
+
correctClassifications: 0,
|
|
422
|
+
totalClassifications: 0,
|
|
423
|
+
misclassified: []
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
let correctClassifications = 0;
|
|
427
|
+
const misclassified = [];
|
|
428
|
+
for (const testCase of testCases.slice(0, 10)) {
|
|
429
|
+
try {
|
|
430
|
+
const response = await assistant.chat({
|
|
431
|
+
messages: [{ role: "user", content: testCase.query }]
|
|
432
|
+
});
|
|
433
|
+
const evaluation = await grader.grade(
|
|
434
|
+
`Query: ${testCase.query}
|
|
435
|
+
KB Response: ${response.message?.content}`,
|
|
436
|
+
"answerability",
|
|
437
|
+
"Score 100 if the KB provided a substantive, on-topic answer. Score 0 if it said it cannot help or gave an off-topic response."
|
|
438
|
+
);
|
|
439
|
+
const actuallyInScope = evaluation.score >= 50;
|
|
440
|
+
if (actuallyInScope === testCase.expectedInScope) {
|
|
441
|
+
correctClassifications++;
|
|
442
|
+
} else {
|
|
443
|
+
misclassified.push(`"${testCase.query}" (expected ${testCase.expectedInScope ? "in-scope" : "out-of-scope"})`);
|
|
444
|
+
}
|
|
445
|
+
} catch {
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
const score = normalizeScore(correctClassifications / testCases.length * 100);
|
|
449
|
+
return {
|
|
450
|
+
score,
|
|
451
|
+
evidence: [
|
|
452
|
+
{
|
|
453
|
+
criterion: "scopePrecision",
|
|
454
|
+
score,
|
|
455
|
+
reasoning: `${correctClassifications}/${testCases.length} scope classifications were correct.`,
|
|
456
|
+
examples: misclassified.slice(0, 3)
|
|
457
|
+
}
|
|
458
|
+
],
|
|
459
|
+
correctClassifications,
|
|
460
|
+
totalClassifications: testCases.length,
|
|
461
|
+
misclassified
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
async function evaluateCapabilityMatch(assistant, manifest, config) {
|
|
465
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
466
|
+
const capabilities = (manifest.capabilities || []).map(
|
|
467
|
+
(c) => typeof c === "string" ? c : c.text
|
|
468
|
+
);
|
|
469
|
+
if (capabilities.length === 0) {
|
|
470
|
+
return {
|
|
471
|
+
score: 50,
|
|
472
|
+
evidence: [
|
|
473
|
+
{
|
|
474
|
+
criterion: "capabilityMatch",
|
|
475
|
+
score: 50,
|
|
476
|
+
reasoning: "No capabilities defined in manifest."
|
|
477
|
+
}
|
|
478
|
+
],
|
|
479
|
+
matched: [],
|
|
480
|
+
mismatched: []
|
|
481
|
+
};
|
|
482
|
+
}
|
|
483
|
+
const matched = [];
|
|
484
|
+
const mismatched = [];
|
|
485
|
+
for (const capability of capabilities.slice(0, 5)) {
|
|
486
|
+
const testQuery = `Can you help me with: ${capability}`;
|
|
487
|
+
try {
|
|
488
|
+
const response = await assistant.chat({
|
|
489
|
+
messages: [{ role: "user", content: testQuery }]
|
|
490
|
+
});
|
|
491
|
+
const evaluation = await grader.grade(
|
|
492
|
+
`Capability: ${capability}
|
|
493
|
+
KB Response: ${response.message?.content}`,
|
|
494
|
+
"capability fulfillment",
|
|
495
|
+
"Score 100 if the KB demonstrated it can help with this capability. Score 0 if it cannot."
|
|
496
|
+
);
|
|
497
|
+
if (evaluation.score >= 60) {
|
|
498
|
+
matched.push(capability);
|
|
499
|
+
} else {
|
|
500
|
+
mismatched.push(capability);
|
|
501
|
+
}
|
|
502
|
+
} catch {
|
|
503
|
+
matched.push(capability);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
const score = normalizeScore(matched.length / capabilities.length * 100);
|
|
507
|
+
return {
|
|
508
|
+
score,
|
|
509
|
+
evidence: [
|
|
510
|
+
{
|
|
511
|
+
criterion: "capabilityMatch",
|
|
512
|
+
score,
|
|
513
|
+
reasoning: `${matched.length}/${capabilities.length} stated capabilities match actual KB content.`,
|
|
514
|
+
examples: mismatched
|
|
515
|
+
}
|
|
516
|
+
],
|
|
517
|
+
matched,
|
|
518
|
+
mismatched
|
|
519
|
+
};
|
|
520
|
+
}
|
|
521
|
+
function extractEntitiesFromManifest(manifest) {
|
|
522
|
+
const entities = /* @__PURE__ */ new Set();
|
|
523
|
+
if (manifest.slots) {
|
|
524
|
+
manifest.slots.forEach((slot) => {
|
|
525
|
+
if (slot.examples) {
|
|
526
|
+
slot.examples.forEach((e) => entities.add(e));
|
|
527
|
+
}
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
if (manifest.domain) {
|
|
531
|
+
entities.add(manifest.domain);
|
|
532
|
+
}
|
|
533
|
+
if (manifest.capabilities) {
|
|
534
|
+
manifest.capabilities.forEach((c) => {
|
|
535
|
+
const text = typeof c === "string" ? c : c.text;
|
|
536
|
+
const words = text.split(/\s+/).filter((w) => w.length > 3 && /^[A-Z]/.test(w));
|
|
537
|
+
words.forEach((w) => entities.add(w.toLowerCase()));
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
return Array.from(entities);
|
|
541
|
+
}
|
|
542
|
+
async function extractEntitiesWithLLM(grader, text) {
|
|
543
|
+
try {
|
|
544
|
+
const result = await grader.grade(
|
|
545
|
+
text,
|
|
546
|
+
"entity extraction",
|
|
547
|
+
"List the main entities (products, topics, categories) mentioned. Return just the entity names separated by commas."
|
|
548
|
+
);
|
|
549
|
+
return result.reasoning.split(",").map((e) => e.trim().toLowerCase()).filter((e) => e.length > 2);
|
|
550
|
+
} catch {
|
|
551
|
+
return [];
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
async function evaluateRetrieval(config) {
|
|
555
|
+
const startTime = Date.now();
|
|
556
|
+
if (config.queries.length === 0) {
|
|
557
|
+
throw new Error("At least one query is required for retrieval eval");
|
|
558
|
+
}
|
|
559
|
+
const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
|
|
560
|
+
if (!apiKey) {
|
|
561
|
+
throw new Error("PINECONE_API_KEY is required for retrieval eval");
|
|
562
|
+
}
|
|
563
|
+
const pinecone$1 = new pinecone.Pinecone({ apiKey });
|
|
564
|
+
const assistant = pinecone$1.assistant(config.assistantName);
|
|
565
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
566
|
+
const topK = config.topK || 5;
|
|
567
|
+
const queryResults = [];
|
|
568
|
+
for (const testQuery of config.queries) {
|
|
569
|
+
const result = await evaluateQueryRetrieval(
|
|
570
|
+
assistant,
|
|
571
|
+
grader,
|
|
572
|
+
testQuery,
|
|
573
|
+
topK,
|
|
574
|
+
config.verbose
|
|
575
|
+
);
|
|
576
|
+
queryResults.push(result);
|
|
577
|
+
}
|
|
578
|
+
const relevanceScores = queryResults.map((r) => r.relevanceScore);
|
|
579
|
+
const relevance = normalizeScore(average(relevanceScores));
|
|
580
|
+
const totalExpected = queryResults.reduce(
|
|
581
|
+
(sum, r) => sum + r.foundTopics.length + r.missingTopics.length,
|
|
582
|
+
0
|
|
583
|
+
);
|
|
584
|
+
const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);
|
|
585
|
+
const recall = totalExpected === 0 ? 100 : normalizeScore(totalFound / totalExpected * 100);
|
|
586
|
+
const allChunks = queryResults.flatMap((r) => r.chunks);
|
|
587
|
+
const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);
|
|
588
|
+
const precision = allChunks.length === 0 ? 100 : normalizeScore(relevantChunks.length / allChunks.length * 100);
|
|
589
|
+
const totalIrrelevant = queryResults.reduce(
|
|
590
|
+
(sum, r) => sum + r.noiseTopics.length,
|
|
591
|
+
0
|
|
592
|
+
);
|
|
593
|
+
const totalIrrelevantExpected = config.queries.reduce(
|
|
594
|
+
(sum, q) => sum + (q.irrelevantTopics?.length || 0),
|
|
595
|
+
0
|
|
596
|
+
);
|
|
597
|
+
const noiseRatio = totalIrrelevantExpected === 0 ? 0 : normalizeScore(totalIrrelevant / totalIrrelevantExpected * 100);
|
|
598
|
+
const scores = { relevance, recall, precision, noiseRatio };
|
|
599
|
+
const overallScore = calculateWeightedScore([
|
|
600
|
+
{ score: relevance, weight: 0.35 },
|
|
601
|
+
{ score: recall, weight: 0.3 },
|
|
602
|
+
{ score: precision, weight: 0.25 },
|
|
603
|
+
{ score: 100 - noiseRatio, weight: 0.1 }
|
|
604
|
+
// Invert noise ratio
|
|
605
|
+
]);
|
|
606
|
+
const evidence = [
|
|
607
|
+
{
|
|
608
|
+
criterion: "relevance",
|
|
609
|
+
score: relevance,
|
|
610
|
+
reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`
|
|
611
|
+
},
|
|
612
|
+
{
|
|
613
|
+
criterion: "recall",
|
|
614
|
+
score: recall,
|
|
615
|
+
reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,
|
|
616
|
+
examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3)
|
|
617
|
+
},
|
|
618
|
+
{
|
|
619
|
+
criterion: "precision",
|
|
620
|
+
score: precision,
|
|
621
|
+
reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`
|
|
622
|
+
},
|
|
623
|
+
{
|
|
624
|
+
criterion: "noiseRatio",
|
|
625
|
+
score: noiseRatio,
|
|
626
|
+
reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,
|
|
627
|
+
examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3)
|
|
628
|
+
}
|
|
629
|
+
];
|
|
630
|
+
return {
|
|
631
|
+
passed: overallScore >= 70 && noiseRatio <= 30,
|
|
632
|
+
overallScore,
|
|
633
|
+
scores,
|
|
634
|
+
evidence,
|
|
635
|
+
summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),
|
|
636
|
+
duration: Date.now() - startTime,
|
|
637
|
+
queryResults
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
async function evaluateQueryRetrieval(assistant, grader, testQuery, topK, verbose) {
|
|
641
|
+
let contextResult;
|
|
642
|
+
try {
|
|
643
|
+
contextResult = await assistant.context({
|
|
644
|
+
query: testQuery.query,
|
|
645
|
+
topK
|
|
646
|
+
});
|
|
647
|
+
} catch (error) {
|
|
648
|
+
return {
|
|
649
|
+
query: testQuery.query,
|
|
650
|
+
chunks: [],
|
|
651
|
+
relevanceScore: 0,
|
|
652
|
+
foundTopics: [],
|
|
653
|
+
missingTopics: testQuery.expectedTopics || [],
|
|
654
|
+
noiseTopics: []
|
|
655
|
+
};
|
|
656
|
+
}
|
|
657
|
+
const snippets = contextResult.snippets || [];
|
|
658
|
+
const chunks = [];
|
|
659
|
+
for (const snippet of snippets) {
|
|
660
|
+
const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);
|
|
661
|
+
const ref = snippet.reference;
|
|
662
|
+
const sourceFile = ref?.file?.name || ref?.name || "unknown";
|
|
663
|
+
chunks.push({
|
|
664
|
+
content: snippet.content.slice(0, 500),
|
|
665
|
+
// Truncate for storage
|
|
666
|
+
score: snippet.score,
|
|
667
|
+
sourceFile,
|
|
668
|
+
relevanceGrade: relevanceResult.score,
|
|
669
|
+
reasoning: relevanceResult.reasoning
|
|
670
|
+
});
|
|
671
|
+
}
|
|
672
|
+
const relevanceScore = chunks.length === 0 ? 0 : average(chunks.map((c) => c.relevanceGrade));
|
|
673
|
+
const allContent = chunks.map((c) => c.content).join(" ").toLowerCase();
|
|
674
|
+
const topicResult = checkTopics(
|
|
675
|
+
allContent,
|
|
676
|
+
testQuery.expectedTopics || [],
|
|
677
|
+
testQuery.irrelevantTopics || []
|
|
678
|
+
);
|
|
679
|
+
if (verbose) {
|
|
680
|
+
console.log(`Query: "${testQuery.query}"`);
|
|
681
|
+
console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);
|
|
682
|
+
console.log(` Topics found: ${topicResult.found.join(", ") || "none"}`);
|
|
683
|
+
console.log(` Topics missing: ${topicResult.missing.join(", ") || "none"}`);
|
|
684
|
+
console.log(` Noise: ${topicResult.noise.join(", ") || "none"}`);
|
|
685
|
+
}
|
|
686
|
+
return {
|
|
687
|
+
query: testQuery.query,
|
|
688
|
+
chunks,
|
|
689
|
+
relevanceScore,
|
|
690
|
+
foundTopics: topicResult.found,
|
|
691
|
+
missingTopics: topicResult.missing,
|
|
692
|
+
noiseTopics: topicResult.noise
|
|
693
|
+
};
|
|
694
|
+
}
|
|
695
|
+
function checkTopics(content, expectedTopics, irrelevantTopics) {
|
|
696
|
+
const found = [];
|
|
697
|
+
const missing = [];
|
|
698
|
+
const noise = [];
|
|
699
|
+
for (const topic of expectedTopics) {
|
|
700
|
+
if (content.includes(topic.toLowerCase())) {
|
|
701
|
+
found.push(topic);
|
|
702
|
+
} else {
|
|
703
|
+
missing.push(topic);
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
for (const topic of irrelevantTopics) {
|
|
707
|
+
if (content.includes(topic.toLowerCase())) {
|
|
708
|
+
noise.push(topic);
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
return { found, missing, noise };
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
// src/agent/index.ts
|
|
715
|
+
async function evaluateAgent(config) {
|
|
716
|
+
const startTime = Date.now();
|
|
717
|
+
if (config.scenarios.length === 0) {
|
|
718
|
+
throw new Error("At least one scenario is required for agent eval");
|
|
719
|
+
}
|
|
720
|
+
const grader = createLLMGrader({
|
|
721
|
+
openaiApiKey: config.openaiApiKey,
|
|
722
|
+
model: config.graderConfig?.model,
|
|
723
|
+
temperature: config.graderConfig?.temperature
|
|
724
|
+
});
|
|
725
|
+
const scenarioResults = [];
|
|
726
|
+
for (const scenario of config.scenarios) {
|
|
727
|
+
const result = await runScenario(scenario, config, grader);
|
|
728
|
+
scenarioResults.push(result);
|
|
729
|
+
}
|
|
730
|
+
const passedScenarios = scenarioResults.filter((r) => r.passed);
|
|
731
|
+
const accuracy = normalizeScore(passedScenarios.length / scenarioResults.length * 100);
|
|
732
|
+
const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);
|
|
733
|
+
const relevanceScores = gradedResults.flatMap(
|
|
734
|
+
(r) => r.evaluation.evidence.filter((e) => e.criterion === "relevance").map((e) => e.score)
|
|
735
|
+
);
|
|
736
|
+
const completenessScores = gradedResults.flatMap(
|
|
737
|
+
(r) => r.evaluation.evidence.filter((e) => e.criterion === "completeness").map((e) => e.score)
|
|
738
|
+
);
|
|
739
|
+
const helpfulnessScores = gradedResults.flatMap(
|
|
740
|
+
(r) => r.evaluation.evidence.filter((e) => e.criterion === "helpfulness").map((e) => e.score)
|
|
741
|
+
);
|
|
742
|
+
const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;
|
|
743
|
+
const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;
|
|
744
|
+
const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;
|
|
745
|
+
const scores = { accuracy, relevance, completeness, helpfulness };
|
|
746
|
+
const overallScore = calculateWeightedScore([
|
|
747
|
+
{ score: accuracy, weight: 0.3 },
|
|
748
|
+
{ score: relevance, weight: 0.25 },
|
|
749
|
+
{ score: completeness, weight: 0.25 },
|
|
750
|
+
{ score: helpfulness, weight: 0.2 }
|
|
751
|
+
]);
|
|
752
|
+
const evidence = [
|
|
753
|
+
{
|
|
754
|
+
criterion: "accuracy",
|
|
755
|
+
score: accuracy,
|
|
756
|
+
reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,
|
|
757
|
+
examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3)
|
|
758
|
+
},
|
|
759
|
+
{
|
|
760
|
+
criterion: "relevance",
|
|
761
|
+
score: relevance,
|
|
762
|
+
reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`
|
|
763
|
+
},
|
|
764
|
+
{
|
|
765
|
+
criterion: "completeness",
|
|
766
|
+
score: completeness,
|
|
767
|
+
reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`
|
|
768
|
+
},
|
|
769
|
+
{
|
|
770
|
+
criterion: "helpfulness",
|
|
771
|
+
score: helpfulness,
|
|
772
|
+
reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`
|
|
773
|
+
}
|
|
774
|
+
];
|
|
775
|
+
return {
|
|
776
|
+
passed: overallScore >= 70,
|
|
777
|
+
overallScore,
|
|
778
|
+
scores,
|
|
779
|
+
evidence,
|
|
780
|
+
summary: generateSummary(scores),
|
|
781
|
+
duration: Date.now() - startTime,
|
|
782
|
+
scenarioResults
|
|
783
|
+
};
|
|
784
|
+
}
|
|
785
|
+
async function runScenario(scenario, config, grader) {
|
|
786
|
+
const startTime = Date.now();
|
|
787
|
+
const maxTurns = scenario.maxTurns || config.maxTurns || 5;
|
|
788
|
+
const timeout = config.timeout || 6e4;
|
|
789
|
+
const conversation = [];
|
|
790
|
+
let currentMessage = scenario.initialQuery;
|
|
791
|
+
let context = {
|
|
792
|
+
sessionId: `eval_${Date.now()}`,
|
|
793
|
+
conversationHistory: []
|
|
794
|
+
};
|
|
795
|
+
let lastResponse = null;
|
|
796
|
+
let turn = 0;
|
|
797
|
+
try {
|
|
798
|
+
while (turn < maxTurns) {
|
|
799
|
+
turn++;
|
|
800
|
+
const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);
|
|
801
|
+
lastResponse = response;
|
|
802
|
+
conversation.push({
|
|
803
|
+
turn,
|
|
804
|
+
userMessage: currentMessage,
|
|
805
|
+
agentResponse: response
|
|
806
|
+
});
|
|
807
|
+
context = {
|
|
808
|
+
...context,
|
|
809
|
+
previousContext: response.context,
|
|
810
|
+
previousIntent: response.intent,
|
|
811
|
+
conversationHistory: [
|
|
812
|
+
...context.conversationHistory || [],
|
|
813
|
+
{ role: "user", content: currentMessage },
|
|
814
|
+
{ role: "assistant", content: response.answer || response.followUpQuestion || "" }
|
|
815
|
+
]
|
|
816
|
+
};
|
|
817
|
+
if (response.outcome === "answer") {
|
|
818
|
+
break;
|
|
819
|
+
}
|
|
820
|
+
if (response.outcome === "blocked" || response.outcome === "out_of_scope") {
|
|
821
|
+
break;
|
|
822
|
+
}
|
|
823
|
+
if (response.outcome === "follow_up") {
|
|
824
|
+
const followUpQuestion = response.followUpQuestion || "";
|
|
825
|
+
const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);
|
|
826
|
+
if (!responseToFollowUp) {
|
|
827
|
+
break;
|
|
828
|
+
}
|
|
829
|
+
currentMessage = responseToFollowUp;
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
const evaluation = await evaluateScenarioResult(
|
|
833
|
+
scenario,
|
|
834
|
+
lastResponse,
|
|
835
|
+
conversation,
|
|
836
|
+
grader
|
|
837
|
+
);
|
|
838
|
+
const outcomeMatch = scenario.expectedOutcome ? lastResponse?.outcome === scenario.expectedOutcome : true;
|
|
839
|
+
const passed = outcomeMatch && evaluation.passed;
|
|
840
|
+
return {
|
|
841
|
+
scenario,
|
|
842
|
+
passed,
|
|
843
|
+
turns: turn,
|
|
844
|
+
finalOutcome: lastResponse?.outcome || "error",
|
|
845
|
+
finalAnswer: lastResponse?.outcome === "answer" ? lastResponse.answer || null : null,
|
|
846
|
+
evaluation,
|
|
847
|
+
conversation,
|
|
848
|
+
duration: Date.now() - startTime
|
|
849
|
+
};
|
|
850
|
+
} catch (error) {
|
|
851
|
+
return {
|
|
852
|
+
scenario,
|
|
853
|
+
passed: false,
|
|
854
|
+
turns: turn,
|
|
855
|
+
finalOutcome: "error",
|
|
856
|
+
finalAnswer: null,
|
|
857
|
+
evaluation: {
|
|
858
|
+
passed: false,
|
|
859
|
+
score: 0,
|
|
860
|
+
evidence: [
|
|
861
|
+
{
|
|
862
|
+
criterion: "error",
|
|
863
|
+
score: 0,
|
|
864
|
+
reasoning: error instanceof Error ? error.message : String(error)
|
|
865
|
+
}
|
|
866
|
+
]
|
|
867
|
+
},
|
|
868
|
+
conversation,
|
|
869
|
+
duration: Date.now() - startTime,
|
|
870
|
+
error: error instanceof Error ? error.message : String(error)
|
|
871
|
+
};
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
async function callAgent(endpoint, message, context, timeout) {
|
|
875
|
+
if (typeof endpoint === "function") {
|
|
876
|
+
return endpoint(message, context);
|
|
877
|
+
}
|
|
878
|
+
const controller = new AbortController();
|
|
879
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
880
|
+
try {
|
|
881
|
+
const response = await fetch(endpoint, {
|
|
882
|
+
method: "POST",
|
|
883
|
+
headers: { "Content-Type": "application/json" },
|
|
884
|
+
body: JSON.stringify({
|
|
885
|
+
message,
|
|
886
|
+
session_id: context.sessionId,
|
|
887
|
+
previous_context: context.previousContext,
|
|
888
|
+
previous_intent: context.previousIntent,
|
|
889
|
+
conversation_history: context.conversationHistory
|
|
890
|
+
}),
|
|
891
|
+
signal: controller.signal
|
|
892
|
+
});
|
|
893
|
+
if (!response.ok) {
|
|
894
|
+
throw new Error(`Agent returned ${response.status}: ${response.statusText}`);
|
|
895
|
+
}
|
|
896
|
+
const data = await response.json();
|
|
897
|
+
return {
|
|
898
|
+
outcome: data.outcome || "answer",
|
|
899
|
+
answer: data.answer,
|
|
900
|
+
followUpQuestion: data.followUpQuestion || data.follow_up_question,
|
|
901
|
+
options: data.options,
|
|
902
|
+
context: data.context,
|
|
903
|
+
intent: data.intent,
|
|
904
|
+
trace: data.trace,
|
|
905
|
+
sessionId: data.session_id || data.sessionId
|
|
906
|
+
};
|
|
907
|
+
} finally {
|
|
908
|
+
clearTimeout(timeoutId);
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
function generateFollowUpResponse(question, scenario) {
|
|
912
|
+
if (!scenario.followUpResponses) {
|
|
913
|
+
return null;
|
|
914
|
+
}
|
|
915
|
+
for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {
|
|
916
|
+
if (question.toLowerCase().includes(pattern.toLowerCase())) {
|
|
917
|
+
return response;
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
return null;
|
|
921
|
+
}
|
|
922
|
+
async function evaluateScenarioResult(scenario, response, conversation, grader) {
|
|
923
|
+
const evidence = [];
|
|
924
|
+
if (!response || response.outcome !== "answer" || !response.answer) {
|
|
925
|
+
if (scenario.expectedOutcome && scenario.expectedOutcome !== "answer") {
|
|
926
|
+
return {
|
|
927
|
+
passed: response?.outcome === scenario.expectedOutcome,
|
|
928
|
+
score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
|
|
929
|
+
evidence: [
|
|
930
|
+
{
|
|
931
|
+
criterion: "outcomeMatch",
|
|
932
|
+
score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
|
|
933
|
+
reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || "no response"}.`
|
|
934
|
+
}
|
|
935
|
+
]
|
|
936
|
+
};
|
|
937
|
+
}
|
|
938
|
+
return {
|
|
939
|
+
passed: false,
|
|
940
|
+
score: 0,
|
|
941
|
+
evidence: [
|
|
942
|
+
{
|
|
943
|
+
criterion: "noAnswer",
|
|
944
|
+
score: 0,
|
|
945
|
+
reasoning: `Expected an answer but got ${response?.outcome || "no response"}.`
|
|
946
|
+
}
|
|
947
|
+
]
|
|
948
|
+
};
|
|
949
|
+
}
|
|
950
|
+
const answer = response.answer;
|
|
951
|
+
let totalScore = 0;
|
|
952
|
+
let criteriaCount = 0;
|
|
953
|
+
if (scenario.evaluation.mustContain) {
|
|
954
|
+
const found = scenario.evaluation.mustContain.filter(
|
|
955
|
+
(s) => answer.toLowerCase().includes(s.toLowerCase())
|
|
956
|
+
);
|
|
957
|
+
const score = normalizeScore(found.length / scenario.evaluation.mustContain.length * 100);
|
|
958
|
+
totalScore += score;
|
|
959
|
+
criteriaCount++;
|
|
960
|
+
evidence.push({
|
|
961
|
+
criterion: "mustContain",
|
|
962
|
+
score,
|
|
963
|
+
reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,
|
|
964
|
+
examples: scenario.evaluation.mustContain.filter(
|
|
965
|
+
(s) => !answer.toLowerCase().includes(s.toLowerCase())
|
|
966
|
+
)
|
|
967
|
+
});
|
|
968
|
+
}
|
|
969
|
+
if (scenario.evaluation.mustNotContain) {
|
|
970
|
+
const found = scenario.evaluation.mustNotContain.filter(
|
|
971
|
+
(s) => answer.toLowerCase().includes(s.toLowerCase())
|
|
972
|
+
);
|
|
973
|
+
const score = normalizeScore((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length * 100);
|
|
974
|
+
totalScore += score;
|
|
975
|
+
criteriaCount++;
|
|
976
|
+
evidence.push({
|
|
977
|
+
criterion: "mustNotContain",
|
|
978
|
+
score,
|
|
979
|
+
reasoning: `Found ${found.length} forbidden terms.`,
|
|
980
|
+
examples: found
|
|
981
|
+
});
|
|
982
|
+
}
|
|
983
|
+
const gradingPrompt = `
|
|
984
|
+
Query: ${scenario.initialQuery}
|
|
985
|
+
Answer: ${answer}
|
|
986
|
+
${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ""}
|
|
987
|
+
`;
|
|
988
|
+
const relevanceResult = await grader.grade(
|
|
989
|
+
gradingPrompt,
|
|
990
|
+
"relevance",
|
|
991
|
+
"How relevant is the answer to the query? 100 = directly and completely addresses the query."
|
|
992
|
+
);
|
|
993
|
+
evidence.push({
|
|
994
|
+
criterion: "relevance",
|
|
995
|
+
score: relevanceResult.score,
|
|
996
|
+
reasoning: relevanceResult.reasoning
|
|
997
|
+
});
|
|
998
|
+
totalScore += relevanceResult.score;
|
|
999
|
+
criteriaCount++;
|
|
1000
|
+
const completenessResult = await grader.grade(
|
|
1001
|
+
gradingPrompt,
|
|
1002
|
+
"completeness",
|
|
1003
|
+
"How complete is the answer? 100 = fully addresses all aspects of the query."
|
|
1004
|
+
);
|
|
1005
|
+
evidence.push({
|
|
1006
|
+
criterion: "completeness",
|
|
1007
|
+
score: completenessResult.score,
|
|
1008
|
+
reasoning: completenessResult.reasoning
|
|
1009
|
+
});
|
|
1010
|
+
totalScore += completenessResult.score;
|
|
1011
|
+
criteriaCount++;
|
|
1012
|
+
const helpfulnessResult = await grader.grade(
|
|
1013
|
+
gradingPrompt,
|
|
1014
|
+
"helpfulness",
|
|
1015
|
+
"How helpful and actionable is the answer? 100 = provides clear, actionable guidance."
|
|
1016
|
+
);
|
|
1017
|
+
evidence.push({
|
|
1018
|
+
criterion: "helpfulness",
|
|
1019
|
+
score: helpfulnessResult.score,
|
|
1020
|
+
reasoning: helpfulnessResult.reasoning
|
|
1021
|
+
});
|
|
1022
|
+
totalScore += helpfulnessResult.score;
|
|
1023
|
+
criteriaCount++;
|
|
1024
|
+
const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;
|
|
1025
|
+
return {
|
|
1026
|
+
passed: avgScore >= 70,
|
|
1027
|
+
score: normalizeScore(avgScore),
|
|
1028
|
+
evidence
|
|
1029
|
+
};
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
exports.average = average;
|
|
1033
|
+
exports.calculatePercentage = calculatePercentage;
|
|
1034
|
+
exports.calculateWeightedScore = calculateWeightedScore;
|
|
1035
|
+
exports.createLLMGrader = createLLMGrader;
|
|
1036
|
+
exports.evaluateAgent = evaluateAgent;
|
|
1037
|
+
exports.evaluateIntrospection = evaluateIntrospection;
|
|
1038
|
+
exports.evaluateRetrieval = evaluateRetrieval;
|
|
1039
|
+
exports.formatConsoleReport = formatConsoleReport;
|
|
1040
|
+
exports.formatJsonReport = formatJsonReport;
|
|
1041
|
+
exports.gradeWithLLM = gradeWithLLM;
|
|
1042
|
+
//# sourceMappingURL=index.cjs.map
|
|
1043
|
+
//# sourceMappingURL=index.cjs.map
|