@kat-ai/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,1043 @@
1
+ 'use strict';
2
+
3
+ var pinecone = require('@pinecone-database/pinecone');
4
+ var openai = require('@ai-sdk/openai');
5
+ var ai = require('ai');
6
+ var zod = require('zod');
7
+ var core = require('@kat/core');
8
+
9
+ // src/introspection/index.ts
10
+ var GradeSchema = zod.z.object({
11
+ score: zod.z.number().min(0).max(100).describe("Score from 0-100"),
12
+ reasoning: zod.z.string().describe("Explanation for the score"),
13
+ examples: zod.z.array(zod.z.string()).optional().describe("Specific examples that influenced the score")
14
+ });
15
+ var MultiCriteriaGradeSchema = zod.z.object({
16
+ scores: zod.z.array(zod.z.object({
17
+ criterion: zod.z.string(),
18
+ score: zod.z.number().min(0).max(100),
19
+ reasoning: zod.z.string()
20
+ })),
21
+ overallReasoning: zod.z.string()
22
+ });
23
+ function createLLMGrader(config = {}) {
24
+ const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
25
+ if (!apiKey) {
26
+ throw new Error("OPENAI_API_KEY is required for LLM grading");
27
+ }
28
+ const openai$1 = openai.createOpenAI({ apiKey });
29
+ const model = config.model || core.resolveDefaultOpenAiChatModelId();
30
+ const temperature = config.temperature ?? 0.1;
31
+ return {
32
+ async grade(content, criterion, rubric) {
33
+ const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
34
+
35
+ ${rubric ? `Rubric: ${rubric}
36
+ ` : ""}
37
+ Content to evaluate:
38
+ """
39
+ ${content}
40
+ """
41
+
42
+ Provide a score from 0-100 and explain your reasoning.`;
43
+ const result = await ai.generateObject({
44
+ model: openai$1(model),
45
+ schema: GradeSchema,
46
+ prompt,
47
+ temperature
48
+ });
49
+ return result.object;
50
+ },
51
+ async gradeMultiple(content, criteria) {
52
+ const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
53
+ const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
54
+
55
+ Criteria:
56
+ ${criteriaDescription}
57
+
58
+ Content to evaluate:
59
+ """
60
+ ${content}
61
+ """
62
+
63
+ For each criterion, provide a score from 0-100 and explain your reasoning.`;
64
+ const result = await ai.generateObject({
65
+ model: openai$1(model),
66
+ schema: MultiCriteriaGradeSchema,
67
+ prompt,
68
+ temperature
69
+ });
70
+ return result.object.scores.map((s) => ({
71
+ criterion: s.criterion,
72
+ score: s.score,
73
+ reasoning: s.reasoning
74
+ }));
75
+ },
76
+ async gradeRelevance(query, content) {
77
+ const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
78
+
79
+ Query: "${query}"
80
+
81
+ Content:
82
+ """
83
+ ${content}
84
+ """
85
+
86
+ A score of 100 means the content directly and completely answers the query.
87
+ A score of 0 means the content is completely irrelevant.
88
+
89
+ Provide a score from 0-100 and explain your reasoning.`;
90
+ const result = await ai.generateObject({
91
+ model: openai$1(model),
92
+ schema: GradeSchema,
93
+ prompt,
94
+ temperature
95
+ });
96
+ return {
97
+ score: result.object.score,
98
+ reasoning: result.object.reasoning
99
+ };
100
+ }
101
+ };
102
+ }
103
+ async function gradeWithLLM(content, criterion, config = {}) {
104
+ const grader = createLLMGrader(config);
105
+ return grader.grade(content, criterion);
106
+ }
107
+
108
+ // src/utils/metrics.ts
109
+ function calculateWeightedScore(scores) {
110
+ if (scores.length === 0) return 0;
111
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
112
+ if (totalWeight === 0) return 0;
113
+ const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
114
+ return Math.round(weightedSum / totalWeight);
115
+ }
116
+ function calculatePercentage(found, expected) {
117
+ if (expected === 0) return 100;
118
+ return Math.round(found / expected * 100);
119
+ }
120
+ function average(numbers) {
121
+ if (numbers.length === 0) return 0;
122
+ return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
123
+ }
124
+ function clamp(value, min, max) {
125
+ return Math.max(min, Math.min(max, value));
126
+ }
127
+ function normalizeScore(score) {
128
+ return clamp(Math.round(score), 0, 100);
129
+ }
130
+
131
+ // src/utils/reporters.ts
132
+ function formatConsoleReport(result, options = {}) {
133
+ const lines = [];
134
+ const { includeEvidence = true } = options;
135
+ const status = result.passed ? "\u2713 PASSED" : "\u2717 FAILED";
136
+ const statusColor = result.passed ? "\x1B[32m" : "\x1B[31m";
137
+ const reset = "\x1B[0m";
138
+ lines.push("");
139
+ lines.push("\u2550".repeat(60));
140
+ lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);
141
+ lines.push("\u2550".repeat(60));
142
+ lines.push("");
143
+ lines.push(`Summary: ${result.summary}`);
144
+ lines.push(`Duration: ${result.duration}ms`);
145
+ lines.push("");
146
+ lines.push("Scores:");
147
+ for (const [name, score] of Object.entries(result.scores)) {
148
+ const bar = createProgressBar(score, 20);
149
+ const formattedName = formatScoreName(name);
150
+ lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);
151
+ }
152
+ if (includeEvidence && result.evidence.length > 0) {
153
+ lines.push("");
154
+ lines.push("Evidence:");
155
+ for (const evidence of result.evidence) {
156
+ lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);
157
+ lines.push(` ${evidence.reasoning}`);
158
+ if (evidence.examples && evidence.examples.length > 0) {
159
+ for (const example of evidence.examples.slice(0, 3)) {
160
+ lines.push(` - ${example}`);
161
+ }
162
+ }
163
+ }
164
+ }
165
+ lines.push("");
166
+ lines.push("\u2500".repeat(60));
167
+ return lines.join("\n");
168
+ }
169
+ function createProgressBar(value, width) {
170
+ const filled = Math.round(value / 100 * width);
171
+ const empty = width - filled;
172
+ return `[${"\u2588".repeat(filled)}${"\u2591".repeat(empty)}]`;
173
+ }
174
+ function formatScoreName(name) {
175
+ return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
176
+ }
177
+ function formatJsonReport(result, options = {}) {
178
+ const { includeEvidence = true, includeRawData = false } = options;
179
+ const output = {
180
+ passed: result.passed,
181
+ overallScore: result.overallScore,
182
+ scores: result.scores,
183
+ summary: result.summary,
184
+ duration: result.duration
185
+ };
186
+ if (includeEvidence) {
187
+ output.evidence = result.evidence;
188
+ }
189
+ for (const [key, value] of Object.entries(result)) {
190
+ if (!["passed", "overallScore", "scores", "evidence", "summary", "duration"].includes(key) && (includeRawData || !isRawData(value))) {
191
+ output[key] = value;
192
+ }
193
+ }
194
+ return JSON.stringify(output, null, 2);
195
+ }
196
+ function isRawData(value) {
197
+ if (Array.isArray(value) && value.length > 10) return true;
198
+ if (typeof value === "object" && value !== null) {
199
+ const keys = Object.keys(value);
200
+ if (keys.length > 20) return true;
201
+ }
202
+ return false;
203
+ }
204
+ function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
205
+ const entries = Object.entries(scores);
206
+ const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
207
+ const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
208
+ const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
209
+ const parts = [];
210
+ if (avgScore >= thresholds.good) {
211
+ parts.push("Strong overall performance");
212
+ } else if (avgScore >= thresholds.acceptable) {
213
+ parts.push("Acceptable performance with room for improvement");
214
+ } else {
215
+ parts.push("Performance below acceptable thresholds");
216
+ }
217
+ if (goodMetrics.length > 0) {
218
+ parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
219
+ }
220
+ if (poorMetrics.length > 0) {
221
+ parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
222
+ }
223
+ return parts.join(". ") + ".";
224
+ }
225
+
226
+ // src/introspection/index.ts
227
+ async function evaluateIntrospection(config) {
228
+ const startTime = Date.now();
229
+ const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
230
+ if (!apiKey) {
231
+ throw new Error("PINECONE_API_KEY is required for introspection eval");
232
+ }
233
+ const pinecone$1 = new pinecone.Pinecone({ apiKey });
234
+ const assistant = pinecone$1.assistant(config.assistantName);
235
+ const [entityResult, slotResult, scopeResult, capabilityResult] = await Promise.all([
236
+ evaluateEntityCoverage(assistant, config.manifest, config),
237
+ evaluateSlotAccuracy(assistant, config.manifest, config),
238
+ evaluateScopePrecision(assistant, config.manifest, config),
239
+ evaluateCapabilityMatch(assistant, config.manifest, config)
240
+ ]);
241
+ const scores = {
242
+ entityCoverage: entityResult.score,
243
+ slotAccuracy: slotResult.score,
244
+ scopePrecision: scopeResult.score,
245
+ capabilityMatch: capabilityResult.score
246
+ };
247
+ const overallScore = calculateWeightedScore([
248
+ { score: scores.entityCoverage, weight: 0.25 },
249
+ { score: scores.slotAccuracy, weight: 0.3 },
250
+ { score: scores.scopePrecision, weight: 0.25 },
251
+ { score: scores.capabilityMatch, weight: 0.2 }
252
+ ]);
253
+ const evidence = [
254
+ ...entityResult.evidence,
255
+ ...slotResult.evidence,
256
+ ...scopeResult.evidence,
257
+ ...capabilityResult.evidence
258
+ ];
259
+ return {
260
+ passed: overallScore >= 70,
261
+ overallScore,
262
+ scores,
263
+ evidence,
264
+ summary: generateSummary(scores),
265
+ duration: Date.now() - startTime,
266
+ details: {
267
+ missingEntities: entityResult.missing,
268
+ incorrectSlots: slotResult.incorrect,
269
+ scopeMisclassifications: scopeResult.misclassified,
270
+ mismatchedCapabilities: capabilityResult.mismatched
271
+ },
272
+ manifest: config.manifest
273
+ };
274
+ }
275
+ async function evaluateEntityCoverage(assistant, manifest, config) {
276
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
277
+ const manifestEntities = extractEntitiesFromManifest(manifest);
278
+ const discoveryQueries = [
279
+ "What are the main topics you can help with?",
280
+ "What products or entities do you have information about?",
281
+ "List the categories of information you contain."
282
+ ];
283
+ const discoveredEntities = /* @__PURE__ */ new Set();
284
+ for (const query of discoveryQueries) {
285
+ try {
286
+ const response = await assistant.chat({
287
+ messages: [{ role: "user", content: query }]
288
+ });
289
+ const entities = await extractEntitiesWithLLM(grader, response.message?.content || "");
290
+ entities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
291
+ } catch {
292
+ }
293
+ }
294
+ if (config.groundTruth) {
295
+ for (const gt of config.groundTruth) {
296
+ if (gt.expectedEntities) {
297
+ gt.expectedEntities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
298
+ }
299
+ }
300
+ }
301
+ const found = [];
302
+ const missing = [];
303
+ for (const entity of discoveredEntities) {
304
+ if (manifestEntities.some((me) => me.toLowerCase().includes(entity) || entity.includes(me.toLowerCase()))) {
305
+ found.push(entity);
306
+ } else {
307
+ missing.push(entity);
308
+ }
309
+ }
310
+ const score = discoveredEntities.size === 0 ? 100 : normalizeScore(found.length / discoveredEntities.size * 100);
311
+ return {
312
+ score,
313
+ evidence: [
314
+ {
315
+ criterion: "entityCoverage",
316
+ score,
317
+ reasoning: `Found ${found.length}/${discoveredEntities.size} expected entities in the manifest.`,
318
+ examples: missing.slice(0, 5)
319
+ }
320
+ ],
321
+ found,
322
+ missing
323
+ };
324
+ }
325
+ async function evaluateSlotAccuracy(assistant, manifest, config) {
326
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
327
+ const slots = manifest.slots || [];
328
+ if (slots.length === 0) {
329
+ return {
330
+ score: 50,
331
+ // Penalize but don't fail for no slots
332
+ evidence: [
333
+ {
334
+ criterion: "slotAccuracy",
335
+ score: 50,
336
+ reasoning: "No slots defined in manifest. Consider adding slots for common query parameters."
337
+ }
338
+ ],
339
+ correct: [],
340
+ incorrect: []
341
+ };
342
+ }
343
+ const correct = [];
344
+ const incorrect = [];
345
+ for (const slot of slots) {
346
+ const testQuery = `To answer questions about ${manifest.domain || "this topic"}, do I need to know the ${slot.name}?`;
347
+ try {
348
+ const response = await assistant.chat({
349
+ messages: [{ role: "user", content: testQuery }]
350
+ });
351
+ const evaluation = await grader.grade(
352
+ `Slot: ${slot.name}
353
+ Description: ${slot.description || "N/A"}
354
+ KB Response: ${response.message?.content}`,
355
+ "slot relevance",
356
+ "Score 100 if the slot seems relevant to the KB content, 0 if completely irrelevant."
357
+ );
358
+ if (evaluation.score >= 60) {
359
+ correct.push(slot.name);
360
+ } else {
361
+ incorrect.push(slot.name);
362
+ }
363
+ } catch {
364
+ correct.push(slot.name);
365
+ }
366
+ }
367
+ const score = normalizeScore(correct.length / slots.length * 100);
368
+ return {
369
+ score,
370
+ evidence: [
371
+ {
372
+ criterion: "slotAccuracy",
373
+ score,
374
+ reasoning: `${correct.length}/${slots.length} slots appear relevant to the KB content.`,
375
+ examples: incorrect
376
+ }
377
+ ],
378
+ correct,
379
+ incorrect
380
+ };
381
+ }
382
+ async function evaluateScopePrecision(assistant, manifest, config) {
383
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
384
+ const scope = manifest.scope;
385
+ if (!scope) {
386
+ return {
387
+ score: 60,
388
+ // Penalize but don't fail
389
+ evidence: [
390
+ {
391
+ criterion: "scopePrecision",
392
+ score: 60,
393
+ reasoning: "No scope definition in manifest. Consider defining in-scope and out-of-scope examples."
394
+ }
395
+ ],
396
+ correctClassifications: 0,
397
+ totalClassifications: 0,
398
+ misclassified: []
399
+ };
400
+ }
401
+ const testCases = [];
402
+ if (scope.inScopeExamples) {
403
+ scope.inScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: true }));
404
+ }
405
+ if (scope.outOfScopeExamples) {
406
+ scope.outOfScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: false }));
407
+ }
408
+ if (config.groundTruth) {
409
+ config.groundTruth.filter((gt) => gt.shouldBeInScope !== void 0).forEach((gt) => testCases.push({ query: gt.query, expectedInScope: gt.shouldBeInScope }));
410
+ }
411
+ if (testCases.length === 0) {
412
+ return {
413
+ score: 70,
414
+ evidence: [
415
+ {
416
+ criterion: "scopePrecision",
417
+ score: 70,
418
+ reasoning: "No scope test cases available. Add in-scope and out-of-scope examples to test."
419
+ }
420
+ ],
421
+ correctClassifications: 0,
422
+ totalClassifications: 0,
423
+ misclassified: []
424
+ };
425
+ }
426
+ let correctClassifications = 0;
427
+ const misclassified = [];
428
+ for (const testCase of testCases.slice(0, 10)) {
429
+ try {
430
+ const response = await assistant.chat({
431
+ messages: [{ role: "user", content: testCase.query }]
432
+ });
433
+ const evaluation = await grader.grade(
434
+ `Query: ${testCase.query}
435
+ KB Response: ${response.message?.content}`,
436
+ "answerability",
437
+ "Score 100 if the KB provided a substantive, on-topic answer. Score 0 if it said it cannot help or gave an off-topic response."
438
+ );
439
+ const actuallyInScope = evaluation.score >= 50;
440
+ if (actuallyInScope === testCase.expectedInScope) {
441
+ correctClassifications++;
442
+ } else {
443
+ misclassified.push(`"${testCase.query}" (expected ${testCase.expectedInScope ? "in-scope" : "out-of-scope"})`);
444
+ }
445
+ } catch {
446
+ }
447
+ }
448
+ const score = normalizeScore(correctClassifications / testCases.length * 100);
449
+ return {
450
+ score,
451
+ evidence: [
452
+ {
453
+ criterion: "scopePrecision",
454
+ score,
455
+ reasoning: `${correctClassifications}/${testCases.length} scope classifications were correct.`,
456
+ examples: misclassified.slice(0, 3)
457
+ }
458
+ ],
459
+ correctClassifications,
460
+ totalClassifications: testCases.length,
461
+ misclassified
462
+ };
463
+ }
464
+ async function evaluateCapabilityMatch(assistant, manifest, config) {
465
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
466
+ const capabilities = (manifest.capabilities || []).map(
467
+ (c) => typeof c === "string" ? c : c.text
468
+ );
469
+ if (capabilities.length === 0) {
470
+ return {
471
+ score: 50,
472
+ evidence: [
473
+ {
474
+ criterion: "capabilityMatch",
475
+ score: 50,
476
+ reasoning: "No capabilities defined in manifest."
477
+ }
478
+ ],
479
+ matched: [],
480
+ mismatched: []
481
+ };
482
+ }
483
+ const matched = [];
484
+ const mismatched = [];
485
+ for (const capability of capabilities.slice(0, 5)) {
486
+ const testQuery = `Can you help me with: ${capability}`;
487
+ try {
488
+ const response = await assistant.chat({
489
+ messages: [{ role: "user", content: testQuery }]
490
+ });
491
+ const evaluation = await grader.grade(
492
+ `Capability: ${capability}
493
+ KB Response: ${response.message?.content}`,
494
+ "capability fulfillment",
495
+ "Score 100 if the KB demonstrated it can help with this capability. Score 0 if it cannot."
496
+ );
497
+ if (evaluation.score >= 60) {
498
+ matched.push(capability);
499
+ } else {
500
+ mismatched.push(capability);
501
+ }
502
+ } catch {
503
+ matched.push(capability);
504
+ }
505
+ }
506
+ const score = normalizeScore(matched.length / capabilities.length * 100);
507
+ return {
508
+ score,
509
+ evidence: [
510
+ {
511
+ criterion: "capabilityMatch",
512
+ score,
513
+ reasoning: `${matched.length}/${capabilities.length} stated capabilities match actual KB content.`,
514
+ examples: mismatched
515
+ }
516
+ ],
517
+ matched,
518
+ mismatched
519
+ };
520
+ }
521
+ function extractEntitiesFromManifest(manifest) {
522
+ const entities = /* @__PURE__ */ new Set();
523
+ if (manifest.slots) {
524
+ manifest.slots.forEach((slot) => {
525
+ if (slot.examples) {
526
+ slot.examples.forEach((e) => entities.add(e));
527
+ }
528
+ });
529
+ }
530
+ if (manifest.domain) {
531
+ entities.add(manifest.domain);
532
+ }
533
+ if (manifest.capabilities) {
534
+ manifest.capabilities.forEach((c) => {
535
+ const text = typeof c === "string" ? c : c.text;
536
+ const words = text.split(/\s+/).filter((w) => w.length > 3 && /^[A-Z]/.test(w));
537
+ words.forEach((w) => entities.add(w.toLowerCase()));
538
+ });
539
+ }
540
+ return Array.from(entities);
541
+ }
542
+ async function extractEntitiesWithLLM(grader, text) {
543
+ try {
544
+ const result = await grader.grade(
545
+ text,
546
+ "entity extraction",
547
+ "List the main entities (products, topics, categories) mentioned. Return just the entity names separated by commas."
548
+ );
549
+ return result.reasoning.split(",").map((e) => e.trim().toLowerCase()).filter((e) => e.length > 2);
550
+ } catch {
551
+ return [];
552
+ }
553
+ }
554
+ async function evaluateRetrieval(config) {
555
+ const startTime = Date.now();
556
+ if (config.queries.length === 0) {
557
+ throw new Error("At least one query is required for retrieval eval");
558
+ }
559
+ const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
560
+ if (!apiKey) {
561
+ throw new Error("PINECONE_API_KEY is required for retrieval eval");
562
+ }
563
+ const pinecone$1 = new pinecone.Pinecone({ apiKey });
564
+ const assistant = pinecone$1.assistant(config.assistantName);
565
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
566
+ const topK = config.topK || 5;
567
+ const queryResults = [];
568
+ for (const testQuery of config.queries) {
569
+ const result = await evaluateQueryRetrieval(
570
+ assistant,
571
+ grader,
572
+ testQuery,
573
+ topK,
574
+ config.verbose
575
+ );
576
+ queryResults.push(result);
577
+ }
578
+ const relevanceScores = queryResults.map((r) => r.relevanceScore);
579
+ const relevance = normalizeScore(average(relevanceScores));
580
+ const totalExpected = queryResults.reduce(
581
+ (sum, r) => sum + r.foundTopics.length + r.missingTopics.length,
582
+ 0
583
+ );
584
+ const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);
585
+ const recall = totalExpected === 0 ? 100 : normalizeScore(totalFound / totalExpected * 100);
586
+ const allChunks = queryResults.flatMap((r) => r.chunks);
587
+ const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);
588
+ const precision = allChunks.length === 0 ? 100 : normalizeScore(relevantChunks.length / allChunks.length * 100);
589
+ const totalIrrelevant = queryResults.reduce(
590
+ (sum, r) => sum + r.noiseTopics.length,
591
+ 0
592
+ );
593
+ const totalIrrelevantExpected = config.queries.reduce(
594
+ (sum, q) => sum + (q.irrelevantTopics?.length || 0),
595
+ 0
596
+ );
597
+ const noiseRatio = totalIrrelevantExpected === 0 ? 0 : normalizeScore(totalIrrelevant / totalIrrelevantExpected * 100);
598
+ const scores = { relevance, recall, precision, noiseRatio };
599
+ const overallScore = calculateWeightedScore([
600
+ { score: relevance, weight: 0.35 },
601
+ { score: recall, weight: 0.3 },
602
+ { score: precision, weight: 0.25 },
603
+ { score: 100 - noiseRatio, weight: 0.1 }
604
+ // Invert noise ratio
605
+ ]);
606
+ const evidence = [
607
+ {
608
+ criterion: "relevance",
609
+ score: relevance,
610
+ reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`
611
+ },
612
+ {
613
+ criterion: "recall",
614
+ score: recall,
615
+ reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,
616
+ examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3)
617
+ },
618
+ {
619
+ criterion: "precision",
620
+ score: precision,
621
+ reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`
622
+ },
623
+ {
624
+ criterion: "noiseRatio",
625
+ score: noiseRatio,
626
+ reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,
627
+ examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3)
628
+ }
629
+ ];
630
+ return {
631
+ passed: overallScore >= 70 && noiseRatio <= 30,
632
+ overallScore,
633
+ scores,
634
+ evidence,
635
+ summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),
636
+ duration: Date.now() - startTime,
637
+ queryResults
638
+ };
639
+ }
640
+ async function evaluateQueryRetrieval(assistant, grader, testQuery, topK, verbose) {
641
+ let contextResult;
642
+ try {
643
+ contextResult = await assistant.context({
644
+ query: testQuery.query,
645
+ topK
646
+ });
647
+ } catch (error) {
648
+ return {
649
+ query: testQuery.query,
650
+ chunks: [],
651
+ relevanceScore: 0,
652
+ foundTopics: [],
653
+ missingTopics: testQuery.expectedTopics || [],
654
+ noiseTopics: []
655
+ };
656
+ }
657
+ const snippets = contextResult.snippets || [];
658
+ const chunks = [];
659
+ for (const snippet of snippets) {
660
+ const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);
661
+ const ref = snippet.reference;
662
+ const sourceFile = ref?.file?.name || ref?.name || "unknown";
663
+ chunks.push({
664
+ content: snippet.content.slice(0, 500),
665
+ // Truncate for storage
666
+ score: snippet.score,
667
+ sourceFile,
668
+ relevanceGrade: relevanceResult.score,
669
+ reasoning: relevanceResult.reasoning
670
+ });
671
+ }
672
+ const relevanceScore = chunks.length === 0 ? 0 : average(chunks.map((c) => c.relevanceGrade));
673
+ const allContent = chunks.map((c) => c.content).join(" ").toLowerCase();
674
+ const topicResult = checkTopics(
675
+ allContent,
676
+ testQuery.expectedTopics || [],
677
+ testQuery.irrelevantTopics || []
678
+ );
679
+ if (verbose) {
680
+ console.log(`Query: "${testQuery.query}"`);
681
+ console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);
682
+ console.log(` Topics found: ${topicResult.found.join(", ") || "none"}`);
683
+ console.log(` Topics missing: ${topicResult.missing.join(", ") || "none"}`);
684
+ console.log(` Noise: ${topicResult.noise.join(", ") || "none"}`);
685
+ }
686
+ return {
687
+ query: testQuery.query,
688
+ chunks,
689
+ relevanceScore,
690
+ foundTopics: topicResult.found,
691
+ missingTopics: topicResult.missing,
692
+ noiseTopics: topicResult.noise
693
+ };
694
+ }
695
+ function checkTopics(content, expectedTopics, irrelevantTopics) {
696
+ const found = [];
697
+ const missing = [];
698
+ const noise = [];
699
+ for (const topic of expectedTopics) {
700
+ if (content.includes(topic.toLowerCase())) {
701
+ found.push(topic);
702
+ } else {
703
+ missing.push(topic);
704
+ }
705
+ }
706
+ for (const topic of irrelevantTopics) {
707
+ if (content.includes(topic.toLowerCase())) {
708
+ noise.push(topic);
709
+ }
710
+ }
711
+ return { found, missing, noise };
712
+ }
713
+
714
+ // src/agent/index.ts
715
+ async function evaluateAgent(config) {
716
+ const startTime = Date.now();
717
+ if (config.scenarios.length === 0) {
718
+ throw new Error("At least one scenario is required for agent eval");
719
+ }
720
+ const grader = createLLMGrader({
721
+ openaiApiKey: config.openaiApiKey,
722
+ model: config.graderConfig?.model,
723
+ temperature: config.graderConfig?.temperature
724
+ });
725
+ const scenarioResults = [];
726
+ for (const scenario of config.scenarios) {
727
+ const result = await runScenario(scenario, config, grader);
728
+ scenarioResults.push(result);
729
+ }
730
+ const passedScenarios = scenarioResults.filter((r) => r.passed);
731
+ const accuracy = normalizeScore(passedScenarios.length / scenarioResults.length * 100);
732
+ const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);
733
+ const relevanceScores = gradedResults.flatMap(
734
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "relevance").map((e) => e.score)
735
+ );
736
+ const completenessScores = gradedResults.flatMap(
737
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "completeness").map((e) => e.score)
738
+ );
739
+ const helpfulnessScores = gradedResults.flatMap(
740
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "helpfulness").map((e) => e.score)
741
+ );
742
+ const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;
743
+ const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;
744
+ const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;
745
+ const scores = { accuracy, relevance, completeness, helpfulness };
746
+ const overallScore = calculateWeightedScore([
747
+ { score: accuracy, weight: 0.3 },
748
+ { score: relevance, weight: 0.25 },
749
+ { score: completeness, weight: 0.25 },
750
+ { score: helpfulness, weight: 0.2 }
751
+ ]);
752
+ const evidence = [
753
+ {
754
+ criterion: "accuracy",
755
+ score: accuracy,
756
+ reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,
757
+ examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3)
758
+ },
759
+ {
760
+ criterion: "relevance",
761
+ score: relevance,
762
+ reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`
763
+ },
764
+ {
765
+ criterion: "completeness",
766
+ score: completeness,
767
+ reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`
768
+ },
769
+ {
770
+ criterion: "helpfulness",
771
+ score: helpfulness,
772
+ reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`
773
+ }
774
+ ];
775
+ return {
776
+ passed: overallScore >= 70,
777
+ overallScore,
778
+ scores,
779
+ evidence,
780
+ summary: generateSummary(scores),
781
+ duration: Date.now() - startTime,
782
+ scenarioResults
783
+ };
784
+ }
785
+ async function runScenario(scenario, config, grader) {
786
+ const startTime = Date.now();
787
+ const maxTurns = scenario.maxTurns || config.maxTurns || 5;
788
+ const timeout = config.timeout || 6e4;
789
+ const conversation = [];
790
+ let currentMessage = scenario.initialQuery;
791
+ let context = {
792
+ sessionId: `eval_${Date.now()}`,
793
+ conversationHistory: []
794
+ };
795
+ let lastResponse = null;
796
+ let turn = 0;
797
+ try {
798
+ while (turn < maxTurns) {
799
+ turn++;
800
+ const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);
801
+ lastResponse = response;
802
+ conversation.push({
803
+ turn,
804
+ userMessage: currentMessage,
805
+ agentResponse: response
806
+ });
807
+ context = {
808
+ ...context,
809
+ previousContext: response.context,
810
+ previousIntent: response.intent,
811
+ conversationHistory: [
812
+ ...context.conversationHistory || [],
813
+ { role: "user", content: currentMessage },
814
+ { role: "assistant", content: response.answer || response.followUpQuestion || "" }
815
+ ]
816
+ };
817
+ if (response.outcome === "answer") {
818
+ break;
819
+ }
820
+ if (response.outcome === "blocked" || response.outcome === "out_of_scope") {
821
+ break;
822
+ }
823
+ if (response.outcome === "follow_up") {
824
+ const followUpQuestion = response.followUpQuestion || "";
825
+ const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);
826
+ if (!responseToFollowUp) {
827
+ break;
828
+ }
829
+ currentMessage = responseToFollowUp;
830
+ }
831
+ }
832
+ const evaluation = await evaluateScenarioResult(
833
+ scenario,
834
+ lastResponse,
835
+ conversation,
836
+ grader
837
+ );
838
+ const outcomeMatch = scenario.expectedOutcome ? lastResponse?.outcome === scenario.expectedOutcome : true;
839
+ const passed = outcomeMatch && evaluation.passed;
840
+ return {
841
+ scenario,
842
+ passed,
843
+ turns: turn,
844
+ finalOutcome: lastResponse?.outcome || "error",
845
+ finalAnswer: lastResponse?.outcome === "answer" ? lastResponse.answer || null : null,
846
+ evaluation,
847
+ conversation,
848
+ duration: Date.now() - startTime
849
+ };
850
+ } catch (error) {
851
+ return {
852
+ scenario,
853
+ passed: false,
854
+ turns: turn,
855
+ finalOutcome: "error",
856
+ finalAnswer: null,
857
+ evaluation: {
858
+ passed: false,
859
+ score: 0,
860
+ evidence: [
861
+ {
862
+ criterion: "error",
863
+ score: 0,
864
+ reasoning: error instanceof Error ? error.message : String(error)
865
+ }
866
+ ]
867
+ },
868
+ conversation,
869
+ duration: Date.now() - startTime,
870
+ error: error instanceof Error ? error.message : String(error)
871
+ };
872
+ }
873
+ }
874
+ async function callAgent(endpoint, message, context, timeout) {
875
+ if (typeof endpoint === "function") {
876
+ return endpoint(message, context);
877
+ }
878
+ const controller = new AbortController();
879
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
880
+ try {
881
+ const response = await fetch(endpoint, {
882
+ method: "POST",
883
+ headers: { "Content-Type": "application/json" },
884
+ body: JSON.stringify({
885
+ message,
886
+ session_id: context.sessionId,
887
+ previous_context: context.previousContext,
888
+ previous_intent: context.previousIntent,
889
+ conversation_history: context.conversationHistory
890
+ }),
891
+ signal: controller.signal
892
+ });
893
+ if (!response.ok) {
894
+ throw new Error(`Agent returned ${response.status}: ${response.statusText}`);
895
+ }
896
+ const data = await response.json();
897
+ return {
898
+ outcome: data.outcome || "answer",
899
+ answer: data.answer,
900
+ followUpQuestion: data.followUpQuestion || data.follow_up_question,
901
+ options: data.options,
902
+ context: data.context,
903
+ intent: data.intent,
904
+ trace: data.trace,
905
+ sessionId: data.session_id || data.sessionId
906
+ };
907
+ } finally {
908
+ clearTimeout(timeoutId);
909
+ }
910
+ }
911
+ function generateFollowUpResponse(question, scenario) {
912
+ if (!scenario.followUpResponses) {
913
+ return null;
914
+ }
915
+ for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {
916
+ if (question.toLowerCase().includes(pattern.toLowerCase())) {
917
+ return response;
918
+ }
919
+ }
920
+ return null;
921
+ }
922
+ async function evaluateScenarioResult(scenario, response, conversation, grader) {
923
+ const evidence = [];
924
+ if (!response || response.outcome !== "answer" || !response.answer) {
925
+ if (scenario.expectedOutcome && scenario.expectedOutcome !== "answer") {
926
+ return {
927
+ passed: response?.outcome === scenario.expectedOutcome,
928
+ score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
929
+ evidence: [
930
+ {
931
+ criterion: "outcomeMatch",
932
+ score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
933
+ reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || "no response"}.`
934
+ }
935
+ ]
936
+ };
937
+ }
938
+ return {
939
+ passed: false,
940
+ score: 0,
941
+ evidence: [
942
+ {
943
+ criterion: "noAnswer",
944
+ score: 0,
945
+ reasoning: `Expected an answer but got ${response?.outcome || "no response"}.`
946
+ }
947
+ ]
948
+ };
949
+ }
950
+ const answer = response.answer;
951
+ let totalScore = 0;
952
+ let criteriaCount = 0;
953
+ if (scenario.evaluation.mustContain) {
954
+ const found = scenario.evaluation.mustContain.filter(
955
+ (s) => answer.toLowerCase().includes(s.toLowerCase())
956
+ );
957
+ const score = normalizeScore(found.length / scenario.evaluation.mustContain.length * 100);
958
+ totalScore += score;
959
+ criteriaCount++;
960
+ evidence.push({
961
+ criterion: "mustContain",
962
+ score,
963
+ reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,
964
+ examples: scenario.evaluation.mustContain.filter(
965
+ (s) => !answer.toLowerCase().includes(s.toLowerCase())
966
+ )
967
+ });
968
+ }
969
+ if (scenario.evaluation.mustNotContain) {
970
+ const found = scenario.evaluation.mustNotContain.filter(
971
+ (s) => answer.toLowerCase().includes(s.toLowerCase())
972
+ );
973
+ const score = normalizeScore((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length * 100);
974
+ totalScore += score;
975
+ criteriaCount++;
976
+ evidence.push({
977
+ criterion: "mustNotContain",
978
+ score,
979
+ reasoning: `Found ${found.length} forbidden terms.`,
980
+ examples: found
981
+ });
982
+ }
983
+ const gradingPrompt = `
984
+ Query: ${scenario.initialQuery}
985
+ Answer: ${answer}
986
+ ${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ""}
987
+ `;
988
+ const relevanceResult = await grader.grade(
989
+ gradingPrompt,
990
+ "relevance",
991
+ "How relevant is the answer to the query? 100 = directly and completely addresses the query."
992
+ );
993
+ evidence.push({
994
+ criterion: "relevance",
995
+ score: relevanceResult.score,
996
+ reasoning: relevanceResult.reasoning
997
+ });
998
+ totalScore += relevanceResult.score;
999
+ criteriaCount++;
1000
+ const completenessResult = await grader.grade(
1001
+ gradingPrompt,
1002
+ "completeness",
1003
+ "How complete is the answer? 100 = fully addresses all aspects of the query."
1004
+ );
1005
+ evidence.push({
1006
+ criterion: "completeness",
1007
+ score: completenessResult.score,
1008
+ reasoning: completenessResult.reasoning
1009
+ });
1010
+ totalScore += completenessResult.score;
1011
+ criteriaCount++;
1012
+ const helpfulnessResult = await grader.grade(
1013
+ gradingPrompt,
1014
+ "helpfulness",
1015
+ "How helpful and actionable is the answer? 100 = provides clear, actionable guidance."
1016
+ );
1017
+ evidence.push({
1018
+ criterion: "helpfulness",
1019
+ score: helpfulnessResult.score,
1020
+ reasoning: helpfulnessResult.reasoning
1021
+ });
1022
+ totalScore += helpfulnessResult.score;
1023
+ criteriaCount++;
1024
+ const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;
1025
+ return {
1026
+ passed: avgScore >= 70,
1027
+ score: normalizeScore(avgScore),
1028
+ evidence
1029
+ };
1030
+ }
1031
+
1032
+ exports.average = average;
1033
+ exports.calculatePercentage = calculatePercentage;
1034
+ exports.calculateWeightedScore = calculateWeightedScore;
1035
+ exports.createLLMGrader = createLLMGrader;
1036
+ exports.evaluateAgent = evaluateAgent;
1037
+ exports.evaluateIntrospection = evaluateIntrospection;
1038
+ exports.evaluateRetrieval = evaluateRetrieval;
1039
+ exports.formatConsoleReport = formatConsoleReport;
1040
+ exports.formatJsonReport = formatJsonReport;
1041
+ exports.gradeWithLLM = gradeWithLLM;
1042
+ //# sourceMappingURL=index.cjs.map
1043
+ //# sourceMappingURL=index.cjs.map