@kat-ai/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1032 @@
1
+ import { Pinecone } from '@pinecone-database/pinecone';
2
+ import { createOpenAI } from '@ai-sdk/openai';
3
+ import { generateObject } from 'ai';
4
+ import { z } from 'zod';
5
+ import { resolveDefaultOpenAiChatModelId } from '@kat/core';
6
+
7
+ // src/introspection/index.ts
8
+ var GradeSchema = z.object({
9
+ score: z.number().min(0).max(100).describe("Score from 0-100"),
10
+ reasoning: z.string().describe("Explanation for the score"),
11
+ examples: z.array(z.string()).optional().describe("Specific examples that influenced the score")
12
+ });
13
+ var MultiCriteriaGradeSchema = z.object({
14
+ scores: z.array(z.object({
15
+ criterion: z.string(),
16
+ score: z.number().min(0).max(100),
17
+ reasoning: z.string()
18
+ })),
19
+ overallReasoning: z.string()
20
+ });
21
+ function createLLMGrader(config = {}) {
22
+ const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
23
+ if (!apiKey) {
24
+ throw new Error("OPENAI_API_KEY is required for LLM grading");
25
+ }
26
+ const openai = createOpenAI({ apiKey });
27
+ const model = config.model || resolveDefaultOpenAiChatModelId();
28
+ const temperature = config.temperature ?? 0.1;
29
+ return {
30
+ async grade(content, criterion, rubric) {
31
+ const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
32
+
33
+ ${rubric ? `Rubric: ${rubric}
34
+ ` : ""}
35
+ Content to evaluate:
36
+ """
37
+ ${content}
38
+ """
39
+
40
+ Provide a score from 0-100 and explain your reasoning.`;
41
+ const result = await generateObject({
42
+ model: openai(model),
43
+ schema: GradeSchema,
44
+ prompt,
45
+ temperature
46
+ });
47
+ return result.object;
48
+ },
49
+ async gradeMultiple(content, criteria) {
50
+ const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
51
+ const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
52
+
53
+ Criteria:
54
+ ${criteriaDescription}
55
+
56
+ Content to evaluate:
57
+ """
58
+ ${content}
59
+ """
60
+
61
+ For each criterion, provide a score from 0-100 and explain your reasoning.`;
62
+ const result = await generateObject({
63
+ model: openai(model),
64
+ schema: MultiCriteriaGradeSchema,
65
+ prompt,
66
+ temperature
67
+ });
68
+ return result.object.scores.map((s) => ({
69
+ criterion: s.criterion,
70
+ score: s.score,
71
+ reasoning: s.reasoning
72
+ }));
73
+ },
74
+ async gradeRelevance(query, content) {
75
+ const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
76
+
77
+ Query: "${query}"
78
+
79
+ Content:
80
+ """
81
+ ${content}
82
+ """
83
+
84
+ A score of 100 means the content directly and completely answers the query.
85
+ A score of 0 means the content is completely irrelevant.
86
+
87
+ Provide a score from 0-100 and explain your reasoning.`;
88
+ const result = await generateObject({
89
+ model: openai(model),
90
+ schema: GradeSchema,
91
+ prompt,
92
+ temperature
93
+ });
94
+ return {
95
+ score: result.object.score,
96
+ reasoning: result.object.reasoning
97
+ };
98
+ }
99
+ };
100
+ }
101
+ async function gradeWithLLM(content, criterion, config = {}) {
102
+ const grader = createLLMGrader(config);
103
+ return grader.grade(content, criterion);
104
+ }
105
+
106
+ // src/utils/metrics.ts
107
+ function calculateWeightedScore(scores) {
108
+ if (scores.length === 0) return 0;
109
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
110
+ if (totalWeight === 0) return 0;
111
+ const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
112
+ return Math.round(weightedSum / totalWeight);
113
+ }
114
+ function calculatePercentage(found, expected) {
115
+ if (expected === 0) return 100;
116
+ return Math.round(found / expected * 100);
117
+ }
118
+ function average(numbers) {
119
+ if (numbers.length === 0) return 0;
120
+ return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
121
+ }
122
+ function clamp(value, min, max) {
123
+ return Math.max(min, Math.min(max, value));
124
+ }
125
+ function normalizeScore(score) {
126
+ return clamp(Math.round(score), 0, 100);
127
+ }
128
+
129
+ // src/utils/reporters.ts
130
+ function formatConsoleReport(result, options = {}) {
131
+ const lines = [];
132
+ const { includeEvidence = true } = options;
133
+ const status = result.passed ? "\u2713 PASSED" : "\u2717 FAILED";
134
+ const statusColor = result.passed ? "\x1B[32m" : "\x1B[31m";
135
+ const reset = "\x1B[0m";
136
+ lines.push("");
137
+ lines.push("\u2550".repeat(60));
138
+ lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);
139
+ lines.push("\u2550".repeat(60));
140
+ lines.push("");
141
+ lines.push(`Summary: ${result.summary}`);
142
+ lines.push(`Duration: ${result.duration}ms`);
143
+ lines.push("");
144
+ lines.push("Scores:");
145
+ for (const [name, score] of Object.entries(result.scores)) {
146
+ const bar = createProgressBar(score, 20);
147
+ const formattedName = formatScoreName(name);
148
+ lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);
149
+ }
150
+ if (includeEvidence && result.evidence.length > 0) {
151
+ lines.push("");
152
+ lines.push("Evidence:");
153
+ for (const evidence of result.evidence) {
154
+ lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);
155
+ lines.push(` ${evidence.reasoning}`);
156
+ if (evidence.examples && evidence.examples.length > 0) {
157
+ for (const example of evidence.examples.slice(0, 3)) {
158
+ lines.push(` - ${example}`);
159
+ }
160
+ }
161
+ }
162
+ }
163
+ lines.push("");
164
+ lines.push("\u2500".repeat(60));
165
+ return lines.join("\n");
166
+ }
167
+ function createProgressBar(value, width) {
168
+ const filled = Math.round(value / 100 * width);
169
+ const empty = width - filled;
170
+ return `[${"\u2588".repeat(filled)}${"\u2591".repeat(empty)}]`;
171
+ }
172
+ function formatScoreName(name) {
173
+ return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
174
+ }
175
+ function formatJsonReport(result, options = {}) {
176
+ const { includeEvidence = true, includeRawData = false } = options;
177
+ const output = {
178
+ passed: result.passed,
179
+ overallScore: result.overallScore,
180
+ scores: result.scores,
181
+ summary: result.summary,
182
+ duration: result.duration
183
+ };
184
+ if (includeEvidence) {
185
+ output.evidence = result.evidence;
186
+ }
187
+ for (const [key, value] of Object.entries(result)) {
188
+ if (!["passed", "overallScore", "scores", "evidence", "summary", "duration"].includes(key) && (includeRawData || !isRawData(value))) {
189
+ output[key] = value;
190
+ }
191
+ }
192
+ return JSON.stringify(output, null, 2);
193
+ }
194
+ function isRawData(value) {
195
+ if (Array.isArray(value) && value.length > 10) return true;
196
+ if (typeof value === "object" && value !== null) {
197
+ const keys = Object.keys(value);
198
+ if (keys.length > 20) return true;
199
+ }
200
+ return false;
201
+ }
202
+ function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
203
+ const entries = Object.entries(scores);
204
+ const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
205
+ const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
206
+ const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
207
+ const parts = [];
208
+ if (avgScore >= thresholds.good) {
209
+ parts.push("Strong overall performance");
210
+ } else if (avgScore >= thresholds.acceptable) {
211
+ parts.push("Acceptable performance with room for improvement");
212
+ } else {
213
+ parts.push("Performance below acceptable thresholds");
214
+ }
215
+ if (goodMetrics.length > 0) {
216
+ parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
217
+ }
218
+ if (poorMetrics.length > 0) {
219
+ parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
220
+ }
221
+ return parts.join(". ") + ".";
222
+ }
223
+
224
+ // src/introspection/index.ts
225
+ async function evaluateIntrospection(config) {
226
+ const startTime = Date.now();
227
+ const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
228
+ if (!apiKey) {
229
+ throw new Error("PINECONE_API_KEY is required for introspection eval");
230
+ }
231
+ const pinecone = new Pinecone({ apiKey });
232
+ const assistant = pinecone.assistant(config.assistantName);
233
+ const [entityResult, slotResult, scopeResult, capabilityResult] = await Promise.all([
234
+ evaluateEntityCoverage(assistant, config.manifest, config),
235
+ evaluateSlotAccuracy(assistant, config.manifest, config),
236
+ evaluateScopePrecision(assistant, config.manifest, config),
237
+ evaluateCapabilityMatch(assistant, config.manifest, config)
238
+ ]);
239
+ const scores = {
240
+ entityCoverage: entityResult.score,
241
+ slotAccuracy: slotResult.score,
242
+ scopePrecision: scopeResult.score,
243
+ capabilityMatch: capabilityResult.score
244
+ };
245
+ const overallScore = calculateWeightedScore([
246
+ { score: scores.entityCoverage, weight: 0.25 },
247
+ { score: scores.slotAccuracy, weight: 0.3 },
248
+ { score: scores.scopePrecision, weight: 0.25 },
249
+ { score: scores.capabilityMatch, weight: 0.2 }
250
+ ]);
251
+ const evidence = [
252
+ ...entityResult.evidence,
253
+ ...slotResult.evidence,
254
+ ...scopeResult.evidence,
255
+ ...capabilityResult.evidence
256
+ ];
257
+ return {
258
+ passed: overallScore >= 70,
259
+ overallScore,
260
+ scores,
261
+ evidence,
262
+ summary: generateSummary(scores),
263
+ duration: Date.now() - startTime,
264
+ details: {
265
+ missingEntities: entityResult.missing,
266
+ incorrectSlots: slotResult.incorrect,
267
+ scopeMisclassifications: scopeResult.misclassified,
268
+ mismatchedCapabilities: capabilityResult.mismatched
269
+ },
270
+ manifest: config.manifest
271
+ };
272
+ }
273
+ async function evaluateEntityCoverage(assistant, manifest, config) {
274
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
275
+ const manifestEntities = extractEntitiesFromManifest(manifest);
276
+ const discoveryQueries = [
277
+ "What are the main topics you can help with?",
278
+ "What products or entities do you have information about?",
279
+ "List the categories of information you contain."
280
+ ];
281
+ const discoveredEntities = /* @__PURE__ */ new Set();
282
+ for (const query of discoveryQueries) {
283
+ try {
284
+ const response = await assistant.chat({
285
+ messages: [{ role: "user", content: query }]
286
+ });
287
+ const entities = await extractEntitiesWithLLM(grader, response.message?.content || "");
288
+ entities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
289
+ } catch {
290
+ }
291
+ }
292
+ if (config.groundTruth) {
293
+ for (const gt of config.groundTruth) {
294
+ if (gt.expectedEntities) {
295
+ gt.expectedEntities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
296
+ }
297
+ }
298
+ }
299
+ const found = [];
300
+ const missing = [];
301
+ for (const entity of discoveredEntities) {
302
+ if (manifestEntities.some((me) => me.toLowerCase().includes(entity) || entity.includes(me.toLowerCase()))) {
303
+ found.push(entity);
304
+ } else {
305
+ missing.push(entity);
306
+ }
307
+ }
308
+ const score = discoveredEntities.size === 0 ? 100 : normalizeScore(found.length / discoveredEntities.size * 100);
309
+ return {
310
+ score,
311
+ evidence: [
312
+ {
313
+ criterion: "entityCoverage",
314
+ score,
315
+ reasoning: `Found ${found.length}/${discoveredEntities.size} expected entities in the manifest.`,
316
+ examples: missing.slice(0, 5)
317
+ }
318
+ ],
319
+ found,
320
+ missing
321
+ };
322
+ }
323
+ async function evaluateSlotAccuracy(assistant, manifest, config) {
324
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
325
+ const slots = manifest.slots || [];
326
+ if (slots.length === 0) {
327
+ return {
328
+ score: 50,
329
+ // Penalize but don't fail for no slots
330
+ evidence: [
331
+ {
332
+ criterion: "slotAccuracy",
333
+ score: 50,
334
+ reasoning: "No slots defined in manifest. Consider adding slots for common query parameters."
335
+ }
336
+ ],
337
+ correct: [],
338
+ incorrect: []
339
+ };
340
+ }
341
+ const correct = [];
342
+ const incorrect = [];
343
+ for (const slot of slots) {
344
+ const testQuery = `To answer questions about ${manifest.domain || "this topic"}, do I need to know the ${slot.name}?`;
345
+ try {
346
+ const response = await assistant.chat({
347
+ messages: [{ role: "user", content: testQuery }]
348
+ });
349
+ const evaluation = await grader.grade(
350
+ `Slot: ${slot.name}
351
+ Description: ${slot.description || "N/A"}
352
+ KB Response: ${response.message?.content}`,
353
+ "slot relevance",
354
+ "Score 100 if the slot seems relevant to the KB content, 0 if completely irrelevant."
355
+ );
356
+ if (evaluation.score >= 60) {
357
+ correct.push(slot.name);
358
+ } else {
359
+ incorrect.push(slot.name);
360
+ }
361
+ } catch {
362
+ correct.push(slot.name);
363
+ }
364
+ }
365
+ const score = normalizeScore(correct.length / slots.length * 100);
366
+ return {
367
+ score,
368
+ evidence: [
369
+ {
370
+ criterion: "slotAccuracy",
371
+ score,
372
+ reasoning: `${correct.length}/${slots.length} slots appear relevant to the KB content.`,
373
+ examples: incorrect
374
+ }
375
+ ],
376
+ correct,
377
+ incorrect
378
+ };
379
+ }
380
+ async function evaluateScopePrecision(assistant, manifest, config) {
381
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
382
+ const scope = manifest.scope;
383
+ if (!scope) {
384
+ return {
385
+ score: 60,
386
+ // Penalize but don't fail
387
+ evidence: [
388
+ {
389
+ criterion: "scopePrecision",
390
+ score: 60,
391
+ reasoning: "No scope definition in manifest. Consider defining in-scope and out-of-scope examples."
392
+ }
393
+ ],
394
+ correctClassifications: 0,
395
+ totalClassifications: 0,
396
+ misclassified: []
397
+ };
398
+ }
399
+ const testCases = [];
400
+ if (scope.inScopeExamples) {
401
+ scope.inScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: true }));
402
+ }
403
+ if (scope.outOfScopeExamples) {
404
+ scope.outOfScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: false }));
405
+ }
406
+ if (config.groundTruth) {
407
+ config.groundTruth.filter((gt) => gt.shouldBeInScope !== void 0).forEach((gt) => testCases.push({ query: gt.query, expectedInScope: gt.shouldBeInScope }));
408
+ }
409
+ if (testCases.length === 0) {
410
+ return {
411
+ score: 70,
412
+ evidence: [
413
+ {
414
+ criterion: "scopePrecision",
415
+ score: 70,
416
+ reasoning: "No scope test cases available. Add in-scope and out-of-scope examples to test."
417
+ }
418
+ ],
419
+ correctClassifications: 0,
420
+ totalClassifications: 0,
421
+ misclassified: []
422
+ };
423
+ }
424
+ let correctClassifications = 0;
425
+ const misclassified = [];
426
+ for (const testCase of testCases.slice(0, 10)) {
427
+ try {
428
+ const response = await assistant.chat({
429
+ messages: [{ role: "user", content: testCase.query }]
430
+ });
431
+ const evaluation = await grader.grade(
432
+ `Query: ${testCase.query}
433
+ KB Response: ${response.message?.content}`,
434
+ "answerability",
435
+ "Score 100 if the KB provided a substantive, on-topic answer. Score 0 if it said it cannot help or gave an off-topic response."
436
+ );
437
+ const actuallyInScope = evaluation.score >= 50;
438
+ if (actuallyInScope === testCase.expectedInScope) {
439
+ correctClassifications++;
440
+ } else {
441
+ misclassified.push(`"${testCase.query}" (expected ${testCase.expectedInScope ? "in-scope" : "out-of-scope"})`);
442
+ }
443
+ } catch {
444
+ }
445
+ }
446
+ const score = normalizeScore(correctClassifications / testCases.length * 100);
447
+ return {
448
+ score,
449
+ evidence: [
450
+ {
451
+ criterion: "scopePrecision",
452
+ score,
453
+ reasoning: `${correctClassifications}/${testCases.length} scope classifications were correct.`,
454
+ examples: misclassified.slice(0, 3)
455
+ }
456
+ ],
457
+ correctClassifications,
458
+ totalClassifications: testCases.length,
459
+ misclassified
460
+ };
461
+ }
462
+ async function evaluateCapabilityMatch(assistant, manifest, config) {
463
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
464
+ const capabilities = (manifest.capabilities || []).map(
465
+ (c) => typeof c === "string" ? c : c.text
466
+ );
467
+ if (capabilities.length === 0) {
468
+ return {
469
+ score: 50,
470
+ evidence: [
471
+ {
472
+ criterion: "capabilityMatch",
473
+ score: 50,
474
+ reasoning: "No capabilities defined in manifest."
475
+ }
476
+ ],
477
+ matched: [],
478
+ mismatched: []
479
+ };
480
+ }
481
+ const matched = [];
482
+ const mismatched = [];
483
+ for (const capability of capabilities.slice(0, 5)) {
484
+ const testQuery = `Can you help me with: ${capability}`;
485
+ try {
486
+ const response = await assistant.chat({
487
+ messages: [{ role: "user", content: testQuery }]
488
+ });
489
+ const evaluation = await grader.grade(
490
+ `Capability: ${capability}
491
+ KB Response: ${response.message?.content}`,
492
+ "capability fulfillment",
493
+ "Score 100 if the KB demonstrated it can help with this capability. Score 0 if it cannot."
494
+ );
495
+ if (evaluation.score >= 60) {
496
+ matched.push(capability);
497
+ } else {
498
+ mismatched.push(capability);
499
+ }
500
+ } catch {
501
+ matched.push(capability);
502
+ }
503
+ }
504
+ const score = normalizeScore(matched.length / capabilities.length * 100);
505
+ return {
506
+ score,
507
+ evidence: [
508
+ {
509
+ criterion: "capabilityMatch",
510
+ score,
511
+ reasoning: `${matched.length}/${capabilities.length} stated capabilities match actual KB content.`,
512
+ examples: mismatched
513
+ }
514
+ ],
515
+ matched,
516
+ mismatched
517
+ };
518
+ }
519
+ function extractEntitiesFromManifest(manifest) {
520
+ const entities = /* @__PURE__ */ new Set();
521
+ if (manifest.slots) {
522
+ manifest.slots.forEach((slot) => {
523
+ if (slot.examples) {
524
+ slot.examples.forEach((e) => entities.add(e));
525
+ }
526
+ });
527
+ }
528
+ if (manifest.domain) {
529
+ entities.add(manifest.domain);
530
+ }
531
+ if (manifest.capabilities) {
532
+ manifest.capabilities.forEach((c) => {
533
+ const text = typeof c === "string" ? c : c.text;
534
+ const words = text.split(/\s+/).filter((w) => w.length > 3 && /^[A-Z]/.test(w));
535
+ words.forEach((w) => entities.add(w.toLowerCase()));
536
+ });
537
+ }
538
+ return Array.from(entities);
539
+ }
540
+ async function extractEntitiesWithLLM(grader, text) {
541
+ try {
542
+ const result = await grader.grade(
543
+ text,
544
+ "entity extraction",
545
+ "List the main entities (products, topics, categories) mentioned. Return just the entity names separated by commas."
546
+ );
547
+ return result.reasoning.split(",").map((e) => e.trim().toLowerCase()).filter((e) => e.length > 2);
548
+ } catch {
549
+ return [];
550
+ }
551
+ }
552
+ async function evaluateRetrieval(config) {
553
+ const startTime = Date.now();
554
+ if (config.queries.length === 0) {
555
+ throw new Error("At least one query is required for retrieval eval");
556
+ }
557
+ const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
558
+ if (!apiKey) {
559
+ throw new Error("PINECONE_API_KEY is required for retrieval eval");
560
+ }
561
+ const pinecone = new Pinecone({ apiKey });
562
+ const assistant = pinecone.assistant(config.assistantName);
563
+ const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
564
+ const topK = config.topK || 5;
565
+ const queryResults = [];
566
+ for (const testQuery of config.queries) {
567
+ const result = await evaluateQueryRetrieval(
568
+ assistant,
569
+ grader,
570
+ testQuery,
571
+ topK,
572
+ config.verbose
573
+ );
574
+ queryResults.push(result);
575
+ }
576
+ const relevanceScores = queryResults.map((r) => r.relevanceScore);
577
+ const relevance = normalizeScore(average(relevanceScores));
578
+ const totalExpected = queryResults.reduce(
579
+ (sum, r) => sum + r.foundTopics.length + r.missingTopics.length,
580
+ 0
581
+ );
582
+ const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);
583
+ const recall = totalExpected === 0 ? 100 : normalizeScore(totalFound / totalExpected * 100);
584
+ const allChunks = queryResults.flatMap((r) => r.chunks);
585
+ const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);
586
+ const precision = allChunks.length === 0 ? 100 : normalizeScore(relevantChunks.length / allChunks.length * 100);
587
+ const totalIrrelevant = queryResults.reduce(
588
+ (sum, r) => sum + r.noiseTopics.length,
589
+ 0
590
+ );
591
+ const totalIrrelevantExpected = config.queries.reduce(
592
+ (sum, q) => sum + (q.irrelevantTopics?.length || 0),
593
+ 0
594
+ );
595
+ const noiseRatio = totalIrrelevantExpected === 0 ? 0 : normalizeScore(totalIrrelevant / totalIrrelevantExpected * 100);
596
+ const scores = { relevance, recall, precision, noiseRatio };
597
+ const overallScore = calculateWeightedScore([
598
+ { score: relevance, weight: 0.35 },
599
+ { score: recall, weight: 0.3 },
600
+ { score: precision, weight: 0.25 },
601
+ { score: 100 - noiseRatio, weight: 0.1 }
602
+ // Invert noise ratio
603
+ ]);
604
+ const evidence = [
605
+ {
606
+ criterion: "relevance",
607
+ score: relevance,
608
+ reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`
609
+ },
610
+ {
611
+ criterion: "recall",
612
+ score: recall,
613
+ reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,
614
+ examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3)
615
+ },
616
+ {
617
+ criterion: "precision",
618
+ score: precision,
619
+ reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`
620
+ },
621
+ {
622
+ criterion: "noiseRatio",
623
+ score: noiseRatio,
624
+ reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,
625
+ examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3)
626
+ }
627
+ ];
628
+ return {
629
+ passed: overallScore >= 70 && noiseRatio <= 30,
630
+ overallScore,
631
+ scores,
632
+ evidence,
633
+ summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),
634
+ duration: Date.now() - startTime,
635
+ queryResults
636
+ };
637
+ }
638
+ async function evaluateQueryRetrieval(assistant, grader, testQuery, topK, verbose) {
639
+ let contextResult;
640
+ try {
641
+ contextResult = await assistant.context({
642
+ query: testQuery.query,
643
+ topK
644
+ });
645
+ } catch (error) {
646
+ return {
647
+ query: testQuery.query,
648
+ chunks: [],
649
+ relevanceScore: 0,
650
+ foundTopics: [],
651
+ missingTopics: testQuery.expectedTopics || [],
652
+ noiseTopics: []
653
+ };
654
+ }
655
+ const snippets = contextResult.snippets || [];
656
+ const chunks = [];
657
+ for (const snippet of snippets) {
658
+ const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);
659
+ const ref = snippet.reference;
660
+ const sourceFile = ref?.file?.name || ref?.name || "unknown";
661
+ chunks.push({
662
+ content: snippet.content.slice(0, 500),
663
+ // Truncate for storage
664
+ score: snippet.score,
665
+ sourceFile,
666
+ relevanceGrade: relevanceResult.score,
667
+ reasoning: relevanceResult.reasoning
668
+ });
669
+ }
670
+ const relevanceScore = chunks.length === 0 ? 0 : average(chunks.map((c) => c.relevanceGrade));
671
+ const allContent = chunks.map((c) => c.content).join(" ").toLowerCase();
672
+ const topicResult = checkTopics(
673
+ allContent,
674
+ testQuery.expectedTopics || [],
675
+ testQuery.irrelevantTopics || []
676
+ );
677
+ if (verbose) {
678
+ console.log(`Query: "${testQuery.query}"`);
679
+ console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);
680
+ console.log(` Topics found: ${topicResult.found.join(", ") || "none"}`);
681
+ console.log(` Topics missing: ${topicResult.missing.join(", ") || "none"}`);
682
+ console.log(` Noise: ${topicResult.noise.join(", ") || "none"}`);
683
+ }
684
+ return {
685
+ query: testQuery.query,
686
+ chunks,
687
+ relevanceScore,
688
+ foundTopics: topicResult.found,
689
+ missingTopics: topicResult.missing,
690
+ noiseTopics: topicResult.noise
691
+ };
692
+ }
693
+ function checkTopics(content, expectedTopics, irrelevantTopics) {
694
+ const found = [];
695
+ const missing = [];
696
+ const noise = [];
697
+ for (const topic of expectedTopics) {
698
+ if (content.includes(topic.toLowerCase())) {
699
+ found.push(topic);
700
+ } else {
701
+ missing.push(topic);
702
+ }
703
+ }
704
+ for (const topic of irrelevantTopics) {
705
+ if (content.includes(topic.toLowerCase())) {
706
+ noise.push(topic);
707
+ }
708
+ }
709
+ return { found, missing, noise };
710
+ }
711
+
712
+ // src/agent/index.ts
713
+ async function evaluateAgent(config) {
714
+ const startTime = Date.now();
715
+ if (config.scenarios.length === 0) {
716
+ throw new Error("At least one scenario is required for agent eval");
717
+ }
718
+ const grader = createLLMGrader({
719
+ openaiApiKey: config.openaiApiKey,
720
+ model: config.graderConfig?.model,
721
+ temperature: config.graderConfig?.temperature
722
+ });
723
+ const scenarioResults = [];
724
+ for (const scenario of config.scenarios) {
725
+ const result = await runScenario(scenario, config, grader);
726
+ scenarioResults.push(result);
727
+ }
728
+ const passedScenarios = scenarioResults.filter((r) => r.passed);
729
+ const accuracy = normalizeScore(passedScenarios.length / scenarioResults.length * 100);
730
+ const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);
731
+ const relevanceScores = gradedResults.flatMap(
732
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "relevance").map((e) => e.score)
733
+ );
734
+ const completenessScores = gradedResults.flatMap(
735
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "completeness").map((e) => e.score)
736
+ );
737
+ const helpfulnessScores = gradedResults.flatMap(
738
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "helpfulness").map((e) => e.score)
739
+ );
740
+ const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;
741
+ const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;
742
+ const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;
743
+ const scores = { accuracy, relevance, completeness, helpfulness };
744
+ const overallScore = calculateWeightedScore([
745
+ { score: accuracy, weight: 0.3 },
746
+ { score: relevance, weight: 0.25 },
747
+ { score: completeness, weight: 0.25 },
748
+ { score: helpfulness, weight: 0.2 }
749
+ ]);
750
+ const evidence = [
751
+ {
752
+ criterion: "accuracy",
753
+ score: accuracy,
754
+ reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,
755
+ examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3)
756
+ },
757
+ {
758
+ criterion: "relevance",
759
+ score: relevance,
760
+ reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`
761
+ },
762
+ {
763
+ criterion: "completeness",
764
+ score: completeness,
765
+ reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`
766
+ },
767
+ {
768
+ criterion: "helpfulness",
769
+ score: helpfulness,
770
+ reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`
771
+ }
772
+ ];
773
+ return {
774
+ passed: overallScore >= 70,
775
+ overallScore,
776
+ scores,
777
+ evidence,
778
+ summary: generateSummary(scores),
779
+ duration: Date.now() - startTime,
780
+ scenarioResults
781
+ };
782
+ }
783
+ async function runScenario(scenario, config, grader) {
784
+ const startTime = Date.now();
785
+ const maxTurns = scenario.maxTurns || config.maxTurns || 5;
786
+ const timeout = config.timeout || 6e4;
787
+ const conversation = [];
788
+ let currentMessage = scenario.initialQuery;
789
+ let context = {
790
+ sessionId: `eval_${Date.now()}`,
791
+ conversationHistory: []
792
+ };
793
+ let lastResponse = null;
794
+ let turn = 0;
795
+ try {
796
+ while (turn < maxTurns) {
797
+ turn++;
798
+ const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);
799
+ lastResponse = response;
800
+ conversation.push({
801
+ turn,
802
+ userMessage: currentMessage,
803
+ agentResponse: response
804
+ });
805
+ context = {
806
+ ...context,
807
+ previousContext: response.context,
808
+ previousIntent: response.intent,
809
+ conversationHistory: [
810
+ ...context.conversationHistory || [],
811
+ { role: "user", content: currentMessage },
812
+ { role: "assistant", content: response.answer || response.followUpQuestion || "" }
813
+ ]
814
+ };
815
+ if (response.outcome === "answer") {
816
+ break;
817
+ }
818
+ if (response.outcome === "blocked" || response.outcome === "out_of_scope") {
819
+ break;
820
+ }
821
+ if (response.outcome === "follow_up") {
822
+ const followUpQuestion = response.followUpQuestion || "";
823
+ const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);
824
+ if (!responseToFollowUp) {
825
+ break;
826
+ }
827
+ currentMessage = responseToFollowUp;
828
+ }
829
+ }
830
+ const evaluation = await evaluateScenarioResult(
831
+ scenario,
832
+ lastResponse,
833
+ conversation,
834
+ grader
835
+ );
836
+ const outcomeMatch = scenario.expectedOutcome ? lastResponse?.outcome === scenario.expectedOutcome : true;
837
+ const passed = outcomeMatch && evaluation.passed;
838
+ return {
839
+ scenario,
840
+ passed,
841
+ turns: turn,
842
+ finalOutcome: lastResponse?.outcome || "error",
843
+ finalAnswer: lastResponse?.outcome === "answer" ? lastResponse.answer || null : null,
844
+ evaluation,
845
+ conversation,
846
+ duration: Date.now() - startTime
847
+ };
848
+ } catch (error) {
849
+ return {
850
+ scenario,
851
+ passed: false,
852
+ turns: turn,
853
+ finalOutcome: "error",
854
+ finalAnswer: null,
855
+ evaluation: {
856
+ passed: false,
857
+ score: 0,
858
+ evidence: [
859
+ {
860
+ criterion: "error",
861
+ score: 0,
862
+ reasoning: error instanceof Error ? error.message : String(error)
863
+ }
864
+ ]
865
+ },
866
+ conversation,
867
+ duration: Date.now() - startTime,
868
+ error: error instanceof Error ? error.message : String(error)
869
+ };
870
+ }
871
+ }
872
+ async function callAgent(endpoint, message, context, timeout) {
873
+ if (typeof endpoint === "function") {
874
+ return endpoint(message, context);
875
+ }
876
+ const controller = new AbortController();
877
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
878
+ try {
879
+ const response = await fetch(endpoint, {
880
+ method: "POST",
881
+ headers: { "Content-Type": "application/json" },
882
+ body: JSON.stringify({
883
+ message,
884
+ session_id: context.sessionId,
885
+ previous_context: context.previousContext,
886
+ previous_intent: context.previousIntent,
887
+ conversation_history: context.conversationHistory
888
+ }),
889
+ signal: controller.signal
890
+ });
891
+ if (!response.ok) {
892
+ throw new Error(`Agent returned ${response.status}: ${response.statusText}`);
893
+ }
894
+ const data = await response.json();
895
+ return {
896
+ outcome: data.outcome || "answer",
897
+ answer: data.answer,
898
+ followUpQuestion: data.followUpQuestion || data.follow_up_question,
899
+ options: data.options,
900
+ context: data.context,
901
+ intent: data.intent,
902
+ trace: data.trace,
903
+ sessionId: data.session_id || data.sessionId
904
+ };
905
+ } finally {
906
+ clearTimeout(timeoutId);
907
+ }
908
+ }
909
+ function generateFollowUpResponse(question, scenario) {
910
+ if (!scenario.followUpResponses) {
911
+ return null;
912
+ }
913
+ for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {
914
+ if (question.toLowerCase().includes(pattern.toLowerCase())) {
915
+ return response;
916
+ }
917
+ }
918
+ return null;
919
+ }
920
+ async function evaluateScenarioResult(scenario, response, conversation, grader) {
921
+ const evidence = [];
922
+ if (!response || response.outcome !== "answer" || !response.answer) {
923
+ if (scenario.expectedOutcome && scenario.expectedOutcome !== "answer") {
924
+ return {
925
+ passed: response?.outcome === scenario.expectedOutcome,
926
+ score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
927
+ evidence: [
928
+ {
929
+ criterion: "outcomeMatch",
930
+ score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
931
+ reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || "no response"}.`
932
+ }
933
+ ]
934
+ };
935
+ }
936
+ return {
937
+ passed: false,
938
+ score: 0,
939
+ evidence: [
940
+ {
941
+ criterion: "noAnswer",
942
+ score: 0,
943
+ reasoning: `Expected an answer but got ${response?.outcome || "no response"}.`
944
+ }
945
+ ]
946
+ };
947
+ }
948
+ const answer = response.answer;
949
+ let totalScore = 0;
950
+ let criteriaCount = 0;
951
+ if (scenario.evaluation.mustContain) {
952
+ const found = scenario.evaluation.mustContain.filter(
953
+ (s) => answer.toLowerCase().includes(s.toLowerCase())
954
+ );
955
+ const score = normalizeScore(found.length / scenario.evaluation.mustContain.length * 100);
956
+ totalScore += score;
957
+ criteriaCount++;
958
+ evidence.push({
959
+ criterion: "mustContain",
960
+ score,
961
+ reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,
962
+ examples: scenario.evaluation.mustContain.filter(
963
+ (s) => !answer.toLowerCase().includes(s.toLowerCase())
964
+ )
965
+ });
966
+ }
967
+ if (scenario.evaluation.mustNotContain) {
968
+ const found = scenario.evaluation.mustNotContain.filter(
969
+ (s) => answer.toLowerCase().includes(s.toLowerCase())
970
+ );
971
+ const score = normalizeScore((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length * 100);
972
+ totalScore += score;
973
+ criteriaCount++;
974
+ evidence.push({
975
+ criterion: "mustNotContain",
976
+ score,
977
+ reasoning: `Found ${found.length} forbidden terms.`,
978
+ examples: found
979
+ });
980
+ }
981
+ const gradingPrompt = `
982
+ Query: ${scenario.initialQuery}
983
+ Answer: ${answer}
984
+ ${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ""}
985
+ `;
986
+ const relevanceResult = await grader.grade(
987
+ gradingPrompt,
988
+ "relevance",
989
+ "How relevant is the answer to the query? 100 = directly and completely addresses the query."
990
+ );
991
+ evidence.push({
992
+ criterion: "relevance",
993
+ score: relevanceResult.score,
994
+ reasoning: relevanceResult.reasoning
995
+ });
996
+ totalScore += relevanceResult.score;
997
+ criteriaCount++;
998
+ const completenessResult = await grader.grade(
999
+ gradingPrompt,
1000
+ "completeness",
1001
+ "How complete is the answer? 100 = fully addresses all aspects of the query."
1002
+ );
1003
+ evidence.push({
1004
+ criterion: "completeness",
1005
+ score: completenessResult.score,
1006
+ reasoning: completenessResult.reasoning
1007
+ });
1008
+ totalScore += completenessResult.score;
1009
+ criteriaCount++;
1010
+ const helpfulnessResult = await grader.grade(
1011
+ gradingPrompt,
1012
+ "helpfulness",
1013
+ "How helpful and actionable is the answer? 100 = provides clear, actionable guidance."
1014
+ );
1015
+ evidence.push({
1016
+ criterion: "helpfulness",
1017
+ score: helpfulnessResult.score,
1018
+ reasoning: helpfulnessResult.reasoning
1019
+ });
1020
+ totalScore += helpfulnessResult.score;
1021
+ criteriaCount++;
1022
+ const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;
1023
+ return {
1024
+ passed: avgScore >= 70,
1025
+ score: normalizeScore(avgScore),
1026
+ evidence
1027
+ };
1028
+ }
1029
+
1030
+ export { average, calculatePercentage, calculateWeightedScore, createLLMGrader, evaluateAgent, evaluateIntrospection, evaluateRetrieval, formatConsoleReport, formatJsonReport, gradeWithLLM };
1031
+ //# sourceMappingURL=index.js.map
1032
+ //# sourceMappingURL=index.js.map