@kat-ai/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -0
- package/dist/agent/index.cjs +468 -0
- package/dist/agent/index.cjs.map +1 -0
- package/dist/agent/index.d.cts +170 -0
- package/dist/agent/index.d.ts +170 -0
- package/dist/agent/index.js +466 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/index.cjs +1043 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +93 -0
- package/dist/index.d.ts +93 -0
- package/dist/index.js +1032 -0
- package/dist/index.js.map +1 -0
- package/dist/introspection/index.cjs +476 -0
- package/dist/introspection/index.cjs.map +1 -0
- package/dist/introspection/index.d.cts +107 -0
- package/dist/introspection/index.d.ts +107 -0
- package/dist/introspection/index.js +474 -0
- package/dist/introspection/index.js.map +1 -0
- package/dist/retrieval/index.cjs +312 -0
- package/dist/retrieval/index.cjs.map +1 -0
- package/dist/retrieval/index.d.cts +98 -0
- package/dist/retrieval/index.d.ts +98 -0
- package/dist/retrieval/index.js +310 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-BJjlqNhg.d.cts +112 -0
- package/dist/types-BJjlqNhg.d.ts +112 -0
- package/package.json +79 -0
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var pinecone = require('@pinecone-database/pinecone');
|
|
4
|
+
var openai = require('@ai-sdk/openai');
|
|
5
|
+
var ai = require('ai');
|
|
6
|
+
var zod = require('zod');
|
|
7
|
+
var core = require('@kat/core');
|
|
8
|
+
|
|
9
|
+
// src/retrieval/index.ts
|
|
10
|
+
var GradeSchema = zod.z.object({
|
|
11
|
+
score: zod.z.number().min(0).max(100).describe("Score from 0-100"),
|
|
12
|
+
reasoning: zod.z.string().describe("Explanation for the score"),
|
|
13
|
+
examples: zod.z.array(zod.z.string()).optional().describe("Specific examples that influenced the score")
|
|
14
|
+
});
|
|
15
|
+
var MultiCriteriaGradeSchema = zod.z.object({
|
|
16
|
+
scores: zod.z.array(zod.z.object({
|
|
17
|
+
criterion: zod.z.string(),
|
|
18
|
+
score: zod.z.number().min(0).max(100),
|
|
19
|
+
reasoning: zod.z.string()
|
|
20
|
+
})),
|
|
21
|
+
overallReasoning: zod.z.string()
|
|
22
|
+
});
|
|
23
|
+
function createLLMGrader(config = {}) {
|
|
24
|
+
const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
25
|
+
if (!apiKey) {
|
|
26
|
+
throw new Error("OPENAI_API_KEY is required for LLM grading");
|
|
27
|
+
}
|
|
28
|
+
const openai$1 = openai.createOpenAI({ apiKey });
|
|
29
|
+
const model = config.model || core.resolveDefaultOpenAiChatModelId();
|
|
30
|
+
const temperature = config.temperature ?? 0.1;
|
|
31
|
+
return {
|
|
32
|
+
async grade(content, criterion, rubric) {
|
|
33
|
+
const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
|
|
34
|
+
|
|
35
|
+
${rubric ? `Rubric: ${rubric}
|
|
36
|
+
` : ""}
|
|
37
|
+
Content to evaluate:
|
|
38
|
+
"""
|
|
39
|
+
${content}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
43
|
+
const result = await ai.generateObject({
|
|
44
|
+
model: openai$1(model),
|
|
45
|
+
schema: GradeSchema,
|
|
46
|
+
prompt,
|
|
47
|
+
temperature
|
|
48
|
+
});
|
|
49
|
+
return result.object;
|
|
50
|
+
},
|
|
51
|
+
async gradeMultiple(content, criteria) {
|
|
52
|
+
const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
|
|
53
|
+
const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
|
|
54
|
+
|
|
55
|
+
Criteria:
|
|
56
|
+
${criteriaDescription}
|
|
57
|
+
|
|
58
|
+
Content to evaluate:
|
|
59
|
+
"""
|
|
60
|
+
${content}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
For each criterion, provide a score from 0-100 and explain your reasoning.`;
|
|
64
|
+
const result = await ai.generateObject({
|
|
65
|
+
model: openai$1(model),
|
|
66
|
+
schema: MultiCriteriaGradeSchema,
|
|
67
|
+
prompt,
|
|
68
|
+
temperature
|
|
69
|
+
});
|
|
70
|
+
return result.object.scores.map((s) => ({
|
|
71
|
+
criterion: s.criterion,
|
|
72
|
+
score: s.score,
|
|
73
|
+
reasoning: s.reasoning
|
|
74
|
+
}));
|
|
75
|
+
},
|
|
76
|
+
async gradeRelevance(query, content) {
|
|
77
|
+
const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
|
|
78
|
+
|
|
79
|
+
Query: "${query}"
|
|
80
|
+
|
|
81
|
+
Content:
|
|
82
|
+
"""
|
|
83
|
+
${content}
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
A score of 100 means the content directly and completely answers the query.
|
|
87
|
+
A score of 0 means the content is completely irrelevant.
|
|
88
|
+
|
|
89
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
90
|
+
const result = await ai.generateObject({
|
|
91
|
+
model: openai$1(model),
|
|
92
|
+
schema: GradeSchema,
|
|
93
|
+
prompt,
|
|
94
|
+
temperature
|
|
95
|
+
});
|
|
96
|
+
return {
|
|
97
|
+
score: result.object.score,
|
|
98
|
+
reasoning: result.object.reasoning
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// src/utils/metrics.ts
|
|
105
|
+
function calculateWeightedScore(scores) {
|
|
106
|
+
if (scores.length === 0) return 0;
|
|
107
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
108
|
+
if (totalWeight === 0) return 0;
|
|
109
|
+
const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
|
|
110
|
+
return Math.round(weightedSum / totalWeight);
|
|
111
|
+
}
|
|
112
|
+
function average(numbers) {
|
|
113
|
+
if (numbers.length === 0) return 0;
|
|
114
|
+
return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
|
|
115
|
+
}
|
|
116
|
+
function clamp(value, min, max) {
|
|
117
|
+
return Math.max(min, Math.min(max, value));
|
|
118
|
+
}
|
|
119
|
+
function normalizeScore(score) {
|
|
120
|
+
return clamp(Math.round(score), 0, 100);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// src/utils/reporters.ts
|
|
124
|
+
function formatScoreName(name) {
|
|
125
|
+
return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
|
|
126
|
+
}
|
|
127
|
+
function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
|
|
128
|
+
const entries = Object.entries(scores);
|
|
129
|
+
const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
|
|
130
|
+
const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
|
|
131
|
+
const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
|
|
132
|
+
const parts = [];
|
|
133
|
+
if (avgScore >= thresholds.good) {
|
|
134
|
+
parts.push("Strong overall performance");
|
|
135
|
+
} else if (avgScore >= thresholds.acceptable) {
|
|
136
|
+
parts.push("Acceptable performance with room for improvement");
|
|
137
|
+
} else {
|
|
138
|
+
parts.push("Performance below acceptable thresholds");
|
|
139
|
+
}
|
|
140
|
+
if (goodMetrics.length > 0) {
|
|
141
|
+
parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
|
|
142
|
+
}
|
|
143
|
+
if (poorMetrics.length > 0) {
|
|
144
|
+
parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
|
|
145
|
+
}
|
|
146
|
+
return parts.join(". ") + ".";
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// src/retrieval/index.ts
|
|
150
|
+
async function evaluateRetrieval(config) {
|
|
151
|
+
const startTime = Date.now();
|
|
152
|
+
if (config.queries.length === 0) {
|
|
153
|
+
throw new Error("At least one query is required for retrieval eval");
|
|
154
|
+
}
|
|
155
|
+
const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
|
|
156
|
+
if (!apiKey) {
|
|
157
|
+
throw new Error("PINECONE_API_KEY is required for retrieval eval");
|
|
158
|
+
}
|
|
159
|
+
const pinecone$1 = new pinecone.Pinecone({ apiKey });
|
|
160
|
+
const assistant = pinecone$1.assistant(config.assistantName);
|
|
161
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
162
|
+
const topK = config.topK || 5;
|
|
163
|
+
const queryResults = [];
|
|
164
|
+
for (const testQuery of config.queries) {
|
|
165
|
+
const result = await evaluateQueryRetrieval(
|
|
166
|
+
assistant,
|
|
167
|
+
grader,
|
|
168
|
+
testQuery,
|
|
169
|
+
topK,
|
|
170
|
+
config.verbose
|
|
171
|
+
);
|
|
172
|
+
queryResults.push(result);
|
|
173
|
+
}
|
|
174
|
+
const relevanceScores = queryResults.map((r) => r.relevanceScore);
|
|
175
|
+
const relevance = normalizeScore(average(relevanceScores));
|
|
176
|
+
const totalExpected = queryResults.reduce(
|
|
177
|
+
(sum, r) => sum + r.foundTopics.length + r.missingTopics.length,
|
|
178
|
+
0
|
|
179
|
+
);
|
|
180
|
+
const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);
|
|
181
|
+
const recall = totalExpected === 0 ? 100 : normalizeScore(totalFound / totalExpected * 100);
|
|
182
|
+
const allChunks = queryResults.flatMap((r) => r.chunks);
|
|
183
|
+
const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);
|
|
184
|
+
const precision = allChunks.length === 0 ? 100 : normalizeScore(relevantChunks.length / allChunks.length * 100);
|
|
185
|
+
const totalIrrelevant = queryResults.reduce(
|
|
186
|
+
(sum, r) => sum + r.noiseTopics.length,
|
|
187
|
+
0
|
|
188
|
+
);
|
|
189
|
+
const totalIrrelevantExpected = config.queries.reduce(
|
|
190
|
+
(sum, q) => sum + (q.irrelevantTopics?.length || 0),
|
|
191
|
+
0
|
|
192
|
+
);
|
|
193
|
+
const noiseRatio = totalIrrelevantExpected === 0 ? 0 : normalizeScore(totalIrrelevant / totalIrrelevantExpected * 100);
|
|
194
|
+
const scores = { relevance, recall, precision, noiseRatio };
|
|
195
|
+
const overallScore = calculateWeightedScore([
|
|
196
|
+
{ score: relevance, weight: 0.35 },
|
|
197
|
+
{ score: recall, weight: 0.3 },
|
|
198
|
+
{ score: precision, weight: 0.25 },
|
|
199
|
+
{ score: 100 - noiseRatio, weight: 0.1 }
|
|
200
|
+
// Invert noise ratio
|
|
201
|
+
]);
|
|
202
|
+
const evidence = [
|
|
203
|
+
{
|
|
204
|
+
criterion: "relevance",
|
|
205
|
+
score: relevance,
|
|
206
|
+
reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
criterion: "recall",
|
|
210
|
+
score: recall,
|
|
211
|
+
reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,
|
|
212
|
+
examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3)
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
criterion: "precision",
|
|
216
|
+
score: precision,
|
|
217
|
+
reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
criterion: "noiseRatio",
|
|
221
|
+
score: noiseRatio,
|
|
222
|
+
reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,
|
|
223
|
+
examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3)
|
|
224
|
+
}
|
|
225
|
+
];
|
|
226
|
+
return {
|
|
227
|
+
passed: overallScore >= 70 && noiseRatio <= 30,
|
|
228
|
+
overallScore,
|
|
229
|
+
scores,
|
|
230
|
+
evidence,
|
|
231
|
+
summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),
|
|
232
|
+
duration: Date.now() - startTime,
|
|
233
|
+
queryResults
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
async function evaluateQueryRetrieval(assistant, grader, testQuery, topK, verbose) {
|
|
237
|
+
let contextResult;
|
|
238
|
+
try {
|
|
239
|
+
contextResult = await assistant.context({
|
|
240
|
+
query: testQuery.query,
|
|
241
|
+
topK
|
|
242
|
+
});
|
|
243
|
+
} catch (error) {
|
|
244
|
+
return {
|
|
245
|
+
query: testQuery.query,
|
|
246
|
+
chunks: [],
|
|
247
|
+
relevanceScore: 0,
|
|
248
|
+
foundTopics: [],
|
|
249
|
+
missingTopics: testQuery.expectedTopics || [],
|
|
250
|
+
noiseTopics: []
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
const snippets = contextResult.snippets || [];
|
|
254
|
+
const chunks = [];
|
|
255
|
+
for (const snippet of snippets) {
|
|
256
|
+
const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);
|
|
257
|
+
const ref = snippet.reference;
|
|
258
|
+
const sourceFile = ref?.file?.name || ref?.name || "unknown";
|
|
259
|
+
chunks.push({
|
|
260
|
+
content: snippet.content.slice(0, 500),
|
|
261
|
+
// Truncate for storage
|
|
262
|
+
score: snippet.score,
|
|
263
|
+
sourceFile,
|
|
264
|
+
relevanceGrade: relevanceResult.score,
|
|
265
|
+
reasoning: relevanceResult.reasoning
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
const relevanceScore = chunks.length === 0 ? 0 : average(chunks.map((c) => c.relevanceGrade));
|
|
269
|
+
const allContent = chunks.map((c) => c.content).join(" ").toLowerCase();
|
|
270
|
+
const topicResult = checkTopics(
|
|
271
|
+
allContent,
|
|
272
|
+
testQuery.expectedTopics || [],
|
|
273
|
+
testQuery.irrelevantTopics || []
|
|
274
|
+
);
|
|
275
|
+
if (verbose) {
|
|
276
|
+
console.log(`Query: "${testQuery.query}"`);
|
|
277
|
+
console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);
|
|
278
|
+
console.log(` Topics found: ${topicResult.found.join(", ") || "none"}`);
|
|
279
|
+
console.log(` Topics missing: ${topicResult.missing.join(", ") || "none"}`);
|
|
280
|
+
console.log(` Noise: ${topicResult.noise.join(", ") || "none"}`);
|
|
281
|
+
}
|
|
282
|
+
return {
|
|
283
|
+
query: testQuery.query,
|
|
284
|
+
chunks,
|
|
285
|
+
relevanceScore,
|
|
286
|
+
foundTopics: topicResult.found,
|
|
287
|
+
missingTopics: topicResult.missing,
|
|
288
|
+
noiseTopics: topicResult.noise
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
function checkTopics(content, expectedTopics, irrelevantTopics) {
|
|
292
|
+
const found = [];
|
|
293
|
+
const missing = [];
|
|
294
|
+
const noise = [];
|
|
295
|
+
for (const topic of expectedTopics) {
|
|
296
|
+
if (content.includes(topic.toLowerCase())) {
|
|
297
|
+
found.push(topic);
|
|
298
|
+
} else {
|
|
299
|
+
missing.push(topic);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
for (const topic of irrelevantTopics) {
|
|
303
|
+
if (content.includes(topic.toLowerCase())) {
|
|
304
|
+
noise.push(topic);
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
return { found, missing, noise };
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
exports.evaluateRetrieval = evaluateRetrieval;
|
|
311
|
+
//# sourceMappingURL=index.cjs.map
|
|
312
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/utils/llm-grader.ts","../../src/utils/metrics.ts","../../src/utils/reporters.ts","../../src/retrieval/index.ts"],"names":["z","openai","createOpenAI","resolveDefaultOpenAiChatModelId","generateObject","pinecone","Pinecone"],"mappings":";;;;;;;;;AAgBA,IAAM,WAAA,GAAcA,MAAE,MAAA,CAAO;AAAA,EAC3B,KAAA,EAAOA,KAAA,CAAE,MAAA,EAAO,CAAE,GAAA,CAAI,CAAC,CAAA,CAAE,GAAA,CAAI,GAAG,CAAA,CAAE,QAAA,CAAS,kBAAkB,CAAA;AAAA,EAC7D,SAAA,EAAWA,KAAA,CAAE,MAAA,EAAO,CAAE,SAAS,2BAA2B,CAAA;AAAA,EAC1D,QAAA,EAAUA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,EAAQ,CAAA,CAAE,QAAA,EAAS,CAAE,QAAA,CAAS,6CAA6C;AACjG,CAAC,CAAA;AAED,IAAM,wBAAA,GAA2BA,MAAE,MAAA,CAAO;AAAA,EACxC,MAAA,EAAQA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,CAAO;AAAA,IACvB,SAAA,EAAWA,MAAE,MAAA,EAAO;AAAA,IACpB,KAAA,EAAOA,MAAE,MAAA,EAAO,CAAE,IAAI,CAAC,CAAA,CAAE,IAAI,GAAG,CAAA;AAAA,IAChC,SAAA,EAAWA,MAAE,MAAA;AAAO,GACrB,CAAC,CAAA;AAAA,EACF,gBAAA,EAAkBA,MAAE,MAAA;AACtB,CAAC,CAAA;AAiCM,SAAS,eAAA,CAAgB,MAAA,GAA0B,EAAC,EAAc;AACvE,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,YAAA,IAAgB,OAAA,CAAQ,GAAA,CAAI,cAAA;AAClD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,4CAA4C,CAAA;AAAA,EAC9D;AAEA,EAAA,MAAMC,QAAA,GAASC,mBAAA,CAAa,EAAE,MAAA,EAAQ,CAAA;AACtC,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,KAAA,IAASC,oCAAA,EAAgC;AAC9D,EAAA,MAAM,WAAA,GAAc,OAAO,WAAA,IAAe,GAAA;AAE1C,EAAA,OAAO;AAAA,IACL,MAAM,KAAA,CAAM,OAAA,EAAiB,SAAA,EAAmB,MAAA,EAAiB;AAC/D,MAAA,MAAM,MAAA,GAAS,8EAA8E,SAAS,CAAA;;AAAA,EAE1G,MAAA,GAAS,WAAW,MAAM;AAAA,CAAA,GAAO,EAAE;AAAA;AAAA;AAAA,EAGnC,OAAO;AAAA;;AAAA,sDAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMC,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA;AAAA,IAChB,CAAA;AAAA,IAEA,MAAM,aAAA,CAAc,OAAA,EAAiB,QAAA,EAA2B;AAC9D,MAAA,MAAM,sBAAsB,QAAA,CACzB,GAAA,CAAI,CAAC,CAAA,KAAM,KAAK,CAAA,CAAE,IAAI,CAAA,EAAA,EAAK,CAAA,CAAE,WAAW,CAAA,UAAA,EAAa,CAAA,CAAE,MAAM,CAAA,CAAA,CAAG,CAAA,CAChE,KAAK,IAAI,CAAA;AAEZ,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA;AAAA,EAGnB,mBAAmB;;AAAA;AAAA;AAAA,EAInB,OAAO;AAAA;;AAAA,0EAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,wBAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA,CAAO,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,QACtC,WAAW,CAAA,CAAE,SAAA;AAAA,QACb,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,WAAW,CAAA,CAAE;AAAA,OACf,CAAE,CAAA;AAAA,IACJ,CAAA;AAAA,IAEA,MAAM,cAAA,CAAe,KAAA,EAAe,OAAA,EAAiB;AACnD,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA,QAAA,EAEX,KAAK,CAAA;;AAAA;AAAA;AAAA,EAIb,OAAO;AAAA;;AAAA;AAAA;;AAAA,sDAAA,CAAA;AAQH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO;AAAA,QACL,KAAA,EAAO,OAAO,MAAA,CAAO,KAAA;AAAA,QACrB,SAAA,EAAW,OAAO,MAAA,CAAO;AAAA,OAC3B;AAAA,IACF;AAAA,GACF;AACF;;;AC7IO,SAAS,uBACd,MAAA,EACQ;AACR,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AAEhC,EAAA,MAAM,WAAA,GAAc,OAAO,MAAA,CAAO,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AAC/D,EAAA,IAAI,WAAA,KAAgB,GAAG,OAAO,CAAA;AAE9B,EAAA,MAAM,WAAA,GAAc,MAAA,CAAO,MAAA,CAAO,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,KAAA,GAAQ,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AACzE,EAAA,OAAO,IAAA,CAAK,KAAA,CAAM,WAAA,GAAc,WAAW,CAAA;AAC7C;AAoBO,SAAS,QAAQ,OAAA,EAA2B;AACjD,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,GAAA,EAAK,MAAM,GAAA,GAAM,CAAA,EAAG,CAAC,CAAA,GAAI,OAAA,CAAQ,MAAA;AAC1D;AAiDO,SAAS,KAAA,CAAM,KAAA,EAAe,GAAA,EAAa,GAAA,EAAqB;AACrE,EAAA,OAAO,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,CAAC,CAAA;AAC3C;AAKO,SAAS,eAAe,KAAA,EAAuB;AACpD,EAAA,OAAO,MAAM,IAAA,CAAK,KAAA,CAAM,KAAK,CAAA,EAAG,GAAG,GAAG,CAAA;AACxC;;;ACxBA,SAAS,gBAAgB,IAAA,EAAsB;AAC7C,EAAA,OAAO,IAAA,CACJ,OAAA,CAAQ,UAAA,EAAY,KAAK,CAAA,CACzB,OAAA,CAAQ,IAAA,EAAM,CAAC,GAAA,KAAQ,GAAA,CAAI,WAAA,EAAa,EACxC,IAAA,EAAK;AACV;AA2DO,SAAS,eAAA,CACd,QACA,UAAA,GAAmD,EAAE,MAAM,EAAA,EAAI,UAAA,EAAY,IAAG,EACtE;AACR,EAAA,MAAM,OAAA,GAAU,MAAA,CAAO,OAAA,CAAQ,MAAM,CAAA;AACrC,EAAA,MAAM,QAAA,GAAW,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAA,EAAK,GAAG,KAAK,CAAA,KAAM,GAAA,GAAM,KAAA,EAAO,CAAC,IAAI,OAAA,CAAQ,MAAA;AAE9E,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,IAAS,UAAA,CAAW,IAAI,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAChG,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,GAAQ,UAAA,CAAW,UAAU,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAErG,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,IAAI,QAAA,IAAY,WAAW,IAAA,EAAM;AAC/B,IAAA,KAAA,CAAM,KAAK,4BAA4B,CAAA;AAAA,EACzC,CAAA,MAAA,IAAW,QAAA,IAAY,UAAA,CAAW,UAAA,EAAY;AAC5C,IAAA,KAAA,CAAM,KAAK,kDAAkD,CAAA;AAAA,EAC/D,CAAA,MAAO;AACL,IAAA,KAAA,CAAM,KAAK,yCAAyC,CAAA;AAAA,EACtD;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,WAAW,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EACrE;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,sBAAsB,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EAChF;AAEA,EAAA,OAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA,GAAI,GAAA;AAC5B;;;ACxIA,eAAsB,kBACpB,MAAA,EAC8B;AAC9B,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAE3B,EAAA,IAAI,MAAA,CAAO,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG;AAC/B,IAAA,MAAM,IAAI,MAAM,mDAAmD,CAAA;AAAA,EACrE;AAGA,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,cAAA,IAAkB,OAAA,CAAQ,GAAA,CAAI,gBAAA;AACpD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,iDAAiD,CAAA;AAAA,EACnE;AAEA,EAAA,MAAMI,UAAA,GAAW,IAAIC,iBAAA,CAAS,EAAE,QAAQ,CAAA;AACxC,EAAA,MAAM,SAAA,GAAYD,UAAA,CAAS,SAAA,CAAU,MAAA,CAAO,aAAa,CAAA;AACzD,EAAA,MAAM,SAAS,eAAA,CAAgB,EAAE,YAAA,EAAc,MAAA,CAAO,cAAc,CAAA;AACpE,EAAA,MAAM,IAAA,GAAO,OAAO,IAAA,IAAQ,CAAA;AAG5B,EAAA,MAAM,eAAuC,EAAC;AAE9C,EAAA,KAAA,MAAW,SAAA,IAAa,OAAO,OAAA,EAAS;AACtC,IAAA,MAAM,SAAS,MAAM,sBAAA;AAAA,MACnB,SAAA;AAAA,MACA,MAAA;AAAA,MACA,SAAA;AAAA,MACA,IAAA;AAAA,MACA,MAAA,CAAO;AAAA,KACT;AACA,IAAA,YAAA,CAAa,KAAK,MAAM,CAAA;AAAA,EAC1B;AAGA,EAAA,MAAM,kBAAkB,YAAA,CAAa,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,cAAc,CAAA;AAChE,EAAA,MAAM,SAAA,GAAY,cAAA,CAAe,OAAA,CAAQ,eAAe,CAAC,CAAA;AAGzD,EAAA,MAAM,gBAAgB,YAAA,CAAa,MAAA;AAAA,IACjC,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,EAAE,WAAA,CAAY,MAAA,GAAS,EAAE,aAAA,CAAc,MAAA;AAAA,IACzD;AAAA,GACF;AACA,EAAA,MAAM,UAAA,GAAa,YAAA,CAAa,MAAA,CAAO,CAAC,GAAA,EAAK,MAAM,GAAA,GAAM,CAAA,CAAE,WAAA,CAAY,MAAA,EAAQ,CAAC,CAAA;AAChF,EAAA,MAAM,SAAS,aAAA,KAAkB,CAAA,GAAI,MAAM,cAAA,CAAgB,UAAA,GAAa,gBAAiB,GAAG,CAAA;AAG5F,EAAA,MAAM,YAAY,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM,EAAE,MAAM,CAAA;AACtD,EAAA,MAAM,iBAAiB,SAAA,CAAU,MAAA,CAAO,CAAC,CAAA,KAAM,CAAA,CAAE,kBAAkB,EAAE,CAAA;AACrE,EAAA,MAAM,SAAA,GAAY,SAAA,CAAU,MAAA,KAAW,CAAA,GACnC,GAAA,GACA,eAAgB,cAAA,CAAe,MAAA,GAAS,SAAA,CAAU,MAAA,GAAU,GAAG,CAAA;AAGnE,EAAA,MAAM,kBAAkB,YAAA,CAAa,MAAA;AAAA,IACnC,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,EAAE,WAAA,CAAY,MAAA;AAAA,IAChC;AAAA,GACF;AACA,EAAA,MAAM,uBAAA,GAA0B,OAAO,OAAA,CAAQ,MAAA;AAAA,IAC7C,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,IAAO,CAAA,CAAE,kBAAkB,MAAA,IAAU,CAAA,CAAA;AAAA,IACjD;AAAA,GACF;AACA,EAAA,MAAM,aAAa,uBAAA,KAA4B,CAAA,GAC3C,IACA,cAAA,CAAgB,eAAA,GAAkB,0BAA2B,GAAG,CAAA;AAEpE,EAAA,MAAM,MAAA,GAAS,EAAE,SAAA,EAAW,MAAA,EAAQ,WAAW,UAAA,EAAW;AAG1D,EAAA,MAAM,eAAe,sBAAA,CAAuB;AAAA,IAC1C,EAAE,KAAA,EAAO,SAAA,EAAW,MAAA,EAAQ,IAAA,EAAK;AAAA,IACjC,EAAE,KAAA,EAAO,MAAA,EAAQ,MAAA,EAAQ,GAAA,EAAK;AAAA,IAC9B,EAAE,KAAA,EAAO,SAAA,EAAW,MAAA,EAAQ,IAAA,EAAK;AAAA,IACjC,EAAE,KAAA,EAAO,GAAA,GAAM,UAAA,EAAY,QAAQ,GAAA;AAAK;AAAA,GACzC,CAAA;AAGD,EAAA,MAAM,QAAA,GAA2B;AAAA,IAC/B;AAAA,MACE,SAAA,EAAW,WAAA;AAAA,MACX,KAAA,EAAO,SAAA;AAAA,MACP,SAAA,EAAW,CAAA,6CAAA,EAAgD,YAAA,CAAa,MAAM,CAAA,SAAA;AAAA,KAChF;AAAA,IACA;AAAA,MACE,SAAA,EAAW,QAAA;AAAA,MACX,KAAA,EAAO,MAAA;AAAA,MACP,SAAA,EAAW,CAAA,MAAA,EAAS,UAAU,CAAA,CAAA,EAAI,aAAa,CAAA,sCAAA,CAAA;AAAA,MAC/C,QAAA,EAAU,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM,EAAE,aAAa,CAAA,CAAE,KAAA,CAAM,CAAA,EAAG,CAAC;AAAA,KACnE;AAAA,IACA;AAAA,MACE,SAAA,EAAW,WAAA;AAAA,MACX,KAAA,EAAO,SAAA;AAAA,MACP,WAAW,CAAA,EAAG,cAAA,CAAe,MAAM,CAAA,CAAA,EAAI,UAAU,MAAM,CAAA,gCAAA;AAAA,KACzD;AAAA,IACA;AAAA,MACE,SAAA,EAAW,YAAA;AAAA,MACX,KAAA,EAAO,UAAA;AAAA,MACP,SAAA,EAAW,GAAG,eAAe,CAAA,iDAAA,CAAA;AAAA,MAC7B,QAAA,EAAU,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM,EAAE,WAAW,CAAA,CAAE,KAAA,CAAM,CAAA,EAAG,CAAC;AAAA;AACjE,GACF;AAEA,EAAA,OAAO;AAAA,IACL,MAAA,EAAQ,YAAA,IAAgB,EAAA,IAAM,UAAA,IAAc,EAAA;AAAA,IAC5C,YAAA;AAAA,IACA,MAAA;AAAA,IACA,QAAA;AAAA,IACA,OAAA,EAAS,gBAAgB,EAAE,SAAA,EAAW,QAAQ,SAAA,EAAW,UAAA,EAAY,GAAA,GAAM,UAAA,EAAY,CAAA;AAAA,IACvF,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,IACvB;AAAA,GACF;AACF;AASA,eAAe,sBAAA,CACb,SAAA,EACA,MAAA,EACA,SAAA,EACA,MACA,OAAA,EAC+B;AAE/B,EAAA,IAAI,aAAA;AACJ,EAAA,IAAI;AACF,IAAA,aAAA,GAAgB,MAAM,UAAU,OAAA,CAAQ;AAAA,MACtC,OAAO,SAAA,CAAU,KAAA;AAAA,MACjB;AAAA,KACD,CAAA;AAAA,EACH,SAAS,KAAA,EAAO;AAEd,IAAA,OAAO;AAAA,MACL,OAAO,SAAA,CAAU,KAAA;AAAA,MACjB,QAAQ,EAAC;AAAA,MACT,cAAA,EAAgB,CAAA;AAAA,MAChB,aAAa,EAAC;AAAA,MACd,aAAA,EAAe,SAAA,CAAU,cAAA,IAAkB,EAAC;AAAA,MAC5C,aAAa;AAAC,KAChB;AAAA,EACF;AAEA,EAAA,MAAM,QAAA,GAAW,aAAA,CAAc,QAAA,IAAY,EAAC;AAG5C,EAAA,MAAM,SAA2B,EAAC;AAElC,EAAA,KAAA,MAAW,WAAW,QAAA,EAAU;AAC9B,IAAA,MAAM,kBAAkB,MAAM,MAAA,CAAO,eAAe,SAAA,CAAU,KAAA,EAAO,QAAQ,OAAO,CAAA;AAEpF,IAAA,MAAM,MAAM,OAAA,CAAQ,SAAA;AACpB,IAAA,MAAM,UAAA,GACH,GAAA,EAAK,IAAA,EAAkC,IAAA,IACxC,KAAK,IAAA,IACL,SAAA;AAEF,IAAA,MAAA,CAAO,IAAA,CAAK;AAAA,MACV,OAAA,EAAS,OAAA,CAAQ,OAAA,CAAQ,KAAA,CAAM,GAAG,GAAG,CAAA;AAAA;AAAA,MACrC,OAAO,OAAA,CAAQ,KAAA;AAAA,MACf,UAAA;AAAA,MACA,gBAAgB,eAAA,CAAgB,KAAA;AAAA,MAChC,WAAW,eAAA,CAAgB;AAAA,KAC5B,CAAA;AAAA,EACH;AAGA,EAAA,MAAM,cAAA,GAAiB,MAAA,CAAO,MAAA,KAAW,CAAA,GACrC,CAAA,GACA,OAAA,CAAQ,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,CAAA,CAAE,cAAc,CAAC,CAAA;AAG/C,EAAA,MAAM,UAAA,GAAa,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,GAAG,CAAA,CAAE,WAAA,EAAY;AACtE,EAAA,MAAM,WAAA,GAAc,WAAA;AAAA,IAClB,UAAA;AAAA,IACA,SAAA,CAAU,kBAAkB,EAAC;AAAA,IAC7B,SAAA,CAAU,oBAAoB;AAAC,GACjC;AAEA,EAAA,IAAI,OAAA,EAAS;AACX,IAAA,OAAA,CAAQ,GAAA,CAAI,CAAA,QAAA,EAAW,SAAA,CAAU,KAAK,CAAA,CAAA,CAAG,CAAA;AACzC,IAAA,OAAA,CAAQ,IAAI,CAAA,aAAA,EAAgB,cAAA,CAAe,OAAA,CAAQ,CAAC,CAAC,CAAA,IAAA,CAAM,CAAA;AAC3D,IAAA,OAAA,CAAQ,GAAA,CAAI,mBAAmB,WAAA,CAAY,KAAA,CAAM,KAAK,IAAI,CAAA,IAAK,MAAM,CAAA,CAAE,CAAA;AACvE,IAAA,OAAA,CAAQ,GAAA,CAAI,qBAAqB,WAAA,CAAY,OAAA,CAAQ,KAAK,IAAI,CAAA,IAAK,MAAM,CAAA,CAAE,CAAA;AAC3E,IAAA,OAAA,CAAQ,GAAA,CAAI,YAAY,WAAA,CAAY,KAAA,CAAM,KAAK,IAAI,CAAA,IAAK,MAAM,CAAA,CAAE,CAAA;AAAA,EAClE;AAEA,EAAA,OAAO;AAAA,IACL,OAAO,SAAA,CAAU,KAAA;AAAA,IACjB,MAAA;AAAA,IACA,cAAA;AAAA,IACA,aAAa,WAAA,CAAY,KAAA;AAAA,IACzB,eAAe,WAAA,CAAY,OAAA;AAAA,IAC3B,aAAa,WAAA,CAAY;AAAA,GAC3B;AACF;AASA,SAAS,WAAA,CACP,OAAA,EACA,cAAA,EACA,gBAAA,EACkB;AAClB,EAAA,MAAM,QAAkB,EAAC;AACzB,EAAA,MAAM,UAAoB,EAAC;AAC3B,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,KAAA,MAAW,SAAS,cAAA,EAAgB;AAClC,IAAA,IAAI,OAAA,CAAQ,QAAA,CAAS,KAAA,CAAM,WAAA,EAAa,CAAA,EAAG;AACzC,MAAA,KAAA,CAAM,KAAK,KAAK,CAAA;AAAA,IAClB,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,KAAK,KAAK,CAAA;AAAA,IACpB;AAAA,EACF;AAEA,EAAA,KAAA,MAAW,SAAS,gBAAA,EAAkB;AACpC,IAAA,IAAI,OAAA,CAAQ,QAAA,CAAS,KAAA,CAAM,WAAA,EAAa,CAAA,EAAG;AACzC,MAAA,KAAA,CAAM,KAAK,KAAK,CAAA;AAAA,IAClB;AAAA,EACF;AAEA,EAAA,OAAO,EAAE,KAAA,EAAO,OAAA,EAAS,KAAA,EAAM;AACjC","file":"index.cjs","sourcesContent":["/**\n * LLM Grading Utilities\n *\n * Uses OpenAI to grade content quality, relevance, and other metrics.\n */\n\nimport { createOpenAI } from '@ai-sdk/openai';\nimport { generateObject } from 'ai';\nimport { z } from 'zod';\nimport { resolveDefaultOpenAiChatModelId } from '@kat/core';\nimport type { LLMGraderConfig, EvalCriterion, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// GRADING SCHEMAS\n// ============================================================================\n\nconst GradeSchema = z.object({\n score: z.number().min(0).max(100).describe('Score from 0-100'),\n reasoning: z.string().describe('Explanation for the score'),\n examples: z.array(z.string()).optional().describe('Specific examples that influenced the score'),\n});\n\nconst MultiCriteriaGradeSchema = z.object({\n scores: z.array(z.object({\n criterion: z.string(),\n score: z.number().min(0).max(100),\n reasoning: z.string(),\n })),\n overallReasoning: z.string(),\n});\n\n// ============================================================================\n// GRADER FACTORY\n// ============================================================================\n\nexport interface LLMGrader {\n /**\n * Grade content against a single criterion.\n */\n grade(content: string, criterion: string, rubric?: string): Promise<{\n score: number;\n reasoning: string;\n examples?: string[];\n }>;\n\n /**\n * Grade content against multiple criteria.\n */\n gradeMultiple(content: string, criteria: EvalCriterion[]): Promise<EvalEvidence[]>;\n\n /**\n * Grade relevance of content to a query.\n */\n gradeRelevance(query: string, content: string): Promise<{\n score: number;\n reasoning: string;\n }>;\n}\n\n/**\n * Create an LLM grader with the given configuration.\n */\nexport function createLLMGrader(config: LLMGraderConfig = {}): LLMGrader {\n const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;\n if (!apiKey) {\n throw new Error('OPENAI_API_KEY is required for LLM grading');\n }\n\n const openai = createOpenAI({ apiKey });\n const model = config.model || resolveDefaultOpenAiChatModelId();\n const temperature = config.temperature ?? 0.1;\n\n return {\n async grade(content: string, criterion: string, rubric?: string) {\n const prompt = `You are an expert evaluator. Grade the following content on the criterion \"${criterion}\".\n\n${rubric ? `Rubric: ${rubric}\\n` : ''}\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return result.object;\n },\n\n async gradeMultiple(content: string, criteria: EvalCriterion[]) {\n const criteriaDescription = criteria\n .map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`)\n .join('\\n');\n\n const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.\n\nCriteria:\n${criteriaDescription}\n\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nFor each criterion, provide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: MultiCriteriaGradeSchema,\n prompt,\n temperature,\n });\n\n return result.object.scores.map((s) => ({\n criterion: s.criterion,\n score: s.score,\n reasoning: s.reasoning,\n }));\n },\n\n async gradeRelevance(query: string, content: string) {\n const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.\n\nQuery: \"${query}\"\n\nContent:\n\"\"\"\n${content}\n\"\"\"\n\nA score of 100 means the content directly and completely answers the query.\nA score of 0 means the content is completely irrelevant.\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return {\n score: result.object.score,\n reasoning: result.object.reasoning,\n };\n },\n };\n}\n\n// ============================================================================\n// CONVENIENCE FUNCTIONS\n// ============================================================================\n\n/**\n * Grade content using a one-off grader instance.\n */\nexport async function gradeWithLLM(\n content: string,\n criterion: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string; examples?: string[] }> {\n const grader = createLLMGrader(config);\n return grader.grade(content, criterion);\n}\n\n/**\n * Grade relevance using a one-off grader instance.\n */\nexport async function gradeRelevanceWithLLM(\n query: string,\n content: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string }> {\n const grader = createLLMGrader(config);\n return grader.gradeRelevance(query, content);\n}\n","/**\n * Metric Calculation Helpers\n *\n * Pure functions for calculating scores and metrics.\n */\n\n/**\n * Calculate a weighted score from individual scores and weights.\n *\n * @param scores - Array of { score, weight } objects\n * @returns Weighted average score (0-100)\n */\nexport function calculateWeightedScore(\n scores: Array<{ score: number; weight: number }>\n): number {\n if (scores.length === 0) return 0;\n\n const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);\n if (totalWeight === 0) return 0;\n\n const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);\n return Math.round(weightedSum / totalWeight);\n}\n\n/**\n * Calculate percentage of found items vs expected items.\n *\n * @param found - Number of items found\n * @param expected - Number of items expected\n * @returns Percentage (0-100)\n */\nexport function calculatePercentage(found: number, expected: number): number {\n if (expected === 0) return 100; // Nothing expected, consider it perfect\n return Math.round((found / expected) * 100);\n}\n\n/**\n * Calculate the average of an array of numbers.\n *\n * @param numbers - Array of numbers\n * @returns Average value\n */\nexport function average(numbers: number[]): number {\n if (numbers.length === 0) return 0;\n return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;\n}\n\n/**\n * Calculate precision: true positives / (true positives + false positives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falsePositives - Number of incorrect positive predictions\n * @returns Precision (0-100)\n */\nexport function calculatePrecision(\n truePositives: number,\n falsePositives: number\n): number {\n const total = truePositives + falsePositives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate recall: true positives / (true positives + false negatives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falseNegatives - Number of missed positive predictions\n * @returns Recall (0-100)\n */\nexport function calculateRecall(\n truePositives: number,\n falseNegatives: number\n): number {\n const total = truePositives + falseNegatives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate F1 score: harmonic mean of precision and recall.\n *\n * @param precision - Precision value (0-100)\n * @param recall - Recall value (0-100)\n * @returns F1 score (0-100)\n */\nexport function calculateF1(precision: number, recall: number): number {\n if (precision + recall === 0) return 0;\n return Math.round((2 * precision * recall) / (precision + recall));\n}\n\n/**\n * Clamp a value between min and max.\n */\nexport function clamp(value: number, min: number, max: number): number {\n return Math.max(min, Math.min(max, value));\n}\n\n/**\n * Normalize a score to 0-100 range.\n */\nexport function normalizeScore(score: number): number {\n return clamp(Math.round(score), 0, 100);\n}\n","/**\n * Report Formatting Utilities\n *\n * Format eval results for different output targets.\n */\n\nimport type { EvalResult, ReportOptions, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// CONSOLE REPORTER\n// ============================================================================\n\n/**\n * Format an eval result for console output.\n */\nexport function formatConsoleReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const lines: string[] = [];\n const { includeEvidence = true } = options;\n\n // Header\n const status = result.passed ? '✓ PASSED' : '✗ FAILED';\n const statusColor = result.passed ? '\\x1b[32m' : '\\x1b[31m';\n const reset = '\\x1b[0m';\n\n lines.push('');\n lines.push('═'.repeat(60));\n lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);\n lines.push('═'.repeat(60));\n\n // Summary\n lines.push('');\n lines.push(`Summary: ${result.summary}`);\n lines.push(`Duration: ${result.duration}ms`);\n\n // Individual scores\n lines.push('');\n lines.push('Scores:');\n for (const [name, score] of Object.entries(result.scores)) {\n const bar = createProgressBar(score, 20);\n const formattedName = formatScoreName(name);\n lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);\n }\n\n // Evidence (if requested)\n if (includeEvidence && result.evidence.length > 0) {\n lines.push('');\n lines.push('Evidence:');\n for (const evidence of result.evidence) {\n lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);\n lines.push(` ${evidence.reasoning}`);\n if (evidence.examples && evidence.examples.length > 0) {\n for (const example of evidence.examples.slice(0, 3)) {\n lines.push(` - ${example}`);\n }\n }\n }\n }\n\n lines.push('');\n lines.push('─'.repeat(60));\n\n return lines.join('\\n');\n}\n\n/**\n * Create a text progress bar.\n */\nfunction createProgressBar(value: number, width: number): string {\n const filled = Math.round((value / 100) * width);\n const empty = width - filled;\n return `[${'█'.repeat(filled)}${'░'.repeat(empty)}]`;\n}\n\n/**\n * Format a score name for display (camelCase -> Title Case).\n */\nfunction formatScoreName(name: string): string {\n return name\n .replace(/([A-Z])/g, ' $1')\n .replace(/^./, (str) => str.toUpperCase())\n .trim();\n}\n\n// ============================================================================\n// JSON REPORTER\n// ============================================================================\n\n/**\n * Format an eval result as JSON.\n */\nexport function formatJsonReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const { includeEvidence = true, includeRawData = false } = options;\n\n const output: Record<string, unknown> = {\n passed: result.passed,\n overallScore: result.overallScore,\n scores: result.scores,\n summary: result.summary,\n duration: result.duration,\n };\n\n if (includeEvidence) {\n output.evidence = result.evidence;\n }\n\n // Include any additional properties from extended result types\n for (const [key, value] of Object.entries(result)) {\n if (\n !['passed', 'overallScore', 'scores', 'evidence', 'summary', 'duration'].includes(key) &&\n (includeRawData || !isRawData(value))\n ) {\n output[key] = value;\n }\n }\n\n return JSON.stringify(output, null, 2);\n}\n\n/**\n * Check if a value looks like raw data (large arrays/objects).\n */\nfunction isRawData(value: unknown): boolean {\n if (Array.isArray(value) && value.length > 10) return true;\n if (typeof value === 'object' && value !== null) {\n const keys = Object.keys(value);\n if (keys.length > 20) return true;\n }\n return false;\n}\n\n// ============================================================================\n// SUMMARY GENERATION\n// ============================================================================\n\n/**\n * Generate a human-readable summary from scores.\n */\nexport function generateSummary(\n scores: Record<string, number>,\n thresholds: { good: number; acceptable: number } = { good: 80, acceptable: 60 }\n): string {\n const entries = Object.entries(scores);\n const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;\n\n const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);\n const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);\n\n const parts: string[] = [];\n\n if (avgScore >= thresholds.good) {\n parts.push('Strong overall performance');\n } else if (avgScore >= thresholds.acceptable) {\n parts.push('Acceptable performance with room for improvement');\n } else {\n parts.push('Performance below acceptable thresholds');\n }\n\n if (goodMetrics.length > 0) {\n parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(', ')}`);\n }\n\n if (poorMetrics.length > 0) {\n parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(', ')}`);\n }\n\n return parts.join('. ') + '.';\n}\n\n// ============================================================================\n// PRINT HELPERS\n// ============================================================================\n\n/**\n * Print an eval result to the console.\n */\nexport function printReport(result: EvalResult, options: Partial<ReportOptions> = {}): void {\n const format = options.format || 'console';\n\n if (format === 'json') {\n console.log(formatJsonReport(result, options));\n } else {\n console.log(formatConsoleReport(result, options));\n }\n}\n","/**\n * Retrieval Eval - Layer 2\n *\n * Evaluates whether RAG retrieves relevant chunks by testing\n * relevance, recall, precision, and noise ratio.\n */\n\nimport { Pinecone, type ContextModel } from '@pinecone-database/pinecone';\nimport { createLLMGrader } from '../utils/llm-grader.js';\nimport { average, normalizeScore, calculateWeightedScore } from '../utils/metrics.js';\nimport { generateSummary } from '../utils/reporters.js';\nimport type { EvalEvidence } from '../types.js';\nimport type {\n RetrievalEvalConfig,\n RetrievalEvalResult,\n RetrievalTestQuery,\n QueryRetrievalResult,\n RetrievedChunk,\n TopicCheckResult,\n} from './types.js';\n\nexport type {\n RetrievalEvalConfig,\n RetrievalEvalResult,\n RetrievalTestQuery,\n QueryRetrievalResult,\n RetrievedChunk,\n} from './types.js';\n\n// ============================================================================\n// MAIN EVALUATION FUNCTION\n// ============================================================================\n\n/**\n * Evaluate the quality of RAG retrieval for a Pinecone assistant.\n */\nexport async function evaluateRetrieval(\n config: RetrievalEvalConfig\n): Promise<RetrievalEvalResult> {\n const startTime = Date.now();\n\n if (config.queries.length === 0) {\n throw new Error('At least one query is required for retrieval eval');\n }\n\n // Initialize Pinecone\n const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;\n if (!apiKey) {\n throw new Error('PINECONE_API_KEY is required for retrieval eval');\n }\n\n const pinecone = new Pinecone({ apiKey });\n const assistant = pinecone.assistant(config.assistantName);\n const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });\n const topK = config.topK || 5;\n\n // Evaluate each query\n const queryResults: QueryRetrievalResult[] = [];\n\n for (const testQuery of config.queries) {\n const result = await evaluateQueryRetrieval(\n assistant,\n grader,\n testQuery,\n topK,\n config.verbose\n );\n queryResults.push(result);\n }\n\n // Calculate aggregate scores\n const relevanceScores = queryResults.map((r) => r.relevanceScore);\n const relevance = normalizeScore(average(relevanceScores));\n\n // Calculate recall: percentage of expected topics found\n const totalExpected = queryResults.reduce(\n (sum, r) => sum + r.foundTopics.length + r.missingTopics.length,\n 0\n );\n const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);\n const recall = totalExpected === 0 ? 100 : normalizeScore((totalFound / totalExpected) * 100);\n\n // Calculate precision: percentage of retrieved content that's relevant\n const allChunks = queryResults.flatMap((r) => r.chunks);\n const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);\n const precision = allChunks.length === 0\n ? 100\n : normalizeScore((relevantChunks.length / allChunks.length) * 100);\n\n // Calculate noise ratio: percentage of irrelevant topics found\n const totalIrrelevant = queryResults.reduce(\n (sum, r) => sum + r.noiseTopics.length,\n 0\n );\n const totalIrrelevantExpected = config.queries.reduce(\n (sum, q) => sum + (q.irrelevantTopics?.length || 0),\n 0\n );\n const noiseRatio = totalIrrelevantExpected === 0\n ? 0\n : normalizeScore((totalIrrelevant / totalIrrelevantExpected) * 100);\n\n const scores = { relevance, recall, precision, noiseRatio };\n\n // Calculate overall score (noise ratio is inverted - lower is better)\n const overallScore = calculateWeightedScore([\n { score: relevance, weight: 0.35 },\n { score: recall, weight: 0.30 },\n { score: precision, weight: 0.25 },\n { score: 100 - noiseRatio, weight: 0.10 }, // Invert noise ratio\n ]);\n\n // Build evidence\n const evidence: EvalEvidence[] = [\n {\n criterion: 'relevance',\n score: relevance,\n reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`,\n },\n {\n criterion: 'recall',\n score: recall,\n reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,\n examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3),\n },\n {\n criterion: 'precision',\n score: precision,\n reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`,\n },\n {\n criterion: 'noiseRatio',\n score: noiseRatio,\n reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,\n examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3),\n },\n ];\n\n return {\n passed: overallScore >= 70 && noiseRatio <= 30,\n overallScore,\n scores,\n evidence,\n summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),\n duration: Date.now() - startTime,\n queryResults,\n };\n}\n\n// ============================================================================\n// QUERY EVALUATION\n// ============================================================================\n\n/**\n * Evaluate retrieval for a single query.\n */\nasync function evaluateQueryRetrieval(\n assistant: ReturnType<Pinecone['assistant']>,\n grader: ReturnType<typeof createLLMGrader>,\n testQuery: RetrievalTestQuery,\n topK: number,\n verbose?: boolean\n): Promise<QueryRetrievalResult> {\n // Retrieve chunks using Context API\n let contextResult: ContextModel;\n try {\n contextResult = await assistant.context({\n query: testQuery.query,\n topK,\n });\n } catch (error) {\n // Return empty result on error\n return {\n query: testQuery.query,\n chunks: [],\n relevanceScore: 0,\n foundTopics: [],\n missingTopics: testQuery.expectedTopics || [],\n noiseTopics: [],\n };\n }\n\n const snippets = contextResult.snippets || [];\n\n // Grade each chunk for relevance\n const chunks: RetrievedChunk[] = [];\n\n for (const snippet of snippets) {\n const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);\n\n const ref = snippet.reference as Record<string, unknown>;\n const sourceFile =\n (ref?.file as Record<string, unknown>)?.name ||\n ref?.name ||\n 'unknown';\n\n chunks.push({\n content: snippet.content.slice(0, 500), // Truncate for storage\n score: snippet.score,\n sourceFile: sourceFile as string,\n relevanceGrade: relevanceResult.score,\n reasoning: relevanceResult.reasoning,\n });\n }\n\n // Calculate average relevance\n const relevanceScore = chunks.length === 0\n ? 0\n : average(chunks.map((c) => c.relevanceGrade));\n\n // Check for expected and irrelevant topics\n const allContent = chunks.map((c) => c.content).join(' ').toLowerCase();\n const topicResult = checkTopics(\n allContent,\n testQuery.expectedTopics || [],\n testQuery.irrelevantTopics || []\n );\n\n if (verbose) {\n console.log(`Query: \"${testQuery.query}\"`);\n console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);\n console.log(` Topics found: ${topicResult.found.join(', ') || 'none'}`);\n console.log(` Topics missing: ${topicResult.missing.join(', ') || 'none'}`);\n console.log(` Noise: ${topicResult.noise.join(', ') || 'none'}`);\n }\n\n return {\n query: testQuery.query,\n chunks,\n relevanceScore,\n foundTopics: topicResult.found,\n missingTopics: topicResult.missing,\n noiseTopics: topicResult.noise,\n };\n}\n\n// ============================================================================\n// HELPERS\n// ============================================================================\n\n/**\n * Check for expected and irrelevant topics in content.\n */\nfunction checkTopics(\n content: string,\n expectedTopics: string[],\n irrelevantTopics: string[]\n): TopicCheckResult {\n const found: string[] = [];\n const missing: string[] = [];\n const noise: string[] = [];\n\n for (const topic of expectedTopics) {\n if (content.includes(topic.toLowerCase())) {\n found.push(topic);\n } else {\n missing.push(topic);\n }\n }\n\n for (const topic of irrelevantTopics) {\n if (content.includes(topic.toLowerCase())) {\n noise.push(topic);\n }\n }\n\n return { found, missing, noise };\n}\n"]}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { B as BaseEvalConfig, b as EvalResult } from '../types-BJjlqNhg.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Retrieval Eval Types
|
|
5
|
+
*
|
|
6
|
+
* Types for evaluating RAG retrieval quality.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Configuration for retrieval evaluation.
|
|
11
|
+
*/
|
|
12
|
+
interface RetrievalEvalConfig extends BaseEvalConfig {
|
|
13
|
+
/** The Pinecone assistant name to evaluate */
|
|
14
|
+
assistantName: string;
|
|
15
|
+
/** Test queries with expected retrieval outcomes */
|
|
16
|
+
queries: RetrievalTestQuery[];
|
|
17
|
+
/** Number of chunks to retrieve per query (default: 5) */
|
|
18
|
+
topK?: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* A test query for retrieval evaluation.
|
|
22
|
+
*/
|
|
23
|
+
interface RetrievalTestQuery {
|
|
24
|
+
/** The query to test */
|
|
25
|
+
query: string;
|
|
26
|
+
/** Topics/keywords that SHOULD appear in retrieved chunks */
|
|
27
|
+
expectedTopics?: string[];
|
|
28
|
+
/** Topics/keywords that should NOT appear (noise) */
|
|
29
|
+
irrelevantTopics?: string[];
|
|
30
|
+
/** Minimum relevance score (0-1) for top chunk */
|
|
31
|
+
minRelevance?: number;
|
|
32
|
+
/** Description of what good retrieval looks like */
|
|
33
|
+
description?: string;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Result from retrieval evaluation.
|
|
37
|
+
*/
|
|
38
|
+
interface RetrievalEvalResult extends EvalResult {
|
|
39
|
+
/** Individual metric scores */
|
|
40
|
+
scores: {
|
|
41
|
+
/** Average relevance of retrieved chunks (0-100) */
|
|
42
|
+
relevance: number;
|
|
43
|
+
/** Percentage of expected topics found (0-100) */
|
|
44
|
+
recall: number;
|
|
45
|
+
/** Percentage of retrieved content that's relevant (0-100) */
|
|
46
|
+
precision: number;
|
|
47
|
+
/** Percentage of irrelevant content retrieved (0-100, lower is better) */
|
|
48
|
+
noiseRatio: number;
|
|
49
|
+
};
|
|
50
|
+
/** Results for each query */
|
|
51
|
+
queryResults: QueryRetrievalResult[];
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Result for a single query's retrieval.
|
|
55
|
+
*/
|
|
56
|
+
interface QueryRetrievalResult {
|
|
57
|
+
/** The query that was tested */
|
|
58
|
+
query: string;
|
|
59
|
+
/** Retrieved chunks with grading */
|
|
60
|
+
chunks: RetrievedChunk[];
|
|
61
|
+
/** Average relevance score for this query (0-100) */
|
|
62
|
+
relevanceScore: number;
|
|
63
|
+
/** Expected topics that were found */
|
|
64
|
+
foundTopics: string[];
|
|
65
|
+
/** Expected topics that were missing */
|
|
66
|
+
missingTopics: string[];
|
|
67
|
+
/** Irrelevant topics that appeared (noise) */
|
|
68
|
+
noiseTopics: string[];
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* A retrieved chunk with grading information.
|
|
72
|
+
*/
|
|
73
|
+
interface RetrievedChunk {
|
|
74
|
+
/** The chunk content */
|
|
75
|
+
content: string;
|
|
76
|
+
/** Vector similarity score from Pinecone */
|
|
77
|
+
score: number;
|
|
78
|
+
/** Source file name */
|
|
79
|
+
sourceFile: string;
|
|
80
|
+
/** LLM-graded relevance (0-100) */
|
|
81
|
+
relevanceGrade: number;
|
|
82
|
+
/** LLM reasoning for the relevance grade */
|
|
83
|
+
reasoning: string;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Retrieval Eval - Layer 2
|
|
88
|
+
*
|
|
89
|
+
* Evaluates whether RAG retrieves relevant chunks by testing
|
|
90
|
+
* relevance, recall, precision, and noise ratio.
|
|
91
|
+
*/
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Evaluate the quality of RAG retrieval for a Pinecone assistant.
|
|
95
|
+
*/
|
|
96
|
+
declare function evaluateRetrieval(config: RetrievalEvalConfig): Promise<RetrievalEvalResult>;
|
|
97
|
+
|
|
98
|
+
export { type QueryRetrievalResult, type RetrievalEvalConfig, type RetrievalEvalResult, type RetrievalTestQuery, type RetrievedChunk, evaluateRetrieval };
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { B as BaseEvalConfig, b as EvalResult } from '../types-BJjlqNhg.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Retrieval Eval Types
|
|
5
|
+
*
|
|
6
|
+
* Types for evaluating RAG retrieval quality.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Configuration for retrieval evaluation.
|
|
11
|
+
*/
|
|
12
|
+
interface RetrievalEvalConfig extends BaseEvalConfig {
|
|
13
|
+
/** The Pinecone assistant name to evaluate */
|
|
14
|
+
assistantName: string;
|
|
15
|
+
/** Test queries with expected retrieval outcomes */
|
|
16
|
+
queries: RetrievalTestQuery[];
|
|
17
|
+
/** Number of chunks to retrieve per query (default: 5) */
|
|
18
|
+
topK?: number;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* A test query for retrieval evaluation.
|
|
22
|
+
*/
|
|
23
|
+
interface RetrievalTestQuery {
|
|
24
|
+
/** The query to test */
|
|
25
|
+
query: string;
|
|
26
|
+
/** Topics/keywords that SHOULD appear in retrieved chunks */
|
|
27
|
+
expectedTopics?: string[];
|
|
28
|
+
/** Topics/keywords that should NOT appear (noise) */
|
|
29
|
+
irrelevantTopics?: string[];
|
|
30
|
+
/** Minimum relevance score (0-1) for top chunk */
|
|
31
|
+
minRelevance?: number;
|
|
32
|
+
/** Description of what good retrieval looks like */
|
|
33
|
+
description?: string;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Result from retrieval evaluation.
|
|
37
|
+
*/
|
|
38
|
+
interface RetrievalEvalResult extends EvalResult {
|
|
39
|
+
/** Individual metric scores */
|
|
40
|
+
scores: {
|
|
41
|
+
/** Average relevance of retrieved chunks (0-100) */
|
|
42
|
+
relevance: number;
|
|
43
|
+
/** Percentage of expected topics found (0-100) */
|
|
44
|
+
recall: number;
|
|
45
|
+
/** Percentage of retrieved content that's relevant (0-100) */
|
|
46
|
+
precision: number;
|
|
47
|
+
/** Percentage of irrelevant content retrieved (0-100, lower is better) */
|
|
48
|
+
noiseRatio: number;
|
|
49
|
+
};
|
|
50
|
+
/** Results for each query */
|
|
51
|
+
queryResults: QueryRetrievalResult[];
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Result for a single query's retrieval.
|
|
55
|
+
*/
|
|
56
|
+
interface QueryRetrievalResult {
|
|
57
|
+
/** The query that was tested */
|
|
58
|
+
query: string;
|
|
59
|
+
/** Retrieved chunks with grading */
|
|
60
|
+
chunks: RetrievedChunk[];
|
|
61
|
+
/** Average relevance score for this query (0-100) */
|
|
62
|
+
relevanceScore: number;
|
|
63
|
+
/** Expected topics that were found */
|
|
64
|
+
foundTopics: string[];
|
|
65
|
+
/** Expected topics that were missing */
|
|
66
|
+
missingTopics: string[];
|
|
67
|
+
/** Irrelevant topics that appeared (noise) */
|
|
68
|
+
noiseTopics: string[];
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* A retrieved chunk with grading information.
|
|
72
|
+
*/
|
|
73
|
+
interface RetrievedChunk {
|
|
74
|
+
/** The chunk content */
|
|
75
|
+
content: string;
|
|
76
|
+
/** Vector similarity score from Pinecone */
|
|
77
|
+
score: number;
|
|
78
|
+
/** Source file name */
|
|
79
|
+
sourceFile: string;
|
|
80
|
+
/** LLM-graded relevance (0-100) */
|
|
81
|
+
relevanceGrade: number;
|
|
82
|
+
/** LLM reasoning for the relevance grade */
|
|
83
|
+
reasoning: string;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Retrieval Eval - Layer 2
|
|
88
|
+
*
|
|
89
|
+
* Evaluates whether RAG retrieves relevant chunks by testing
|
|
90
|
+
* relevance, recall, precision, and noise ratio.
|
|
91
|
+
*/
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Evaluate the quality of RAG retrieval for a Pinecone assistant.
|
|
95
|
+
*/
|
|
96
|
+
declare function evaluateRetrieval(config: RetrievalEvalConfig): Promise<RetrievalEvalResult>;
|
|
97
|
+
|
|
98
|
+
export { type QueryRetrievalResult, type RetrievalEvalConfig, type RetrievalEvalResult, type RetrievalTestQuery, type RetrievedChunk, evaluateRetrieval };
|