@kat-ai/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -0
- package/dist/agent/index.cjs +468 -0
- package/dist/agent/index.cjs.map +1 -0
- package/dist/agent/index.d.cts +170 -0
- package/dist/agent/index.d.ts +170 -0
- package/dist/agent/index.js +466 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/index.cjs +1043 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +93 -0
- package/dist/index.d.ts +93 -0
- package/dist/index.js +1032 -0
- package/dist/index.js.map +1 -0
- package/dist/introspection/index.cjs +476 -0
- package/dist/introspection/index.cjs.map +1 -0
- package/dist/introspection/index.d.cts +107 -0
- package/dist/introspection/index.d.ts +107 -0
- package/dist/introspection/index.js +474 -0
- package/dist/introspection/index.js.map +1 -0
- package/dist/retrieval/index.cjs +312 -0
- package/dist/retrieval/index.cjs.map +1 -0
- package/dist/retrieval/index.d.cts +98 -0
- package/dist/retrieval/index.d.ts +98 -0
- package/dist/retrieval/index.js +310 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-BJjlqNhg.d.cts +112 -0
- package/dist/types-BJjlqNhg.d.ts +112 -0
- package/package.json +79 -0
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
import { Pinecone } from '@pinecone-database/pinecone';
|
|
2
|
+
import { createOpenAI } from '@ai-sdk/openai';
|
|
3
|
+
import { generateObject } from 'ai';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import { resolveDefaultOpenAiChatModelId } from '@kat/core';
|
|
6
|
+
|
|
7
|
+
// src/retrieval/index.ts
|
|
8
|
+
var GradeSchema = z.object({
|
|
9
|
+
score: z.number().min(0).max(100).describe("Score from 0-100"),
|
|
10
|
+
reasoning: z.string().describe("Explanation for the score"),
|
|
11
|
+
examples: z.array(z.string()).optional().describe("Specific examples that influenced the score")
|
|
12
|
+
});
|
|
13
|
+
var MultiCriteriaGradeSchema = z.object({
|
|
14
|
+
scores: z.array(z.object({
|
|
15
|
+
criterion: z.string(),
|
|
16
|
+
score: z.number().min(0).max(100),
|
|
17
|
+
reasoning: z.string()
|
|
18
|
+
})),
|
|
19
|
+
overallReasoning: z.string()
|
|
20
|
+
});
|
|
21
|
+
function createLLMGrader(config = {}) {
|
|
22
|
+
const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
23
|
+
if (!apiKey) {
|
|
24
|
+
throw new Error("OPENAI_API_KEY is required for LLM grading");
|
|
25
|
+
}
|
|
26
|
+
const openai = createOpenAI({ apiKey });
|
|
27
|
+
const model = config.model || resolveDefaultOpenAiChatModelId();
|
|
28
|
+
const temperature = config.temperature ?? 0.1;
|
|
29
|
+
return {
|
|
30
|
+
async grade(content, criterion, rubric) {
|
|
31
|
+
const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
|
|
32
|
+
|
|
33
|
+
${rubric ? `Rubric: ${rubric}
|
|
34
|
+
` : ""}
|
|
35
|
+
Content to evaluate:
|
|
36
|
+
"""
|
|
37
|
+
${content}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
41
|
+
const result = await generateObject({
|
|
42
|
+
model: openai(model),
|
|
43
|
+
schema: GradeSchema,
|
|
44
|
+
prompt,
|
|
45
|
+
temperature
|
|
46
|
+
});
|
|
47
|
+
return result.object;
|
|
48
|
+
},
|
|
49
|
+
async gradeMultiple(content, criteria) {
|
|
50
|
+
const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
|
|
51
|
+
const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
|
|
52
|
+
|
|
53
|
+
Criteria:
|
|
54
|
+
${criteriaDescription}
|
|
55
|
+
|
|
56
|
+
Content to evaluate:
|
|
57
|
+
"""
|
|
58
|
+
${content}
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
For each criterion, provide a score from 0-100 and explain your reasoning.`;
|
|
62
|
+
const result = await generateObject({
|
|
63
|
+
model: openai(model),
|
|
64
|
+
schema: MultiCriteriaGradeSchema,
|
|
65
|
+
prompt,
|
|
66
|
+
temperature
|
|
67
|
+
});
|
|
68
|
+
return result.object.scores.map((s) => ({
|
|
69
|
+
criterion: s.criterion,
|
|
70
|
+
score: s.score,
|
|
71
|
+
reasoning: s.reasoning
|
|
72
|
+
}));
|
|
73
|
+
},
|
|
74
|
+
async gradeRelevance(query, content) {
|
|
75
|
+
const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
|
|
76
|
+
|
|
77
|
+
Query: "${query}"
|
|
78
|
+
|
|
79
|
+
Content:
|
|
80
|
+
"""
|
|
81
|
+
${content}
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
A score of 100 means the content directly and completely answers the query.
|
|
85
|
+
A score of 0 means the content is completely irrelevant.
|
|
86
|
+
|
|
87
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
88
|
+
const result = await generateObject({
|
|
89
|
+
model: openai(model),
|
|
90
|
+
schema: GradeSchema,
|
|
91
|
+
prompt,
|
|
92
|
+
temperature
|
|
93
|
+
});
|
|
94
|
+
return {
|
|
95
|
+
score: result.object.score,
|
|
96
|
+
reasoning: result.object.reasoning
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// src/utils/metrics.ts
|
|
103
|
+
function calculateWeightedScore(scores) {
|
|
104
|
+
if (scores.length === 0) return 0;
|
|
105
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
106
|
+
if (totalWeight === 0) return 0;
|
|
107
|
+
const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
|
|
108
|
+
return Math.round(weightedSum / totalWeight);
|
|
109
|
+
}
|
|
110
|
+
function average(numbers) {
|
|
111
|
+
if (numbers.length === 0) return 0;
|
|
112
|
+
return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
|
|
113
|
+
}
|
|
114
|
+
function clamp(value, min, max) {
|
|
115
|
+
return Math.max(min, Math.min(max, value));
|
|
116
|
+
}
|
|
117
|
+
function normalizeScore(score) {
|
|
118
|
+
return clamp(Math.round(score), 0, 100);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// src/utils/reporters.ts
|
|
122
|
+
function formatScoreName(name) {
|
|
123
|
+
return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
|
|
124
|
+
}
|
|
125
|
+
function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
|
|
126
|
+
const entries = Object.entries(scores);
|
|
127
|
+
const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
|
|
128
|
+
const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
|
|
129
|
+
const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
|
|
130
|
+
const parts = [];
|
|
131
|
+
if (avgScore >= thresholds.good) {
|
|
132
|
+
parts.push("Strong overall performance");
|
|
133
|
+
} else if (avgScore >= thresholds.acceptable) {
|
|
134
|
+
parts.push("Acceptable performance with room for improvement");
|
|
135
|
+
} else {
|
|
136
|
+
parts.push("Performance below acceptable thresholds");
|
|
137
|
+
}
|
|
138
|
+
if (goodMetrics.length > 0) {
|
|
139
|
+
parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
|
|
140
|
+
}
|
|
141
|
+
if (poorMetrics.length > 0) {
|
|
142
|
+
parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
|
|
143
|
+
}
|
|
144
|
+
return parts.join(". ") + ".";
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// src/retrieval/index.ts
|
|
148
|
+
async function evaluateRetrieval(config) {
|
|
149
|
+
const startTime = Date.now();
|
|
150
|
+
if (config.queries.length === 0) {
|
|
151
|
+
throw new Error("At least one query is required for retrieval eval");
|
|
152
|
+
}
|
|
153
|
+
const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
|
|
154
|
+
if (!apiKey) {
|
|
155
|
+
throw new Error("PINECONE_API_KEY is required for retrieval eval");
|
|
156
|
+
}
|
|
157
|
+
const pinecone = new Pinecone({ apiKey });
|
|
158
|
+
const assistant = pinecone.assistant(config.assistantName);
|
|
159
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
160
|
+
const topK = config.topK || 5;
|
|
161
|
+
const queryResults = [];
|
|
162
|
+
for (const testQuery of config.queries) {
|
|
163
|
+
const result = await evaluateQueryRetrieval(
|
|
164
|
+
assistant,
|
|
165
|
+
grader,
|
|
166
|
+
testQuery,
|
|
167
|
+
topK,
|
|
168
|
+
config.verbose
|
|
169
|
+
);
|
|
170
|
+
queryResults.push(result);
|
|
171
|
+
}
|
|
172
|
+
const relevanceScores = queryResults.map((r) => r.relevanceScore);
|
|
173
|
+
const relevance = normalizeScore(average(relevanceScores));
|
|
174
|
+
const totalExpected = queryResults.reduce(
|
|
175
|
+
(sum, r) => sum + r.foundTopics.length + r.missingTopics.length,
|
|
176
|
+
0
|
|
177
|
+
);
|
|
178
|
+
const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);
|
|
179
|
+
const recall = totalExpected === 0 ? 100 : normalizeScore(totalFound / totalExpected * 100);
|
|
180
|
+
const allChunks = queryResults.flatMap((r) => r.chunks);
|
|
181
|
+
const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);
|
|
182
|
+
const precision = allChunks.length === 0 ? 100 : normalizeScore(relevantChunks.length / allChunks.length * 100);
|
|
183
|
+
const totalIrrelevant = queryResults.reduce(
|
|
184
|
+
(sum, r) => sum + r.noiseTopics.length,
|
|
185
|
+
0
|
|
186
|
+
);
|
|
187
|
+
const totalIrrelevantExpected = config.queries.reduce(
|
|
188
|
+
(sum, q) => sum + (q.irrelevantTopics?.length || 0),
|
|
189
|
+
0
|
|
190
|
+
);
|
|
191
|
+
const noiseRatio = totalIrrelevantExpected === 0 ? 0 : normalizeScore(totalIrrelevant / totalIrrelevantExpected * 100);
|
|
192
|
+
const scores = { relevance, recall, precision, noiseRatio };
|
|
193
|
+
const overallScore = calculateWeightedScore([
|
|
194
|
+
{ score: relevance, weight: 0.35 },
|
|
195
|
+
{ score: recall, weight: 0.3 },
|
|
196
|
+
{ score: precision, weight: 0.25 },
|
|
197
|
+
{ score: 100 - noiseRatio, weight: 0.1 }
|
|
198
|
+
// Invert noise ratio
|
|
199
|
+
]);
|
|
200
|
+
const evidence = [
|
|
201
|
+
{
|
|
202
|
+
criterion: "relevance",
|
|
203
|
+
score: relevance,
|
|
204
|
+
reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
criterion: "recall",
|
|
208
|
+
score: recall,
|
|
209
|
+
reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,
|
|
210
|
+
examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3)
|
|
211
|
+
},
|
|
212
|
+
{
|
|
213
|
+
criterion: "precision",
|
|
214
|
+
score: precision,
|
|
215
|
+
reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
criterion: "noiseRatio",
|
|
219
|
+
score: noiseRatio,
|
|
220
|
+
reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,
|
|
221
|
+
examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3)
|
|
222
|
+
}
|
|
223
|
+
];
|
|
224
|
+
return {
|
|
225
|
+
passed: overallScore >= 70 && noiseRatio <= 30,
|
|
226
|
+
overallScore,
|
|
227
|
+
scores,
|
|
228
|
+
evidence,
|
|
229
|
+
summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),
|
|
230
|
+
duration: Date.now() - startTime,
|
|
231
|
+
queryResults
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
async function evaluateQueryRetrieval(assistant, grader, testQuery, topK, verbose) {
|
|
235
|
+
let contextResult;
|
|
236
|
+
try {
|
|
237
|
+
contextResult = await assistant.context({
|
|
238
|
+
query: testQuery.query,
|
|
239
|
+
topK
|
|
240
|
+
});
|
|
241
|
+
} catch (error) {
|
|
242
|
+
return {
|
|
243
|
+
query: testQuery.query,
|
|
244
|
+
chunks: [],
|
|
245
|
+
relevanceScore: 0,
|
|
246
|
+
foundTopics: [],
|
|
247
|
+
missingTopics: testQuery.expectedTopics || [],
|
|
248
|
+
noiseTopics: []
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
const snippets = contextResult.snippets || [];
|
|
252
|
+
const chunks = [];
|
|
253
|
+
for (const snippet of snippets) {
|
|
254
|
+
const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);
|
|
255
|
+
const ref = snippet.reference;
|
|
256
|
+
const sourceFile = ref?.file?.name || ref?.name || "unknown";
|
|
257
|
+
chunks.push({
|
|
258
|
+
content: snippet.content.slice(0, 500),
|
|
259
|
+
// Truncate for storage
|
|
260
|
+
score: snippet.score,
|
|
261
|
+
sourceFile,
|
|
262
|
+
relevanceGrade: relevanceResult.score,
|
|
263
|
+
reasoning: relevanceResult.reasoning
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
const relevanceScore = chunks.length === 0 ? 0 : average(chunks.map((c) => c.relevanceGrade));
|
|
267
|
+
const allContent = chunks.map((c) => c.content).join(" ").toLowerCase();
|
|
268
|
+
const topicResult = checkTopics(
|
|
269
|
+
allContent,
|
|
270
|
+
testQuery.expectedTopics || [],
|
|
271
|
+
testQuery.irrelevantTopics || []
|
|
272
|
+
);
|
|
273
|
+
if (verbose) {
|
|
274
|
+
console.log(`Query: "${testQuery.query}"`);
|
|
275
|
+
console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);
|
|
276
|
+
console.log(` Topics found: ${topicResult.found.join(", ") || "none"}`);
|
|
277
|
+
console.log(` Topics missing: ${topicResult.missing.join(", ") || "none"}`);
|
|
278
|
+
console.log(` Noise: ${topicResult.noise.join(", ") || "none"}`);
|
|
279
|
+
}
|
|
280
|
+
return {
|
|
281
|
+
query: testQuery.query,
|
|
282
|
+
chunks,
|
|
283
|
+
relevanceScore,
|
|
284
|
+
foundTopics: topicResult.found,
|
|
285
|
+
missingTopics: topicResult.missing,
|
|
286
|
+
noiseTopics: topicResult.noise
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
function checkTopics(content, expectedTopics, irrelevantTopics) {
|
|
290
|
+
const found = [];
|
|
291
|
+
const missing = [];
|
|
292
|
+
const noise = [];
|
|
293
|
+
for (const topic of expectedTopics) {
|
|
294
|
+
if (content.includes(topic.toLowerCase())) {
|
|
295
|
+
found.push(topic);
|
|
296
|
+
} else {
|
|
297
|
+
missing.push(topic);
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
for (const topic of irrelevantTopics) {
|
|
301
|
+
if (content.includes(topic.toLowerCase())) {
|
|
302
|
+
noise.push(topic);
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
return { found, missing, noise };
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
export { evaluateRetrieval };
|
|
309
|
+
//# sourceMappingURL=index.js.map
|
|
310
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/utils/llm-grader.ts","../../src/utils/metrics.ts","../../src/utils/reporters.ts","../../src/retrieval/index.ts"],"names":[],"mappings":";;;;;;;AAgBA,IAAM,WAAA,GAAc,EAAE,MAAA,CAAO;AAAA,EAC3B,KAAA,EAAO,CAAA,CAAE,MAAA,EAAO,CAAE,GAAA,CAAI,CAAC,CAAA,CAAE,GAAA,CAAI,GAAG,CAAA,CAAE,QAAA,CAAS,kBAAkB,CAAA;AAAA,EAC7D,SAAA,EAAW,CAAA,CAAE,MAAA,EAAO,CAAE,SAAS,2BAA2B,CAAA;AAAA,EAC1D,QAAA,EAAU,CAAA,CAAE,KAAA,CAAM,CAAA,CAAE,MAAA,EAAQ,CAAA,CAAE,QAAA,EAAS,CAAE,QAAA,CAAS,6CAA6C;AACjG,CAAC,CAAA;AAED,IAAM,wBAAA,GAA2B,EAAE,MAAA,CAAO;AAAA,EACxC,MAAA,EAAQ,CAAA,CAAE,KAAA,CAAM,CAAA,CAAE,MAAA,CAAO;AAAA,IACvB,SAAA,EAAW,EAAE,MAAA,EAAO;AAAA,IACpB,KAAA,EAAO,EAAE,MAAA,EAAO,CAAE,IAAI,CAAC,CAAA,CAAE,IAAI,GAAG,CAAA;AAAA,IAChC,SAAA,EAAW,EAAE,MAAA;AAAO,GACrB,CAAC,CAAA;AAAA,EACF,gBAAA,EAAkB,EAAE,MAAA;AACtB,CAAC,CAAA;AAiCM,SAAS,eAAA,CAAgB,MAAA,GAA0B,EAAC,EAAc;AACvE,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,YAAA,IAAgB,OAAA,CAAQ,GAAA,CAAI,cAAA;AAClD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,4CAA4C,CAAA;AAAA,EAC9D;AAEA,EAAA,MAAM,MAAA,GAAS,YAAA,CAAa,EAAE,MAAA,EAAQ,CAAA;AACtC,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,KAAA,IAAS,+BAAA,EAAgC;AAC9D,EAAA,MAAM,WAAA,GAAc,OAAO,WAAA,IAAe,GAAA;AAE1C,EAAA,OAAO;AAAA,IACL,MAAM,KAAA,CAAM,OAAA,EAAiB,SAAA,EAAmB,MAAA,EAAiB;AAC/D,MAAA,MAAM,MAAA,GAAS,8EAA8E,SAAS,CAAA;;AAAA,EAE1G,MAAA,GAAS,WAAW,MAAM;AAAA,CAAA,GAAO,EAAE;AAAA;AAAA;AAAA,EAGnC,OAAO;AAAA;;AAAA,sDAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAM,cAAA,CAAe;AAAA,QAClC,KAAA,EAAO,OAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA;AAAA,IAChB,CAAA;AAAA,IAEA,MAAM,aAAA,CAAc,OAAA,EAAiB,QAAA,EAA2B;AAC9D,MAAA,MAAM,sBAAsB,QAAA,CACzB,GAAA,CAAI,CAAC,CAAA,KAAM,KAAK,CAAA,CAAE,IAAI,CAAA,EAAA,EAAK,CAAA,CAAE,WAAW,CAAA,UAAA,EAAa,CAAA,CAAE,MAAM,CAAA,CAAA,CAAG,CAAA,CAChE,KAAK,IAAI,CAAA;AAEZ,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA;AAAA,EAGnB,mBAAmB;;AAAA;AAAA;AAAA,EAInB,OAAO;AAAA;;AAAA,0EAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAM,cAAA,CAAe;AAAA,QAClC,KAAA,EAAO,OAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,wBAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA,CAAO,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,QACtC,WAAW,CAAA,CAAE,SAAA;AAAA,QACb,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,WAAW,CAAA,CAAE;AAAA,OACf,CAAE,CAAA;AAAA,IACJ,CAAA;AAAA,IAEA,MAAM,cAAA,CAAe,KAAA,EAAe,OAAA,EAAiB;AACnD,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA,QAAA,EAEX,KAAK,CAAA;;AAAA;AAAA;AAAA,EAIb,OAAO;AAAA;;AAAA;AAAA;;AAAA,sDAAA,CAAA;AAQH,MAAA,MAAM,MAAA,GAAS,MAAM,cAAA,CAAe;AAAA,QAClC,KAAA,EAAO,OAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO;AAAA,QACL,KAAA,EAAO,OAAO,MAAA,CAAO,KAAA;AAAA,QACrB,SAAA,EAAW,OAAO,MAAA,CAAO;AAAA,OAC3B;AAAA,IACF;AAAA,GACF;AACF;;;AC7IO,SAAS,uBACd,MAAA,EACQ;AACR,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AAEhC,EAAA,MAAM,WAAA,GAAc,OAAO,MAAA,CAAO,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AAC/D,EAAA,IAAI,WAAA,KAAgB,GAAG,OAAO,CAAA;AAE9B,EAAA,MAAM,WAAA,GAAc,MAAA,CAAO,MAAA,CAAO,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,KAAA,GAAQ,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AACzE,EAAA,OAAO,IAAA,CAAK,KAAA,CAAM,WAAA,GAAc,WAAW,CAAA;AAC7C;AAoBO,SAAS,QAAQ,OAAA,EAA2B;AACjD,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,GAAA,EAAK,MAAM,GAAA,GAAM,CAAA,EAAG,CAAC,CAAA,GAAI,OAAA,CAAQ,MAAA;AAC1D;AAiDO,SAAS,KAAA,CAAM,KAAA,EAAe,GAAA,EAAa,GAAA,EAAqB;AACrE,EAAA,OAAO,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,CAAC,CAAA;AAC3C;AAKO,SAAS,eAAe,KAAA,EAAuB;AACpD,EAAA,OAAO,MAAM,IAAA,CAAK,KAAA,CAAM,KAAK,CAAA,EAAG,GAAG,GAAG,CAAA;AACxC;;;ACxBA,SAAS,gBAAgB,IAAA,EAAsB;AAC7C,EAAA,OAAO,IAAA,CACJ,OAAA,CAAQ,UAAA,EAAY,KAAK,CAAA,CACzB,OAAA,CAAQ,IAAA,EAAM,CAAC,GAAA,KAAQ,GAAA,CAAI,WAAA,EAAa,EACxC,IAAA,EAAK;AACV;AA2DO,SAAS,eAAA,CACd,QACA,UAAA,GAAmD,EAAE,MAAM,EAAA,EAAI,UAAA,EAAY,IAAG,EACtE;AACR,EAAA,MAAM,OAAA,GAAU,MAAA,CAAO,OAAA,CAAQ,MAAM,CAAA;AACrC,EAAA,MAAM,QAAA,GAAW,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAA,EAAK,GAAG,KAAK,CAAA,KAAM,GAAA,GAAM,KAAA,EAAO,CAAC,IAAI,OAAA,CAAQ,MAAA;AAE9E,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,IAAS,UAAA,CAAW,IAAI,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAChG,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,GAAQ,UAAA,CAAW,UAAU,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAErG,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,IAAI,QAAA,IAAY,WAAW,IAAA,EAAM;AAC/B,IAAA,KAAA,CAAM,KAAK,4BAA4B,CAAA;AAAA,EACzC,CAAA,MAAA,IAAW,QAAA,IAAY,UAAA,CAAW,UAAA,EAAY;AAC5C,IAAA,KAAA,CAAM,KAAK,kDAAkD,CAAA;AAAA,EAC/D,CAAA,MAAO;AACL,IAAA,KAAA,CAAM,KAAK,yCAAyC,CAAA;AAAA,EACtD;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,WAAW,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EACrE;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,sBAAsB,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EAChF;AAEA,EAAA,OAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA,GAAI,GAAA;AAC5B;;;ACxIA,eAAsB,kBACpB,MAAA,EAC8B;AAC9B,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAE3B,EAAA,IAAI,MAAA,CAAO,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG;AAC/B,IAAA,MAAM,IAAI,MAAM,mDAAmD,CAAA;AAAA,EACrE;AAGA,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,cAAA,IAAkB,OAAA,CAAQ,GAAA,CAAI,gBAAA;AACpD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,iDAAiD,CAAA;AAAA,EACnE;AAEA,EAAA,MAAM,QAAA,GAAW,IAAI,QAAA,CAAS,EAAE,QAAQ,CAAA;AACxC,EAAA,MAAM,SAAA,GAAY,QAAA,CAAS,SAAA,CAAU,MAAA,CAAO,aAAa,CAAA;AACzD,EAAA,MAAM,SAAS,eAAA,CAAgB,EAAE,YAAA,EAAc,MAAA,CAAO,cAAc,CAAA;AACpE,EAAA,MAAM,IAAA,GAAO,OAAO,IAAA,IAAQ,CAAA;AAG5B,EAAA,MAAM,eAAuC,EAAC;AAE9C,EAAA,KAAA,MAAW,SAAA,IAAa,OAAO,OAAA,EAAS;AACtC,IAAA,MAAM,SAAS,MAAM,sBAAA;AAAA,MACnB,SAAA;AAAA,MACA,MAAA;AAAA,MACA,SAAA;AAAA,MACA,IAAA;AAAA,MACA,MAAA,CAAO;AAAA,KACT;AACA,IAAA,YAAA,CAAa,KAAK,MAAM,CAAA;AAAA,EAC1B;AAGA,EAAA,MAAM,kBAAkB,YAAA,CAAa,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,cAAc,CAAA;AAChE,EAAA,MAAM,SAAA,GAAY,cAAA,CAAe,OAAA,CAAQ,eAAe,CAAC,CAAA;AAGzD,EAAA,MAAM,gBAAgB,YAAA,CAAa,MAAA;AAAA,IACjC,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,EAAE,WAAA,CAAY,MAAA,GAAS,EAAE,aAAA,CAAc,MAAA;AAAA,IACzD;AAAA,GACF;AACA,EAAA,MAAM,UAAA,GAAa,YAAA,CAAa,MAAA,CAAO,CAAC,GAAA,EAAK,MAAM,GAAA,GAAM,CAAA,CAAE,WAAA,CAAY,MAAA,EAAQ,CAAC,CAAA;AAChF,EAAA,MAAM,SAAS,aAAA,KAAkB,CAAA,GAAI,MAAM,cAAA,CAAgB,UAAA,GAAa,gBAAiB,GAAG,CAAA;AAG5F,EAAA,MAAM,YAAY,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM,EAAE,MAAM,CAAA;AACtD,EAAA,MAAM,iBAAiB,SAAA,CAAU,MAAA,CAAO,CAAC,CAAA,KAAM,CAAA,CAAE,kBAAkB,EAAE,CAAA;AACrE,EAAA,MAAM,SAAA,GAAY,SAAA,CAAU,MAAA,KAAW,CAAA,GACnC,GAAA,GACA,eAAgB,cAAA,CAAe,MAAA,GAAS,SAAA,CAAU,MAAA,GAAU,GAAG,CAAA;AAGnE,EAAA,MAAM,kBAAkB,YAAA,CAAa,MAAA;AAAA,IACnC,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,EAAE,WAAA,CAAY,MAAA;AAAA,IAChC;AAAA,GACF;AACA,EAAA,MAAM,uBAAA,GAA0B,OAAO,OAAA,CAAQ,MAAA;AAAA,IAC7C,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,IAAO,CAAA,CAAE,kBAAkB,MAAA,IAAU,CAAA,CAAA;AAAA,IACjD;AAAA,GACF;AACA,EAAA,MAAM,aAAa,uBAAA,KAA4B,CAAA,GAC3C,IACA,cAAA,CAAgB,eAAA,GAAkB,0BAA2B,GAAG,CAAA;AAEpE,EAAA,MAAM,MAAA,GAAS,EAAE,SAAA,EAAW,MAAA,EAAQ,WAAW,UAAA,EAAW;AAG1D,EAAA,MAAM,eAAe,sBAAA,CAAuB;AAAA,IAC1C,EAAE,KAAA,EAAO,SAAA,EAAW,MAAA,EAAQ,IAAA,EAAK;AAAA,IACjC,EAAE,KAAA,EAAO,MAAA,EAAQ,MAAA,EAAQ,GAAA,EAAK;AAAA,IAC9B,EAAE,KAAA,EAAO,SAAA,EAAW,MAAA,EAAQ,IAAA,EAAK;AAAA,IACjC,EAAE,KAAA,EAAO,GAAA,GAAM,UAAA,EAAY,QAAQ,GAAA;AAAK;AAAA,GACzC,CAAA;AAGD,EAAA,MAAM,QAAA,GAA2B;AAAA,IAC/B;AAAA,MACE,SAAA,EAAW,WAAA;AAAA,MACX,KAAA,EAAO,SAAA;AAAA,MACP,SAAA,EAAW,CAAA,6CAAA,EAAgD,YAAA,CAAa,MAAM,CAAA,SAAA;AAAA,KAChF;AAAA,IACA;AAAA,MACE,SAAA,EAAW,QAAA;AAAA,MACX,KAAA,EAAO,MAAA;AAAA,MACP,SAAA,EAAW,CAAA,MAAA,EAAS,UAAU,CAAA,CAAA,EAAI,aAAa,CAAA,sCAAA,CAAA;AAAA,MAC/C,QAAA,EAAU,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM,EAAE,aAAa,CAAA,CAAE,KAAA,CAAM,CAAA,EAAG,CAAC;AAAA,KACnE;AAAA,IACA;AAAA,MACE,SAAA,EAAW,WAAA;AAAA,MACX,KAAA,EAAO,SAAA;AAAA,MACP,WAAW,CAAA,EAAG,cAAA,CAAe,MAAM,CAAA,CAAA,EAAI,UAAU,MAAM,CAAA,gCAAA;AAAA,KACzD;AAAA,IACA;AAAA,MACE,SAAA,EAAW,YAAA;AAAA,MACX,KAAA,EAAO,UAAA;AAAA,MACP,SAAA,EAAW,GAAG,eAAe,CAAA,iDAAA,CAAA;AAAA,MAC7B,QAAA,EAAU,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM,EAAE,WAAW,CAAA,CAAE,KAAA,CAAM,CAAA,EAAG,CAAC;AAAA;AACjE,GACF;AAEA,EAAA,OAAO;AAAA,IACL,MAAA,EAAQ,YAAA,IAAgB,EAAA,IAAM,UAAA,IAAc,EAAA;AAAA,IAC5C,YAAA;AAAA,IACA,MAAA;AAAA,IACA,QAAA;AAAA,IACA,OAAA,EAAS,gBAAgB,EAAE,SAAA,EAAW,QAAQ,SAAA,EAAW,UAAA,EAAY,GAAA,GAAM,UAAA,EAAY,CAAA;AAAA,IACvF,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,IACvB;AAAA,GACF;AACF;AASA,eAAe,sBAAA,CACb,SAAA,EACA,MAAA,EACA,SAAA,EACA,MACA,OAAA,EAC+B;AAE/B,EAAA,IAAI,aAAA;AACJ,EAAA,IAAI;AACF,IAAA,aAAA,GAAgB,MAAM,UAAU,OAAA,CAAQ;AAAA,MACtC,OAAO,SAAA,CAAU,KAAA;AAAA,MACjB;AAAA,KACD,CAAA;AAAA,EACH,SAAS,KAAA,EAAO;AAEd,IAAA,OAAO;AAAA,MACL,OAAO,SAAA,CAAU,KAAA;AAAA,MACjB,QAAQ,EAAC;AAAA,MACT,cAAA,EAAgB,CAAA;AAAA,MAChB,aAAa,EAAC;AAAA,MACd,aAAA,EAAe,SAAA,CAAU,cAAA,IAAkB,EAAC;AAAA,MAC5C,aAAa;AAAC,KAChB;AAAA,EACF;AAEA,EAAA,MAAM,QAAA,GAAW,aAAA,CAAc,QAAA,IAAY,EAAC;AAG5C,EAAA,MAAM,SAA2B,EAAC;AAElC,EAAA,KAAA,MAAW,WAAW,QAAA,EAAU;AAC9B,IAAA,MAAM,kBAAkB,MAAM,MAAA,CAAO,eAAe,SAAA,CAAU,KAAA,EAAO,QAAQ,OAAO,CAAA;AAEpF,IAAA,MAAM,MAAM,OAAA,CAAQ,SAAA;AACpB,IAAA,MAAM,UAAA,GACH,GAAA,EAAK,IAAA,EAAkC,IAAA,IACxC,KAAK,IAAA,IACL,SAAA;AAEF,IAAA,MAAA,CAAO,IAAA,CAAK;AAAA,MACV,OAAA,EAAS,OAAA,CAAQ,OAAA,CAAQ,KAAA,CAAM,GAAG,GAAG,CAAA;AAAA;AAAA,MACrC,OAAO,OAAA,CAAQ,KAAA;AAAA,MACf,UAAA;AAAA,MACA,gBAAgB,eAAA,CAAgB,KAAA;AAAA,MAChC,WAAW,eAAA,CAAgB;AAAA,KAC5B,CAAA;AAAA,EACH;AAGA,EAAA,MAAM,cAAA,GAAiB,MAAA,CAAO,MAAA,KAAW,CAAA,GACrC,CAAA,GACA,OAAA,CAAQ,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,CAAA,CAAE,cAAc,CAAC,CAAA;AAG/C,EAAA,MAAM,UAAA,GAAa,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAAM,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,GAAG,CAAA,CAAE,WAAA,EAAY;AACtE,EAAA,MAAM,WAAA,GAAc,WAAA;AAAA,IAClB,UAAA;AAAA,IACA,SAAA,CAAU,kBAAkB,EAAC;AAAA,IAC7B,SAAA,CAAU,oBAAoB;AAAC,GACjC;AAEA,EAAA,IAAI,OAAA,EAAS;AACX,IAAA,OAAA,CAAQ,GAAA,CAAI,CAAA,QAAA,EAAW,SAAA,CAAU,KAAK,CAAA,CAAA,CAAG,CAAA;AACzC,IAAA,OAAA,CAAQ,IAAI,CAAA,aAAA,EAAgB,cAAA,CAAe,OAAA,CAAQ,CAAC,CAAC,CAAA,IAAA,CAAM,CAAA;AAC3D,IAAA,OAAA,CAAQ,GAAA,CAAI,mBAAmB,WAAA,CAAY,KAAA,CAAM,KAAK,IAAI,CAAA,IAAK,MAAM,CAAA,CAAE,CAAA;AACvE,IAAA,OAAA,CAAQ,GAAA,CAAI,qBAAqB,WAAA,CAAY,OAAA,CAAQ,KAAK,IAAI,CAAA,IAAK,MAAM,CAAA,CAAE,CAAA;AAC3E,IAAA,OAAA,CAAQ,GAAA,CAAI,YAAY,WAAA,CAAY,KAAA,CAAM,KAAK,IAAI,CAAA,IAAK,MAAM,CAAA,CAAE,CAAA;AAAA,EAClE;AAEA,EAAA,OAAO;AAAA,IACL,OAAO,SAAA,CAAU,KAAA;AAAA,IACjB,MAAA;AAAA,IACA,cAAA;AAAA,IACA,aAAa,WAAA,CAAY,KAAA;AAAA,IACzB,eAAe,WAAA,CAAY,OAAA;AAAA,IAC3B,aAAa,WAAA,CAAY;AAAA,GAC3B;AACF;AASA,SAAS,WAAA,CACP,OAAA,EACA,cAAA,EACA,gBAAA,EACkB;AAClB,EAAA,MAAM,QAAkB,EAAC;AACzB,EAAA,MAAM,UAAoB,EAAC;AAC3B,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,KAAA,MAAW,SAAS,cAAA,EAAgB;AAClC,IAAA,IAAI,OAAA,CAAQ,QAAA,CAAS,KAAA,CAAM,WAAA,EAAa,CAAA,EAAG;AACzC,MAAA,KAAA,CAAM,KAAK,KAAK,CAAA;AAAA,IAClB,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,KAAK,KAAK,CAAA;AAAA,IACpB;AAAA,EACF;AAEA,EAAA,KAAA,MAAW,SAAS,gBAAA,EAAkB;AACpC,IAAA,IAAI,OAAA,CAAQ,QAAA,CAAS,KAAA,CAAM,WAAA,EAAa,CAAA,EAAG;AACzC,MAAA,KAAA,CAAM,KAAK,KAAK,CAAA;AAAA,IAClB;AAAA,EACF;AAEA,EAAA,OAAO,EAAE,KAAA,EAAO,OAAA,EAAS,KAAA,EAAM;AACjC","file":"index.js","sourcesContent":["/**\n * LLM Grading Utilities\n *\n * Uses OpenAI to grade content quality, relevance, and other metrics.\n */\n\nimport { createOpenAI } from '@ai-sdk/openai';\nimport { generateObject } from 'ai';\nimport { z } from 'zod';\nimport { resolveDefaultOpenAiChatModelId } from '@kat/core';\nimport type { LLMGraderConfig, EvalCriterion, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// GRADING SCHEMAS\n// ============================================================================\n\nconst GradeSchema = z.object({\n score: z.number().min(0).max(100).describe('Score from 0-100'),\n reasoning: z.string().describe('Explanation for the score'),\n examples: z.array(z.string()).optional().describe('Specific examples that influenced the score'),\n});\n\nconst MultiCriteriaGradeSchema = z.object({\n scores: z.array(z.object({\n criterion: z.string(),\n score: z.number().min(0).max(100),\n reasoning: z.string(),\n })),\n overallReasoning: z.string(),\n});\n\n// ============================================================================\n// GRADER FACTORY\n// ============================================================================\n\nexport interface LLMGrader {\n /**\n * Grade content against a single criterion.\n */\n grade(content: string, criterion: string, rubric?: string): Promise<{\n score: number;\n reasoning: string;\n examples?: string[];\n }>;\n\n /**\n * Grade content against multiple criteria.\n */\n gradeMultiple(content: string, criteria: EvalCriterion[]): Promise<EvalEvidence[]>;\n\n /**\n * Grade relevance of content to a query.\n */\n gradeRelevance(query: string, content: string): Promise<{\n score: number;\n reasoning: string;\n }>;\n}\n\n/**\n * Create an LLM grader with the given configuration.\n */\nexport function createLLMGrader(config: LLMGraderConfig = {}): LLMGrader {\n const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;\n if (!apiKey) {\n throw new Error('OPENAI_API_KEY is required for LLM grading');\n }\n\n const openai = createOpenAI({ apiKey });\n const model = config.model || resolveDefaultOpenAiChatModelId();\n const temperature = config.temperature ?? 0.1;\n\n return {\n async grade(content: string, criterion: string, rubric?: string) {\n const prompt = `You are an expert evaluator. Grade the following content on the criterion \"${criterion}\".\n\n${rubric ? `Rubric: ${rubric}\\n` : ''}\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return result.object;\n },\n\n async gradeMultiple(content: string, criteria: EvalCriterion[]) {\n const criteriaDescription = criteria\n .map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`)\n .join('\\n');\n\n const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.\n\nCriteria:\n${criteriaDescription}\n\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nFor each criterion, provide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: MultiCriteriaGradeSchema,\n prompt,\n temperature,\n });\n\n return result.object.scores.map((s) => ({\n criterion: s.criterion,\n score: s.score,\n reasoning: s.reasoning,\n }));\n },\n\n async gradeRelevance(query: string, content: string) {\n const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.\n\nQuery: \"${query}\"\n\nContent:\n\"\"\"\n${content}\n\"\"\"\n\nA score of 100 means the content directly and completely answers the query.\nA score of 0 means the content is completely irrelevant.\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return {\n score: result.object.score,\n reasoning: result.object.reasoning,\n };\n },\n };\n}\n\n// ============================================================================\n// CONVENIENCE FUNCTIONS\n// ============================================================================\n\n/**\n * Grade content using a one-off grader instance.\n */\nexport async function gradeWithLLM(\n content: string,\n criterion: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string; examples?: string[] }> {\n const grader = createLLMGrader(config);\n return grader.grade(content, criterion);\n}\n\n/**\n * Grade relevance using a one-off grader instance.\n */\nexport async function gradeRelevanceWithLLM(\n query: string,\n content: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string }> {\n const grader = createLLMGrader(config);\n return grader.gradeRelevance(query, content);\n}\n","/**\n * Metric Calculation Helpers\n *\n * Pure functions for calculating scores and metrics.\n */\n\n/**\n * Calculate a weighted score from individual scores and weights.\n *\n * @param scores - Array of { score, weight } objects\n * @returns Weighted average score (0-100)\n */\nexport function calculateWeightedScore(\n scores: Array<{ score: number; weight: number }>\n): number {\n if (scores.length === 0) return 0;\n\n const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);\n if (totalWeight === 0) return 0;\n\n const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);\n return Math.round(weightedSum / totalWeight);\n}\n\n/**\n * Calculate percentage of found items vs expected items.\n *\n * @param found - Number of items found\n * @param expected - Number of items expected\n * @returns Percentage (0-100)\n */\nexport function calculatePercentage(found: number, expected: number): number {\n if (expected === 0) return 100; // Nothing expected, consider it perfect\n return Math.round((found / expected) * 100);\n}\n\n/**\n * Calculate the average of an array of numbers.\n *\n * @param numbers - Array of numbers\n * @returns Average value\n */\nexport function average(numbers: number[]): number {\n if (numbers.length === 0) return 0;\n return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;\n}\n\n/**\n * Calculate precision: true positives / (true positives + false positives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falsePositives - Number of incorrect positive predictions\n * @returns Precision (0-100)\n */\nexport function calculatePrecision(\n truePositives: number,\n falsePositives: number\n): number {\n const total = truePositives + falsePositives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate recall: true positives / (true positives + false negatives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falseNegatives - Number of missed positive predictions\n * @returns Recall (0-100)\n */\nexport function calculateRecall(\n truePositives: number,\n falseNegatives: number\n): number {\n const total = truePositives + falseNegatives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate F1 score: harmonic mean of precision and recall.\n *\n * @param precision - Precision value (0-100)\n * @param recall - Recall value (0-100)\n * @returns F1 score (0-100)\n */\nexport function calculateF1(precision: number, recall: number): number {\n if (precision + recall === 0) return 0;\n return Math.round((2 * precision * recall) / (precision + recall));\n}\n\n/**\n * Clamp a value between min and max.\n */\nexport function clamp(value: number, min: number, max: number): number {\n return Math.max(min, Math.min(max, value));\n}\n\n/**\n * Normalize a score to 0-100 range.\n */\nexport function normalizeScore(score: number): number {\n return clamp(Math.round(score), 0, 100);\n}\n","/**\n * Report Formatting Utilities\n *\n * Format eval results for different output targets.\n */\n\nimport type { EvalResult, ReportOptions, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// CONSOLE REPORTER\n// ============================================================================\n\n/**\n * Format an eval result for console output.\n */\nexport function formatConsoleReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const lines: string[] = [];\n const { includeEvidence = true } = options;\n\n // Header\n const status = result.passed ? '✓ PASSED' : '✗ FAILED';\n const statusColor = result.passed ? '\\x1b[32m' : '\\x1b[31m';\n const reset = '\\x1b[0m';\n\n lines.push('');\n lines.push('═'.repeat(60));\n lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);\n lines.push('═'.repeat(60));\n\n // Summary\n lines.push('');\n lines.push(`Summary: ${result.summary}`);\n lines.push(`Duration: ${result.duration}ms`);\n\n // Individual scores\n lines.push('');\n lines.push('Scores:');\n for (const [name, score] of Object.entries(result.scores)) {\n const bar = createProgressBar(score, 20);\n const formattedName = formatScoreName(name);\n lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);\n }\n\n // Evidence (if requested)\n if (includeEvidence && result.evidence.length > 0) {\n lines.push('');\n lines.push('Evidence:');\n for (const evidence of result.evidence) {\n lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);\n lines.push(` ${evidence.reasoning}`);\n if (evidence.examples && evidence.examples.length > 0) {\n for (const example of evidence.examples.slice(0, 3)) {\n lines.push(` - ${example}`);\n }\n }\n }\n }\n\n lines.push('');\n lines.push('─'.repeat(60));\n\n return lines.join('\\n');\n}\n\n/**\n * Create a text progress bar.\n */\nfunction createProgressBar(value: number, width: number): string {\n const filled = Math.round((value / 100) * width);\n const empty = width - filled;\n return `[${'█'.repeat(filled)}${'░'.repeat(empty)}]`;\n}\n\n/**\n * Format a score name for display (camelCase -> Title Case).\n */\nfunction formatScoreName(name: string): string {\n return name\n .replace(/([A-Z])/g, ' $1')\n .replace(/^./, (str) => str.toUpperCase())\n .trim();\n}\n\n// ============================================================================\n// JSON REPORTER\n// ============================================================================\n\n/**\n * Format an eval result as JSON.\n */\nexport function formatJsonReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const { includeEvidence = true, includeRawData = false } = options;\n\n const output: Record<string, unknown> = {\n passed: result.passed,\n overallScore: result.overallScore,\n scores: result.scores,\n summary: result.summary,\n duration: result.duration,\n };\n\n if (includeEvidence) {\n output.evidence = result.evidence;\n }\n\n // Include any additional properties from extended result types\n for (const [key, value] of Object.entries(result)) {\n if (\n !['passed', 'overallScore', 'scores', 'evidence', 'summary', 'duration'].includes(key) &&\n (includeRawData || !isRawData(value))\n ) {\n output[key] = value;\n }\n }\n\n return JSON.stringify(output, null, 2);\n}\n\n/**\n * Check if a value looks like raw data (large arrays/objects).\n */\nfunction isRawData(value: unknown): boolean {\n if (Array.isArray(value) && value.length > 10) return true;\n if (typeof value === 'object' && value !== null) {\n const keys = Object.keys(value);\n if (keys.length > 20) return true;\n }\n return false;\n}\n\n// ============================================================================\n// SUMMARY GENERATION\n// ============================================================================\n\n/**\n * Generate a human-readable summary from scores.\n */\nexport function generateSummary(\n scores: Record<string, number>,\n thresholds: { good: number; acceptable: number } = { good: 80, acceptable: 60 }\n): string {\n const entries = Object.entries(scores);\n const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;\n\n const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);\n const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);\n\n const parts: string[] = [];\n\n if (avgScore >= thresholds.good) {\n parts.push('Strong overall performance');\n } else if (avgScore >= thresholds.acceptable) {\n parts.push('Acceptable performance with room for improvement');\n } else {\n parts.push('Performance below acceptable thresholds');\n }\n\n if (goodMetrics.length > 0) {\n parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(', ')}`);\n }\n\n if (poorMetrics.length > 0) {\n parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(', ')}`);\n }\n\n return parts.join('. ') + '.';\n}\n\n// ============================================================================\n// PRINT HELPERS\n// ============================================================================\n\n/**\n * Print an eval result to the console.\n */\nexport function printReport(result: EvalResult, options: Partial<ReportOptions> = {}): void {\n const format = options.format || 'console';\n\n if (format === 'json') {\n console.log(formatJsonReport(result, options));\n } else {\n console.log(formatConsoleReport(result, options));\n }\n}\n","/**\n * Retrieval Eval - Layer 2\n *\n * Evaluates whether RAG retrieves relevant chunks by testing\n * relevance, recall, precision, and noise ratio.\n */\n\nimport { Pinecone, type ContextModel } from '@pinecone-database/pinecone';\nimport { createLLMGrader } from '../utils/llm-grader.js';\nimport { average, normalizeScore, calculateWeightedScore } from '../utils/metrics.js';\nimport { generateSummary } from '../utils/reporters.js';\nimport type { EvalEvidence } from '../types.js';\nimport type {\n RetrievalEvalConfig,\n RetrievalEvalResult,\n RetrievalTestQuery,\n QueryRetrievalResult,\n RetrievedChunk,\n TopicCheckResult,\n} from './types.js';\n\nexport type {\n RetrievalEvalConfig,\n RetrievalEvalResult,\n RetrievalTestQuery,\n QueryRetrievalResult,\n RetrievedChunk,\n} from './types.js';\n\n// ============================================================================\n// MAIN EVALUATION FUNCTION\n// ============================================================================\n\n/**\n * Evaluate the quality of RAG retrieval for a Pinecone assistant.\n */\nexport async function evaluateRetrieval(\n config: RetrievalEvalConfig\n): Promise<RetrievalEvalResult> {\n const startTime = Date.now();\n\n if (config.queries.length === 0) {\n throw new Error('At least one query is required for retrieval eval');\n }\n\n // Initialize Pinecone\n const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;\n if (!apiKey) {\n throw new Error('PINECONE_API_KEY is required for retrieval eval');\n }\n\n const pinecone = new Pinecone({ apiKey });\n const assistant = pinecone.assistant(config.assistantName);\n const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });\n const topK = config.topK || 5;\n\n // Evaluate each query\n const queryResults: QueryRetrievalResult[] = [];\n\n for (const testQuery of config.queries) {\n const result = await evaluateQueryRetrieval(\n assistant,\n grader,\n testQuery,\n topK,\n config.verbose\n );\n queryResults.push(result);\n }\n\n // Calculate aggregate scores\n const relevanceScores = queryResults.map((r) => r.relevanceScore);\n const relevance = normalizeScore(average(relevanceScores));\n\n // Calculate recall: percentage of expected topics found\n const totalExpected = queryResults.reduce(\n (sum, r) => sum + r.foundTopics.length + r.missingTopics.length,\n 0\n );\n const totalFound = queryResults.reduce((sum, r) => sum + r.foundTopics.length, 0);\n const recall = totalExpected === 0 ? 100 : normalizeScore((totalFound / totalExpected) * 100);\n\n // Calculate precision: percentage of retrieved content that's relevant\n const allChunks = queryResults.flatMap((r) => r.chunks);\n const relevantChunks = allChunks.filter((c) => c.relevanceGrade >= 50);\n const precision = allChunks.length === 0\n ? 100\n : normalizeScore((relevantChunks.length / allChunks.length) * 100);\n\n // Calculate noise ratio: percentage of irrelevant topics found\n const totalIrrelevant = queryResults.reduce(\n (sum, r) => sum + r.noiseTopics.length,\n 0\n );\n const totalIrrelevantExpected = config.queries.reduce(\n (sum, q) => sum + (q.irrelevantTopics?.length || 0),\n 0\n );\n const noiseRatio = totalIrrelevantExpected === 0\n ? 0\n : normalizeScore((totalIrrelevant / totalIrrelevantExpected) * 100);\n\n const scores = { relevance, recall, precision, noiseRatio };\n\n // Calculate overall score (noise ratio is inverted - lower is better)\n const overallScore = calculateWeightedScore([\n { score: relevance, weight: 0.35 },\n { score: recall, weight: 0.30 },\n { score: precision, weight: 0.25 },\n { score: 100 - noiseRatio, weight: 0.10 }, // Invert noise ratio\n ]);\n\n // Build evidence\n const evidence: EvalEvidence[] = [\n {\n criterion: 'relevance',\n score: relevance,\n reasoning: `Average relevance of retrieved chunks across ${queryResults.length} queries.`,\n },\n {\n criterion: 'recall',\n score: recall,\n reasoning: `Found ${totalFound}/${totalExpected} expected topics in retrieved content.`,\n examples: queryResults.flatMap((r) => r.missingTopics).slice(0, 3),\n },\n {\n criterion: 'precision',\n score: precision,\n reasoning: `${relevantChunks.length}/${allChunks.length} retrieved chunks were relevant.`,\n },\n {\n criterion: 'noiseRatio',\n score: noiseRatio,\n reasoning: `${totalIrrelevant} irrelevant topics appeared in retrieved content.`,\n examples: queryResults.flatMap((r) => r.noiseTopics).slice(0, 3),\n },\n ];\n\n return {\n passed: overallScore >= 70 && noiseRatio <= 30,\n overallScore,\n scores,\n evidence,\n summary: generateSummary({ relevance, recall, precision, noiseRatio: 100 - noiseRatio }),\n duration: Date.now() - startTime,\n queryResults,\n };\n}\n\n// ============================================================================\n// QUERY EVALUATION\n// ============================================================================\n\n/**\n * Evaluate retrieval for a single query.\n */\nasync function evaluateQueryRetrieval(\n assistant: ReturnType<Pinecone['assistant']>,\n grader: ReturnType<typeof createLLMGrader>,\n testQuery: RetrievalTestQuery,\n topK: number,\n verbose?: boolean\n): Promise<QueryRetrievalResult> {\n // Retrieve chunks using Context API\n let contextResult: ContextModel;\n try {\n contextResult = await assistant.context({\n query: testQuery.query,\n topK,\n });\n } catch (error) {\n // Return empty result on error\n return {\n query: testQuery.query,\n chunks: [],\n relevanceScore: 0,\n foundTopics: [],\n missingTopics: testQuery.expectedTopics || [],\n noiseTopics: [],\n };\n }\n\n const snippets = contextResult.snippets || [];\n\n // Grade each chunk for relevance\n const chunks: RetrievedChunk[] = [];\n\n for (const snippet of snippets) {\n const relevanceResult = await grader.gradeRelevance(testQuery.query, snippet.content);\n\n const ref = snippet.reference as Record<string, unknown>;\n const sourceFile =\n (ref?.file as Record<string, unknown>)?.name ||\n ref?.name ||\n 'unknown';\n\n chunks.push({\n content: snippet.content.slice(0, 500), // Truncate for storage\n score: snippet.score,\n sourceFile: sourceFile as string,\n relevanceGrade: relevanceResult.score,\n reasoning: relevanceResult.reasoning,\n });\n }\n\n // Calculate average relevance\n const relevanceScore = chunks.length === 0\n ? 0\n : average(chunks.map((c) => c.relevanceGrade));\n\n // Check for expected and irrelevant topics\n const allContent = chunks.map((c) => c.content).join(' ').toLowerCase();\n const topicResult = checkTopics(\n allContent,\n testQuery.expectedTopics || [],\n testQuery.irrelevantTopics || []\n );\n\n if (verbose) {\n console.log(`Query: \"${testQuery.query}\"`);\n console.log(` Relevance: ${relevanceScore.toFixed(1)}/100`);\n console.log(` Topics found: ${topicResult.found.join(', ') || 'none'}`);\n console.log(` Topics missing: ${topicResult.missing.join(', ') || 'none'}`);\n console.log(` Noise: ${topicResult.noise.join(', ') || 'none'}`);\n }\n\n return {\n query: testQuery.query,\n chunks,\n relevanceScore,\n foundTopics: topicResult.found,\n missingTopics: topicResult.missing,\n noiseTopics: topicResult.noise,\n };\n}\n\n// ============================================================================\n// HELPERS\n// ============================================================================\n\n/**\n * Check for expected and irrelevant topics in content.\n */\nfunction checkTopics(\n content: string,\n expectedTopics: string[],\n irrelevantTopics: string[]\n): TopicCheckResult {\n const found: string[] = [];\n const missing: string[] = [];\n const noise: string[] = [];\n\n for (const topic of expectedTopics) {\n if (content.includes(topic.toLowerCase())) {\n found.push(topic);\n } else {\n missing.push(topic);\n }\n }\n\n for (const topic of irrelevantTopics) {\n if (content.includes(topic.toLowerCase())) {\n noise.push(topic);\n }\n }\n\n return { found, missing, noise };\n}\n"]}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @kat/eval - Shared Type Definitions
|
|
3
|
+
*
|
|
4
|
+
* Core types used across all eval layers (introspection, retrieval, agent).
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Base eval result with scores and evidence.
|
|
8
|
+
* All eval functions return a result that extends this interface.
|
|
9
|
+
*/
|
|
10
|
+
interface EvalResult {
|
|
11
|
+
/** Whether the eval passed (typically overallScore >= 70) */
|
|
12
|
+
passed: boolean;
|
|
13
|
+
/** Overall score from 0-100 */
|
|
14
|
+
overallScore: number;
|
|
15
|
+
/** Individual scores by metric name */
|
|
16
|
+
scores: Record<string, number>;
|
|
17
|
+
/** Evidence supporting the scores */
|
|
18
|
+
evidence: EvalEvidence[];
|
|
19
|
+
/** Human-readable summary of the evaluation */
|
|
20
|
+
summary: string;
|
|
21
|
+
/** Duration of the evaluation in milliseconds */
|
|
22
|
+
duration: number;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Evidence supporting an eval score.
|
|
26
|
+
* Provides transparency into why a score was assigned.
|
|
27
|
+
*/
|
|
28
|
+
interface EvalEvidence {
|
|
29
|
+
/** Name of the criterion being evaluated */
|
|
30
|
+
criterion: string;
|
|
31
|
+
/** Score for this criterion (0-100) */
|
|
32
|
+
score: number;
|
|
33
|
+
/** LLM or algorithmic reasoning for the score */
|
|
34
|
+
reasoning: string;
|
|
35
|
+
/** Optional examples that influenced the score */
|
|
36
|
+
examples?: string[];
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Configuration for LLM-graded evaluation.
|
|
40
|
+
* Used when an LLM judges quality (e.g., relevance, helpfulness).
|
|
41
|
+
*/
|
|
42
|
+
interface LLMGraderConfig {
|
|
43
|
+
/** OpenAI API key (uses env var if not provided) */
|
|
44
|
+
openaiApiKey?: string;
|
|
45
|
+
/** Model to use for grading (default: gpt-4o-mini) */
|
|
46
|
+
model?: string;
|
|
47
|
+
/** Temperature for grading (default: 0.1 for consistency) */
|
|
48
|
+
temperature?: number;
|
|
49
|
+
/** Custom rubric for grading */
|
|
50
|
+
rubric?: string;
|
|
51
|
+
/** Specific criteria to evaluate */
|
|
52
|
+
criteria?: EvalCriterion[];
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* A single evaluation criterion.
|
|
56
|
+
*/
|
|
57
|
+
interface EvalCriterion {
|
|
58
|
+
/** Name of the criterion (e.g., "relevance", "accuracy") */
|
|
59
|
+
name: string;
|
|
60
|
+
/** Description of what this criterion measures */
|
|
61
|
+
description: string;
|
|
62
|
+
/** Weight for this criterion (0-1, sum should equal 1) */
|
|
63
|
+
weight: number;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Common configuration options for eval functions.
|
|
67
|
+
*/
|
|
68
|
+
interface BaseEvalConfig {
|
|
69
|
+
/** Pinecone API key (uses env var if not provided) */
|
|
70
|
+
pineconeApiKey?: string;
|
|
71
|
+
/** OpenAI API key for LLM grading (uses env var if not provided) */
|
|
72
|
+
openaiApiKey?: string;
|
|
73
|
+
/** Whether to run in verbose mode */
|
|
74
|
+
verbose?: boolean;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Output format for eval results.
|
|
78
|
+
*/
|
|
79
|
+
type OutputFormat = 'console' | 'json' | 'html';
|
|
80
|
+
/**
|
|
81
|
+
* Options for report generation.
|
|
82
|
+
*/
|
|
83
|
+
interface ReportOptions {
|
|
84
|
+
/** Output format */
|
|
85
|
+
format: OutputFormat;
|
|
86
|
+
/** Whether to include detailed evidence */
|
|
87
|
+
includeEvidence?: boolean;
|
|
88
|
+
/** Whether to include raw data */
|
|
89
|
+
includeRawData?: boolean;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Result from a sub-evaluation (used internally).
|
|
93
|
+
*/
|
|
94
|
+
interface SubEvalResult {
|
|
95
|
+
/** Score from 0-100 */
|
|
96
|
+
score: number;
|
|
97
|
+
/** Evidence for this sub-evaluation */
|
|
98
|
+
evidence: EvalEvidence[];
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Result from entity/topic checking.
|
|
102
|
+
*/
|
|
103
|
+
interface CheckResult {
|
|
104
|
+
/** Items that were found */
|
|
105
|
+
found: string[];
|
|
106
|
+
/** Items that were expected but missing */
|
|
107
|
+
missing: string[];
|
|
108
|
+
/** Score based on found/expected ratio */
|
|
109
|
+
score: number;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export type { BaseEvalConfig as B, CheckResult as C, EvalCriterion as E, LLMGraderConfig as L, OutputFormat as O, ReportOptions as R, SubEvalResult as S, EvalEvidence as a, EvalResult as b };
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @kat/eval - Shared Type Definitions
|
|
3
|
+
*
|
|
4
|
+
* Core types used across all eval layers (introspection, retrieval, agent).
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Base eval result with scores and evidence.
|
|
8
|
+
* All eval functions return a result that extends this interface.
|
|
9
|
+
*/
|
|
10
|
+
interface EvalResult {
|
|
11
|
+
/** Whether the eval passed (typically overallScore >= 70) */
|
|
12
|
+
passed: boolean;
|
|
13
|
+
/** Overall score from 0-100 */
|
|
14
|
+
overallScore: number;
|
|
15
|
+
/** Individual scores by metric name */
|
|
16
|
+
scores: Record<string, number>;
|
|
17
|
+
/** Evidence supporting the scores */
|
|
18
|
+
evidence: EvalEvidence[];
|
|
19
|
+
/** Human-readable summary of the evaluation */
|
|
20
|
+
summary: string;
|
|
21
|
+
/** Duration of the evaluation in milliseconds */
|
|
22
|
+
duration: number;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Evidence supporting an eval score.
|
|
26
|
+
* Provides transparency into why a score was assigned.
|
|
27
|
+
*/
|
|
28
|
+
interface EvalEvidence {
|
|
29
|
+
/** Name of the criterion being evaluated */
|
|
30
|
+
criterion: string;
|
|
31
|
+
/** Score for this criterion (0-100) */
|
|
32
|
+
score: number;
|
|
33
|
+
/** LLM or algorithmic reasoning for the score */
|
|
34
|
+
reasoning: string;
|
|
35
|
+
/** Optional examples that influenced the score */
|
|
36
|
+
examples?: string[];
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Configuration for LLM-graded evaluation.
|
|
40
|
+
* Used when an LLM judges quality (e.g., relevance, helpfulness).
|
|
41
|
+
*/
|
|
42
|
+
interface LLMGraderConfig {
|
|
43
|
+
/** OpenAI API key (uses env var if not provided) */
|
|
44
|
+
openaiApiKey?: string;
|
|
45
|
+
/** Model to use for grading (default: gpt-4o-mini) */
|
|
46
|
+
model?: string;
|
|
47
|
+
/** Temperature for grading (default: 0.1 for consistency) */
|
|
48
|
+
temperature?: number;
|
|
49
|
+
/** Custom rubric for grading */
|
|
50
|
+
rubric?: string;
|
|
51
|
+
/** Specific criteria to evaluate */
|
|
52
|
+
criteria?: EvalCriterion[];
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* A single evaluation criterion.
|
|
56
|
+
*/
|
|
57
|
+
interface EvalCriterion {
|
|
58
|
+
/** Name of the criterion (e.g., "relevance", "accuracy") */
|
|
59
|
+
name: string;
|
|
60
|
+
/** Description of what this criterion measures */
|
|
61
|
+
description: string;
|
|
62
|
+
/** Weight for this criterion (0-1, sum should equal 1) */
|
|
63
|
+
weight: number;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Common configuration options for eval functions.
|
|
67
|
+
*/
|
|
68
|
+
interface BaseEvalConfig {
|
|
69
|
+
/** Pinecone API key (uses env var if not provided) */
|
|
70
|
+
pineconeApiKey?: string;
|
|
71
|
+
/** OpenAI API key for LLM grading (uses env var if not provided) */
|
|
72
|
+
openaiApiKey?: string;
|
|
73
|
+
/** Whether to run in verbose mode */
|
|
74
|
+
verbose?: boolean;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Output format for eval results.
|
|
78
|
+
*/
|
|
79
|
+
type OutputFormat = 'console' | 'json' | 'html';
|
|
80
|
+
/**
|
|
81
|
+
* Options for report generation.
|
|
82
|
+
*/
|
|
83
|
+
interface ReportOptions {
|
|
84
|
+
/** Output format */
|
|
85
|
+
format: OutputFormat;
|
|
86
|
+
/** Whether to include detailed evidence */
|
|
87
|
+
includeEvidence?: boolean;
|
|
88
|
+
/** Whether to include raw data */
|
|
89
|
+
includeRawData?: boolean;
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Result from a sub-evaluation (used internally).
|
|
93
|
+
*/
|
|
94
|
+
interface SubEvalResult {
|
|
95
|
+
/** Score from 0-100 */
|
|
96
|
+
score: number;
|
|
97
|
+
/** Evidence for this sub-evaluation */
|
|
98
|
+
evidence: EvalEvidence[];
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Result from entity/topic checking.
|
|
102
|
+
*/
|
|
103
|
+
interface CheckResult {
|
|
104
|
+
/** Items that were found */
|
|
105
|
+
found: string[];
|
|
106
|
+
/** Items that were expected but missing */
|
|
107
|
+
missing: string[];
|
|
108
|
+
/** Score based on found/expected ratio */
|
|
109
|
+
score: number;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export type { BaseEvalConfig as B, CheckResult as C, EvalCriterion as E, LLMGraderConfig as L, OutputFormat as O, ReportOptions as R, SubEvalResult as S, EvalEvidence as a, EvalResult as b };
|