@kat-ai/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -0
- package/dist/agent/index.cjs +468 -0
- package/dist/agent/index.cjs.map +1 -0
- package/dist/agent/index.d.cts +170 -0
- package/dist/agent/index.d.ts +170 -0
- package/dist/agent/index.js +466 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/index.cjs +1043 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +93 -0
- package/dist/index.d.ts +93 -0
- package/dist/index.js +1032 -0
- package/dist/index.js.map +1 -0
- package/dist/introspection/index.cjs +476 -0
- package/dist/introspection/index.cjs.map +1 -0
- package/dist/introspection/index.d.cts +107 -0
- package/dist/introspection/index.d.ts +107 -0
- package/dist/introspection/index.js +474 -0
- package/dist/introspection/index.js.map +1 -0
- package/dist/retrieval/index.cjs +312 -0
- package/dist/retrieval/index.cjs.map +1 -0
- package/dist/retrieval/index.d.cts +98 -0
- package/dist/retrieval/index.d.ts +98 -0
- package/dist/retrieval/index.js +310 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-BJjlqNhg.d.cts +112 -0
- package/dist/types-BJjlqNhg.d.ts +112 -0
- package/package.json +79 -0
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var pinecone = require('@pinecone-database/pinecone');
|
|
4
|
+
var openai = require('@ai-sdk/openai');
|
|
5
|
+
var ai = require('ai');
|
|
6
|
+
var zod = require('zod');
|
|
7
|
+
var core = require('@kat/core');
|
|
8
|
+
|
|
9
|
+
// src/introspection/index.ts
|
|
10
|
+
var GradeSchema = zod.z.object({
|
|
11
|
+
score: zod.z.number().min(0).max(100).describe("Score from 0-100"),
|
|
12
|
+
reasoning: zod.z.string().describe("Explanation for the score"),
|
|
13
|
+
examples: zod.z.array(zod.z.string()).optional().describe("Specific examples that influenced the score")
|
|
14
|
+
});
|
|
15
|
+
var MultiCriteriaGradeSchema = zod.z.object({
|
|
16
|
+
scores: zod.z.array(zod.z.object({
|
|
17
|
+
criterion: zod.z.string(),
|
|
18
|
+
score: zod.z.number().min(0).max(100),
|
|
19
|
+
reasoning: zod.z.string()
|
|
20
|
+
})),
|
|
21
|
+
overallReasoning: zod.z.string()
|
|
22
|
+
});
|
|
23
|
+
function createLLMGrader(config = {}) {
|
|
24
|
+
const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
25
|
+
if (!apiKey) {
|
|
26
|
+
throw new Error("OPENAI_API_KEY is required for LLM grading");
|
|
27
|
+
}
|
|
28
|
+
const openai$1 = openai.createOpenAI({ apiKey });
|
|
29
|
+
const model = config.model || core.resolveDefaultOpenAiChatModelId();
|
|
30
|
+
const temperature = config.temperature ?? 0.1;
|
|
31
|
+
return {
|
|
32
|
+
async grade(content, criterion, rubric) {
|
|
33
|
+
const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
|
|
34
|
+
|
|
35
|
+
${rubric ? `Rubric: ${rubric}
|
|
36
|
+
` : ""}
|
|
37
|
+
Content to evaluate:
|
|
38
|
+
"""
|
|
39
|
+
${content}
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
43
|
+
const result = await ai.generateObject({
|
|
44
|
+
model: openai$1(model),
|
|
45
|
+
schema: GradeSchema,
|
|
46
|
+
prompt,
|
|
47
|
+
temperature
|
|
48
|
+
});
|
|
49
|
+
return result.object;
|
|
50
|
+
},
|
|
51
|
+
async gradeMultiple(content, criteria) {
|
|
52
|
+
const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
|
|
53
|
+
const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
|
|
54
|
+
|
|
55
|
+
Criteria:
|
|
56
|
+
${criteriaDescription}
|
|
57
|
+
|
|
58
|
+
Content to evaluate:
|
|
59
|
+
"""
|
|
60
|
+
${content}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
For each criterion, provide a score from 0-100 and explain your reasoning.`;
|
|
64
|
+
const result = await ai.generateObject({
|
|
65
|
+
model: openai$1(model),
|
|
66
|
+
schema: MultiCriteriaGradeSchema,
|
|
67
|
+
prompt,
|
|
68
|
+
temperature
|
|
69
|
+
});
|
|
70
|
+
return result.object.scores.map((s) => ({
|
|
71
|
+
criterion: s.criterion,
|
|
72
|
+
score: s.score,
|
|
73
|
+
reasoning: s.reasoning
|
|
74
|
+
}));
|
|
75
|
+
},
|
|
76
|
+
async gradeRelevance(query, content) {
|
|
77
|
+
const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
|
|
78
|
+
|
|
79
|
+
Query: "${query}"
|
|
80
|
+
|
|
81
|
+
Content:
|
|
82
|
+
"""
|
|
83
|
+
${content}
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
A score of 100 means the content directly and completely answers the query.
|
|
87
|
+
A score of 0 means the content is completely irrelevant.
|
|
88
|
+
|
|
89
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
90
|
+
const result = await ai.generateObject({
|
|
91
|
+
model: openai$1(model),
|
|
92
|
+
schema: GradeSchema,
|
|
93
|
+
prompt,
|
|
94
|
+
temperature
|
|
95
|
+
});
|
|
96
|
+
return {
|
|
97
|
+
score: result.object.score,
|
|
98
|
+
reasoning: result.object.reasoning
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// src/utils/metrics.ts
|
|
105
|
+
function calculateWeightedScore(scores) {
|
|
106
|
+
if (scores.length === 0) return 0;
|
|
107
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
108
|
+
if (totalWeight === 0) return 0;
|
|
109
|
+
const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
|
|
110
|
+
return Math.round(weightedSum / totalWeight);
|
|
111
|
+
}
|
|
112
|
+
function clamp(value, min, max) {
|
|
113
|
+
return Math.max(min, Math.min(max, value));
|
|
114
|
+
}
|
|
115
|
+
function normalizeScore(score) {
|
|
116
|
+
return clamp(Math.round(score), 0, 100);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// src/utils/reporters.ts
|
|
120
|
+
function formatScoreName(name) {
|
|
121
|
+
return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
|
|
122
|
+
}
|
|
123
|
+
function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
|
|
124
|
+
const entries = Object.entries(scores);
|
|
125
|
+
const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
|
|
126
|
+
const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
|
|
127
|
+
const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
|
|
128
|
+
const parts = [];
|
|
129
|
+
if (avgScore >= thresholds.good) {
|
|
130
|
+
parts.push("Strong overall performance");
|
|
131
|
+
} else if (avgScore >= thresholds.acceptable) {
|
|
132
|
+
parts.push("Acceptable performance with room for improvement");
|
|
133
|
+
} else {
|
|
134
|
+
parts.push("Performance below acceptable thresholds");
|
|
135
|
+
}
|
|
136
|
+
if (goodMetrics.length > 0) {
|
|
137
|
+
parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
|
|
138
|
+
}
|
|
139
|
+
if (poorMetrics.length > 0) {
|
|
140
|
+
parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
|
|
141
|
+
}
|
|
142
|
+
return parts.join(". ") + ".";
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// src/introspection/index.ts
|
|
146
|
+
async function evaluateIntrospection(config) {
|
|
147
|
+
const startTime = Date.now();
|
|
148
|
+
const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;
|
|
149
|
+
if (!apiKey) {
|
|
150
|
+
throw new Error("PINECONE_API_KEY is required for introspection eval");
|
|
151
|
+
}
|
|
152
|
+
const pinecone$1 = new pinecone.Pinecone({ apiKey });
|
|
153
|
+
const assistant = pinecone$1.assistant(config.assistantName);
|
|
154
|
+
const [entityResult, slotResult, scopeResult, capabilityResult] = await Promise.all([
|
|
155
|
+
evaluateEntityCoverage(assistant, config.manifest, config),
|
|
156
|
+
evaluateSlotAccuracy(assistant, config.manifest, config),
|
|
157
|
+
evaluateScopePrecision(assistant, config.manifest, config),
|
|
158
|
+
evaluateCapabilityMatch(assistant, config.manifest, config)
|
|
159
|
+
]);
|
|
160
|
+
const scores = {
|
|
161
|
+
entityCoverage: entityResult.score,
|
|
162
|
+
slotAccuracy: slotResult.score,
|
|
163
|
+
scopePrecision: scopeResult.score,
|
|
164
|
+
capabilityMatch: capabilityResult.score
|
|
165
|
+
};
|
|
166
|
+
const overallScore = calculateWeightedScore([
|
|
167
|
+
{ score: scores.entityCoverage, weight: 0.25 },
|
|
168
|
+
{ score: scores.slotAccuracy, weight: 0.3 },
|
|
169
|
+
{ score: scores.scopePrecision, weight: 0.25 },
|
|
170
|
+
{ score: scores.capabilityMatch, weight: 0.2 }
|
|
171
|
+
]);
|
|
172
|
+
const evidence = [
|
|
173
|
+
...entityResult.evidence,
|
|
174
|
+
...slotResult.evidence,
|
|
175
|
+
...scopeResult.evidence,
|
|
176
|
+
...capabilityResult.evidence
|
|
177
|
+
];
|
|
178
|
+
return {
|
|
179
|
+
passed: overallScore >= 70,
|
|
180
|
+
overallScore,
|
|
181
|
+
scores,
|
|
182
|
+
evidence,
|
|
183
|
+
summary: generateSummary(scores),
|
|
184
|
+
duration: Date.now() - startTime,
|
|
185
|
+
details: {
|
|
186
|
+
missingEntities: entityResult.missing,
|
|
187
|
+
incorrectSlots: slotResult.incorrect,
|
|
188
|
+
scopeMisclassifications: scopeResult.misclassified,
|
|
189
|
+
mismatchedCapabilities: capabilityResult.mismatched
|
|
190
|
+
},
|
|
191
|
+
manifest: config.manifest
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
async function evaluateEntityCoverage(assistant, manifest, config) {
|
|
195
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
196
|
+
const manifestEntities = extractEntitiesFromManifest(manifest);
|
|
197
|
+
const discoveryQueries = [
|
|
198
|
+
"What are the main topics you can help with?",
|
|
199
|
+
"What products or entities do you have information about?",
|
|
200
|
+
"List the categories of information you contain."
|
|
201
|
+
];
|
|
202
|
+
const discoveredEntities = /* @__PURE__ */ new Set();
|
|
203
|
+
for (const query of discoveryQueries) {
|
|
204
|
+
try {
|
|
205
|
+
const response = await assistant.chat({
|
|
206
|
+
messages: [{ role: "user", content: query }]
|
|
207
|
+
});
|
|
208
|
+
const entities = await extractEntitiesWithLLM(grader, response.message?.content || "");
|
|
209
|
+
entities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
|
|
210
|
+
} catch {
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
if (config.groundTruth) {
|
|
214
|
+
for (const gt of config.groundTruth) {
|
|
215
|
+
if (gt.expectedEntities) {
|
|
216
|
+
gt.expectedEntities.forEach((e) => discoveredEntities.add(e.toLowerCase()));
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
const found = [];
|
|
221
|
+
const missing = [];
|
|
222
|
+
for (const entity of discoveredEntities) {
|
|
223
|
+
if (manifestEntities.some((me) => me.toLowerCase().includes(entity) || entity.includes(me.toLowerCase()))) {
|
|
224
|
+
found.push(entity);
|
|
225
|
+
} else {
|
|
226
|
+
missing.push(entity);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
const score = discoveredEntities.size === 0 ? 100 : normalizeScore(found.length / discoveredEntities.size * 100);
|
|
230
|
+
return {
|
|
231
|
+
score,
|
|
232
|
+
evidence: [
|
|
233
|
+
{
|
|
234
|
+
criterion: "entityCoverage",
|
|
235
|
+
score,
|
|
236
|
+
reasoning: `Found ${found.length}/${discoveredEntities.size} expected entities in the manifest.`,
|
|
237
|
+
examples: missing.slice(0, 5)
|
|
238
|
+
}
|
|
239
|
+
],
|
|
240
|
+
found,
|
|
241
|
+
missing
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
async function evaluateSlotAccuracy(assistant, manifest, config) {
|
|
245
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
246
|
+
const slots = manifest.slots || [];
|
|
247
|
+
if (slots.length === 0) {
|
|
248
|
+
return {
|
|
249
|
+
score: 50,
|
|
250
|
+
// Penalize but don't fail for no slots
|
|
251
|
+
evidence: [
|
|
252
|
+
{
|
|
253
|
+
criterion: "slotAccuracy",
|
|
254
|
+
score: 50,
|
|
255
|
+
reasoning: "No slots defined in manifest. Consider adding slots for common query parameters."
|
|
256
|
+
}
|
|
257
|
+
],
|
|
258
|
+
correct: [],
|
|
259
|
+
incorrect: []
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
const correct = [];
|
|
263
|
+
const incorrect = [];
|
|
264
|
+
for (const slot of slots) {
|
|
265
|
+
const testQuery = `To answer questions about ${manifest.domain || "this topic"}, do I need to know the ${slot.name}?`;
|
|
266
|
+
try {
|
|
267
|
+
const response = await assistant.chat({
|
|
268
|
+
messages: [{ role: "user", content: testQuery }]
|
|
269
|
+
});
|
|
270
|
+
const evaluation = await grader.grade(
|
|
271
|
+
`Slot: ${slot.name}
|
|
272
|
+
Description: ${slot.description || "N/A"}
|
|
273
|
+
KB Response: ${response.message?.content}`,
|
|
274
|
+
"slot relevance",
|
|
275
|
+
"Score 100 if the slot seems relevant to the KB content, 0 if completely irrelevant."
|
|
276
|
+
);
|
|
277
|
+
if (evaluation.score >= 60) {
|
|
278
|
+
correct.push(slot.name);
|
|
279
|
+
} else {
|
|
280
|
+
incorrect.push(slot.name);
|
|
281
|
+
}
|
|
282
|
+
} catch {
|
|
283
|
+
correct.push(slot.name);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
const score = normalizeScore(correct.length / slots.length * 100);
|
|
287
|
+
return {
|
|
288
|
+
score,
|
|
289
|
+
evidence: [
|
|
290
|
+
{
|
|
291
|
+
criterion: "slotAccuracy",
|
|
292
|
+
score,
|
|
293
|
+
reasoning: `${correct.length}/${slots.length} slots appear relevant to the KB content.`,
|
|
294
|
+
examples: incorrect
|
|
295
|
+
}
|
|
296
|
+
],
|
|
297
|
+
correct,
|
|
298
|
+
incorrect
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
async function evaluateScopePrecision(assistant, manifest, config) {
|
|
302
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
303
|
+
const scope = manifest.scope;
|
|
304
|
+
if (!scope) {
|
|
305
|
+
return {
|
|
306
|
+
score: 60,
|
|
307
|
+
// Penalize but don't fail
|
|
308
|
+
evidence: [
|
|
309
|
+
{
|
|
310
|
+
criterion: "scopePrecision",
|
|
311
|
+
score: 60,
|
|
312
|
+
reasoning: "No scope definition in manifest. Consider defining in-scope and out-of-scope examples."
|
|
313
|
+
}
|
|
314
|
+
],
|
|
315
|
+
correctClassifications: 0,
|
|
316
|
+
totalClassifications: 0,
|
|
317
|
+
misclassified: []
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
const testCases = [];
|
|
321
|
+
if (scope.inScopeExamples) {
|
|
322
|
+
scope.inScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: true }));
|
|
323
|
+
}
|
|
324
|
+
if (scope.outOfScopeExamples) {
|
|
325
|
+
scope.outOfScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: false }));
|
|
326
|
+
}
|
|
327
|
+
if (config.groundTruth) {
|
|
328
|
+
config.groundTruth.filter((gt) => gt.shouldBeInScope !== void 0).forEach((gt) => testCases.push({ query: gt.query, expectedInScope: gt.shouldBeInScope }));
|
|
329
|
+
}
|
|
330
|
+
if (testCases.length === 0) {
|
|
331
|
+
return {
|
|
332
|
+
score: 70,
|
|
333
|
+
evidence: [
|
|
334
|
+
{
|
|
335
|
+
criterion: "scopePrecision",
|
|
336
|
+
score: 70,
|
|
337
|
+
reasoning: "No scope test cases available. Add in-scope and out-of-scope examples to test."
|
|
338
|
+
}
|
|
339
|
+
],
|
|
340
|
+
correctClassifications: 0,
|
|
341
|
+
totalClassifications: 0,
|
|
342
|
+
misclassified: []
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
let correctClassifications = 0;
|
|
346
|
+
const misclassified = [];
|
|
347
|
+
for (const testCase of testCases.slice(0, 10)) {
|
|
348
|
+
try {
|
|
349
|
+
const response = await assistant.chat({
|
|
350
|
+
messages: [{ role: "user", content: testCase.query }]
|
|
351
|
+
});
|
|
352
|
+
const evaluation = await grader.grade(
|
|
353
|
+
`Query: ${testCase.query}
|
|
354
|
+
KB Response: ${response.message?.content}`,
|
|
355
|
+
"answerability",
|
|
356
|
+
"Score 100 if the KB provided a substantive, on-topic answer. Score 0 if it said it cannot help or gave an off-topic response."
|
|
357
|
+
);
|
|
358
|
+
const actuallyInScope = evaluation.score >= 50;
|
|
359
|
+
if (actuallyInScope === testCase.expectedInScope) {
|
|
360
|
+
correctClassifications++;
|
|
361
|
+
} else {
|
|
362
|
+
misclassified.push(`"${testCase.query}" (expected ${testCase.expectedInScope ? "in-scope" : "out-of-scope"})`);
|
|
363
|
+
}
|
|
364
|
+
} catch {
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
const score = normalizeScore(correctClassifications / testCases.length * 100);
|
|
368
|
+
return {
|
|
369
|
+
score,
|
|
370
|
+
evidence: [
|
|
371
|
+
{
|
|
372
|
+
criterion: "scopePrecision",
|
|
373
|
+
score,
|
|
374
|
+
reasoning: `${correctClassifications}/${testCases.length} scope classifications were correct.`,
|
|
375
|
+
examples: misclassified.slice(0, 3)
|
|
376
|
+
}
|
|
377
|
+
],
|
|
378
|
+
correctClassifications,
|
|
379
|
+
totalClassifications: testCases.length,
|
|
380
|
+
misclassified
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
async function evaluateCapabilityMatch(assistant, manifest, config) {
|
|
384
|
+
const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });
|
|
385
|
+
const capabilities = (manifest.capabilities || []).map(
|
|
386
|
+
(c) => typeof c === "string" ? c : c.text
|
|
387
|
+
);
|
|
388
|
+
if (capabilities.length === 0) {
|
|
389
|
+
return {
|
|
390
|
+
score: 50,
|
|
391
|
+
evidence: [
|
|
392
|
+
{
|
|
393
|
+
criterion: "capabilityMatch",
|
|
394
|
+
score: 50,
|
|
395
|
+
reasoning: "No capabilities defined in manifest."
|
|
396
|
+
}
|
|
397
|
+
],
|
|
398
|
+
matched: [],
|
|
399
|
+
mismatched: []
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
const matched = [];
|
|
403
|
+
const mismatched = [];
|
|
404
|
+
for (const capability of capabilities.slice(0, 5)) {
|
|
405
|
+
const testQuery = `Can you help me with: ${capability}`;
|
|
406
|
+
try {
|
|
407
|
+
const response = await assistant.chat({
|
|
408
|
+
messages: [{ role: "user", content: testQuery }]
|
|
409
|
+
});
|
|
410
|
+
const evaluation = await grader.grade(
|
|
411
|
+
`Capability: ${capability}
|
|
412
|
+
KB Response: ${response.message?.content}`,
|
|
413
|
+
"capability fulfillment",
|
|
414
|
+
"Score 100 if the KB demonstrated it can help with this capability. Score 0 if it cannot."
|
|
415
|
+
);
|
|
416
|
+
if (evaluation.score >= 60) {
|
|
417
|
+
matched.push(capability);
|
|
418
|
+
} else {
|
|
419
|
+
mismatched.push(capability);
|
|
420
|
+
}
|
|
421
|
+
} catch {
|
|
422
|
+
matched.push(capability);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
const score = normalizeScore(matched.length / capabilities.length * 100);
|
|
426
|
+
return {
|
|
427
|
+
score,
|
|
428
|
+
evidence: [
|
|
429
|
+
{
|
|
430
|
+
criterion: "capabilityMatch",
|
|
431
|
+
score,
|
|
432
|
+
reasoning: `${matched.length}/${capabilities.length} stated capabilities match actual KB content.`,
|
|
433
|
+
examples: mismatched
|
|
434
|
+
}
|
|
435
|
+
],
|
|
436
|
+
matched,
|
|
437
|
+
mismatched
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
function extractEntitiesFromManifest(manifest) {
|
|
441
|
+
const entities = /* @__PURE__ */ new Set();
|
|
442
|
+
if (manifest.slots) {
|
|
443
|
+
manifest.slots.forEach((slot) => {
|
|
444
|
+
if (slot.examples) {
|
|
445
|
+
slot.examples.forEach((e) => entities.add(e));
|
|
446
|
+
}
|
|
447
|
+
});
|
|
448
|
+
}
|
|
449
|
+
if (manifest.domain) {
|
|
450
|
+
entities.add(manifest.domain);
|
|
451
|
+
}
|
|
452
|
+
if (manifest.capabilities) {
|
|
453
|
+
manifest.capabilities.forEach((c) => {
|
|
454
|
+
const text = typeof c === "string" ? c : c.text;
|
|
455
|
+
const words = text.split(/\s+/).filter((w) => w.length > 3 && /^[A-Z]/.test(w));
|
|
456
|
+
words.forEach((w) => entities.add(w.toLowerCase()));
|
|
457
|
+
});
|
|
458
|
+
}
|
|
459
|
+
return Array.from(entities);
|
|
460
|
+
}
|
|
461
|
+
async function extractEntitiesWithLLM(grader, text) {
|
|
462
|
+
try {
|
|
463
|
+
const result = await grader.grade(
|
|
464
|
+
text,
|
|
465
|
+
"entity extraction",
|
|
466
|
+
"List the main entities (products, topics, categories) mentioned. Return just the entity names separated by commas."
|
|
467
|
+
);
|
|
468
|
+
return result.reasoning.split(",").map((e) => e.trim().toLowerCase()).filter((e) => e.length > 2);
|
|
469
|
+
} catch {
|
|
470
|
+
return [];
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
exports.evaluateIntrospection = evaluateIntrospection;
|
|
475
|
+
//# sourceMappingURL=index.cjs.map
|
|
476
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/utils/llm-grader.ts","../../src/utils/metrics.ts","../../src/utils/reporters.ts","../../src/introspection/index.ts"],"names":["z","openai","createOpenAI","resolveDefaultOpenAiChatModelId","generateObject","pinecone","Pinecone"],"mappings":";;;;;;;;;AAgBA,IAAM,WAAA,GAAcA,MAAE,MAAA,CAAO;AAAA,EAC3B,KAAA,EAAOA,KAAA,CAAE,MAAA,EAAO,CAAE,GAAA,CAAI,CAAC,CAAA,CAAE,GAAA,CAAI,GAAG,CAAA,CAAE,QAAA,CAAS,kBAAkB,CAAA;AAAA,EAC7D,SAAA,EAAWA,KAAA,CAAE,MAAA,EAAO,CAAE,SAAS,2BAA2B,CAAA;AAAA,EAC1D,QAAA,EAAUA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,EAAQ,CAAA,CAAE,QAAA,EAAS,CAAE,QAAA,CAAS,6CAA6C;AACjG,CAAC,CAAA;AAED,IAAM,wBAAA,GAA2BA,MAAE,MAAA,CAAO;AAAA,EACxC,MAAA,EAAQA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,CAAO;AAAA,IACvB,SAAA,EAAWA,MAAE,MAAA,EAAO;AAAA,IACpB,KAAA,EAAOA,MAAE,MAAA,EAAO,CAAE,IAAI,CAAC,CAAA,CAAE,IAAI,GAAG,CAAA;AAAA,IAChC,SAAA,EAAWA,MAAE,MAAA;AAAO,GACrB,CAAC,CAAA;AAAA,EACF,gBAAA,EAAkBA,MAAE,MAAA;AACtB,CAAC,CAAA;AAiCM,SAAS,eAAA,CAAgB,MAAA,GAA0B,EAAC,EAAc;AACvE,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,YAAA,IAAgB,OAAA,CAAQ,GAAA,CAAI,cAAA;AAClD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,4CAA4C,CAAA;AAAA,EAC9D;AAEA,EAAA,MAAMC,QAAA,GAASC,mBAAA,CAAa,EAAE,MAAA,EAAQ,CAAA;AACtC,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,KAAA,IAASC,oCAAA,EAAgC;AAC9D,EAAA,MAAM,WAAA,GAAc,OAAO,WAAA,IAAe,GAAA;AAE1C,EAAA,OAAO;AAAA,IACL,MAAM,KAAA,CAAM,OAAA,EAAiB,SAAA,EAAmB,MAAA,EAAiB;AAC/D,MAAA,MAAM,MAAA,GAAS,8EAA8E,SAAS,CAAA;;AAAA,EAE1G,MAAA,GAAS,WAAW,MAAM;AAAA,CAAA,GAAO,EAAE;AAAA;AAAA;AAAA,EAGnC,OAAO;AAAA;;AAAA,sDAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMC,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA;AAAA,IAChB,CAAA;AAAA,IAEA,MAAM,aAAA,CAAc,OAAA,EAAiB,QAAA,EAA2B;AAC9D,MAAA,MAAM,sBAAsB,QAAA,CACzB,GAAA,CAAI,CAAC,CAAA,KAAM,KAAK,CAAA,CAAE,IAAI,CAAA,EAAA,EAAK,CAAA,CAAE,WAAW,CAAA,UAAA,EAAa,CAAA,CAAE,MAAM,CAAA,CAAA,CAAG,CAAA,CAChE,KAAK,IAAI,CAAA;AAEZ,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA;AAAA,EAGnB,mBAAmB;;AAAA;AAAA;AAAA,EAInB,OAAO;AAAA;;AAAA,0EAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,wBAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA,CAAO,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,QACtC,WAAW,CAAA,CAAE,SAAA;AAAA,QACb,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,WAAW,CAAA,CAAE;AAAA,OACf,CAAE,CAAA;AAAA,IACJ,CAAA;AAAA,IAEA,MAAM,cAAA,CAAe,KAAA,EAAe,OAAA,EAAiB;AACnD,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA,QAAA,EAEX,KAAK,CAAA;;AAAA;AAAA;AAAA,EAIb,OAAO;AAAA;;AAAA;AAAA;;AAAA,sDAAA,CAAA;AAQH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO;AAAA,QACL,KAAA,EAAO,OAAO,MAAA,CAAO,KAAA;AAAA,QACrB,SAAA,EAAW,OAAO,MAAA,CAAO;AAAA,OAC3B;AAAA,IACF;AAAA,GACF;AACF;;;AC7IO,SAAS,uBACd,MAAA,EACQ;AACR,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AAEhC,EAAA,MAAM,WAAA,GAAc,OAAO,MAAA,CAAO,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AAC/D,EAAA,IAAI,WAAA,KAAgB,GAAG,OAAO,CAAA;AAE9B,EAAA,MAAM,WAAA,GAAc,MAAA,CAAO,MAAA,CAAO,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,KAAA,GAAQ,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AACzE,EAAA,OAAO,IAAA,CAAK,KAAA,CAAM,WAAA,GAAc,WAAW,CAAA;AAC7C;AAwEO,SAAS,KAAA,CAAM,KAAA,EAAe,GAAA,EAAa,GAAA,EAAqB;AACrE,EAAA,OAAO,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,CAAC,CAAA;AAC3C;AAKO,SAAS,eAAe,KAAA,EAAuB;AACpD,EAAA,OAAO,MAAM,IAAA,CAAK,KAAA,CAAM,KAAK,CAAA,EAAG,GAAG,GAAG,CAAA;AACxC;;;ACxBA,SAAS,gBAAgB,IAAA,EAAsB;AAC7C,EAAA,OAAO,IAAA,CACJ,OAAA,CAAQ,UAAA,EAAY,KAAK,CAAA,CACzB,OAAA,CAAQ,IAAA,EAAM,CAAC,GAAA,KAAQ,GAAA,CAAI,WAAA,EAAa,EACxC,IAAA,EAAK;AACV;AA2DO,SAAS,eAAA,CACd,QACA,UAAA,GAAmD,EAAE,MAAM,EAAA,EAAI,UAAA,EAAY,IAAG,EACtE;AACR,EAAA,MAAM,OAAA,GAAU,MAAA,CAAO,OAAA,CAAQ,MAAM,CAAA;AACrC,EAAA,MAAM,QAAA,GAAW,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAA,EAAK,GAAG,KAAK,CAAA,KAAM,GAAA,GAAM,KAAA,EAAO,CAAC,IAAI,OAAA,CAAQ,MAAA;AAE9E,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,IAAS,UAAA,CAAW,IAAI,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAChG,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,GAAQ,UAAA,CAAW,UAAU,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAErG,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,IAAI,QAAA,IAAY,WAAW,IAAA,EAAM;AAC/B,IAAA,KAAA,CAAM,KAAK,4BAA4B,CAAA;AAAA,EACzC,CAAA,MAAA,IAAW,QAAA,IAAY,UAAA,CAAW,UAAA,EAAY;AAC5C,IAAA,KAAA,CAAM,KAAK,kDAAkD,CAAA;AAAA,EAC/D,CAAA,MAAO;AACL,IAAA,KAAA,CAAM,KAAK,yCAAyC,CAAA;AAAA,EACtD;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,WAAW,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EACrE;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,sBAAsB,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EAChF;AAEA,EAAA,OAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA,GAAI,GAAA;AAC5B;;;AC7IA,eAAsB,sBACpB,MAAA,EACkC;AAClC,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAG3B,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,cAAA,IAAkB,OAAA,CAAQ,GAAA,CAAI,gBAAA;AACpD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,qDAAqD,CAAA;AAAA,EACvE;AAEA,EAAA,MAAMI,UAAA,GAAW,IAAIC,iBAAA,CAAS,EAAE,QAAQ,CAAA;AACxC,EAAA,MAAM,SAAA,GAAYD,UAAA,CAAS,SAAA,CAAU,MAAA,CAAO,aAAa,CAAA;AAGzD,EAAA,MAAM,CAAC,cAAc,UAAA,EAAY,WAAA,EAAa,gBAAgB,CAAA,GAAI,MAAM,QAAQ,GAAA,CAAI;AAAA,IAClF,sBAAA,CAAuB,SAAA,EAAW,MAAA,CAAO,QAAA,EAAU,MAAM,CAAA;AAAA,IACzD,oBAAA,CAAqB,SAAA,EAAW,MAAA,CAAO,QAAA,EAAU,MAAM,CAAA;AAAA,IACvD,sBAAA,CAAuB,SAAA,EAAW,MAAA,CAAO,QAAA,EAAU,MAAM,CAAA;AAAA,IACzD,uBAAA,CAAwB,SAAA,EAAW,MAAA,CAAO,QAAA,EAAU,MAAM;AAAA,GAC3D,CAAA;AAGD,EAAA,MAAM,MAAA,GAAS;AAAA,IACb,gBAAgB,YAAA,CAAa,KAAA;AAAA,IAC7B,cAAc,UAAA,CAAW,KAAA;AAAA,IACzB,gBAAgB,WAAA,CAAY,KAAA;AAAA,IAC5B,iBAAiB,gBAAA,CAAiB;AAAA,GACpC;AAEA,EAAA,MAAM,eAAe,sBAAA,CAAuB;AAAA,IAC1C,EAAE,KAAA,EAAO,MAAA,CAAO,cAAA,EAAgB,QAAQ,IAAA,EAAK;AAAA,IAC7C,EAAE,KAAA,EAAO,MAAA,CAAO,YAAA,EAAc,QAAQ,GAAA,EAAK;AAAA,IAC3C,EAAE,KAAA,EAAO,MAAA,CAAO,cAAA,EAAgB,QAAQ,IAAA,EAAK;AAAA,IAC7C,EAAE,KAAA,EAAO,MAAA,CAAO,eAAA,EAAiB,QAAQ,GAAA;AAAK,GAC/C,CAAA;AAGD,EAAA,MAAM,QAAA,GAA2B;AAAA,IAC/B,GAAG,YAAA,CAAa,QAAA;AAAA,IAChB,GAAG,UAAA,CAAW,QAAA;AAAA,IACd,GAAG,WAAA,CAAY,QAAA;AAAA,IACf,GAAG,gBAAA,CAAiB;AAAA,GACtB;AAEA,EAAA,OAAO;AAAA,IACL,QAAQ,YAAA,IAAgB,EAAA;AAAA,IACxB,YAAA;AAAA,IACA,MAAA;AAAA,IACA,QAAA;AAAA,IACA,OAAA,EAAS,gBAAgB,MAAM,CAAA;AAAA,IAC/B,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,IACvB,OAAA,EAAS;AAAA,MACP,iBAAiB,YAAA,CAAa,OAAA;AAAA,MAC9B,gBAAgB,UAAA,CAAW,SAAA;AAAA,MAC3B,yBAAyB,WAAA,CAAY,aAAA;AAAA,MACrC,wBAAwB,gBAAA,CAAiB;AAAA,KAC3C;AAAA,IACA,UAAU,MAAA,CAAO;AAAA,GACnB;AACF;AASA,eAAe,sBAAA,CACb,SAAA,EACA,QAAA,EACA,MAAA,EAC+B;AAC/B,EAAA,MAAM,SAAS,eAAA,CAAgB,EAAE,YAAA,EAAc,MAAA,CAAO,cAAc,CAAA;AAGpE,EAAA,MAAM,gBAAA,GAAmB,4BAA4B,QAAQ,CAAA;AAG7D,EAAA,MAAM,gBAAA,GAAmB;AAAA,IACvB,6CAAA;AAAA,IACA,0DAAA;AAAA,IACA;AAAA,GACF;AAEA,EAAA,MAAM,kBAAA,uBAAsC,GAAA,EAAI;AAChD,EAAA,KAAA,MAAW,SAAS,gBAAA,EAAkB;AACpC,IAAA,IAAI;AACF,MAAA,MAAM,QAAA,GAAW,MAAM,SAAA,CAAU,IAAA,CAAK;AAAA,QACpC,UAAU,CAAC,EAAE,MAAM,MAAA,EAAQ,OAAA,EAAS,OAAO;AAAA,OAC5C,CAAA;AACD,MAAA,MAAM,WAAW,MAAM,sBAAA,CAAuB,QAAQ,QAAA,CAAS,OAAA,EAAS,WAAW,EAAE,CAAA;AACrF,MAAA,QAAA,CAAS,OAAA,CAAQ,CAAC,CAAA,KAAM,kBAAA,CAAmB,IAAI,CAAA,CAAE,WAAA,EAAa,CAAC,CAAA;AAAA,IACjE,CAAA,CAAA,MAAQ;AAAA,IAER;AAAA,EACF;AAGA,EAAA,IAAI,OAAO,WAAA,EAAa;AACtB,IAAA,KAAA,MAAW,EAAA,IAAM,OAAO,WAAA,EAAa;AACnC,MAAA,IAAI,GAAG,gBAAA,EAAkB;AACvB,QAAA,EAAA,CAAG,gBAAA,CAAiB,QAAQ,CAAC,CAAA,KAAM,mBAAmB,GAAA,CAAI,CAAA,CAAE,WAAA,EAAa,CAAC,CAAA;AAAA,MAC5E;AAAA,IACF;AAAA,EACF;AAGA,EAAA,MAAM,QAAkB,EAAC;AACzB,EAAA,MAAM,UAAoB,EAAC;AAE3B,EAAA,KAAA,MAAW,UAAU,kBAAA,EAAoB;AACvC,IAAA,IAAI,iBAAiB,IAAA,CAAK,CAAC,EAAA,KAAO,EAAA,CAAG,aAAY,CAAE,QAAA,CAAS,MAAM,CAAA,IAAK,OAAO,QAAA,CAAS,EAAA,CAAG,WAAA,EAAa,CAAC,CAAA,EAAG;AACzG,MAAA,KAAA,CAAM,KAAK,MAAM,CAAA;AAAA,IACnB,CAAA,MAAO;AACL,MAAA,OAAA,CAAQ,KAAK,MAAM,CAAA;AAAA,IACrB;AAAA,EACF;AAEA,EAAA,MAAM,KAAA,GAAQ,kBAAA,CAAmB,IAAA,KAAS,CAAA,GACtC,GAAA,GACA,eAAgB,KAAA,CAAM,MAAA,GAAS,kBAAA,CAAmB,IAAA,GAAQ,GAAG,CAAA;AAEjE,EAAA,OAAO;AAAA,IACL,KAAA;AAAA,IACA,QAAA,EAAU;AAAA,MACR;AAAA,QACE,SAAA,EAAW,gBAAA;AAAA,QACX,KAAA;AAAA,QACA,WAAW,CAAA,MAAA,EAAS,KAAA,CAAM,MAAM,CAAA,CAAA,EAAI,mBAAmB,IAAI,CAAA,mCAAA,CAAA;AAAA,QAC3D,QAAA,EAAU,OAAA,CAAQ,KAAA,CAAM,CAAA,EAAG,CAAC;AAAA;AAC9B,KACF;AAAA,IACA,KAAA;AAAA,IACA;AAAA,GACF;AACF;AAKA,eAAe,oBAAA,CACb,SAAA,EACA,QAAA,EACA,MAAA,EAC6B;AAC7B,EAAA,MAAM,SAAS,eAAA,CAAgB,EAAE,YAAA,EAAc,MAAA,CAAO,cAAc,CAAA;AAEpE,EAAA,MAAM,KAAA,GAAQ,QAAA,CAAS,KAAA,IAAS,EAAC;AACjC,EAAA,IAAI,KAAA,CAAM,WAAW,CAAA,EAAG;AACtB,IAAA,OAAO;AAAA,MACL,KAAA,EAAO,EAAA;AAAA;AAAA,MACP,QAAA,EAAU;AAAA,QACR;AAAA,UACE,SAAA,EAAW,cAAA;AAAA,UACX,KAAA,EAAO,EAAA;AAAA,UACP,SAAA,EAAW;AAAA;AACb,OACF;AAAA,MACA,SAAS,EAAC;AAAA,MACV,WAAW;AAAC,KACd;AAAA,EACF;AAGA,EAAA,MAAM,UAAoB,EAAC;AAC3B,EAAA,MAAM,YAAsB,EAAC;AAE7B,EAAA,KAAA,MAAW,QAAQ,KAAA,EAAO;AACxB,IAAA,MAAM,YAAY,CAAA,0BAAA,EAA6B,QAAA,CAAS,UAAU,YAAY,CAAA,wBAAA,EAA2B,KAAK,IAAI,CAAA,CAAA,CAAA;AAElH,IAAA,IAAI;AACF,MAAA,MAAM,QAAA,GAAW,MAAM,SAAA,CAAU,IAAA,CAAK;AAAA,QACpC,UAAU,CAAC,EAAE,MAAM,MAAA,EAAQ,OAAA,EAAS,WAAW;AAAA,OAChD,CAAA;AAGD,MAAA,MAAM,UAAA,GAAa,MAAM,MAAA,CAAO,KAAA;AAAA,QAC9B,CAAA,MAAA,EAAS,KAAK,IAAI;AAAA,aAAA,EAAkB,IAAA,CAAK,eAAe,KAAK;AAAA,aAAA,EAAkB,QAAA,CAAS,SAAS,OAAO,CAAA,CAAA;AAAA,QACxG,gBAAA;AAAA,QACA;AAAA,OACF;AAEA,MAAA,IAAI,UAAA,CAAW,SAAS,EAAA,EAAI;AAC1B,QAAA,OAAA,CAAQ,IAAA,CAAK,KAAK,IAAI,CAAA;AAAA,MACxB,CAAA,MAAO;AACL,QAAA,SAAA,CAAU,IAAA,CAAK,KAAK,IAAI,CAAA;AAAA,MAC1B;AAAA,IACF,CAAA,CAAA,MAAQ;AAEN,MAAA,OAAA,CAAQ,IAAA,CAAK,KAAK,IAAI,CAAA;AAAA,IACxB;AAAA,EACF;AAEA,EAAA,MAAM,QAAQ,cAAA,CAAgB,OAAA,CAAQ,MAAA,GAAS,KAAA,CAAM,SAAU,GAAG,CAAA;AAElE,EAAA,OAAO;AAAA,IACL,KAAA;AAAA,IACA,QAAA,EAAU;AAAA,MACR;AAAA,QACE,SAAA,EAAW,cAAA;AAAA,QACX,KAAA;AAAA,QACA,WAAW,CAAA,EAAG,OAAA,CAAQ,MAAM,CAAA,CAAA,EAAI,MAAM,MAAM,CAAA,yCAAA,CAAA;AAAA,QAC5C,QAAA,EAAU;AAAA;AACZ,KACF;AAAA,IACA,OAAA;AAAA,IACA;AAAA,GACF;AACF;AAKA,eAAe,sBAAA,CACb,SAAA,EACA,QAAA,EACA,MAAA,EAC+B;AAC/B,EAAA,MAAM,SAAS,eAAA,CAAgB,EAAE,YAAA,EAAc,MAAA,CAAO,cAAc,CAAA;AAEpE,EAAA,MAAM,QAAQ,QAAA,CAAS,KAAA;AACvB,EAAA,IAAI,CAAC,KAAA,EAAO;AACV,IAAA,OAAO;AAAA,MACL,KAAA,EAAO,EAAA;AAAA;AAAA,MACP,QAAA,EAAU;AAAA,QACR;AAAA,UACE,SAAA,EAAW,gBAAA;AAAA,UACX,KAAA,EAAO,EAAA;AAAA,UACP,SAAA,EAAW;AAAA;AACb,OACF;AAAA,MACA,sBAAA,EAAwB,CAAA;AAAA,MACxB,oBAAA,EAAsB,CAAA;AAAA,MACtB,eAAe;AAAC,KAClB;AAAA,EACF;AAEA,EAAA,MAAM,YAAgE,EAAC;AAGvE,EAAA,IAAI,MAAM,eAAA,EAAiB;AACzB,IAAA,KAAA,CAAM,eAAA,CAAgB,OAAA,CAAQ,CAAC,CAAA,KAAM,SAAA,CAAU,IAAA,CAAK,EAAE,KAAA,EAAO,CAAA,EAAG,eAAA,EAAiB,IAAA,EAAM,CAAC,CAAA;AAAA,EAC1F;AACA,EAAA,IAAI,MAAM,kBAAA,EAAoB;AAC5B,IAAA,KAAA,CAAM,kBAAA,CAAmB,OAAA,CAAQ,CAAC,CAAA,KAAM,SAAA,CAAU,IAAA,CAAK,EAAE,KAAA,EAAO,CAAA,EAAG,eAAA,EAAiB,KAAA,EAAO,CAAC,CAAA;AAAA,EAC9F;AAGA,EAAA,IAAI,OAAO,WAAA,EAAa;AACtB,IAAA,MAAA,CAAO,WAAA,CACJ,OAAO,CAAC,EAAA,KAAO,GAAG,eAAA,KAAoB,MAAS,EAC/C,OAAA,CAAQ,CAAC,OAAO,SAAA,CAAU,IAAA,CAAK,EAAE,KAAA,EAAO,EAAA,CAAG,OAAO,eAAA,EAAiB,EAAA,CAAG,eAAA,EAAkB,CAAC,CAAA;AAAA,EAC9F;AAEA,EAAA,IAAI,SAAA,CAAU,WAAW,CAAA,EAAG;AAC1B,IAAA,OAAO;AAAA,MACL,KAAA,EAAO,EAAA;AAAA,MACP,QAAA,EAAU;AAAA,QACR;AAAA,UACE,SAAA,EAAW,gBAAA;AAAA,UACX,KAAA,EAAO,EAAA;AAAA,UACP,SAAA,EAAW;AAAA;AACb,OACF;AAAA,MACA,sBAAA,EAAwB,CAAA;AAAA,MACxB,oBAAA,EAAsB,CAAA;AAAA,MACtB,eAAe;AAAC,KAClB;AAAA,EACF;AAEA,EAAA,IAAI,sBAAA,GAAyB,CAAA;AAC7B,EAAA,MAAM,gBAA0B,EAAC;AAEjC,EAAA,KAAA,MAAW,QAAA,IAAY,SAAA,CAAU,KAAA,CAAM,CAAA,EAAG,EAAE,CAAA,EAAG;AAC7C,IAAA,IAAI;AACF,MAAA,MAAM,QAAA,GAAW,MAAM,SAAA,CAAU,IAAA,CAAK;AAAA,QACpC,QAAA,EAAU,CAAC,EAAE,IAAA,EAAM,QAAQ,OAAA,EAAS,QAAA,CAAS,OAAO;AAAA,OACrD,CAAA;AAGD,MAAA,MAAM,UAAA,GAAa,MAAM,MAAA,CAAO,KAAA;AAAA,QAC9B,CAAA,OAAA,EAAU,SAAS,KAAK;AAAA,aAAA,EAAkB,QAAA,CAAS,SAAS,OAAO,CAAA,CAAA;AAAA,QACnE,eAAA;AAAA,QACA;AAAA,OACF;AAEA,MAAA,MAAM,eAAA,GAAkB,WAAW,KAAA,IAAS,EAAA;AAE5C,MAAA,IAAI,eAAA,KAAoB,SAAS,eAAA,EAAiB;AAChD,QAAA,sBAAA,EAAA;AAAA,MACF,CAAA,MAAO;AACL,QAAA,aAAA,CAAc,IAAA,CAAK,IAAI,QAAA,CAAS,KAAK,eAAe,QAAA,CAAS,eAAA,GAAkB,UAAA,GAAa,cAAc,CAAA,CAAA,CAAG,CAAA;AAAA,MAC/G;AAAA,IACF,CAAA,CAAA,MAAQ;AAAA,IAER;AAAA,EACF;AAEA,EAAA,MAAM,KAAA,GAAQ,cAAA,CAAgB,sBAAA,GAAyB,SAAA,CAAU,SAAU,GAAG,CAAA;AAE9E,EAAA,OAAO;AAAA,IACL,KAAA;AAAA,IACA,QAAA,EAAU;AAAA,MACR;AAAA,QACE,SAAA,EAAW,gBAAA;AAAA,QACX,KAAA;AAAA,QACA,SAAA,EAAW,CAAA,EAAG,sBAAsB,CAAA,CAAA,EAAI,UAAU,MAAM,CAAA,oCAAA,CAAA;AAAA,QACxD,QAAA,EAAU,aAAA,CAAc,KAAA,CAAM,CAAA,EAAG,CAAC;AAAA;AACpC,KACF;AAAA,IACA,sBAAA;AAAA,IACA,sBAAsB,SAAA,CAAU,MAAA;AAAA,IAChC;AAAA,GACF;AACF;AAKA,eAAe,uBAAA,CACb,SAAA,EACA,QAAA,EACA,MAAA,EACgC;AAChC,EAAA,MAAM,SAAS,eAAA,CAAgB,EAAE,YAAA,EAAc,MAAA,CAAO,cAAc,CAAA;AAEpE,EAAA,MAAM,YAAA,GAAA,CAAgB,QAAA,CAAS,YAAA,IAAgB,EAAC,EAAG,GAAA;AAAA,IAAI,CAAC,CAAA,KACtD,OAAO,CAAA,KAAM,QAAA,GAAW,IAAI,CAAA,CAAE;AAAA,GAChC;AAEA,EAAA,IAAI,YAAA,CAAa,WAAW,CAAA,EAAG;AAC7B,IAAA,OAAO;AAAA,MACL,KAAA,EAAO,EAAA;AAAA,MACP,QAAA,EAAU;AAAA,QACR;AAAA,UACE,SAAA,EAAW,iBAAA;AAAA,UACX,KAAA,EAAO,EAAA;AAAA,UACP,SAAA,EAAW;AAAA;AACb,OACF;AAAA,MACA,SAAS,EAAC;AAAA,MACV,YAAY;AAAC,KACf;AAAA,EACF;AAEA,EAAA,MAAM,UAAoB,EAAC;AAC3B,EAAA,MAAM,aAAuB,EAAC;AAE9B,EAAA,KAAA,MAAW,UAAA,IAAc,YAAA,CAAa,KAAA,CAAM,CAAA,EAAG,CAAC,CAAA,EAAG;AAEjD,IAAA,MAAM,SAAA,GAAY,yBAAyB,UAAU,CAAA,CAAA;AAErD,IAAA,IAAI;AACF,MAAA,MAAM,QAAA,GAAW,MAAM,SAAA,CAAU,IAAA,CAAK;AAAA,QACpC,UAAU,CAAC,EAAE,MAAM,MAAA,EAAQ,OAAA,EAAS,WAAW;AAAA,OAChD,CAAA;AAGD,MAAA,MAAM,UAAA,GAAa,MAAM,MAAA,CAAO,KAAA;AAAA,QAC9B,eAAe,UAAU;AAAA,aAAA,EAAkB,QAAA,CAAS,SAAS,OAAO,CAAA,CAAA;AAAA,QACpE,wBAAA;AAAA,QACA;AAAA,OACF;AAEA,MAAA,IAAI,UAAA,CAAW,SAAS,EAAA,EAAI;AAC1B,QAAA,OAAA,CAAQ,KAAK,UAAU,CAAA;AAAA,MACzB,CAAA,MAAO;AACL,QAAA,UAAA,CAAW,KAAK,UAAU,CAAA;AAAA,MAC5B;AAAA,IACF,CAAA,CAAA,MAAQ;AAEN,MAAA,OAAA,CAAQ,KAAK,UAAU,CAAA;AAAA,IACzB;AAAA,EACF;AAEA,EAAA,MAAM,QAAQ,cAAA,CAAgB,OAAA,CAAQ,MAAA,GAAS,YAAA,CAAa,SAAU,GAAG,CAAA;AAEzE,EAAA,OAAO;AAAA,IACL,KAAA;AAAA,IACA,QAAA,EAAU;AAAA,MACR;AAAA,QACE,SAAA,EAAW,iBAAA;AAAA,QACX,KAAA;AAAA,QACA,WAAW,CAAA,EAAG,OAAA,CAAQ,MAAM,CAAA,CAAA,EAAI,aAAa,MAAM,CAAA,6CAAA,CAAA;AAAA,QACnD,QAAA,EAAU;AAAA;AACZ,KACF;AAAA,IACA,OAAA;AAAA,IACA;AAAA,GACF;AACF;AASA,SAAS,4BAA4B,QAAA,EAAgC;AACnE,EAAA,MAAM,QAAA,uBAA4B,GAAA,EAAI;AAGtC,EAAA,IAAI,SAAS,KAAA,EAAO;AAClB,IAAA,QAAA,CAAS,KAAA,CAAM,OAAA,CAAQ,CAAC,IAAA,KAAS;AAC/B,MAAA,IAAI,KAAK,QAAA,EAAU;AACjB,QAAA,IAAA,CAAK,SAAS,OAAA,CAAQ,CAAC,MAAM,QAAA,CAAS,GAAA,CAAI,CAAC,CAAC,CAAA;AAAA,MAC9C;AAAA,IACF,CAAC,CAAA;AAAA,EACH;AAGA,EAAA,IAAI,SAAS,MAAA,EAAQ;AACnB,IAAA,QAAA,CAAS,GAAA,CAAI,SAAS,MAAM,CAAA;AAAA,EAC9B;AAGA,EAAA,IAAI,SAAS,YAAA,EAAc;AACzB,IAAA,QAAA,CAAS,YAAA,CAAa,OAAA,CAAQ,CAAC,CAAA,KAAM;AACnC,MAAA,MAAM,IAAA,GAAO,OAAO,CAAA,KAAM,QAAA,GAAW,IAAI,CAAA,CAAE,IAAA;AAE3C,MAAA,MAAM,KAAA,GAAQ,IAAA,CAAK,KAAA,CAAM,KAAK,EAAE,MAAA,CAAO,CAAC,CAAA,KAAM,CAAA,CAAE,MAAA,GAAS,CAAA,IAAK,QAAA,CAAS,IAAA,CAAK,CAAC,CAAC,CAAA;AAC9E,MAAA,KAAA,CAAM,OAAA,CAAQ,CAAC,CAAA,KAAM,QAAA,CAAS,IAAI,CAAA,CAAE,WAAA,EAAa,CAAC,CAAA;AAAA,IACpD,CAAC,CAAA;AAAA,EACH;AAEA,EAAA,OAAO,KAAA,CAAM,KAAK,QAAQ,CAAA;AAC5B;AAKA,eAAe,sBAAA,CACb,QACA,IAAA,EACmB;AACnB,EAAA,IAAI;AACF,IAAA,MAAM,MAAA,GAAS,MAAM,MAAA,CAAO,KAAA;AAAA,MAC1B,IAAA;AAAA,MACA,mBAAA;AAAA,MACA;AAAA,KACF;AAGA,IAAA,OAAO,OAAO,SAAA,CACX,KAAA,CAAM,GAAG,CAAA,CACT,GAAA,CAAI,CAAC,CAAA,KAAM,CAAA,CAAE,MAAK,CAAE,WAAA,EAAa,CAAA,CACjC,MAAA,CAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAS,CAAC,CAAA;AAAA,EAC/B,CAAA,CAAA,MAAQ;AACN,IAAA,OAAO,EAAC;AAAA,EACV;AACF","file":"index.cjs","sourcesContent":["/**\n * LLM Grading Utilities\n *\n * Uses OpenAI to grade content quality, relevance, and other metrics.\n */\n\nimport { createOpenAI } from '@ai-sdk/openai';\nimport { generateObject } from 'ai';\nimport { z } from 'zod';\nimport { resolveDefaultOpenAiChatModelId } from '@kat/core';\nimport type { LLMGraderConfig, EvalCriterion, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// GRADING SCHEMAS\n// ============================================================================\n\nconst GradeSchema = z.object({\n score: z.number().min(0).max(100).describe('Score from 0-100'),\n reasoning: z.string().describe('Explanation for the score'),\n examples: z.array(z.string()).optional().describe('Specific examples that influenced the score'),\n});\n\nconst MultiCriteriaGradeSchema = z.object({\n scores: z.array(z.object({\n criterion: z.string(),\n score: z.number().min(0).max(100),\n reasoning: z.string(),\n })),\n overallReasoning: z.string(),\n});\n\n// ============================================================================\n// GRADER FACTORY\n// ============================================================================\n\nexport interface LLMGrader {\n /**\n * Grade content against a single criterion.\n */\n grade(content: string, criterion: string, rubric?: string): Promise<{\n score: number;\n reasoning: string;\n examples?: string[];\n }>;\n\n /**\n * Grade content against multiple criteria.\n */\n gradeMultiple(content: string, criteria: EvalCriterion[]): Promise<EvalEvidence[]>;\n\n /**\n * Grade relevance of content to a query.\n */\n gradeRelevance(query: string, content: string): Promise<{\n score: number;\n reasoning: string;\n }>;\n}\n\n/**\n * Create an LLM grader with the given configuration.\n */\nexport function createLLMGrader(config: LLMGraderConfig = {}): LLMGrader {\n const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;\n if (!apiKey) {\n throw new Error('OPENAI_API_KEY is required for LLM grading');\n }\n\n const openai = createOpenAI({ apiKey });\n const model = config.model || resolveDefaultOpenAiChatModelId();\n const temperature = config.temperature ?? 0.1;\n\n return {\n async grade(content: string, criterion: string, rubric?: string) {\n const prompt = `You are an expert evaluator. Grade the following content on the criterion \"${criterion}\".\n\n${rubric ? `Rubric: ${rubric}\\n` : ''}\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return result.object;\n },\n\n async gradeMultiple(content: string, criteria: EvalCriterion[]) {\n const criteriaDescription = criteria\n .map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`)\n .join('\\n');\n\n const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.\n\nCriteria:\n${criteriaDescription}\n\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nFor each criterion, provide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: MultiCriteriaGradeSchema,\n prompt,\n temperature,\n });\n\n return result.object.scores.map((s) => ({\n criterion: s.criterion,\n score: s.score,\n reasoning: s.reasoning,\n }));\n },\n\n async gradeRelevance(query: string, content: string) {\n const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.\n\nQuery: \"${query}\"\n\nContent:\n\"\"\"\n${content}\n\"\"\"\n\nA score of 100 means the content directly and completely answers the query.\nA score of 0 means the content is completely irrelevant.\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return {\n score: result.object.score,\n reasoning: result.object.reasoning,\n };\n },\n };\n}\n\n// ============================================================================\n// CONVENIENCE FUNCTIONS\n// ============================================================================\n\n/**\n * Grade content using a one-off grader instance.\n */\nexport async function gradeWithLLM(\n content: string,\n criterion: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string; examples?: string[] }> {\n const grader = createLLMGrader(config);\n return grader.grade(content, criterion);\n}\n\n/**\n * Grade relevance using a one-off grader instance.\n */\nexport async function gradeRelevanceWithLLM(\n query: string,\n content: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string }> {\n const grader = createLLMGrader(config);\n return grader.gradeRelevance(query, content);\n}\n","/**\n * Metric Calculation Helpers\n *\n * Pure functions for calculating scores and metrics.\n */\n\n/**\n * Calculate a weighted score from individual scores and weights.\n *\n * @param scores - Array of { score, weight } objects\n * @returns Weighted average score (0-100)\n */\nexport function calculateWeightedScore(\n scores: Array<{ score: number; weight: number }>\n): number {\n if (scores.length === 0) return 0;\n\n const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);\n if (totalWeight === 0) return 0;\n\n const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);\n return Math.round(weightedSum / totalWeight);\n}\n\n/**\n * Calculate percentage of found items vs expected items.\n *\n * @param found - Number of items found\n * @param expected - Number of items expected\n * @returns Percentage (0-100)\n */\nexport function calculatePercentage(found: number, expected: number): number {\n if (expected === 0) return 100; // Nothing expected, consider it perfect\n return Math.round((found / expected) * 100);\n}\n\n/**\n * Calculate the average of an array of numbers.\n *\n * @param numbers - Array of numbers\n * @returns Average value\n */\nexport function average(numbers: number[]): number {\n if (numbers.length === 0) return 0;\n return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;\n}\n\n/**\n * Calculate precision: true positives / (true positives + false positives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falsePositives - Number of incorrect positive predictions\n * @returns Precision (0-100)\n */\nexport function calculatePrecision(\n truePositives: number,\n falsePositives: number\n): number {\n const total = truePositives + falsePositives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate recall: true positives / (true positives + false negatives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falseNegatives - Number of missed positive predictions\n * @returns Recall (0-100)\n */\nexport function calculateRecall(\n truePositives: number,\n falseNegatives: number\n): number {\n const total = truePositives + falseNegatives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate F1 score: harmonic mean of precision and recall.\n *\n * @param precision - Precision value (0-100)\n * @param recall - Recall value (0-100)\n * @returns F1 score (0-100)\n */\nexport function calculateF1(precision: number, recall: number): number {\n if (precision + recall === 0) return 0;\n return Math.round((2 * precision * recall) / (precision + recall));\n}\n\n/**\n * Clamp a value between min and max.\n */\nexport function clamp(value: number, min: number, max: number): number {\n return Math.max(min, Math.min(max, value));\n}\n\n/**\n * Normalize a score to 0-100 range.\n */\nexport function normalizeScore(score: number): number {\n return clamp(Math.round(score), 0, 100);\n}\n","/**\n * Report Formatting Utilities\n *\n * Format eval results for different output targets.\n */\n\nimport type { EvalResult, ReportOptions, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// CONSOLE REPORTER\n// ============================================================================\n\n/**\n * Format an eval result for console output.\n */\nexport function formatConsoleReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const lines: string[] = [];\n const { includeEvidence = true } = options;\n\n // Header\n const status = result.passed ? '✓ PASSED' : '✗ FAILED';\n const statusColor = result.passed ? '\\x1b[32m' : '\\x1b[31m';\n const reset = '\\x1b[0m';\n\n lines.push('');\n lines.push('═'.repeat(60));\n lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);\n lines.push('═'.repeat(60));\n\n // Summary\n lines.push('');\n lines.push(`Summary: ${result.summary}`);\n lines.push(`Duration: ${result.duration}ms`);\n\n // Individual scores\n lines.push('');\n lines.push('Scores:');\n for (const [name, score] of Object.entries(result.scores)) {\n const bar = createProgressBar(score, 20);\n const formattedName = formatScoreName(name);\n lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);\n }\n\n // Evidence (if requested)\n if (includeEvidence && result.evidence.length > 0) {\n lines.push('');\n lines.push('Evidence:');\n for (const evidence of result.evidence) {\n lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);\n lines.push(` ${evidence.reasoning}`);\n if (evidence.examples && evidence.examples.length > 0) {\n for (const example of evidence.examples.slice(0, 3)) {\n lines.push(` - ${example}`);\n }\n }\n }\n }\n\n lines.push('');\n lines.push('─'.repeat(60));\n\n return lines.join('\\n');\n}\n\n/**\n * Create a text progress bar.\n */\nfunction createProgressBar(value: number, width: number): string {\n const filled = Math.round((value / 100) * width);\n const empty = width - filled;\n return `[${'█'.repeat(filled)}${'░'.repeat(empty)}]`;\n}\n\n/**\n * Format a score name for display (camelCase -> Title Case).\n */\nfunction formatScoreName(name: string): string {\n return name\n .replace(/([A-Z])/g, ' $1')\n .replace(/^./, (str) => str.toUpperCase())\n .trim();\n}\n\n// ============================================================================\n// JSON REPORTER\n// ============================================================================\n\n/**\n * Format an eval result as JSON.\n */\nexport function formatJsonReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const { includeEvidence = true, includeRawData = false } = options;\n\n const output: Record<string, unknown> = {\n passed: result.passed,\n overallScore: result.overallScore,\n scores: result.scores,\n summary: result.summary,\n duration: result.duration,\n };\n\n if (includeEvidence) {\n output.evidence = result.evidence;\n }\n\n // Include any additional properties from extended result types\n for (const [key, value] of Object.entries(result)) {\n if (\n !['passed', 'overallScore', 'scores', 'evidence', 'summary', 'duration'].includes(key) &&\n (includeRawData || !isRawData(value))\n ) {\n output[key] = value;\n }\n }\n\n return JSON.stringify(output, null, 2);\n}\n\n/**\n * Check if a value looks like raw data (large arrays/objects).\n */\nfunction isRawData(value: unknown): boolean {\n if (Array.isArray(value) && value.length > 10) return true;\n if (typeof value === 'object' && value !== null) {\n const keys = Object.keys(value);\n if (keys.length > 20) return true;\n }\n return false;\n}\n\n// ============================================================================\n// SUMMARY GENERATION\n// ============================================================================\n\n/**\n * Generate a human-readable summary from scores.\n */\nexport function generateSummary(\n scores: Record<string, number>,\n thresholds: { good: number; acceptable: number } = { good: 80, acceptable: 60 }\n): string {\n const entries = Object.entries(scores);\n const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;\n\n const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);\n const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);\n\n const parts: string[] = [];\n\n if (avgScore >= thresholds.good) {\n parts.push('Strong overall performance');\n } else if (avgScore >= thresholds.acceptable) {\n parts.push('Acceptable performance with room for improvement');\n } else {\n parts.push('Performance below acceptable thresholds');\n }\n\n if (goodMetrics.length > 0) {\n parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(', ')}`);\n }\n\n if (poorMetrics.length > 0) {\n parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(', ')}`);\n }\n\n return parts.join('. ') + '.';\n}\n\n// ============================================================================\n// PRINT HELPERS\n// ============================================================================\n\n/**\n * Print an eval result to the console.\n */\nexport function printReport(result: EvalResult, options: Partial<ReportOptions> = {}): void {\n const format = options.format || 'console';\n\n if (format === 'json') {\n console.log(formatJsonReport(result, options));\n } else {\n console.log(formatConsoleReport(result, options));\n }\n}\n","/**\n * Introspection Eval - Layer 1\n *\n * Evaluates whether introspection correctly understands a KB's content\n * by testing entity coverage, slot accuracy, scope precision, and capability matching.\n */\n\nimport { Pinecone } from '@pinecone-database/pinecone';\nimport { createLLMGrader } from '../utils/llm-grader.js';\nimport { calculateWeightedScore, normalizeScore } from '../utils/metrics.js';\nimport { generateSummary } from '../utils/reporters.js';\nimport type { EvalEvidence } from '../types.js';\nimport type {\n IntrospectionEvalConfig,\n IntrospectionEvalResult,\n KBManifest,\n EntityCoverageResult,\n SlotAccuracyResult,\n ScopePrecisionResult,\n CapabilityMatchResult,\n} from './types.js';\n\nexport type { IntrospectionEvalConfig, IntrospectionEvalResult, GroundTruthQuery, KBManifest } from './types.js';\n\n// ============================================================================\n// MAIN EVALUATION FUNCTION\n// ============================================================================\n\n/**\n * Evaluate the quality of a generated manifest against a Pinecone assistant.\n */\nexport async function evaluateIntrospection(\n config: IntrospectionEvalConfig\n): Promise<IntrospectionEvalResult> {\n const startTime = Date.now();\n\n // Initialize Pinecone\n const apiKey = config.pineconeApiKey || process.env.PINECONE_API_KEY;\n if (!apiKey) {\n throw new Error('PINECONE_API_KEY is required for introspection eval');\n }\n\n const pinecone = new Pinecone({ apiKey });\n const assistant = pinecone.assistant(config.assistantName);\n\n // Run sub-evaluations\n const [entityResult, slotResult, scopeResult, capabilityResult] = await Promise.all([\n evaluateEntityCoverage(assistant, config.manifest, config),\n evaluateSlotAccuracy(assistant, config.manifest, config),\n evaluateScopePrecision(assistant, config.manifest, config),\n evaluateCapabilityMatch(assistant, config.manifest, config),\n ]);\n\n // Calculate overall score with weights\n const scores = {\n entityCoverage: entityResult.score,\n slotAccuracy: slotResult.score,\n scopePrecision: scopeResult.score,\n capabilityMatch: capabilityResult.score,\n };\n\n const overallScore = calculateWeightedScore([\n { score: scores.entityCoverage, weight: 0.25 },\n { score: scores.slotAccuracy, weight: 0.30 },\n { score: scores.scopePrecision, weight: 0.25 },\n { score: scores.capabilityMatch, weight: 0.20 },\n ]);\n\n // Combine evidence\n const evidence: EvalEvidence[] = [\n ...entityResult.evidence,\n ...slotResult.evidence,\n ...scopeResult.evidence,\n ...capabilityResult.evidence,\n ];\n\n return {\n passed: overallScore >= 70,\n overallScore,\n scores,\n evidence,\n summary: generateSummary(scores),\n duration: Date.now() - startTime,\n details: {\n missingEntities: entityResult.missing,\n incorrectSlots: slotResult.incorrect,\n scopeMisclassifications: scopeResult.misclassified,\n mismatchedCapabilities: capabilityResult.mismatched,\n },\n manifest: config.manifest,\n };\n}\n\n// ============================================================================\n// SUB-EVALUATIONS\n// ============================================================================\n\n/**\n * Evaluate entity coverage: Does the manifest capture entities from the KB?\n */\nasync function evaluateEntityCoverage(\n assistant: ReturnType<Pinecone['assistant']>,\n manifest: KBManifest,\n config: IntrospectionEvalConfig\n): Promise<EntityCoverageResult> {\n const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });\n\n // Get entities from manifest (from slots, capabilities, description)\n const manifestEntities = extractEntitiesFromManifest(manifest);\n\n // Query the KB to discover actual entities\n const discoveryQueries = [\n 'What are the main topics you can help with?',\n 'What products or entities do you have information about?',\n 'List the categories of information you contain.',\n ];\n\n const discoveredEntities: Set<string> = new Set();\n for (const query of discoveryQueries) {\n try {\n const response = await assistant.chat({\n messages: [{ role: 'user', content: query }],\n });\n const entities = await extractEntitiesWithLLM(grader, response.message?.content || '');\n entities.forEach((e) => discoveredEntities.add(e.toLowerCase()));\n } catch {\n // Skip failed queries\n }\n }\n\n // Also check ground truth if provided\n if (config.groundTruth) {\n for (const gt of config.groundTruth) {\n if (gt.expectedEntities) {\n gt.expectedEntities.forEach((e) => discoveredEntities.add(e.toLowerCase()));\n }\n }\n }\n\n // Calculate coverage\n const found: string[] = [];\n const missing: string[] = [];\n\n for (const entity of discoveredEntities) {\n if (manifestEntities.some((me) => me.toLowerCase().includes(entity) || entity.includes(me.toLowerCase()))) {\n found.push(entity);\n } else {\n missing.push(entity);\n }\n }\n\n const score = discoveredEntities.size === 0\n ? 100\n : normalizeScore((found.length / discoveredEntities.size) * 100);\n\n return {\n score,\n evidence: [\n {\n criterion: 'entityCoverage',\n score,\n reasoning: `Found ${found.length}/${discoveredEntities.size} expected entities in the manifest.`,\n examples: missing.slice(0, 5),\n },\n ],\n found,\n missing,\n };\n}\n\n/**\n * Evaluate slot accuracy: Are the defined slots appropriate?\n */\nasync function evaluateSlotAccuracy(\n assistant: ReturnType<Pinecone['assistant']>,\n manifest: KBManifest,\n config: IntrospectionEvalConfig\n): Promise<SlotAccuracyResult> {\n const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });\n\n const slots = manifest.slots || [];\n if (slots.length === 0) {\n return {\n score: 50, // Penalize but don't fail for no slots\n evidence: [\n {\n criterion: 'slotAccuracy',\n score: 50,\n reasoning: 'No slots defined in manifest. Consider adding slots for common query parameters.',\n },\n ],\n correct: [],\n incorrect: [],\n };\n }\n\n // Test each slot by asking the KB questions that would require it\n const correct: string[] = [];\n const incorrect: string[] = [];\n\n for (const slot of slots) {\n const testQuery = `To answer questions about ${manifest.domain || 'this topic'}, do I need to know the ${slot.name}?`;\n\n try {\n const response = await assistant.chat({\n messages: [{ role: 'user', content: testQuery }],\n });\n\n // Use LLM to evaluate if the slot seems relevant\n const evaluation = await grader.grade(\n `Slot: ${slot.name}\\nDescription: ${slot.description || 'N/A'}\\nKB Response: ${response.message?.content}`,\n 'slot relevance',\n 'Score 100 if the slot seems relevant to the KB content, 0 if completely irrelevant.'\n );\n\n if (evaluation.score >= 60) {\n correct.push(slot.name);\n } else {\n incorrect.push(slot.name);\n }\n } catch {\n // If we can't test, assume correct\n correct.push(slot.name);\n }\n }\n\n const score = normalizeScore((correct.length / slots.length) * 100);\n\n return {\n score,\n evidence: [\n {\n criterion: 'slotAccuracy',\n score,\n reasoning: `${correct.length}/${slots.length} slots appear relevant to the KB content.`,\n examples: incorrect,\n },\n ],\n correct,\n incorrect,\n };\n}\n\n/**\n * Evaluate scope precision: Are in/out scope boundaries accurate?\n */\nasync function evaluateScopePrecision(\n assistant: ReturnType<Pinecone['assistant']>,\n manifest: KBManifest,\n config: IntrospectionEvalConfig\n): Promise<ScopePrecisionResult> {\n const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });\n\n const scope = manifest.scope;\n if (!scope) {\n return {\n score: 60, // Penalize but don't fail\n evidence: [\n {\n criterion: 'scopePrecision',\n score: 60,\n reasoning: 'No scope definition in manifest. Consider defining in-scope and out-of-scope examples.',\n },\n ],\n correctClassifications: 0,\n totalClassifications: 0,\n misclassified: [],\n };\n }\n\n const testCases: Array<{ query: string; expectedInScope: boolean }> = [];\n\n // Add scope examples from manifest\n if (scope.inScopeExamples) {\n scope.inScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: true }));\n }\n if (scope.outOfScopeExamples) {\n scope.outOfScopeExamples.forEach((q) => testCases.push({ query: q, expectedInScope: false }));\n }\n\n // Add ground truth if provided\n if (config.groundTruth) {\n config.groundTruth\n .filter((gt) => gt.shouldBeInScope !== undefined)\n .forEach((gt) => testCases.push({ query: gt.query, expectedInScope: gt.shouldBeInScope! }));\n }\n\n if (testCases.length === 0) {\n return {\n score: 70,\n evidence: [\n {\n criterion: 'scopePrecision',\n score: 70,\n reasoning: 'No scope test cases available. Add in-scope and out-of-scope examples to test.',\n },\n ],\n correctClassifications: 0,\n totalClassifications: 0,\n misclassified: [],\n };\n }\n\n let correctClassifications = 0;\n const misclassified: string[] = [];\n\n for (const testCase of testCases.slice(0, 10)) { // Limit to 10 tests\n try {\n const response = await assistant.chat({\n messages: [{ role: 'user', content: testCase.query }],\n });\n\n // Use LLM to determine if the KB could answer\n const evaluation = await grader.grade(\n `Query: ${testCase.query}\\nKB Response: ${response.message?.content}`,\n 'answerability',\n 'Score 100 if the KB provided a substantive, on-topic answer. Score 0 if it said it cannot help or gave an off-topic response.'\n );\n\n const actuallyInScope = evaluation.score >= 50;\n\n if (actuallyInScope === testCase.expectedInScope) {\n correctClassifications++;\n } else {\n misclassified.push(`\"${testCase.query}\" (expected ${testCase.expectedInScope ? 'in-scope' : 'out-of-scope'})`);\n }\n } catch {\n // Skip failed queries\n }\n }\n\n const score = normalizeScore((correctClassifications / testCases.length) * 100);\n\n return {\n score,\n evidence: [\n {\n criterion: 'scopePrecision',\n score,\n reasoning: `${correctClassifications}/${testCases.length} scope classifications were correct.`,\n examples: misclassified.slice(0, 3),\n },\n ],\n correctClassifications,\n totalClassifications: testCases.length,\n misclassified,\n };\n}\n\n/**\n * Evaluate capability match: Do stated capabilities match KB content?\n */\nasync function evaluateCapabilityMatch(\n assistant: ReturnType<Pinecone['assistant']>,\n manifest: KBManifest,\n config: IntrospectionEvalConfig\n): Promise<CapabilityMatchResult> {\n const grader = createLLMGrader({ openaiApiKey: config.openaiApiKey });\n\n const capabilities = (manifest.capabilities || []).map((c) =>\n typeof c === 'string' ? c : c.text\n );\n\n if (capabilities.length === 0) {\n return {\n score: 50,\n evidence: [\n {\n criterion: 'capabilityMatch',\n score: 50,\n reasoning: 'No capabilities defined in manifest.',\n },\n ],\n matched: [],\n mismatched: [],\n };\n }\n\n const matched: string[] = [];\n const mismatched: string[] = [];\n\n for (const capability of capabilities.slice(0, 5)) { // Limit to 5\n // Generate a test query for this capability\n const testQuery = `Can you help me with: ${capability}`;\n\n try {\n const response = await assistant.chat({\n messages: [{ role: 'user', content: testQuery }],\n });\n\n // Check if the KB can actually help with this\n const evaluation = await grader.grade(\n `Capability: ${capability}\\nKB Response: ${response.message?.content}`,\n 'capability fulfillment',\n 'Score 100 if the KB demonstrated it can help with this capability. Score 0 if it cannot.'\n );\n\n if (evaluation.score >= 60) {\n matched.push(capability);\n } else {\n mismatched.push(capability);\n }\n } catch {\n // Assume matched if we can't test\n matched.push(capability);\n }\n }\n\n const score = normalizeScore((matched.length / capabilities.length) * 100);\n\n return {\n score,\n evidence: [\n {\n criterion: 'capabilityMatch',\n score,\n reasoning: `${matched.length}/${capabilities.length} stated capabilities match actual KB content.`,\n examples: mismatched,\n },\n ],\n matched,\n mismatched,\n };\n}\n\n// ============================================================================\n// HELPERS\n// ============================================================================\n\n/**\n * Extract entity-like terms from a manifest.\n */\nfunction extractEntitiesFromManifest(manifest: KBManifest): string[] {\n const entities: Set<string> = new Set();\n\n // From slots\n if (manifest.slots) {\n manifest.slots.forEach((slot) => {\n if (slot.examples) {\n slot.examples.forEach((e) => entities.add(e));\n }\n });\n }\n\n // From domain\n if (manifest.domain) {\n entities.add(manifest.domain);\n }\n\n // From capabilities\n if (manifest.capabilities) {\n manifest.capabilities.forEach((c) => {\n const text = typeof c === 'string' ? c : c.text;\n // Extract nouns (simple heuristic)\n const words = text.split(/\\s+/).filter((w) => w.length > 3 && /^[A-Z]/.test(w));\n words.forEach((w) => entities.add(w.toLowerCase()));\n });\n }\n\n return Array.from(entities);\n}\n\n/**\n * Use LLM to extract entities from text.\n */\nasync function extractEntitiesWithLLM(\n grader: ReturnType<typeof createLLMGrader>,\n text: string\n): Promise<string[]> {\n try {\n const result = await grader.grade(\n text,\n 'entity extraction',\n 'List the main entities (products, topics, categories) mentioned. Return just the entity names separated by commas.'\n );\n\n // Parse the reasoning as a comma-separated list\n return result.reasoning\n .split(',')\n .map((e) => e.trim().toLowerCase())\n .filter((e) => e.length > 2);\n } catch {\n return [];\n }\n}\n"]}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { B as BaseEvalConfig, b as EvalResult } from '../types-BJjlqNhg.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Introspection Eval Types
|
|
5
|
+
*
|
|
6
|
+
* Types for evaluating manifest quality and introspection accuracy.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Configuration for introspection evaluation.
|
|
11
|
+
*/
|
|
12
|
+
interface IntrospectionEvalConfig extends BaseEvalConfig {
|
|
13
|
+
/** The Pinecone assistant name to evaluate against */
|
|
14
|
+
assistantName: string;
|
|
15
|
+
/** The generated manifest to evaluate */
|
|
16
|
+
manifest: KBManifest;
|
|
17
|
+
/** Ground truth queries with expected outcomes */
|
|
18
|
+
groundTruth?: GroundTruthQuery[];
|
|
19
|
+
/** Number of test queries to generate if no ground truth provided */
|
|
20
|
+
autoGenerateQueries?: number;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* A ground truth query for validation.
|
|
24
|
+
*/
|
|
25
|
+
interface GroundTruthQuery {
|
|
26
|
+
/** The query to test */
|
|
27
|
+
query: string;
|
|
28
|
+
/** Expected entities that should be recognized */
|
|
29
|
+
expectedEntities?: string[];
|
|
30
|
+
/** Expected slots that should be extracted */
|
|
31
|
+
expectedSlots?: string[];
|
|
32
|
+
/** Expected capabilities that should be matched */
|
|
33
|
+
expectedCapabilities?: string[];
|
|
34
|
+
/** Whether this query should be in scope (true) or out of scope (false) */
|
|
35
|
+
shouldBeInScope?: boolean;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Result from introspection evaluation.
|
|
39
|
+
*/
|
|
40
|
+
interface IntrospectionEvalResult extends EvalResult {
|
|
41
|
+
/** Individual metric scores */
|
|
42
|
+
scores: {
|
|
43
|
+
/** Percentage of expected entities captured in the manifest */
|
|
44
|
+
entityCoverage: number;
|
|
45
|
+
/** Accuracy of slot extraction */
|
|
46
|
+
slotAccuracy: number;
|
|
47
|
+
/** Precision of scope boundary classification */
|
|
48
|
+
scopePrecision: number;
|
|
49
|
+
/** How well capabilities match actual KB content */
|
|
50
|
+
capabilityMatch: number;
|
|
51
|
+
};
|
|
52
|
+
/** Detailed findings */
|
|
53
|
+
details: {
|
|
54
|
+
/** Entities that were expected but missing from manifest */
|
|
55
|
+
missingEntities: string[];
|
|
56
|
+
/** Slots that were incorrectly defined or missing */
|
|
57
|
+
incorrectSlots: string[];
|
|
58
|
+
/** Queries that were misclassified as in/out of scope */
|
|
59
|
+
scopeMisclassifications: string[];
|
|
60
|
+
/** Capabilities that don't match KB content */
|
|
61
|
+
mismatchedCapabilities: string[];
|
|
62
|
+
};
|
|
63
|
+
/** The manifest that was evaluated */
|
|
64
|
+
manifest: KBManifest;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Simplified manifest type for eval.
|
|
68
|
+
* The full type is in @kat/core, but we define a minimal version here
|
|
69
|
+
* to avoid a hard dependency.
|
|
70
|
+
*/
|
|
71
|
+
interface KBManifest {
|
|
72
|
+
id: string;
|
|
73
|
+
assistantName: string;
|
|
74
|
+
domain?: string;
|
|
75
|
+
description?: string;
|
|
76
|
+
capabilities?: Array<string | {
|
|
77
|
+
text: string;
|
|
78
|
+
}>;
|
|
79
|
+
slots?: Array<{
|
|
80
|
+
name: string;
|
|
81
|
+
type?: string;
|
|
82
|
+
description?: string;
|
|
83
|
+
required?: boolean;
|
|
84
|
+
examples?: string[];
|
|
85
|
+
}>;
|
|
86
|
+
scope?: {
|
|
87
|
+
description?: string;
|
|
88
|
+
inScopeExamples?: string[];
|
|
89
|
+
outOfScopeExamples?: string[];
|
|
90
|
+
excludedIntents?: string[];
|
|
91
|
+
};
|
|
92
|
+
outputs?: string[];
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Introspection Eval - Layer 1
|
|
97
|
+
*
|
|
98
|
+
* Evaluates whether introspection correctly understands a KB's content
|
|
99
|
+
* by testing entity coverage, slot accuracy, scope precision, and capability matching.
|
|
100
|
+
*/
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Evaluate the quality of a generated manifest against a Pinecone assistant.
|
|
104
|
+
*/
|
|
105
|
+
declare function evaluateIntrospection(config: IntrospectionEvalConfig): Promise<IntrospectionEvalResult>;
|
|
106
|
+
|
|
107
|
+
export { type GroundTruthQuery, type IntrospectionEvalConfig, type IntrospectionEvalResult, type KBManifest, evaluateIntrospection };
|