@kat-ai/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -0
- package/dist/agent/index.cjs +468 -0
- package/dist/agent/index.cjs.map +1 -0
- package/dist/agent/index.d.cts +170 -0
- package/dist/agent/index.d.ts +170 -0
- package/dist/agent/index.js +466 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/index.cjs +1043 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +93 -0
- package/dist/index.d.ts +93 -0
- package/dist/index.js +1032 -0
- package/dist/index.js.map +1 -0
- package/dist/introspection/index.cjs +476 -0
- package/dist/introspection/index.cjs.map +1 -0
- package/dist/introspection/index.d.cts +107 -0
- package/dist/introspection/index.d.ts +107 -0
- package/dist/introspection/index.js +474 -0
- package/dist/introspection/index.js.map +1 -0
- package/dist/retrieval/index.cjs +312 -0
- package/dist/retrieval/index.cjs.map +1 -0
- package/dist/retrieval/index.d.cts +98 -0
- package/dist/retrieval/index.d.ts +98 -0
- package/dist/retrieval/index.js +310 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-BJjlqNhg.d.cts +112 -0
- package/dist/types-BJjlqNhg.d.ts +112 -0
- package/package.json +79 -0
package/README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# @kat-ai/eval
|
|
2
|
+
|
|
3
|
+
Evaluation framework for KAT RAG systems. Provides layered quality metrics for introspection, retrieval, and end-to-end agent behavior.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @kat-ai/eval
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Release baseline (compatible package set):
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
npm install @kat-ai/sdk@0.1.0 @kat-ai/eval@0.1.0 @kat-ai/cli@0.1.0
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
```typescript
|
|
20
|
+
import { evaluateIntrospection, evaluateRetrieval, evaluateAgent } from '@kat-ai/eval';
|
|
21
|
+
|
|
22
|
+
// Layer 1: Evaluate manifest quality
|
|
23
|
+
const introspectionResult = await evaluateIntrospection({
|
|
24
|
+
assistantName: 'my-kb',
|
|
25
|
+
manifest: generatedManifest,
|
|
26
|
+
groundTruth: [
|
|
27
|
+
{ query: 'What products do you cover?', expectedEntities: ['toaster', 'blender'] },
|
|
28
|
+
],
|
|
29
|
+
});
|
|
30
|
+
console.log(`Introspection score: ${introspectionResult.overallScore}/100`);
|
|
31
|
+
|
|
32
|
+
// Layer 2: Evaluate retrieval quality
|
|
33
|
+
const retrievalResult = await evaluateRetrieval({
|
|
34
|
+
assistantName: 'my-kb',
|
|
35
|
+
queries: [
|
|
36
|
+
{ query: 'How to fix a toaster?', expectedTopics: ['heating element', 'troubleshooting'] },
|
|
37
|
+
],
|
|
38
|
+
});
|
|
39
|
+
console.log(`Retrieval score: ${retrievalResult.overallScore}/100`);
|
|
40
|
+
|
|
41
|
+
// Layer 3: Evaluate agent behavior
|
|
42
|
+
const agentResult = await evaluateAgent({
|
|
43
|
+
agentEndpoint: 'http://localhost:3000/api/chat',
|
|
44
|
+
scenarios: [
|
|
45
|
+
{
|
|
46
|
+
name: 'basic-troubleshoot',
|
|
47
|
+
initialQuery: 'My toaster won't heat up',
|
|
48
|
+
expectedOutcome: 'answer',
|
|
49
|
+
evaluation: { mustContain: ['heating element'] },
|
|
50
|
+
},
|
|
51
|
+
],
|
|
52
|
+
});
|
|
53
|
+
console.log(`Agent score: ${agentResult.overallScore}/100`);
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Eval Layers
|
|
57
|
+
|
|
58
|
+
### Layer 1: Introspection Eval
|
|
59
|
+
|
|
60
|
+
Evaluates whether introspection correctly understands a KB's content:
|
|
61
|
+
|
|
62
|
+
- **Entity Coverage**: Does the manifest capture all entities in the KB?
|
|
63
|
+
- **Slot Accuracy**: Are extracted slots correct for the domain?
|
|
64
|
+
- **Scope Precision**: Are in/out scope boundaries accurate?
|
|
65
|
+
- **Capability Match**: Do capabilities match actual KB content?
|
|
66
|
+
|
|
67
|
+
### Layer 2: Retrieval Eval
|
|
68
|
+
|
|
69
|
+
Evaluates whether RAG retrieves relevant chunks:
|
|
70
|
+
|
|
71
|
+
- **Relevance**: Are retrieved chunks relevant to the query?
|
|
72
|
+
- **Recall**: Are expected topics found in retrieved chunks?
|
|
73
|
+
- **Precision**: What percentage of retrieved content is relevant?
|
|
74
|
+
- **Noise Ratio**: How much irrelevant content is retrieved?
|
|
75
|
+
|
|
76
|
+
### Layer 3: Agent Eval
|
|
77
|
+
|
|
78
|
+
Evaluates end-to-end agent behavior:
|
|
79
|
+
|
|
80
|
+
- **Accuracy**: Does the agent produce the expected outcome type?
|
|
81
|
+
- **Relevance**: Is the answer relevant to the query?
|
|
82
|
+
- **Completeness**: Does the answer fully address the question?
|
|
83
|
+
- **Helpfulness**: Is the response actionable and helpful?
|
|
84
|
+
|
|
85
|
+
## CLI Usage
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Run all eval layers with the canonical baseline bundle
|
|
89
|
+
kat eval --assistant my-kb --endpoint http://localhost:3000/api/chat --baseline
|
|
90
|
+
# Equivalent explicit path:
|
|
91
|
+
# kat eval --assistant my-kb --endpoint http://localhost:3000/api/chat --scenarios ./eval/baseline/naive-rag-baseline.json
|
|
92
|
+
|
|
93
|
+
# Run specific layer
|
|
94
|
+
kat eval --layer introspection --assistant my-kb --scenarios ./eval/baseline/introspection-ground-truth.json
|
|
95
|
+
kat eval --layer retrieval --assistant my-kb --scenarios ./eval/baseline/retrieval-queries.json
|
|
96
|
+
kat eval --layer agent --endpoint http://localhost:3000/api/chat --scenarios ./eval/baseline/agent-scenarios.json
|
|
97
|
+
|
|
98
|
+
# Output as JSON
|
|
99
|
+
kat eval --assistant my-kb --output json > results.json
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Baseline fixtures are checked in at:
|
|
103
|
+
- `eval/baseline/naive-rag-baseline.json`
|
|
104
|
+
- `eval/baseline/introspection-ground-truth.json`
|
|
105
|
+
- `eval/baseline/retrieval-queries.json`
|
|
106
|
+
- `eval/baseline/agent-scenarios.json`
|
|
107
|
+
|
|
108
|
+
When running `--output json`:
|
|
109
|
+
- `--layer all` outputs an array: `[{ layer, result }, ...]`
|
|
110
|
+
- single-layer runs output only that layer's `result` object
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
MIT
|
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var openai = require('@ai-sdk/openai');
|
|
4
|
+
var ai = require('ai');
|
|
5
|
+
var zod = require('zod');
|
|
6
|
+
var core = require('@kat/core');
|
|
7
|
+
|
|
8
|
+
// src/utils/llm-grader.ts
|
|
9
|
+
var GradeSchema = zod.z.object({
|
|
10
|
+
score: zod.z.number().min(0).max(100).describe("Score from 0-100"),
|
|
11
|
+
reasoning: zod.z.string().describe("Explanation for the score"),
|
|
12
|
+
examples: zod.z.array(zod.z.string()).optional().describe("Specific examples that influenced the score")
|
|
13
|
+
});
|
|
14
|
+
var MultiCriteriaGradeSchema = zod.z.object({
|
|
15
|
+
scores: zod.z.array(zod.z.object({
|
|
16
|
+
criterion: zod.z.string(),
|
|
17
|
+
score: zod.z.number().min(0).max(100),
|
|
18
|
+
reasoning: zod.z.string()
|
|
19
|
+
})),
|
|
20
|
+
overallReasoning: zod.z.string()
|
|
21
|
+
});
|
|
22
|
+
function createLLMGrader(config = {}) {
|
|
23
|
+
const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
|
|
24
|
+
if (!apiKey) {
|
|
25
|
+
throw new Error("OPENAI_API_KEY is required for LLM grading");
|
|
26
|
+
}
|
|
27
|
+
const openai$1 = openai.createOpenAI({ apiKey });
|
|
28
|
+
const model = config.model || core.resolveDefaultOpenAiChatModelId();
|
|
29
|
+
const temperature = config.temperature ?? 0.1;
|
|
30
|
+
return {
|
|
31
|
+
async grade(content, criterion, rubric) {
|
|
32
|
+
const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
|
|
33
|
+
|
|
34
|
+
${rubric ? `Rubric: ${rubric}
|
|
35
|
+
` : ""}
|
|
36
|
+
Content to evaluate:
|
|
37
|
+
"""
|
|
38
|
+
${content}
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
42
|
+
const result = await ai.generateObject({
|
|
43
|
+
model: openai$1(model),
|
|
44
|
+
schema: GradeSchema,
|
|
45
|
+
prompt,
|
|
46
|
+
temperature
|
|
47
|
+
});
|
|
48
|
+
return result.object;
|
|
49
|
+
},
|
|
50
|
+
async gradeMultiple(content, criteria) {
|
|
51
|
+
const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
|
|
52
|
+
const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
|
|
53
|
+
|
|
54
|
+
Criteria:
|
|
55
|
+
${criteriaDescription}
|
|
56
|
+
|
|
57
|
+
Content to evaluate:
|
|
58
|
+
"""
|
|
59
|
+
${content}
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
For each criterion, provide a score from 0-100 and explain your reasoning.`;
|
|
63
|
+
const result = await ai.generateObject({
|
|
64
|
+
model: openai$1(model),
|
|
65
|
+
schema: MultiCriteriaGradeSchema,
|
|
66
|
+
prompt,
|
|
67
|
+
temperature
|
|
68
|
+
});
|
|
69
|
+
return result.object.scores.map((s) => ({
|
|
70
|
+
criterion: s.criterion,
|
|
71
|
+
score: s.score,
|
|
72
|
+
reasoning: s.reasoning
|
|
73
|
+
}));
|
|
74
|
+
},
|
|
75
|
+
async gradeRelevance(query, content) {
|
|
76
|
+
const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
|
|
77
|
+
|
|
78
|
+
Query: "${query}"
|
|
79
|
+
|
|
80
|
+
Content:
|
|
81
|
+
"""
|
|
82
|
+
${content}
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
A score of 100 means the content directly and completely answers the query.
|
|
86
|
+
A score of 0 means the content is completely irrelevant.
|
|
87
|
+
|
|
88
|
+
Provide a score from 0-100 and explain your reasoning.`;
|
|
89
|
+
const result = await ai.generateObject({
|
|
90
|
+
model: openai$1(model),
|
|
91
|
+
schema: GradeSchema,
|
|
92
|
+
prompt,
|
|
93
|
+
temperature
|
|
94
|
+
});
|
|
95
|
+
return {
|
|
96
|
+
score: result.object.score,
|
|
97
|
+
reasoning: result.object.reasoning
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// src/utils/metrics.ts
|
|
104
|
+
function calculateWeightedScore(scores) {
|
|
105
|
+
if (scores.length === 0) return 0;
|
|
106
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
107
|
+
if (totalWeight === 0) return 0;
|
|
108
|
+
const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
|
|
109
|
+
return Math.round(weightedSum / totalWeight);
|
|
110
|
+
}
|
|
111
|
+
function average(numbers) {
|
|
112
|
+
if (numbers.length === 0) return 0;
|
|
113
|
+
return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
|
|
114
|
+
}
|
|
115
|
+
function clamp(value, min, max) {
|
|
116
|
+
return Math.max(min, Math.min(max, value));
|
|
117
|
+
}
|
|
118
|
+
function normalizeScore(score) {
|
|
119
|
+
return clamp(Math.round(score), 0, 100);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// src/utils/reporters.ts
|
|
123
|
+
function formatScoreName(name) {
|
|
124
|
+
return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
|
|
125
|
+
}
|
|
126
|
+
function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
|
|
127
|
+
const entries = Object.entries(scores);
|
|
128
|
+
const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
|
|
129
|
+
const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
|
|
130
|
+
const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
|
|
131
|
+
const parts = [];
|
|
132
|
+
if (avgScore >= thresholds.good) {
|
|
133
|
+
parts.push("Strong overall performance");
|
|
134
|
+
} else if (avgScore >= thresholds.acceptable) {
|
|
135
|
+
parts.push("Acceptable performance with room for improvement");
|
|
136
|
+
} else {
|
|
137
|
+
parts.push("Performance below acceptable thresholds");
|
|
138
|
+
}
|
|
139
|
+
if (goodMetrics.length > 0) {
|
|
140
|
+
parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
|
|
141
|
+
}
|
|
142
|
+
if (poorMetrics.length > 0) {
|
|
143
|
+
parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
|
|
144
|
+
}
|
|
145
|
+
return parts.join(". ") + ".";
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// src/agent/index.ts
|
|
149
|
+
async function evaluateAgent(config) {
|
|
150
|
+
const startTime = Date.now();
|
|
151
|
+
if (config.scenarios.length === 0) {
|
|
152
|
+
throw new Error("At least one scenario is required for agent eval");
|
|
153
|
+
}
|
|
154
|
+
const grader = createLLMGrader({
|
|
155
|
+
openaiApiKey: config.openaiApiKey,
|
|
156
|
+
model: config.graderConfig?.model,
|
|
157
|
+
temperature: config.graderConfig?.temperature
|
|
158
|
+
});
|
|
159
|
+
const scenarioResults = [];
|
|
160
|
+
for (const scenario of config.scenarios) {
|
|
161
|
+
const result = await runScenario(scenario, config, grader);
|
|
162
|
+
scenarioResults.push(result);
|
|
163
|
+
}
|
|
164
|
+
const passedScenarios = scenarioResults.filter((r) => r.passed);
|
|
165
|
+
const accuracy = normalizeScore(passedScenarios.length / scenarioResults.length * 100);
|
|
166
|
+
const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);
|
|
167
|
+
const relevanceScores = gradedResults.flatMap(
|
|
168
|
+
(r) => r.evaluation.evidence.filter((e) => e.criterion === "relevance").map((e) => e.score)
|
|
169
|
+
);
|
|
170
|
+
const completenessScores = gradedResults.flatMap(
|
|
171
|
+
(r) => r.evaluation.evidence.filter((e) => e.criterion === "completeness").map((e) => e.score)
|
|
172
|
+
);
|
|
173
|
+
const helpfulnessScores = gradedResults.flatMap(
|
|
174
|
+
(r) => r.evaluation.evidence.filter((e) => e.criterion === "helpfulness").map((e) => e.score)
|
|
175
|
+
);
|
|
176
|
+
const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;
|
|
177
|
+
const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;
|
|
178
|
+
const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;
|
|
179
|
+
const scores = { accuracy, relevance, completeness, helpfulness };
|
|
180
|
+
const overallScore = calculateWeightedScore([
|
|
181
|
+
{ score: accuracy, weight: 0.3 },
|
|
182
|
+
{ score: relevance, weight: 0.25 },
|
|
183
|
+
{ score: completeness, weight: 0.25 },
|
|
184
|
+
{ score: helpfulness, weight: 0.2 }
|
|
185
|
+
]);
|
|
186
|
+
const evidence = [
|
|
187
|
+
{
|
|
188
|
+
criterion: "accuracy",
|
|
189
|
+
score: accuracy,
|
|
190
|
+
reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,
|
|
191
|
+
examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3)
|
|
192
|
+
},
|
|
193
|
+
{
|
|
194
|
+
criterion: "relevance",
|
|
195
|
+
score: relevance,
|
|
196
|
+
reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
criterion: "completeness",
|
|
200
|
+
score: completeness,
|
|
201
|
+
reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
criterion: "helpfulness",
|
|
205
|
+
score: helpfulness,
|
|
206
|
+
reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`
|
|
207
|
+
}
|
|
208
|
+
];
|
|
209
|
+
return {
|
|
210
|
+
passed: overallScore >= 70,
|
|
211
|
+
overallScore,
|
|
212
|
+
scores,
|
|
213
|
+
evidence,
|
|
214
|
+
summary: generateSummary(scores),
|
|
215
|
+
duration: Date.now() - startTime,
|
|
216
|
+
scenarioResults
|
|
217
|
+
};
|
|
218
|
+
}
|
|
219
|
+
async function runScenario(scenario, config, grader) {
|
|
220
|
+
const startTime = Date.now();
|
|
221
|
+
const maxTurns = scenario.maxTurns || config.maxTurns || 5;
|
|
222
|
+
const timeout = config.timeout || 6e4;
|
|
223
|
+
const conversation = [];
|
|
224
|
+
let currentMessage = scenario.initialQuery;
|
|
225
|
+
let context = {
|
|
226
|
+
sessionId: `eval_${Date.now()}`,
|
|
227
|
+
conversationHistory: []
|
|
228
|
+
};
|
|
229
|
+
let lastResponse = null;
|
|
230
|
+
let turn = 0;
|
|
231
|
+
try {
|
|
232
|
+
while (turn < maxTurns) {
|
|
233
|
+
turn++;
|
|
234
|
+
const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);
|
|
235
|
+
lastResponse = response;
|
|
236
|
+
conversation.push({
|
|
237
|
+
turn,
|
|
238
|
+
userMessage: currentMessage,
|
|
239
|
+
agentResponse: response
|
|
240
|
+
});
|
|
241
|
+
context = {
|
|
242
|
+
...context,
|
|
243
|
+
previousContext: response.context,
|
|
244
|
+
previousIntent: response.intent,
|
|
245
|
+
conversationHistory: [
|
|
246
|
+
...context.conversationHistory || [],
|
|
247
|
+
{ role: "user", content: currentMessage },
|
|
248
|
+
{ role: "assistant", content: response.answer || response.followUpQuestion || "" }
|
|
249
|
+
]
|
|
250
|
+
};
|
|
251
|
+
if (response.outcome === "answer") {
|
|
252
|
+
break;
|
|
253
|
+
}
|
|
254
|
+
if (response.outcome === "blocked" || response.outcome === "out_of_scope") {
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
if (response.outcome === "follow_up") {
|
|
258
|
+
const followUpQuestion = response.followUpQuestion || "";
|
|
259
|
+
const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);
|
|
260
|
+
if (!responseToFollowUp) {
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
currentMessage = responseToFollowUp;
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
const evaluation = await evaluateScenarioResult(
|
|
267
|
+
scenario,
|
|
268
|
+
lastResponse,
|
|
269
|
+
conversation,
|
|
270
|
+
grader
|
|
271
|
+
);
|
|
272
|
+
const outcomeMatch = scenario.expectedOutcome ? lastResponse?.outcome === scenario.expectedOutcome : true;
|
|
273
|
+
const passed = outcomeMatch && evaluation.passed;
|
|
274
|
+
return {
|
|
275
|
+
scenario,
|
|
276
|
+
passed,
|
|
277
|
+
turns: turn,
|
|
278
|
+
finalOutcome: lastResponse?.outcome || "error",
|
|
279
|
+
finalAnswer: lastResponse?.outcome === "answer" ? lastResponse.answer || null : null,
|
|
280
|
+
evaluation,
|
|
281
|
+
conversation,
|
|
282
|
+
duration: Date.now() - startTime
|
|
283
|
+
};
|
|
284
|
+
} catch (error) {
|
|
285
|
+
return {
|
|
286
|
+
scenario,
|
|
287
|
+
passed: false,
|
|
288
|
+
turns: turn,
|
|
289
|
+
finalOutcome: "error",
|
|
290
|
+
finalAnswer: null,
|
|
291
|
+
evaluation: {
|
|
292
|
+
passed: false,
|
|
293
|
+
score: 0,
|
|
294
|
+
evidence: [
|
|
295
|
+
{
|
|
296
|
+
criterion: "error",
|
|
297
|
+
score: 0,
|
|
298
|
+
reasoning: error instanceof Error ? error.message : String(error)
|
|
299
|
+
}
|
|
300
|
+
]
|
|
301
|
+
},
|
|
302
|
+
conversation,
|
|
303
|
+
duration: Date.now() - startTime,
|
|
304
|
+
error: error instanceof Error ? error.message : String(error)
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
async function callAgent(endpoint, message, context, timeout) {
|
|
309
|
+
if (typeof endpoint === "function") {
|
|
310
|
+
return endpoint(message, context);
|
|
311
|
+
}
|
|
312
|
+
const controller = new AbortController();
|
|
313
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
314
|
+
try {
|
|
315
|
+
const response = await fetch(endpoint, {
|
|
316
|
+
method: "POST",
|
|
317
|
+
headers: { "Content-Type": "application/json" },
|
|
318
|
+
body: JSON.stringify({
|
|
319
|
+
message,
|
|
320
|
+
session_id: context.sessionId,
|
|
321
|
+
previous_context: context.previousContext,
|
|
322
|
+
previous_intent: context.previousIntent,
|
|
323
|
+
conversation_history: context.conversationHistory
|
|
324
|
+
}),
|
|
325
|
+
signal: controller.signal
|
|
326
|
+
});
|
|
327
|
+
if (!response.ok) {
|
|
328
|
+
throw new Error(`Agent returned ${response.status}: ${response.statusText}`);
|
|
329
|
+
}
|
|
330
|
+
const data = await response.json();
|
|
331
|
+
return {
|
|
332
|
+
outcome: data.outcome || "answer",
|
|
333
|
+
answer: data.answer,
|
|
334
|
+
followUpQuestion: data.followUpQuestion || data.follow_up_question,
|
|
335
|
+
options: data.options,
|
|
336
|
+
context: data.context,
|
|
337
|
+
intent: data.intent,
|
|
338
|
+
trace: data.trace,
|
|
339
|
+
sessionId: data.session_id || data.sessionId
|
|
340
|
+
};
|
|
341
|
+
} finally {
|
|
342
|
+
clearTimeout(timeoutId);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
function generateFollowUpResponse(question, scenario) {
|
|
346
|
+
if (!scenario.followUpResponses) {
|
|
347
|
+
return null;
|
|
348
|
+
}
|
|
349
|
+
for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {
|
|
350
|
+
if (question.toLowerCase().includes(pattern.toLowerCase())) {
|
|
351
|
+
return response;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
return null;
|
|
355
|
+
}
|
|
356
|
+
async function evaluateScenarioResult(scenario, response, conversation, grader) {
|
|
357
|
+
const evidence = [];
|
|
358
|
+
if (!response || response.outcome !== "answer" || !response.answer) {
|
|
359
|
+
if (scenario.expectedOutcome && scenario.expectedOutcome !== "answer") {
|
|
360
|
+
return {
|
|
361
|
+
passed: response?.outcome === scenario.expectedOutcome,
|
|
362
|
+
score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
|
|
363
|
+
evidence: [
|
|
364
|
+
{
|
|
365
|
+
criterion: "outcomeMatch",
|
|
366
|
+
score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
|
|
367
|
+
reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || "no response"}.`
|
|
368
|
+
}
|
|
369
|
+
]
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
return {
|
|
373
|
+
passed: false,
|
|
374
|
+
score: 0,
|
|
375
|
+
evidence: [
|
|
376
|
+
{
|
|
377
|
+
criterion: "noAnswer",
|
|
378
|
+
score: 0,
|
|
379
|
+
reasoning: `Expected an answer but got ${response?.outcome || "no response"}.`
|
|
380
|
+
}
|
|
381
|
+
]
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
const answer = response.answer;
|
|
385
|
+
let totalScore = 0;
|
|
386
|
+
let criteriaCount = 0;
|
|
387
|
+
if (scenario.evaluation.mustContain) {
|
|
388
|
+
const found = scenario.evaluation.mustContain.filter(
|
|
389
|
+
(s) => answer.toLowerCase().includes(s.toLowerCase())
|
|
390
|
+
);
|
|
391
|
+
const score = normalizeScore(found.length / scenario.evaluation.mustContain.length * 100);
|
|
392
|
+
totalScore += score;
|
|
393
|
+
criteriaCount++;
|
|
394
|
+
evidence.push({
|
|
395
|
+
criterion: "mustContain",
|
|
396
|
+
score,
|
|
397
|
+
reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,
|
|
398
|
+
examples: scenario.evaluation.mustContain.filter(
|
|
399
|
+
(s) => !answer.toLowerCase().includes(s.toLowerCase())
|
|
400
|
+
)
|
|
401
|
+
});
|
|
402
|
+
}
|
|
403
|
+
if (scenario.evaluation.mustNotContain) {
|
|
404
|
+
const found = scenario.evaluation.mustNotContain.filter(
|
|
405
|
+
(s) => answer.toLowerCase().includes(s.toLowerCase())
|
|
406
|
+
);
|
|
407
|
+
const score = normalizeScore((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length * 100);
|
|
408
|
+
totalScore += score;
|
|
409
|
+
criteriaCount++;
|
|
410
|
+
evidence.push({
|
|
411
|
+
criterion: "mustNotContain",
|
|
412
|
+
score,
|
|
413
|
+
reasoning: `Found ${found.length} forbidden terms.`,
|
|
414
|
+
examples: found
|
|
415
|
+
});
|
|
416
|
+
}
|
|
417
|
+
const gradingPrompt = `
|
|
418
|
+
Query: ${scenario.initialQuery}
|
|
419
|
+
Answer: ${answer}
|
|
420
|
+
${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ""}
|
|
421
|
+
`;
|
|
422
|
+
const relevanceResult = await grader.grade(
|
|
423
|
+
gradingPrompt,
|
|
424
|
+
"relevance",
|
|
425
|
+
"How relevant is the answer to the query? 100 = directly and completely addresses the query."
|
|
426
|
+
);
|
|
427
|
+
evidence.push({
|
|
428
|
+
criterion: "relevance",
|
|
429
|
+
score: relevanceResult.score,
|
|
430
|
+
reasoning: relevanceResult.reasoning
|
|
431
|
+
});
|
|
432
|
+
totalScore += relevanceResult.score;
|
|
433
|
+
criteriaCount++;
|
|
434
|
+
const completenessResult = await grader.grade(
|
|
435
|
+
gradingPrompt,
|
|
436
|
+
"completeness",
|
|
437
|
+
"How complete is the answer? 100 = fully addresses all aspects of the query."
|
|
438
|
+
);
|
|
439
|
+
evidence.push({
|
|
440
|
+
criterion: "completeness",
|
|
441
|
+
score: completenessResult.score,
|
|
442
|
+
reasoning: completenessResult.reasoning
|
|
443
|
+
});
|
|
444
|
+
totalScore += completenessResult.score;
|
|
445
|
+
criteriaCount++;
|
|
446
|
+
const helpfulnessResult = await grader.grade(
|
|
447
|
+
gradingPrompt,
|
|
448
|
+
"helpfulness",
|
|
449
|
+
"How helpful and actionable is the answer? 100 = provides clear, actionable guidance."
|
|
450
|
+
);
|
|
451
|
+
evidence.push({
|
|
452
|
+
criterion: "helpfulness",
|
|
453
|
+
score: helpfulnessResult.score,
|
|
454
|
+
reasoning: helpfulnessResult.reasoning
|
|
455
|
+
});
|
|
456
|
+
totalScore += helpfulnessResult.score;
|
|
457
|
+
criteriaCount++;
|
|
458
|
+
const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;
|
|
459
|
+
return {
|
|
460
|
+
passed: avgScore >= 70,
|
|
461
|
+
score: normalizeScore(avgScore),
|
|
462
|
+
evidence
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
exports.evaluateAgent = evaluateAgent;
|
|
467
|
+
//# sourceMappingURL=index.cjs.map
|
|
468
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/utils/llm-grader.ts","../../src/utils/metrics.ts","../../src/utils/reporters.ts","../../src/agent/index.ts"],"names":["z","openai","createOpenAI","resolveDefaultOpenAiChatModelId","generateObject"],"mappings":";;;;;;;;AAgBA,IAAM,WAAA,GAAcA,MAAE,MAAA,CAAO;AAAA,EAC3B,KAAA,EAAOA,KAAA,CAAE,MAAA,EAAO,CAAE,GAAA,CAAI,CAAC,CAAA,CAAE,GAAA,CAAI,GAAG,CAAA,CAAE,QAAA,CAAS,kBAAkB,CAAA;AAAA,EAC7D,SAAA,EAAWA,KAAA,CAAE,MAAA,EAAO,CAAE,SAAS,2BAA2B,CAAA;AAAA,EAC1D,QAAA,EAAUA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,EAAQ,CAAA,CAAE,QAAA,EAAS,CAAE,QAAA,CAAS,6CAA6C;AACjG,CAAC,CAAA;AAED,IAAM,wBAAA,GAA2BA,MAAE,MAAA,CAAO;AAAA,EACxC,MAAA,EAAQA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,CAAO;AAAA,IACvB,SAAA,EAAWA,MAAE,MAAA,EAAO;AAAA,IACpB,KAAA,EAAOA,MAAE,MAAA,EAAO,CAAE,IAAI,CAAC,CAAA,CAAE,IAAI,GAAG,CAAA;AAAA,IAChC,SAAA,EAAWA,MAAE,MAAA;AAAO,GACrB,CAAC,CAAA;AAAA,EACF,gBAAA,EAAkBA,MAAE,MAAA;AACtB,CAAC,CAAA;AAiCM,SAAS,eAAA,CAAgB,MAAA,GAA0B,EAAC,EAAc;AACvE,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,YAAA,IAAgB,OAAA,CAAQ,GAAA,CAAI,cAAA;AAClD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,4CAA4C,CAAA;AAAA,EAC9D;AAEA,EAAA,MAAMC,QAAA,GAASC,mBAAA,CAAa,EAAE,MAAA,EAAQ,CAAA;AACtC,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,KAAA,IAASC,oCAAA,EAAgC;AAC9D,EAAA,MAAM,WAAA,GAAc,OAAO,WAAA,IAAe,GAAA;AAE1C,EAAA,OAAO;AAAA,IACL,MAAM,KAAA,CAAM,OAAA,EAAiB,SAAA,EAAmB,MAAA,EAAiB;AAC/D,MAAA,MAAM,MAAA,GAAS,8EAA8E,SAAS,CAAA;;AAAA,EAE1G,MAAA,GAAS,WAAW,MAAM;AAAA,CAAA,GAAO,EAAE;AAAA;AAAA;AAAA,EAGnC,OAAO;AAAA;;AAAA,sDAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMC,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA;AAAA,IAChB,CAAA;AAAA,IAEA,MAAM,aAAA,CAAc,OAAA,EAAiB,QAAA,EAA2B;AAC9D,MAAA,MAAM,sBAAsB,QAAA,CACzB,GAAA,CAAI,CAAC,CAAA,KAAM,KAAK,CAAA,CAAE,IAAI,CAAA,EAAA,EAAK,CAAA,CAAE,WAAW,CAAA,UAAA,EAAa,CAAA,CAAE,MAAM,CAAA,CAAA,CAAG,CAAA,CAChE,KAAK,IAAI,CAAA;AAEZ,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA;AAAA,EAGnB,mBAAmB;;AAAA;AAAA;AAAA,EAInB,OAAO;AAAA;;AAAA,0EAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,wBAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA,CAAO,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,QACtC,WAAW,CAAA,CAAE,SAAA;AAAA,QACb,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,WAAW,CAAA,CAAE;AAAA,OACf,CAAE,CAAA;AAAA,IACJ,CAAA;AAAA,IAEA,MAAM,cAAA,CAAe,KAAA,EAAe,OAAA,EAAiB;AACnD,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA,QAAA,EAEX,KAAK,CAAA;;AAAA;AAAA;AAAA,EAIb,OAAO;AAAA;;AAAA;AAAA;;AAAA,sDAAA,CAAA;AAQH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO;AAAA,QACL,KAAA,EAAO,OAAO,MAAA,CAAO,KAAA;AAAA,QACrB,SAAA,EAAW,OAAO,MAAA,CAAO;AAAA,OAC3B;AAAA,IACF;AAAA,GACF;AACF;;;AC7IO,SAAS,uBACd,MAAA,EACQ;AACR,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AAEhC,EAAA,MAAM,WAAA,GAAc,OAAO,MAAA,CAAO,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AAC/D,EAAA,IAAI,WAAA,KAAgB,GAAG,OAAO,CAAA;AAE9B,EAAA,MAAM,WAAA,GAAc,MAAA,CAAO,MAAA,CAAO,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,KAAA,GAAQ,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AACzE,EAAA,OAAO,IAAA,CAAK,KAAA,CAAM,WAAA,GAAc,WAAW,CAAA;AAC7C;AAoBO,SAAS,QAAQ,OAAA,EAA2B;AACjD,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,GAAA,EAAK,MAAM,GAAA,GAAM,CAAA,EAAG,CAAC,CAAA,GAAI,OAAA,CAAQ,MAAA;AAC1D;AAiDO,SAAS,KAAA,CAAM,KAAA,EAAe,GAAA,EAAa,GAAA,EAAqB;AACrE,EAAA,OAAO,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,CAAC,CAAA;AAC3C;AAKO,SAAS,eAAe,KAAA,EAAuB;AACpD,EAAA,OAAO,MAAM,IAAA,CAAK,KAAA,CAAM,KAAK,CAAA,EAAG,GAAG,GAAG,CAAA;AACxC;;;ACxBA,SAAS,gBAAgB,IAAA,EAAsB;AAC7C,EAAA,OAAO,IAAA,CACJ,OAAA,CAAQ,UAAA,EAAY,KAAK,CAAA,CACzB,OAAA,CAAQ,IAAA,EAAM,CAAC,GAAA,KAAQ,GAAA,CAAI,WAAA,EAAa,EACxC,IAAA,EAAK;AACV;AA2DO,SAAS,eAAA,CACd,QACA,UAAA,GAAmD,EAAE,MAAM,EAAA,EAAI,UAAA,EAAY,IAAG,EACtE;AACR,EAAA,MAAM,OAAA,GAAU,MAAA,CAAO,OAAA,CAAQ,MAAM,CAAA;AACrC,EAAA,MAAM,QAAA,GAAW,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAA,EAAK,GAAG,KAAK,CAAA,KAAM,GAAA,GAAM,KAAA,EAAO,CAAC,IAAI,OAAA,CAAQ,MAAA;AAE9E,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,IAAS,UAAA,CAAW,IAAI,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAChG,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,GAAQ,UAAA,CAAW,UAAU,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAErG,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,IAAI,QAAA,IAAY,WAAW,IAAA,EAAM;AAC/B,IAAA,KAAA,CAAM,KAAK,4BAA4B,CAAA;AAAA,EACzC,CAAA,MAAA,IAAW,QAAA,IAAY,UAAA,CAAW,UAAA,EAAY;AAC5C,IAAA,KAAA,CAAM,KAAK,kDAAkD,CAAA;AAAA,EAC/D,CAAA,MAAO;AACL,IAAA,KAAA,CAAM,KAAK,yCAAyC,CAAA;AAAA,EACtD;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,WAAW,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EACrE;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,sBAAsB,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EAChF;AAEA,EAAA,OAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA,GAAI,GAAA;AAC5B;;;ACrIA,eAAsB,cACpB,MAAA,EAC0B;AAC1B,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAE3B,EAAA,IAAI,MAAA,CAAO,SAAA,CAAU,MAAA,KAAW,CAAA,EAAG;AACjC,IAAA,MAAM,IAAI,MAAM,kDAAkD,CAAA;AAAA,EACpE;AAEA,EAAA,MAAM,SAAS,eAAA,CAAgB;AAAA,IAC7B,cAAc,MAAA,CAAO,YAAA;AAAA,IACrB,KAAA,EAAO,OAAO,YAAA,EAAc,KAAA;AAAA,IAC5B,WAAA,EAAa,OAAO,YAAA,EAAc;AAAA,GACnC,CAAA;AAGD,EAAA,MAAM,kBAAoC,EAAC;AAE3C,EAAA,KAAA,MAAW,QAAA,IAAY,OAAO,SAAA,EAAW;AACvC,IAAA,MAAM,MAAA,GAAS,MAAM,WAAA,CAAY,QAAA,EAAU,QAAQ,MAAM,CAAA;AACzD,IAAA,eAAA,CAAgB,KAAK,MAAM,CAAA;AAAA,EAC7B;AAGA,EAAA,MAAM,kBAAkB,eAAA,CAAgB,MAAA,CAAO,CAAC,CAAA,KAAM,EAAE,MAAM,CAAA;AAC9D,EAAA,MAAM,WAAW,cAAA,CAAgB,eAAA,CAAgB,MAAA,GAAS,eAAA,CAAgB,SAAU,GAAG,CAAA;AAGvF,EAAA,MAAM,aAAA,GAAgB,gBAAgB,MAAA,CAAO,CAAC,MAAM,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,MAAA,GAAS,CAAC,CAAA;AACpF,EAAA,MAAM,kBAAkB,aAAA,CAAc,OAAA;AAAA,IAAQ,CAAC,CAAA,KAC7C,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,OAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAA,KAAc,WAAW,CAAA,CAAE,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,KAAK;AAAA,GACrF;AACA,EAAA,MAAM,qBAAqB,aAAA,CAAc,OAAA;AAAA,IAAQ,CAAC,CAAA,KAChD,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,OAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAA,KAAc,cAAc,CAAA,CAAE,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,KAAK;AAAA,GACxF;AACA,EAAA,MAAM,oBAAoB,aAAA,CAAc,OAAA;AAAA,IAAQ,CAAC,CAAA,KAC/C,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,OAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAA,KAAc,aAAa,CAAA,CAAE,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,KAAK;AAAA,GACvF;AAEA,EAAA,MAAM,SAAA,GAAY,gBAAgB,MAAA,GAAS,CAAA,GAAI,eAAe,OAAA,CAAQ,eAAe,CAAC,CAAA,GAAI,CAAA;AAC1F,EAAA,MAAM,YAAA,GAAe,mBAAmB,MAAA,GAAS,CAAA,GAAI,eAAe,OAAA,CAAQ,kBAAkB,CAAC,CAAA,GAAI,CAAA;AACnG,EAAA,MAAM,WAAA,GAAc,kBAAkB,MAAA,GAAS,CAAA,GAAI,eAAe,OAAA,CAAQ,iBAAiB,CAAC,CAAA,GAAI,CAAA;AAEhG,EAAA,MAAM,MAAA,GAAS,EAAE,QAAA,EAAU,SAAA,EAAW,cAAc,WAAA,EAAY;AAEhE,EAAA,MAAM,eAAe,sBAAA,CAAuB;AAAA,IAC1C,EAAE,KAAA,EAAO,QAAA,EAAU,MAAA,EAAQ,GAAA,EAAK;AAAA,IAChC,EAAE,KAAA,EAAO,SAAA,EAAW,MAAA,EAAQ,IAAA,EAAK;AAAA,IACjC,EAAE,KAAA,EAAO,YAAA,EAAc,MAAA,EAAQ,IAAA,EAAK;AAAA,IACpC,EAAE,KAAA,EAAO,WAAA,EAAa,MAAA,EAAQ,GAAA;AAAK,GACpC,CAAA;AAGD,EAAA,MAAM,QAAA,GAA2B;AAAA,IAC/B;AAAA,MACE,SAAA,EAAW,UAAA;AAAA,MACX,KAAA,EAAO,QAAA;AAAA,MACP,WAAW,CAAA,EAAG,eAAA,CAAgB,MAAM,CAAA,CAAA,EAAI,gBAAgB,MAAM,CAAA,kBAAA,CAAA;AAAA,MAC9D,UAAU,eAAA,CAAgB,MAAA,CAAO,CAAC,CAAA,KAAM,CAAC,EAAE,MAAM,CAAA,CAAE,GAAA,CAAI,CAAC,MAAM,CAAA,CAAE,QAAA,CAAS,IAAI,CAAA,CAAE,KAAA,CAAM,GAAG,CAAC;AAAA,KAC3F;AAAA,IACA;AAAA,MACE,SAAA,EAAW,WAAA;AAAA,MACX,KAAA,EAAO,SAAA;AAAA,MACP,SAAA,EAAW,CAAA,gCAAA,EAAmC,aAAA,CAAc,MAAM,CAAA,kBAAA;AAAA,KACpE;AAAA,IACA;AAAA,MACE,SAAA,EAAW,cAAA;AAAA,MACX,KAAA,EAAO,YAAA;AAAA,MACP,SAAA,EAAW,CAAA,mCAAA,EAAsC,aAAA,CAAc,MAAM,CAAA,kBAAA;AAAA,KACvE;AAAA,IACA;AAAA,MACE,SAAA,EAAW,aAAA;AAAA,MACX,KAAA,EAAO,WAAA;AAAA,MACP,SAAA,EAAW,CAAA,2BAAA,EAA8B,aAAA,CAAc,MAAM,CAAA,kBAAA;AAAA;AAC/D,GACF;AAEA,EAAA,OAAO;AAAA,IACL,QAAQ,YAAA,IAAgB,EAAA;AAAA,IACxB,YAAA;AAAA,IACA,MAAA;AAAA,IACA,QAAA;AAAA,IACA,OAAA,EAAS,gBAAgB,MAAM,CAAA;AAAA,IAC/B,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,IACvB;AAAA,GACF;AACF;AASA,eAAe,WAAA,CACb,QAAA,EACA,MAAA,EACA,MAAA,EACyB;AACzB,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAC3B,EAAA,MAAM,QAAA,GAAW,QAAA,CAAS,QAAA,IAAY,MAAA,CAAO,QAAA,IAAY,CAAA;AACzD,EAAA,MAAM,OAAA,GAAU,OAAO,OAAA,IAAW,GAAA;AAElC,EAAA,MAAM,eAAmC,EAAC;AAC1C,EAAA,IAAI,iBAAiB,QAAA,CAAS,YAAA;AAC9B,EAAA,IAAI,OAAA,GAAwB;AAAA,IAC1B,SAAA,EAAW,CAAA,KAAA,EAAQ,IAAA,CAAK,GAAA,EAAK,CAAA,CAAA;AAAA,IAC7B,qBAAqB;AAAC,GACxB;AACA,EAAA,IAAI,YAAA,GAAqC,IAAA;AACzC,EAAA,IAAI,IAAA,GAAO,CAAA;AAEX,EAAA,IAAI;AACF,IAAA,OAAO,OAAO,QAAA,EAAU;AACtB,MAAA,IAAA,EAAA;AAGA,MAAA,MAAM,WAAW,MAAM,SAAA,CAAU,OAAO,aAAA,EAAe,cAAA,EAAgB,SAAS,OAAO,CAAA;AACvF,MAAA,YAAA,GAAe,QAAA;AAGf,MAAA,YAAA,CAAa,IAAA,CAAK;AAAA,QAChB,IAAA;AAAA,QACA,WAAA,EAAa,cAAA;AAAA,QACb,aAAA,EAAe;AAAA,OAChB,CAAA;AAGD,MAAA,OAAA,GAAU;AAAA,QACR,GAAG,OAAA;AAAA,QACH,iBAAiB,QAAA,CAAS,OAAA;AAAA,QAC1B,gBAAgB,QAAA,CAAS,MAAA;AAAA,QACzB,mBAAA,EAAqB;AAAA,UACnB,GAAI,OAAA,CAAQ,mBAAA,IAAuB,EAAC;AAAA,UACpC,EAAE,IAAA,EAAM,MAAA,EAAiB,OAAA,EAAS,cAAA,EAAe;AAAA,UACjD,EAAE,MAAM,WAAA,EAAsB,OAAA,EAAS,SAAS,MAAA,IAAU,QAAA,CAAS,oBAAoB,EAAA;AAAG;AAC5F,OACF;AAGA,MAAA,IAAI,QAAA,CAAS,YAAY,QAAA,EAAU;AACjC,QAAA;AAAA,MACF;AAEA,MAAA,IAAI,QAAA,CAAS,OAAA,KAAY,SAAA,IAAa,QAAA,CAAS,YAAY,cAAA,EAAgB;AACzE,QAAA;AAAA,MACF;AAEA,MAAA,IAAI,QAAA,CAAS,YAAY,WAAA,EAAa;AAEpC,QAAA,MAAM,gBAAA,GAAmB,SAAS,gBAAA,IAAoB,EAAA;AACtD,QAAA,MAAM,kBAAA,GAAqB,wBAAA,CAAyB,gBAAA,EAAkB,QAAQ,CAAA;AAE9E,QAAA,IAAI,CAAC,kBAAA,EAAoB;AAEvB,UAAA;AAAA,QACF;AAEA,QAAA,cAAA,GAAiB,kBAAA;AAAA,MACnB;AAAA,IACF;AAGA,IAAA,MAAM,aAAa,MAAM,sBAAA;AAAA,MACvB,QAAA;AAAA,MACA,YAAA;AAAA,MACA,YAAA;AAAA,MACA;AAAA,KACF;AAGA,IAAA,MAAM,eAAe,QAAA,CAAS,eAAA,GAC1B,YAAA,EAAc,OAAA,KAAY,SAAS,eAAA,GACnC,IAAA;AAEJ,IAAA,MAAM,MAAA,GAAS,gBAAgB,UAAA,CAAW,MAAA;AAE1C,IAAA,OAAO;AAAA,MACL,QAAA;AAAA,MACA,MAAA;AAAA,MACA,KAAA,EAAO,IAAA;AAAA,MACP,YAAA,EAAc,cAAc,OAAA,IAAW,OAAA;AAAA,MACvC,aAAa,YAAA,EAAc,OAAA,KAAY,QAAA,GAAW,YAAA,CAAa,UAAU,IAAA,GAAO,IAAA;AAAA,MAChF,UAAA;AAAA,MACA,YAAA;AAAA,MACA,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI;AAAA,KACzB;AAAA,EACF,SAAS,KAAA,EAAO;AACd,IAAA,OAAO;AAAA,MACL,QAAA;AAAA,MACA,MAAA,EAAQ,KAAA;AAAA,MACR,KAAA,EAAO,IAAA;AAAA,MACP,YAAA,EAAc,OAAA;AAAA,MACd,WAAA,EAAa,IAAA;AAAA,MACb,UAAA,EAAY;AAAA,QACV,MAAA,EAAQ,KAAA;AAAA,QACR,KAAA,EAAO,CAAA;AAAA,QACP,QAAA,EAAU;AAAA,UACR;AAAA,YACE,SAAA,EAAW,OAAA;AAAA,YACX,KAAA,EAAO,CAAA;AAAA,YACP,WAAW,KAAA,YAAiB,KAAA,GAAQ,KAAA,CAAM,OAAA,GAAU,OAAO,KAAK;AAAA;AAClE;AACF,OACF;AAAA,MACA,YAAA;AAAA,MACA,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,MACvB,OAAO,KAAA,YAAiB,KAAA,GAAQ,KAAA,CAAM,OAAA,GAAU,OAAO,KAAK;AAAA,KAC9D;AAAA,EACF;AACF;AASA,eAAe,SAAA,CACb,QAAA,EACA,OAAA,EACA,OAAA,EACA,OAAA,EACwB;AACxB,EAAA,IAAI,OAAO,aAAa,UAAA,EAAY;AAClC,IAAA,OAAO,QAAA,CAAS,SAAS,OAAO,CAAA;AAAA,EAClC;AAGA,EAAA,MAAM,UAAA,GAAa,IAAI,eAAA,EAAgB;AACvC,EAAA,MAAM,YAAY,UAAA,CAAW,MAAM,UAAA,CAAW,KAAA,IAAS,OAAO,CAAA;AAE9D,EAAA,IAAI;AACF,IAAA,MAAM,QAAA,GAAW,MAAM,KAAA,CAAM,QAAA,EAAU;AAAA,MACrC,MAAA,EAAQ,MAAA;AAAA,MACR,OAAA,EAAS,EAAE,cAAA,EAAgB,kBAAA,EAAmB;AAAA,MAC9C,IAAA,EAAM,KAAK,SAAA,CAAU;AAAA,QACnB,OAAA;AAAA,QACA,YAAY,OAAA,CAAQ,SAAA;AAAA,QACpB,kBAAkB,OAAA,CAAQ,eAAA;AAAA,QAC1B,iBAAiB,OAAA,CAAQ,cAAA;AAAA,QACzB,sBAAsB,OAAA,CAAQ;AAAA,OAC/B,CAAA;AAAA,MACD,QAAQ,UAAA,CAAW;AAAA,KACpB,CAAA;AAED,IAAA,IAAI,CAAC,SAAS,EAAA,EAAI;AAChB,MAAA,MAAM,IAAI,MAAM,CAAA,eAAA,EAAkB,QAAA,CAAS,MAAM,CAAA,EAAA,EAAK,QAAA,CAAS,UAAU,CAAA,CAAE,CAAA;AAAA,IAC7E;AAEA,IAAA,MAAM,IAAA,GAAO,MAAM,QAAA,CAAS,IAAA,EAAK;AAEjC,IAAA,OAAO;AAAA,MACL,OAAA,EAAU,KAAK,OAAA,IAAwC,QAAA;AAAA,MACvD,QAAQ,IAAA,CAAK,MAAA;AAAA,MACb,gBAAA,EAAmB,IAAA,CAAK,gBAAA,IAAoB,IAAA,CAAK,kBAAA;AAAA,MACjD,SAAS,IAAA,CAAK,OAAA;AAAA,MACd,SAAS,IAAA,CAAK,OAAA;AAAA,MACd,QAAQ,IAAA,CAAK,MAAA;AAAA,MACb,OAAO,IAAA,CAAK,KAAA;AAAA,MACZ,SAAA,EAAY,IAAA,CAAK,UAAA,IAAc,IAAA,CAAK;AAAA,KACtC;AAAA,EACF,CAAA,SAAE;AACA,IAAA,YAAA,CAAa,SAAS,CAAA;AAAA,EACxB;AACF;AASA,SAAS,wBAAA,CACP,UACA,QAAA,EACe;AACf,EAAA,IAAI,CAAC,SAAS,iBAAA,EAAmB;AAC/B,IAAA,OAAO,IAAA;AAAA,EACT;AAGA,EAAA,KAAA,MAAW,CAAC,SAAS,QAAQ,CAAA,IAAK,OAAO,OAAA,CAAQ,QAAA,CAAS,iBAAiB,CAAA,EAAG;AAC5E,IAAA,IAAI,SAAS,WAAA,EAAY,CAAE,SAAS,OAAA,CAAQ,WAAA,EAAa,CAAA,EAAG;AAC1D,MAAA,OAAO,QAAA;AAAA,IACT;AAAA,EACF;AAEA,EAAA,OAAO,IAAA;AACT;AASA,eAAe,sBAAA,CACb,QAAA,EACA,QAAA,EACA,YAAA,EACA,MAAA,EACuE;AACvE,EAAA,MAAM,WAA2B,EAAC;AAElC,EAAA,IAAI,CAAC,QAAA,IAAY,QAAA,CAAS,YAAY,QAAA,IAAY,CAAC,SAAS,MAAA,EAAQ;AAElE,IAAA,IAAI,QAAA,CAAS,eAAA,IAAmB,QAAA,CAAS,eAAA,KAAoB,QAAA,EAAU;AAErE,MAAA,OAAO;AAAA,QACL,MAAA,EAAQ,QAAA,EAAU,OAAA,KAAY,QAAA,CAAS,eAAA;AAAA,QACvC,KAAA,EAAO,QAAA,EAAU,OAAA,KAAY,QAAA,CAAS,kBAAkB,GAAA,GAAM,CAAA;AAAA,QAC9D,QAAA,EAAU;AAAA,UACR;AAAA,YACE,SAAA,EAAW,cAAA;AAAA,YACX,KAAA,EAAO,QAAA,EAAU,OAAA,KAAY,QAAA,CAAS,kBAAkB,GAAA,GAAM,CAAA;AAAA,YAC9D,WAAW,CAAA,SAAA,EAAY,QAAA,CAAS,eAAe,CAAA,MAAA,EAAS,QAAA,EAAU,WAAW,aAAa,CAAA,CAAA;AAAA;AAC5F;AACF,OACF;AAAA,IACF;AAEA,IAAA,OAAO;AAAA,MACL,MAAA,EAAQ,KAAA;AAAA,MACR,KAAA,EAAO,CAAA;AAAA,MACP,QAAA,EAAU;AAAA,QACR;AAAA,UACE,SAAA,EAAW,UAAA;AAAA,UACX,KAAA,EAAO,CAAA;AAAA,UACP,SAAA,EAAW,CAAA,2BAAA,EAA8B,QAAA,EAAU,OAAA,IAAW,aAAa,CAAA,CAAA;AAAA;AAC7E;AACF,KACF;AAAA,EACF;AAEA,EAAA,MAAM,SAAS,QAAA,CAAS,MAAA;AACxB,EAAA,IAAI,UAAA,GAAa,CAAA;AACjB,EAAA,IAAI,aAAA,GAAgB,CAAA;AAGpB,EAAA,IAAI,QAAA,CAAS,WAAW,WAAA,EAAa;AACnC,IAAA,MAAM,KAAA,GAAQ,QAAA,CAAS,UAAA,CAAW,WAAA,CAAY,MAAA;AAAA,MAAO,CAAC,MACpD,MAAA,CAAO,WAAA,GAAc,QAAA,CAAS,CAAA,CAAE,aAAa;AAAA,KAC/C;AACA,IAAA,MAAM,KAAA,GAAQ,eAAgB,KAAA,CAAM,MAAA,GAAS,SAAS,UAAA,CAAW,WAAA,CAAY,SAAU,GAAG,CAAA;AAC1F,IAAA,UAAA,IAAc,KAAA;AACd,IAAA,aAAA,EAAA;AAEA,IAAA,QAAA,CAAS,IAAA,CAAK;AAAA,MACZ,SAAA,EAAW,aAAA;AAAA,MACX,KAAA;AAAA,MACA,SAAA,EAAW,SAAS,KAAA,CAAM,MAAM,IAAI,QAAA,CAAS,UAAA,CAAW,YAAY,MAAM,CAAA,gBAAA,CAAA;AAAA,MAC1E,QAAA,EAAU,QAAA,CAAS,UAAA,CAAW,WAAA,CAAY,MAAA;AAAA,QACxC,CAAC,MAAM,CAAC,MAAA,CAAO,aAAY,CAAE,QAAA,CAAS,CAAA,CAAE,WAAA,EAAa;AAAA;AACvD,KACD,CAAA;AAAA,EACH;AAGA,EAAA,IAAI,QAAA,CAAS,WAAW,cAAA,EAAgB;AACtC,IAAA,MAAM,KAAA,GAAQ,QAAA,CAAS,UAAA,CAAW,cAAA,CAAe,MAAA;AAAA,MAAO,CAAC,MACvD,MAAA,CAAO,WAAA,GAAc,QAAA,CAAS,CAAA,CAAE,aAAa;AAAA,KAC/C;AACA,IAAA,MAAM,KAAA,GAAQ,cAAA,CAAA,CAAiB,QAAA,CAAS,UAAA,CAAW,cAAA,CAAe,MAAA,GAAS,KAAA,CAAM,MAAA,IAAU,QAAA,CAAS,UAAA,CAAW,cAAA,CAAe,MAAA,GAAU,GAAG,CAAA;AAC3I,IAAA,UAAA,IAAc,KAAA;AACd,IAAA,aAAA,EAAA;AAEA,IAAA,QAAA,CAAS,IAAA,CAAK;AAAA,MACZ,SAAA,EAAW,gBAAA;AAAA,MACX,KAAA;AAAA,MACA,SAAA,EAAW,CAAA,MAAA,EAAS,KAAA,CAAM,MAAM,CAAA,iBAAA,CAAA;AAAA,MAChC,QAAA,EAAU;AAAA,KACX,CAAA;AAAA,EACH;AAGA,EAAA,MAAM,aAAA,GAAgB;AAAA,OAAA,EACf,SAAS,YAAY;AAAA,QAAA,EACpB,MAAM;AAAA,EACd,QAAA,CAAS,WAAW,MAAA,GAAS,CAAA,QAAA,EAAW,SAAS,UAAA,CAAW,MAAM,KAAK,EAAE;AAAA,CAAA;AAIzE,EAAA,MAAM,eAAA,GAAkB,MAAM,MAAA,CAAO,KAAA;AAAA,IACnC,aAAA;AAAA,IACA,WAAA;AAAA,IACA;AAAA,GACF;AACA,EAAA,QAAA,CAAS,IAAA,CAAK;AAAA,IACZ,SAAA,EAAW,WAAA;AAAA,IACX,OAAO,eAAA,CAAgB,KAAA;AAAA,IACvB,WAAW,eAAA,CAAgB;AAAA,GAC5B,CAAA;AACD,EAAA,UAAA,IAAc,eAAA,CAAgB,KAAA;AAC9B,EAAA,aAAA,EAAA;AAGA,EAAA,MAAM,kBAAA,GAAqB,MAAM,MAAA,CAAO,KAAA;AAAA,IACtC,aAAA;AAAA,IACA,cAAA;AAAA,IACA;AAAA,GACF;AACA,EAAA,QAAA,CAAS,IAAA,CAAK;AAAA,IACZ,SAAA,EAAW,cAAA;AAAA,IACX,OAAO,kBAAA,CAAmB,KAAA;AAAA,IAC1B,WAAW,kBAAA,CAAmB;AAAA,GAC/B,CAAA;AACD,EAAA,UAAA,IAAc,kBAAA,CAAmB,KAAA;AACjC,EAAA,aAAA,EAAA;AAGA,EAAA,MAAM,iBAAA,GAAoB,MAAM,MAAA,CAAO,KAAA;AAAA,IACrC,aAAA;AAAA,IACA,aAAA;AAAA,IACA;AAAA,GACF;AACA,EAAA,QAAA,CAAS,IAAA,CAAK;AAAA,IACZ,SAAA,EAAW,aAAA;AAAA,IACX,OAAO,iBAAA,CAAkB,KAAA;AAAA,IACzB,WAAW,iBAAA,CAAkB;AAAA,GAC9B,CAAA;AACD,EAAA,UAAA,IAAc,iBAAA,CAAkB,KAAA;AAChC,EAAA,aAAA,EAAA;AAEA,EAAA,MAAM,QAAA,GAAW,aAAA,GAAgB,CAAA,GAAI,UAAA,GAAa,aAAA,GAAgB,CAAA;AAElE,EAAA,OAAO;AAAA,IACL,QAAQ,QAAA,IAAY,EAAA;AAAA,IACpB,KAAA,EAAO,eAAe,QAAQ,CAAA;AAAA,IAC9B;AAAA,GACF;AACF","file":"index.cjs","sourcesContent":["/**\n * LLM Grading Utilities\n *\n * Uses OpenAI to grade content quality, relevance, and other metrics.\n */\n\nimport { createOpenAI } from '@ai-sdk/openai';\nimport { generateObject } from 'ai';\nimport { z } from 'zod';\nimport { resolveDefaultOpenAiChatModelId } from '@kat/core';\nimport type { LLMGraderConfig, EvalCriterion, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// GRADING SCHEMAS\n// ============================================================================\n\nconst GradeSchema = z.object({\n score: z.number().min(0).max(100).describe('Score from 0-100'),\n reasoning: z.string().describe('Explanation for the score'),\n examples: z.array(z.string()).optional().describe('Specific examples that influenced the score'),\n});\n\nconst MultiCriteriaGradeSchema = z.object({\n scores: z.array(z.object({\n criterion: z.string(),\n score: z.number().min(0).max(100),\n reasoning: z.string(),\n })),\n overallReasoning: z.string(),\n});\n\n// ============================================================================\n// GRADER FACTORY\n// ============================================================================\n\nexport interface LLMGrader {\n /**\n * Grade content against a single criterion.\n */\n grade(content: string, criterion: string, rubric?: string): Promise<{\n score: number;\n reasoning: string;\n examples?: string[];\n }>;\n\n /**\n * Grade content against multiple criteria.\n */\n gradeMultiple(content: string, criteria: EvalCriterion[]): Promise<EvalEvidence[]>;\n\n /**\n * Grade relevance of content to a query.\n */\n gradeRelevance(query: string, content: string): Promise<{\n score: number;\n reasoning: string;\n }>;\n}\n\n/**\n * Create an LLM grader with the given configuration.\n */\nexport function createLLMGrader(config: LLMGraderConfig = {}): LLMGrader {\n const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;\n if (!apiKey) {\n throw new Error('OPENAI_API_KEY is required for LLM grading');\n }\n\n const openai = createOpenAI({ apiKey });\n const model = config.model || resolveDefaultOpenAiChatModelId();\n const temperature = config.temperature ?? 0.1;\n\n return {\n async grade(content: string, criterion: string, rubric?: string) {\n const prompt = `You are an expert evaluator. Grade the following content on the criterion \"${criterion}\".\n\n${rubric ? `Rubric: ${rubric}\\n` : ''}\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return result.object;\n },\n\n async gradeMultiple(content: string, criteria: EvalCriterion[]) {\n const criteriaDescription = criteria\n .map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`)\n .join('\\n');\n\n const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.\n\nCriteria:\n${criteriaDescription}\n\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nFor each criterion, provide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: MultiCriteriaGradeSchema,\n prompt,\n temperature,\n });\n\n return result.object.scores.map((s) => ({\n criterion: s.criterion,\n score: s.score,\n reasoning: s.reasoning,\n }));\n },\n\n async gradeRelevance(query: string, content: string) {\n const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.\n\nQuery: \"${query}\"\n\nContent:\n\"\"\"\n${content}\n\"\"\"\n\nA score of 100 means the content directly and completely answers the query.\nA score of 0 means the content is completely irrelevant.\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return {\n score: result.object.score,\n reasoning: result.object.reasoning,\n };\n },\n };\n}\n\n// ============================================================================\n// CONVENIENCE FUNCTIONS\n// ============================================================================\n\n/**\n * Grade content using a one-off grader instance.\n */\nexport async function gradeWithLLM(\n content: string,\n criterion: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string; examples?: string[] }> {\n const grader = createLLMGrader(config);\n return grader.grade(content, criterion);\n}\n\n/**\n * Grade relevance using a one-off grader instance.\n */\nexport async function gradeRelevanceWithLLM(\n query: string,\n content: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string }> {\n const grader = createLLMGrader(config);\n return grader.gradeRelevance(query, content);\n}\n","/**\n * Metric Calculation Helpers\n *\n * Pure functions for calculating scores and metrics.\n */\n\n/**\n * Calculate a weighted score from individual scores and weights.\n *\n * @param scores - Array of { score, weight } objects\n * @returns Weighted average score (0-100)\n */\nexport function calculateWeightedScore(\n scores: Array<{ score: number; weight: number }>\n): number {\n if (scores.length === 0) return 0;\n\n const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);\n if (totalWeight === 0) return 0;\n\n const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);\n return Math.round(weightedSum / totalWeight);\n}\n\n/**\n * Calculate percentage of found items vs expected items.\n *\n * @param found - Number of items found\n * @param expected - Number of items expected\n * @returns Percentage (0-100)\n */\nexport function calculatePercentage(found: number, expected: number): number {\n if (expected === 0) return 100; // Nothing expected, consider it perfect\n return Math.round((found / expected) * 100);\n}\n\n/**\n * Calculate the average of an array of numbers.\n *\n * @param numbers - Array of numbers\n * @returns Average value\n */\nexport function average(numbers: number[]): number {\n if (numbers.length === 0) return 0;\n return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;\n}\n\n/**\n * Calculate precision: true positives / (true positives + false positives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falsePositives - Number of incorrect positive predictions\n * @returns Precision (0-100)\n */\nexport function calculatePrecision(\n truePositives: number,\n falsePositives: number\n): number {\n const total = truePositives + falsePositives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate recall: true positives / (true positives + false negatives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falseNegatives - Number of missed positive predictions\n * @returns Recall (0-100)\n */\nexport function calculateRecall(\n truePositives: number,\n falseNegatives: number\n): number {\n const total = truePositives + falseNegatives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate F1 score: harmonic mean of precision and recall.\n *\n * @param precision - Precision value (0-100)\n * @param recall - Recall value (0-100)\n * @returns F1 score (0-100)\n */\nexport function calculateF1(precision: number, recall: number): number {\n if (precision + recall === 0) return 0;\n return Math.round((2 * precision * recall) / (precision + recall));\n}\n\n/**\n * Clamp a value between min and max.\n */\nexport function clamp(value: number, min: number, max: number): number {\n return Math.max(min, Math.min(max, value));\n}\n\n/**\n * Normalize a score to 0-100 range.\n */\nexport function normalizeScore(score: number): number {\n return clamp(Math.round(score), 0, 100);\n}\n","/**\n * Report Formatting Utilities\n *\n * Format eval results for different output targets.\n */\n\nimport type { EvalResult, ReportOptions, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// CONSOLE REPORTER\n// ============================================================================\n\n/**\n * Format an eval result for console output.\n */\nexport function formatConsoleReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const lines: string[] = [];\n const { includeEvidence = true } = options;\n\n // Header\n const status = result.passed ? '✓ PASSED' : '✗ FAILED';\n const statusColor = result.passed ? '\\x1b[32m' : '\\x1b[31m';\n const reset = '\\x1b[0m';\n\n lines.push('');\n lines.push('═'.repeat(60));\n lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);\n lines.push('═'.repeat(60));\n\n // Summary\n lines.push('');\n lines.push(`Summary: ${result.summary}`);\n lines.push(`Duration: ${result.duration}ms`);\n\n // Individual scores\n lines.push('');\n lines.push('Scores:');\n for (const [name, score] of Object.entries(result.scores)) {\n const bar = createProgressBar(score, 20);\n const formattedName = formatScoreName(name);\n lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);\n }\n\n // Evidence (if requested)\n if (includeEvidence && result.evidence.length > 0) {\n lines.push('');\n lines.push('Evidence:');\n for (const evidence of result.evidence) {\n lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);\n lines.push(` ${evidence.reasoning}`);\n if (evidence.examples && evidence.examples.length > 0) {\n for (const example of evidence.examples.slice(0, 3)) {\n lines.push(` - ${example}`);\n }\n }\n }\n }\n\n lines.push('');\n lines.push('─'.repeat(60));\n\n return lines.join('\\n');\n}\n\n/**\n * Create a text progress bar.\n */\nfunction createProgressBar(value: number, width: number): string {\n const filled = Math.round((value / 100) * width);\n const empty = width - filled;\n return `[${'█'.repeat(filled)}${'░'.repeat(empty)}]`;\n}\n\n/**\n * Format a score name for display (camelCase -> Title Case).\n */\nfunction formatScoreName(name: string): string {\n return name\n .replace(/([A-Z])/g, ' $1')\n .replace(/^./, (str) => str.toUpperCase())\n .trim();\n}\n\n// ============================================================================\n// JSON REPORTER\n// ============================================================================\n\n/**\n * Format an eval result as JSON.\n */\nexport function formatJsonReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const { includeEvidence = true, includeRawData = false } = options;\n\n const output: Record<string, unknown> = {\n passed: result.passed,\n overallScore: result.overallScore,\n scores: result.scores,\n summary: result.summary,\n duration: result.duration,\n };\n\n if (includeEvidence) {\n output.evidence = result.evidence;\n }\n\n // Include any additional properties from extended result types\n for (const [key, value] of Object.entries(result)) {\n if (\n !['passed', 'overallScore', 'scores', 'evidence', 'summary', 'duration'].includes(key) &&\n (includeRawData || !isRawData(value))\n ) {\n output[key] = value;\n }\n }\n\n return JSON.stringify(output, null, 2);\n}\n\n/**\n * Check if a value looks like raw data (large arrays/objects).\n */\nfunction isRawData(value: unknown): boolean {\n if (Array.isArray(value) && value.length > 10) return true;\n if (typeof value === 'object' && value !== null) {\n const keys = Object.keys(value);\n if (keys.length > 20) return true;\n }\n return false;\n}\n\n// ============================================================================\n// SUMMARY GENERATION\n// ============================================================================\n\n/**\n * Generate a human-readable summary from scores.\n */\nexport function generateSummary(\n scores: Record<string, number>,\n thresholds: { good: number; acceptable: number } = { good: 80, acceptable: 60 }\n): string {\n const entries = Object.entries(scores);\n const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;\n\n const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);\n const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);\n\n const parts: string[] = [];\n\n if (avgScore >= thresholds.good) {\n parts.push('Strong overall performance');\n } else if (avgScore >= thresholds.acceptable) {\n parts.push('Acceptable performance with room for improvement');\n } else {\n parts.push('Performance below acceptable thresholds');\n }\n\n if (goodMetrics.length > 0) {\n parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(', ')}`);\n }\n\n if (poorMetrics.length > 0) {\n parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(', ')}`);\n }\n\n return parts.join('. ') + '.';\n}\n\n// ============================================================================\n// PRINT HELPERS\n// ============================================================================\n\n/**\n * Print an eval result to the console.\n */\nexport function printReport(result: EvalResult, options: Partial<ReportOptions> = {}): void {\n const format = options.format || 'console';\n\n if (format === 'json') {\n console.log(formatJsonReport(result, options));\n } else {\n console.log(formatConsoleReport(result, options));\n }\n}\n","/**\n * Agent Eval - Layer 3\n *\n * Evaluates end-to-end agent behavior by running multi-turn\n * conversation scenarios and grading the responses.\n */\n\nimport { createLLMGrader } from '../utils/llm-grader.js';\nimport { average, normalizeScore, calculateWeightedScore } from '../utils/metrics.js';\nimport { generateSummary } from '../utils/reporters.js';\nimport type { EvalEvidence } from '../types.js';\nimport type {\n AgentEvalConfig,\n AgentEvalResult,\n AgentTestScenario,\n ScenarioResult,\n ConversationTurn,\n AgentResponse,\n AgentContext,\n AgentFunction,\n} from './types.js';\n\nexport type {\n AgentEvalConfig,\n AgentEvalResult,\n AgentTestScenario,\n ScenarioEvaluation,\n ScenarioResult,\n ConversationTurn,\n AgentResponse,\n} from './types.js';\n\n// ============================================================================\n// MAIN EVALUATION FUNCTION\n// ============================================================================\n\n/**\n * Evaluate agent behavior by running test scenarios.\n */\nexport async function evaluateAgent(\n config: AgentEvalConfig\n): Promise<AgentEvalResult> {\n const startTime = Date.now();\n\n if (config.scenarios.length === 0) {\n throw new Error('At least one scenario is required for agent eval');\n }\n\n const grader = createLLMGrader({\n openaiApiKey: config.openaiApiKey,\n model: config.graderConfig?.model,\n temperature: config.graderConfig?.temperature,\n });\n\n // Run each scenario\n const scenarioResults: ScenarioResult[] = [];\n\n for (const scenario of config.scenarios) {\n const result = await runScenario(scenario, config, grader);\n scenarioResults.push(result);\n }\n\n // Calculate aggregate scores\n const passedScenarios = scenarioResults.filter((r) => r.passed);\n const accuracy = normalizeScore((passedScenarios.length / scenarioResults.length) * 100);\n\n // Calculate average scores from graded scenarios\n const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);\n const relevanceScores = gradedResults.flatMap((r) =>\n r.evaluation.evidence.filter((e) => e.criterion === 'relevance').map((e) => e.score)\n );\n const completenessScores = gradedResults.flatMap((r) =>\n r.evaluation.evidence.filter((e) => e.criterion === 'completeness').map((e) => e.score)\n );\n const helpfulnessScores = gradedResults.flatMap((r) =>\n r.evaluation.evidence.filter((e) => e.criterion === 'helpfulness').map((e) => e.score)\n );\n\n const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;\n const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;\n const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;\n\n const scores = { accuracy, relevance, completeness, helpfulness };\n\n const overallScore = calculateWeightedScore([\n { score: accuracy, weight: 0.30 },\n { score: relevance, weight: 0.25 },\n { score: completeness, weight: 0.25 },\n { score: helpfulness, weight: 0.20 },\n ]);\n\n // Build evidence\n const evidence: EvalEvidence[] = [\n {\n criterion: 'accuracy',\n score: accuracy,\n reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,\n examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3),\n },\n {\n criterion: 'relevance',\n score: relevance,\n reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`,\n },\n {\n criterion: 'completeness',\n score: completeness,\n reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`,\n },\n {\n criterion: 'helpfulness',\n score: helpfulness,\n reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`,\n },\n ];\n\n return {\n passed: overallScore >= 70,\n overallScore,\n scores,\n evidence,\n summary: generateSummary(scores),\n duration: Date.now() - startTime,\n scenarioResults,\n };\n}\n\n// ============================================================================\n// SCENARIO RUNNER\n// ============================================================================\n\n/**\n * Run a single scenario and evaluate the result.\n */\nasync function runScenario(\n scenario: AgentTestScenario,\n config: AgentEvalConfig,\n grader: ReturnType<typeof createLLMGrader>\n): Promise<ScenarioResult> {\n const startTime = Date.now();\n const maxTurns = scenario.maxTurns || config.maxTurns || 5;\n const timeout = config.timeout || 60000;\n\n const conversation: ConversationTurn[] = [];\n let currentMessage = scenario.initialQuery;\n let context: AgentContext = {\n sessionId: `eval_${Date.now()}`,\n conversationHistory: [],\n };\n let lastResponse: AgentResponse | null = null;\n let turn = 0;\n\n try {\n while (turn < maxTurns) {\n turn++;\n\n // Call the agent\n const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);\n lastResponse = response;\n\n // Record the conversation\n conversation.push({\n turn,\n userMessage: currentMessage,\n agentResponse: response,\n });\n\n // Update context\n context = {\n ...context,\n previousContext: response.context,\n previousIntent: response.intent,\n conversationHistory: [\n ...(context.conversationHistory || []),\n { role: 'user' as const, content: currentMessage },\n { role: 'assistant' as const, content: response.answer || response.followUpQuestion || '' },\n ],\n };\n\n // Check outcome\n if (response.outcome === 'answer') {\n break; // Got an answer, done\n }\n\n if (response.outcome === 'blocked' || response.outcome === 'out_of_scope') {\n break; // Terminal state\n }\n\n if (response.outcome === 'follow_up') {\n // Generate response to follow-up question\n const followUpQuestion = response.followUpQuestion || '';\n const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);\n\n if (!responseToFollowUp) {\n // No response configured, end the conversation\n break;\n }\n\n currentMessage = responseToFollowUp;\n }\n }\n\n // Evaluate the result\n const evaluation = await evaluateScenarioResult(\n scenario,\n lastResponse,\n conversation,\n grader\n );\n\n // Determine if passed\n const outcomeMatch = scenario.expectedOutcome\n ? lastResponse?.outcome === scenario.expectedOutcome\n : true;\n\n const passed = outcomeMatch && evaluation.passed;\n\n return {\n scenario,\n passed,\n turns: turn,\n finalOutcome: lastResponse?.outcome || 'error',\n finalAnswer: lastResponse?.outcome === 'answer' ? lastResponse.answer || null : null,\n evaluation,\n conversation,\n duration: Date.now() - startTime,\n };\n } catch (error) {\n return {\n scenario,\n passed: false,\n turns: turn,\n finalOutcome: 'error',\n finalAnswer: null,\n evaluation: {\n passed: false,\n score: 0,\n evidence: [\n {\n criterion: 'error',\n score: 0,\n reasoning: error instanceof Error ? error.message : String(error),\n },\n ],\n },\n conversation,\n duration: Date.now() - startTime,\n error: error instanceof Error ? error.message : String(error),\n };\n }\n}\n\n// ============================================================================\n// AGENT CALLING\n// ============================================================================\n\n/**\n * Call the agent (either via HTTP or direct function).\n */\nasync function callAgent(\n endpoint: string | AgentFunction,\n message: string,\n context: AgentContext,\n timeout: number\n): Promise<AgentResponse> {\n if (typeof endpoint === 'function') {\n return endpoint(message, context);\n }\n\n // HTTP call\n const controller = new AbortController();\n const timeoutId = setTimeout(() => controller.abort(), timeout);\n\n try {\n const response = await fetch(endpoint, {\n method: 'POST',\n headers: { 'Content-Type': 'application/json' },\n body: JSON.stringify({\n message,\n session_id: context.sessionId,\n previous_context: context.previousContext,\n previous_intent: context.previousIntent,\n conversation_history: context.conversationHistory,\n }),\n signal: controller.signal,\n });\n\n if (!response.ok) {\n throw new Error(`Agent returned ${response.status}: ${response.statusText}`);\n }\n\n const data = await response.json() as Record<string, unknown>;\n\n return {\n outcome: (data.outcome as AgentResponse['outcome']) || 'answer',\n answer: data.answer as string | undefined,\n followUpQuestion: (data.followUpQuestion || data.follow_up_question) as string | undefined,\n options: data.options as AgentResponse['options'],\n context: data.context,\n intent: data.intent,\n trace: data.trace,\n sessionId: (data.session_id || data.sessionId) as string | undefined,\n };\n } finally {\n clearTimeout(timeoutId);\n }\n}\n\n// ============================================================================\n// FOLLOW-UP RESPONSE GENERATION\n// ============================================================================\n\n/**\n * Generate a response to a follow-up question based on scenario config.\n */\nfunction generateFollowUpResponse(\n question: string,\n scenario: AgentTestScenario\n): string | null {\n if (!scenario.followUpResponses) {\n return null;\n }\n\n // Check each pattern\n for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {\n if (question.toLowerCase().includes(pattern.toLowerCase())) {\n return response;\n }\n }\n\n return null;\n}\n\n// ============================================================================\n// SCENARIO EVALUATION\n// ============================================================================\n\n/**\n * Evaluate the result of a scenario.\n */\nasync function evaluateScenarioResult(\n scenario: AgentTestScenario,\n response: AgentResponse | null,\n conversation: ConversationTurn[],\n grader: ReturnType<typeof createLLMGrader>\n): Promise<{ passed: boolean; score: number; evidence: EvalEvidence[] }> {\n const evidence: EvalEvidence[] = [];\n\n if (!response || response.outcome !== 'answer' || !response.answer) {\n // No answer to evaluate\n if (scenario.expectedOutcome && scenario.expectedOutcome !== 'answer') {\n // Expected non-answer outcome\n return {\n passed: response?.outcome === scenario.expectedOutcome,\n score: response?.outcome === scenario.expectedOutcome ? 100 : 0,\n evidence: [\n {\n criterion: 'outcomeMatch',\n score: response?.outcome === scenario.expectedOutcome ? 100 : 0,\n reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || 'no response'}.`,\n },\n ],\n };\n }\n\n return {\n passed: false,\n score: 0,\n evidence: [\n {\n criterion: 'noAnswer',\n score: 0,\n reasoning: `Expected an answer but got ${response?.outcome || 'no response'}.`,\n },\n ],\n };\n }\n\n const answer = response.answer;\n let totalScore = 0;\n let criteriaCount = 0;\n\n // Check mustContain\n if (scenario.evaluation.mustContain) {\n const found = scenario.evaluation.mustContain.filter((s) =>\n answer.toLowerCase().includes(s.toLowerCase())\n );\n const score = normalizeScore((found.length / scenario.evaluation.mustContain.length) * 100);\n totalScore += score;\n criteriaCount++;\n\n evidence.push({\n criterion: 'mustContain',\n score,\n reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,\n examples: scenario.evaluation.mustContain.filter(\n (s) => !answer.toLowerCase().includes(s.toLowerCase())\n ),\n });\n }\n\n // Check mustNotContain\n if (scenario.evaluation.mustNotContain) {\n const found = scenario.evaluation.mustNotContain.filter((s) =>\n answer.toLowerCase().includes(s.toLowerCase())\n );\n const score = normalizeScore(((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length) * 100);\n totalScore += score;\n criteriaCount++;\n\n evidence.push({\n criterion: 'mustNotContain',\n score,\n reasoning: `Found ${found.length} forbidden terms.`,\n examples: found,\n });\n }\n\n // LLM grading for relevance, completeness, helpfulness\n const gradingPrompt = `\nQuery: ${scenario.initialQuery}\nAnswer: ${answer}\n${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ''}\n`;\n\n // Grade relevance\n const relevanceResult = await grader.grade(\n gradingPrompt,\n 'relevance',\n 'How relevant is the answer to the query? 100 = directly and completely addresses the query.'\n );\n evidence.push({\n criterion: 'relevance',\n score: relevanceResult.score,\n reasoning: relevanceResult.reasoning,\n });\n totalScore += relevanceResult.score;\n criteriaCount++;\n\n // Grade completeness\n const completenessResult = await grader.grade(\n gradingPrompt,\n 'completeness',\n 'How complete is the answer? 100 = fully addresses all aspects of the query.'\n );\n evidence.push({\n criterion: 'completeness',\n score: completenessResult.score,\n reasoning: completenessResult.reasoning,\n });\n totalScore += completenessResult.score;\n criteriaCount++;\n\n // Grade helpfulness\n const helpfulnessResult = await grader.grade(\n gradingPrompt,\n 'helpfulness',\n 'How helpful and actionable is the answer? 100 = provides clear, actionable guidance.'\n );\n evidence.push({\n criterion: 'helpfulness',\n score: helpfulnessResult.score,\n reasoning: helpfulnessResult.reasoning,\n });\n totalScore += helpfulnessResult.score;\n criteriaCount++;\n\n const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;\n\n return {\n passed: avgScore >= 70,\n score: normalizeScore(avgScore),\n evidence,\n };\n}\n"]}
|