@kat-ai/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,114 @@
1
+ # @kat-ai/eval
2
+
3
+ Evaluation framework for KAT RAG systems. Provides layered quality metrics for introspection, retrieval, and end-to-end agent behavior.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install @kat-ai/eval
9
+ ```
10
+
11
+ Release baseline (compatible package set):
12
+
13
+ ```bash
14
+ npm install @kat-ai/sdk@0.1.0 @kat-ai/eval@0.1.0 @kat-ai/cli@0.1.0
15
+ ```
16
+
17
+ ## Quick Start
18
+
19
+ ```typescript
20
+ import { evaluateIntrospection, evaluateRetrieval, evaluateAgent } from '@kat-ai/eval';
21
+
22
+ // Layer 1: Evaluate manifest quality
23
+ const introspectionResult = await evaluateIntrospection({
24
+ assistantName: 'my-kb',
25
+ manifest: generatedManifest,
26
+ groundTruth: [
27
+ { query: 'What products do you cover?', expectedEntities: ['toaster', 'blender'] },
28
+ ],
29
+ });
30
+ console.log(`Introspection score: ${introspectionResult.overallScore}/100`);
31
+
32
+ // Layer 2: Evaluate retrieval quality
33
+ const retrievalResult = await evaluateRetrieval({
34
+ assistantName: 'my-kb',
35
+ queries: [
36
+ { query: 'How to fix a toaster?', expectedTopics: ['heating element', 'troubleshooting'] },
37
+ ],
38
+ });
39
+ console.log(`Retrieval score: ${retrievalResult.overallScore}/100`);
40
+
41
+ // Layer 3: Evaluate agent behavior
42
+ const agentResult = await evaluateAgent({
43
+ agentEndpoint: 'http://localhost:3000/api/chat',
44
+ scenarios: [
45
+ {
46
+ name: 'basic-troubleshoot',
47
+ initialQuery: 'My toaster won't heat up',
48
+ expectedOutcome: 'answer',
49
+ evaluation: { mustContain: ['heating element'] },
50
+ },
51
+ ],
52
+ });
53
+ console.log(`Agent score: ${agentResult.overallScore}/100`);
54
+ ```
55
+
56
+ ## Eval Layers
57
+
58
+ ### Layer 1: Introspection Eval
59
+
60
+ Evaluates whether introspection correctly understands a KB's content:
61
+
62
+ - **Entity Coverage**: Does the manifest capture all entities in the KB?
63
+ - **Slot Accuracy**: Are extracted slots correct for the domain?
64
+ - **Scope Precision**: Are in/out scope boundaries accurate?
65
+ - **Capability Match**: Do capabilities match actual KB content?
66
+
67
+ ### Layer 2: Retrieval Eval
68
+
69
+ Evaluates whether RAG retrieves relevant chunks:
70
+
71
+ - **Relevance**: Are retrieved chunks relevant to the query?
72
+ - **Recall**: Are expected topics found in retrieved chunks?
73
+ - **Precision**: What percentage of retrieved content is relevant?
74
+ - **Noise Ratio**: How much irrelevant content is retrieved?
75
+
76
+ ### Layer 3: Agent Eval
77
+
78
+ Evaluates end-to-end agent behavior:
79
+
80
+ - **Accuracy**: Does the agent produce the expected outcome type?
81
+ - **Relevance**: Is the answer relevant to the query?
82
+ - **Completeness**: Does the answer fully address the question?
83
+ - **Helpfulness**: Is the response actionable and helpful?
84
+
85
+ ## CLI Usage
86
+
87
+ ```bash
88
+ # Run all eval layers with the canonical baseline bundle
89
+ kat eval --assistant my-kb --endpoint http://localhost:3000/api/chat --baseline
90
+ # Equivalent explicit path:
91
+ # kat eval --assistant my-kb --endpoint http://localhost:3000/api/chat --scenarios ./eval/baseline/naive-rag-baseline.json
92
+
93
+ # Run specific layer
94
+ kat eval --layer introspection --assistant my-kb --scenarios ./eval/baseline/introspection-ground-truth.json
95
+ kat eval --layer retrieval --assistant my-kb --scenarios ./eval/baseline/retrieval-queries.json
96
+ kat eval --layer agent --endpoint http://localhost:3000/api/chat --scenarios ./eval/baseline/agent-scenarios.json
97
+
98
+ # Output as JSON
99
+ kat eval --assistant my-kb --output json > results.json
100
+ ```
101
+
102
+ Baseline fixtures are checked in at:
103
+ - `eval/baseline/naive-rag-baseline.json`
104
+ - `eval/baseline/introspection-ground-truth.json`
105
+ - `eval/baseline/retrieval-queries.json`
106
+ - `eval/baseline/agent-scenarios.json`
107
+
108
+ When running `--output json`:
109
+ - `--layer all` outputs an array: `[{ layer, result }, ...]`
110
+ - single-layer runs output only that layer's `result` object
111
+
112
+ ## License
113
+
114
+ MIT
@@ -0,0 +1,468 @@
1
+ 'use strict';
2
+
3
+ var openai = require('@ai-sdk/openai');
4
+ var ai = require('ai');
5
+ var zod = require('zod');
6
+ var core = require('@kat/core');
7
+
8
+ // src/utils/llm-grader.ts
9
+ var GradeSchema = zod.z.object({
10
+ score: zod.z.number().min(0).max(100).describe("Score from 0-100"),
11
+ reasoning: zod.z.string().describe("Explanation for the score"),
12
+ examples: zod.z.array(zod.z.string()).optional().describe("Specific examples that influenced the score")
13
+ });
14
+ var MultiCriteriaGradeSchema = zod.z.object({
15
+ scores: zod.z.array(zod.z.object({
16
+ criterion: zod.z.string(),
17
+ score: zod.z.number().min(0).max(100),
18
+ reasoning: zod.z.string()
19
+ })),
20
+ overallReasoning: zod.z.string()
21
+ });
22
+ function createLLMGrader(config = {}) {
23
+ const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;
24
+ if (!apiKey) {
25
+ throw new Error("OPENAI_API_KEY is required for LLM grading");
26
+ }
27
+ const openai$1 = openai.createOpenAI({ apiKey });
28
+ const model = config.model || core.resolveDefaultOpenAiChatModelId();
29
+ const temperature = config.temperature ?? 0.1;
30
+ return {
31
+ async grade(content, criterion, rubric) {
32
+ const prompt = `You are an expert evaluator. Grade the following content on the criterion "${criterion}".
33
+
34
+ ${rubric ? `Rubric: ${rubric}
35
+ ` : ""}
36
+ Content to evaluate:
37
+ """
38
+ ${content}
39
+ """
40
+
41
+ Provide a score from 0-100 and explain your reasoning.`;
42
+ const result = await ai.generateObject({
43
+ model: openai$1(model),
44
+ schema: GradeSchema,
45
+ prompt,
46
+ temperature
47
+ });
48
+ return result.object;
49
+ },
50
+ async gradeMultiple(content, criteria) {
51
+ const criteriaDescription = criteria.map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`).join("\n");
52
+ const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.
53
+
54
+ Criteria:
55
+ ${criteriaDescription}
56
+
57
+ Content to evaluate:
58
+ """
59
+ ${content}
60
+ """
61
+
62
+ For each criterion, provide a score from 0-100 and explain your reasoning.`;
63
+ const result = await ai.generateObject({
64
+ model: openai$1(model),
65
+ schema: MultiCriteriaGradeSchema,
66
+ prompt,
67
+ temperature
68
+ });
69
+ return result.object.scores.map((s) => ({
70
+ criterion: s.criterion,
71
+ score: s.score,
72
+ reasoning: s.reasoning
73
+ }));
74
+ },
75
+ async gradeRelevance(query, content) {
76
+ const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.
77
+
78
+ Query: "${query}"
79
+
80
+ Content:
81
+ """
82
+ ${content}
83
+ """
84
+
85
+ A score of 100 means the content directly and completely answers the query.
86
+ A score of 0 means the content is completely irrelevant.
87
+
88
+ Provide a score from 0-100 and explain your reasoning.`;
89
+ const result = await ai.generateObject({
90
+ model: openai$1(model),
91
+ schema: GradeSchema,
92
+ prompt,
93
+ temperature
94
+ });
95
+ return {
96
+ score: result.object.score,
97
+ reasoning: result.object.reasoning
98
+ };
99
+ }
100
+ };
101
+ }
102
+
103
+ // src/utils/metrics.ts
104
+ function calculateWeightedScore(scores) {
105
+ if (scores.length === 0) return 0;
106
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
107
+ if (totalWeight === 0) return 0;
108
+ const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);
109
+ return Math.round(weightedSum / totalWeight);
110
+ }
111
+ function average(numbers) {
112
+ if (numbers.length === 0) return 0;
113
+ return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;
114
+ }
115
+ function clamp(value, min, max) {
116
+ return Math.max(min, Math.min(max, value));
117
+ }
118
+ function normalizeScore(score) {
119
+ return clamp(Math.round(score), 0, 100);
120
+ }
121
+
122
+ // src/utils/reporters.ts
123
+ function formatScoreName(name) {
124
+ return name.replace(/([A-Z])/g, " $1").replace(/^./, (str) => str.toUpperCase()).trim();
125
+ }
126
+ function generateSummary(scores, thresholds = { good: 80, acceptable: 60 }) {
127
+ const entries = Object.entries(scores);
128
+ const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;
129
+ const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);
130
+ const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);
131
+ const parts = [];
132
+ if (avgScore >= thresholds.good) {
133
+ parts.push("Strong overall performance");
134
+ } else if (avgScore >= thresholds.acceptable) {
135
+ parts.push("Acceptable performance with room for improvement");
136
+ } else {
137
+ parts.push("Performance below acceptable thresholds");
138
+ }
139
+ if (goodMetrics.length > 0) {
140
+ parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(", ")}`);
141
+ }
142
+ if (poorMetrics.length > 0) {
143
+ parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(", ")}`);
144
+ }
145
+ return parts.join(". ") + ".";
146
+ }
147
+
148
+ // src/agent/index.ts
149
+ async function evaluateAgent(config) {
150
+ const startTime = Date.now();
151
+ if (config.scenarios.length === 0) {
152
+ throw new Error("At least one scenario is required for agent eval");
153
+ }
154
+ const grader = createLLMGrader({
155
+ openaiApiKey: config.openaiApiKey,
156
+ model: config.graderConfig?.model,
157
+ temperature: config.graderConfig?.temperature
158
+ });
159
+ const scenarioResults = [];
160
+ for (const scenario of config.scenarios) {
161
+ const result = await runScenario(scenario, config, grader);
162
+ scenarioResults.push(result);
163
+ }
164
+ const passedScenarios = scenarioResults.filter((r) => r.passed);
165
+ const accuracy = normalizeScore(passedScenarios.length / scenarioResults.length * 100);
166
+ const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);
167
+ const relevanceScores = gradedResults.flatMap(
168
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "relevance").map((e) => e.score)
169
+ );
170
+ const completenessScores = gradedResults.flatMap(
171
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "completeness").map((e) => e.score)
172
+ );
173
+ const helpfulnessScores = gradedResults.flatMap(
174
+ (r) => r.evaluation.evidence.filter((e) => e.criterion === "helpfulness").map((e) => e.score)
175
+ );
176
+ const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;
177
+ const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;
178
+ const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;
179
+ const scores = { accuracy, relevance, completeness, helpfulness };
180
+ const overallScore = calculateWeightedScore([
181
+ { score: accuracy, weight: 0.3 },
182
+ { score: relevance, weight: 0.25 },
183
+ { score: completeness, weight: 0.25 },
184
+ { score: helpfulness, weight: 0.2 }
185
+ ]);
186
+ const evidence = [
187
+ {
188
+ criterion: "accuracy",
189
+ score: accuracy,
190
+ reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,
191
+ examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3)
192
+ },
193
+ {
194
+ criterion: "relevance",
195
+ score: relevance,
196
+ reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`
197
+ },
198
+ {
199
+ criterion: "completeness",
200
+ score: completeness,
201
+ reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`
202
+ },
203
+ {
204
+ criterion: "helpfulness",
205
+ score: helpfulness,
206
+ reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`
207
+ }
208
+ ];
209
+ return {
210
+ passed: overallScore >= 70,
211
+ overallScore,
212
+ scores,
213
+ evidence,
214
+ summary: generateSummary(scores),
215
+ duration: Date.now() - startTime,
216
+ scenarioResults
217
+ };
218
+ }
219
+ async function runScenario(scenario, config, grader) {
220
+ const startTime = Date.now();
221
+ const maxTurns = scenario.maxTurns || config.maxTurns || 5;
222
+ const timeout = config.timeout || 6e4;
223
+ const conversation = [];
224
+ let currentMessage = scenario.initialQuery;
225
+ let context = {
226
+ sessionId: `eval_${Date.now()}`,
227
+ conversationHistory: []
228
+ };
229
+ let lastResponse = null;
230
+ let turn = 0;
231
+ try {
232
+ while (turn < maxTurns) {
233
+ turn++;
234
+ const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);
235
+ lastResponse = response;
236
+ conversation.push({
237
+ turn,
238
+ userMessage: currentMessage,
239
+ agentResponse: response
240
+ });
241
+ context = {
242
+ ...context,
243
+ previousContext: response.context,
244
+ previousIntent: response.intent,
245
+ conversationHistory: [
246
+ ...context.conversationHistory || [],
247
+ { role: "user", content: currentMessage },
248
+ { role: "assistant", content: response.answer || response.followUpQuestion || "" }
249
+ ]
250
+ };
251
+ if (response.outcome === "answer") {
252
+ break;
253
+ }
254
+ if (response.outcome === "blocked" || response.outcome === "out_of_scope") {
255
+ break;
256
+ }
257
+ if (response.outcome === "follow_up") {
258
+ const followUpQuestion = response.followUpQuestion || "";
259
+ const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);
260
+ if (!responseToFollowUp) {
261
+ break;
262
+ }
263
+ currentMessage = responseToFollowUp;
264
+ }
265
+ }
266
+ const evaluation = await evaluateScenarioResult(
267
+ scenario,
268
+ lastResponse,
269
+ conversation,
270
+ grader
271
+ );
272
+ const outcomeMatch = scenario.expectedOutcome ? lastResponse?.outcome === scenario.expectedOutcome : true;
273
+ const passed = outcomeMatch && evaluation.passed;
274
+ return {
275
+ scenario,
276
+ passed,
277
+ turns: turn,
278
+ finalOutcome: lastResponse?.outcome || "error",
279
+ finalAnswer: lastResponse?.outcome === "answer" ? lastResponse.answer || null : null,
280
+ evaluation,
281
+ conversation,
282
+ duration: Date.now() - startTime
283
+ };
284
+ } catch (error) {
285
+ return {
286
+ scenario,
287
+ passed: false,
288
+ turns: turn,
289
+ finalOutcome: "error",
290
+ finalAnswer: null,
291
+ evaluation: {
292
+ passed: false,
293
+ score: 0,
294
+ evidence: [
295
+ {
296
+ criterion: "error",
297
+ score: 0,
298
+ reasoning: error instanceof Error ? error.message : String(error)
299
+ }
300
+ ]
301
+ },
302
+ conversation,
303
+ duration: Date.now() - startTime,
304
+ error: error instanceof Error ? error.message : String(error)
305
+ };
306
+ }
307
+ }
308
+ async function callAgent(endpoint, message, context, timeout) {
309
+ if (typeof endpoint === "function") {
310
+ return endpoint(message, context);
311
+ }
312
+ const controller = new AbortController();
313
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
314
+ try {
315
+ const response = await fetch(endpoint, {
316
+ method: "POST",
317
+ headers: { "Content-Type": "application/json" },
318
+ body: JSON.stringify({
319
+ message,
320
+ session_id: context.sessionId,
321
+ previous_context: context.previousContext,
322
+ previous_intent: context.previousIntent,
323
+ conversation_history: context.conversationHistory
324
+ }),
325
+ signal: controller.signal
326
+ });
327
+ if (!response.ok) {
328
+ throw new Error(`Agent returned ${response.status}: ${response.statusText}`);
329
+ }
330
+ const data = await response.json();
331
+ return {
332
+ outcome: data.outcome || "answer",
333
+ answer: data.answer,
334
+ followUpQuestion: data.followUpQuestion || data.follow_up_question,
335
+ options: data.options,
336
+ context: data.context,
337
+ intent: data.intent,
338
+ trace: data.trace,
339
+ sessionId: data.session_id || data.sessionId
340
+ };
341
+ } finally {
342
+ clearTimeout(timeoutId);
343
+ }
344
+ }
345
+ function generateFollowUpResponse(question, scenario) {
346
+ if (!scenario.followUpResponses) {
347
+ return null;
348
+ }
349
+ for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {
350
+ if (question.toLowerCase().includes(pattern.toLowerCase())) {
351
+ return response;
352
+ }
353
+ }
354
+ return null;
355
+ }
356
+ async function evaluateScenarioResult(scenario, response, conversation, grader) {
357
+ const evidence = [];
358
+ if (!response || response.outcome !== "answer" || !response.answer) {
359
+ if (scenario.expectedOutcome && scenario.expectedOutcome !== "answer") {
360
+ return {
361
+ passed: response?.outcome === scenario.expectedOutcome,
362
+ score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
363
+ evidence: [
364
+ {
365
+ criterion: "outcomeMatch",
366
+ score: response?.outcome === scenario.expectedOutcome ? 100 : 0,
367
+ reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || "no response"}.`
368
+ }
369
+ ]
370
+ };
371
+ }
372
+ return {
373
+ passed: false,
374
+ score: 0,
375
+ evidence: [
376
+ {
377
+ criterion: "noAnswer",
378
+ score: 0,
379
+ reasoning: `Expected an answer but got ${response?.outcome || "no response"}.`
380
+ }
381
+ ]
382
+ };
383
+ }
384
+ const answer = response.answer;
385
+ let totalScore = 0;
386
+ let criteriaCount = 0;
387
+ if (scenario.evaluation.mustContain) {
388
+ const found = scenario.evaluation.mustContain.filter(
389
+ (s) => answer.toLowerCase().includes(s.toLowerCase())
390
+ );
391
+ const score = normalizeScore(found.length / scenario.evaluation.mustContain.length * 100);
392
+ totalScore += score;
393
+ criteriaCount++;
394
+ evidence.push({
395
+ criterion: "mustContain",
396
+ score,
397
+ reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,
398
+ examples: scenario.evaluation.mustContain.filter(
399
+ (s) => !answer.toLowerCase().includes(s.toLowerCase())
400
+ )
401
+ });
402
+ }
403
+ if (scenario.evaluation.mustNotContain) {
404
+ const found = scenario.evaluation.mustNotContain.filter(
405
+ (s) => answer.toLowerCase().includes(s.toLowerCase())
406
+ );
407
+ const score = normalizeScore((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length * 100);
408
+ totalScore += score;
409
+ criteriaCount++;
410
+ evidence.push({
411
+ criterion: "mustNotContain",
412
+ score,
413
+ reasoning: `Found ${found.length} forbidden terms.`,
414
+ examples: found
415
+ });
416
+ }
417
+ const gradingPrompt = `
418
+ Query: ${scenario.initialQuery}
419
+ Answer: ${answer}
420
+ ${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ""}
421
+ `;
422
+ const relevanceResult = await grader.grade(
423
+ gradingPrompt,
424
+ "relevance",
425
+ "How relevant is the answer to the query? 100 = directly and completely addresses the query."
426
+ );
427
+ evidence.push({
428
+ criterion: "relevance",
429
+ score: relevanceResult.score,
430
+ reasoning: relevanceResult.reasoning
431
+ });
432
+ totalScore += relevanceResult.score;
433
+ criteriaCount++;
434
+ const completenessResult = await grader.grade(
435
+ gradingPrompt,
436
+ "completeness",
437
+ "How complete is the answer? 100 = fully addresses all aspects of the query."
438
+ );
439
+ evidence.push({
440
+ criterion: "completeness",
441
+ score: completenessResult.score,
442
+ reasoning: completenessResult.reasoning
443
+ });
444
+ totalScore += completenessResult.score;
445
+ criteriaCount++;
446
+ const helpfulnessResult = await grader.grade(
447
+ gradingPrompt,
448
+ "helpfulness",
449
+ "How helpful and actionable is the answer? 100 = provides clear, actionable guidance."
450
+ );
451
+ evidence.push({
452
+ criterion: "helpfulness",
453
+ score: helpfulnessResult.score,
454
+ reasoning: helpfulnessResult.reasoning
455
+ });
456
+ totalScore += helpfulnessResult.score;
457
+ criteriaCount++;
458
+ const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;
459
+ return {
460
+ passed: avgScore >= 70,
461
+ score: normalizeScore(avgScore),
462
+ evidence
463
+ };
464
+ }
465
+
466
+ exports.evaluateAgent = evaluateAgent;
467
+ //# sourceMappingURL=index.cjs.map
468
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/utils/llm-grader.ts","../../src/utils/metrics.ts","../../src/utils/reporters.ts","../../src/agent/index.ts"],"names":["z","openai","createOpenAI","resolveDefaultOpenAiChatModelId","generateObject"],"mappings":";;;;;;;;AAgBA,IAAM,WAAA,GAAcA,MAAE,MAAA,CAAO;AAAA,EAC3B,KAAA,EAAOA,KAAA,CAAE,MAAA,EAAO,CAAE,GAAA,CAAI,CAAC,CAAA,CAAE,GAAA,CAAI,GAAG,CAAA,CAAE,QAAA,CAAS,kBAAkB,CAAA;AAAA,EAC7D,SAAA,EAAWA,KAAA,CAAE,MAAA,EAAO,CAAE,SAAS,2BAA2B,CAAA;AAAA,EAC1D,QAAA,EAAUA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,EAAQ,CAAA,CAAE,QAAA,EAAS,CAAE,QAAA,CAAS,6CAA6C;AACjG,CAAC,CAAA;AAED,IAAM,wBAAA,GAA2BA,MAAE,MAAA,CAAO;AAAA,EACxC,MAAA,EAAQA,KAAA,CAAE,KAAA,CAAMA,KAAA,CAAE,MAAA,CAAO;AAAA,IACvB,SAAA,EAAWA,MAAE,MAAA,EAAO;AAAA,IACpB,KAAA,EAAOA,MAAE,MAAA,EAAO,CAAE,IAAI,CAAC,CAAA,CAAE,IAAI,GAAG,CAAA;AAAA,IAChC,SAAA,EAAWA,MAAE,MAAA;AAAO,GACrB,CAAC,CAAA;AAAA,EACF,gBAAA,EAAkBA,MAAE,MAAA;AACtB,CAAC,CAAA;AAiCM,SAAS,eAAA,CAAgB,MAAA,GAA0B,EAAC,EAAc;AACvE,EAAA,MAAM,MAAA,GAAS,MAAA,CAAO,YAAA,IAAgB,OAAA,CAAQ,GAAA,CAAI,cAAA;AAClD,EAAA,IAAI,CAAC,MAAA,EAAQ;AACX,IAAA,MAAM,IAAI,MAAM,4CAA4C,CAAA;AAAA,EAC9D;AAEA,EAAA,MAAMC,QAAA,GAASC,mBAAA,CAAa,EAAE,MAAA,EAAQ,CAAA;AACtC,EAAA,MAAM,KAAA,GAAQ,MAAA,CAAO,KAAA,IAASC,oCAAA,EAAgC;AAC9D,EAAA,MAAM,WAAA,GAAc,OAAO,WAAA,IAAe,GAAA;AAE1C,EAAA,OAAO;AAAA,IACL,MAAM,KAAA,CAAM,OAAA,EAAiB,SAAA,EAAmB,MAAA,EAAiB;AAC/D,MAAA,MAAM,MAAA,GAAS,8EAA8E,SAAS,CAAA;;AAAA,EAE1G,MAAA,GAAS,WAAW,MAAM;AAAA,CAAA,GAAO,EAAE;AAAA;AAAA;AAAA,EAGnC,OAAO;AAAA;;AAAA,sDAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMC,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA;AAAA,IAChB,CAAA;AAAA,IAEA,MAAM,aAAA,CAAc,OAAA,EAAiB,QAAA,EAA2B;AAC9D,MAAA,MAAM,sBAAsB,QAAA,CACzB,GAAA,CAAI,CAAC,CAAA,KAAM,KAAK,CAAA,CAAE,IAAI,CAAA,EAAA,EAAK,CAAA,CAAE,WAAW,CAAA,UAAA,EAAa,CAAA,CAAE,MAAM,CAAA,CAAA,CAAG,CAAA,CAChE,KAAK,IAAI,CAAA;AAEZ,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA;AAAA,EAGnB,mBAAmB;;AAAA;AAAA;AAAA,EAInB,OAAO;AAAA;;AAAA,0EAAA,CAAA;AAKH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,wBAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO,MAAA,CAAO,MAAA,CAAO,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,MAAO;AAAA,QACtC,WAAW,CAAA,CAAE,SAAA;AAAA,QACb,OAAO,CAAA,CAAE,KAAA;AAAA,QACT,WAAW,CAAA,CAAE;AAAA,OACf,CAAE,CAAA;AAAA,IACJ,CAAA;AAAA,IAEA,MAAM,cAAA,CAAe,KAAA,EAAe,OAAA,EAAiB;AACnD,MAAA,MAAM,MAAA,GAAS,CAAA;;AAAA,QAAA,EAEX,KAAK,CAAA;;AAAA;AAAA;AAAA,EAIb,OAAO;AAAA;;AAAA;AAAA;;AAAA,sDAAA,CAAA;AAQH,MAAA,MAAM,MAAA,GAAS,MAAMG,iBAAA,CAAe;AAAA,QAClC,KAAA,EAAOH,SAAO,KAAK,CAAA;AAAA,QACnB,MAAA,EAAQ,WAAA;AAAA,QACR,MAAA;AAAA,QACA;AAAA,OACD,CAAA;AAED,MAAA,OAAO;AAAA,QACL,KAAA,EAAO,OAAO,MAAA,CAAO,KAAA;AAAA,QACrB,SAAA,EAAW,OAAO,MAAA,CAAO;AAAA,OAC3B;AAAA,IACF;AAAA,GACF;AACF;;;AC7IO,SAAS,uBACd,MAAA,EACQ;AACR,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AAEhC,EAAA,MAAM,WAAA,GAAc,OAAO,MAAA,CAAO,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AAC/D,EAAA,IAAI,WAAA,KAAgB,GAAG,OAAO,CAAA;AAE9B,EAAA,MAAM,WAAA,GAAc,MAAA,CAAO,MAAA,CAAO,CAAC,GAAA,EAAK,CAAA,KAAM,GAAA,GAAM,CAAA,CAAE,KAAA,GAAQ,CAAA,CAAE,MAAA,EAAQ,CAAC,CAAA;AACzE,EAAA,OAAO,IAAA,CAAK,KAAA,CAAM,WAAA,GAAc,WAAW,CAAA;AAC7C;AAoBO,SAAS,QAAQ,OAAA,EAA2B;AACjD,EAAA,IAAI,OAAA,CAAQ,MAAA,KAAW,CAAA,EAAG,OAAO,CAAA;AACjC,EAAA,OAAO,OAAA,CAAQ,OAAO,CAAC,GAAA,EAAK,MAAM,GAAA,GAAM,CAAA,EAAG,CAAC,CAAA,GAAI,OAAA,CAAQ,MAAA;AAC1D;AAiDO,SAAS,KAAA,CAAM,KAAA,EAAe,GAAA,EAAa,GAAA,EAAqB;AACrE,EAAA,OAAO,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,GAAA,CAAI,GAAA,EAAK,KAAK,CAAC,CAAA;AAC3C;AAKO,SAAS,eAAe,KAAA,EAAuB;AACpD,EAAA,OAAO,MAAM,IAAA,CAAK,KAAA,CAAM,KAAK,CAAA,EAAG,GAAG,GAAG,CAAA;AACxC;;;ACxBA,SAAS,gBAAgB,IAAA,EAAsB;AAC7C,EAAA,OAAO,IAAA,CACJ,OAAA,CAAQ,UAAA,EAAY,KAAK,CAAA,CACzB,OAAA,CAAQ,IAAA,EAAM,CAAC,GAAA,KAAQ,GAAA,CAAI,WAAA,EAAa,EACxC,IAAA,EAAK;AACV;AA2DO,SAAS,eAAA,CACd,QACA,UAAA,GAAmD,EAAE,MAAM,EAAA,EAAI,UAAA,EAAY,IAAG,EACtE;AACR,EAAA,MAAM,OAAA,GAAU,MAAA,CAAO,OAAA,CAAQ,MAAM,CAAA;AACrC,EAAA,MAAM,QAAA,GAAW,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAA,EAAK,GAAG,KAAK,CAAA,KAAM,GAAA,GAAM,KAAA,EAAO,CAAC,IAAI,OAAA,CAAQ,MAAA;AAE9E,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,IAAS,UAAA,CAAW,IAAI,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAChG,EAAA,MAAM,cAAc,OAAA,CAAQ,MAAA,CAAO,CAAC,GAAG,KAAK,CAAA,KAAM,KAAA,GAAQ,UAAA,CAAW,UAAU,CAAA,CAAE,GAAA,CAAI,CAAC,CAAC,IAAI,MAAM,IAAI,CAAA;AAErG,EAAA,MAAM,QAAkB,EAAC;AAEzB,EAAA,IAAI,QAAA,IAAY,WAAW,IAAA,EAAM;AAC/B,IAAA,KAAA,CAAM,KAAK,4BAA4B,CAAA;AAAA,EACzC,CAAA,MAAA,IAAW,QAAA,IAAY,UAAA,CAAW,UAAA,EAAY;AAC5C,IAAA,KAAA,CAAM,KAAK,kDAAkD,CAAA;AAAA,EAC/D,CAAA,MAAO;AACL,IAAA,KAAA,CAAM,KAAK,yCAAyC,CAAA;AAAA,EACtD;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,WAAW,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EACrE;AAEA,EAAA,IAAI,WAAA,CAAY,SAAS,CAAA,EAAG;AAC1B,IAAA,KAAA,CAAM,IAAA,CAAK,sBAAsB,WAAA,CAAY,GAAA,CAAI,eAAe,CAAA,CAAE,IAAA,CAAK,IAAI,CAAC,CAAA,CAAE,CAAA;AAAA,EAChF;AAEA,EAAA,OAAO,KAAA,CAAM,IAAA,CAAK,IAAI,CAAA,GAAI,GAAA;AAC5B;;;ACrIA,eAAsB,cACpB,MAAA,EAC0B;AAC1B,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAE3B,EAAA,IAAI,MAAA,CAAO,SAAA,CAAU,MAAA,KAAW,CAAA,EAAG;AACjC,IAAA,MAAM,IAAI,MAAM,kDAAkD,CAAA;AAAA,EACpE;AAEA,EAAA,MAAM,SAAS,eAAA,CAAgB;AAAA,IAC7B,cAAc,MAAA,CAAO,YAAA;AAAA,IACrB,KAAA,EAAO,OAAO,YAAA,EAAc,KAAA;AAAA,IAC5B,WAAA,EAAa,OAAO,YAAA,EAAc;AAAA,GACnC,CAAA;AAGD,EAAA,MAAM,kBAAoC,EAAC;AAE3C,EAAA,KAAA,MAAW,QAAA,IAAY,OAAO,SAAA,EAAW;AACvC,IAAA,MAAM,MAAA,GAAS,MAAM,WAAA,CAAY,QAAA,EAAU,QAAQ,MAAM,CAAA;AACzD,IAAA,eAAA,CAAgB,KAAK,MAAM,CAAA;AAAA,EAC7B;AAGA,EAAA,MAAM,kBAAkB,eAAA,CAAgB,MAAA,CAAO,CAAC,CAAA,KAAM,EAAE,MAAM,CAAA;AAC9D,EAAA,MAAM,WAAW,cAAA,CAAgB,eAAA,CAAgB,MAAA,GAAS,eAAA,CAAgB,SAAU,GAAG,CAAA;AAGvF,EAAA,MAAM,aAAA,GAAgB,gBAAgB,MAAA,CAAO,CAAC,MAAM,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,MAAA,GAAS,CAAC,CAAA;AACpF,EAAA,MAAM,kBAAkB,aAAA,CAAc,OAAA;AAAA,IAAQ,CAAC,CAAA,KAC7C,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,OAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAA,KAAc,WAAW,CAAA,CAAE,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,KAAK;AAAA,GACrF;AACA,EAAA,MAAM,qBAAqB,aAAA,CAAc,OAAA;AAAA,IAAQ,CAAC,CAAA,KAChD,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,OAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAA,KAAc,cAAc,CAAA,CAAE,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,KAAK;AAAA,GACxF;AACA,EAAA,MAAM,oBAAoB,aAAA,CAAc,OAAA;AAAA,IAAQ,CAAC,CAAA,KAC/C,CAAA,CAAE,UAAA,CAAW,QAAA,CAAS,OAAO,CAAC,CAAA,KAAM,CAAA,CAAE,SAAA,KAAc,aAAa,CAAA,CAAE,GAAA,CAAI,CAAC,CAAA,KAAM,EAAE,KAAK;AAAA,GACvF;AAEA,EAAA,MAAM,SAAA,GAAY,gBAAgB,MAAA,GAAS,CAAA,GAAI,eAAe,OAAA,CAAQ,eAAe,CAAC,CAAA,GAAI,CAAA;AAC1F,EAAA,MAAM,YAAA,GAAe,mBAAmB,MAAA,GAAS,CAAA,GAAI,eAAe,OAAA,CAAQ,kBAAkB,CAAC,CAAA,GAAI,CAAA;AACnG,EAAA,MAAM,WAAA,GAAc,kBAAkB,MAAA,GAAS,CAAA,GAAI,eAAe,OAAA,CAAQ,iBAAiB,CAAC,CAAA,GAAI,CAAA;AAEhG,EAAA,MAAM,MAAA,GAAS,EAAE,QAAA,EAAU,SAAA,EAAW,cAAc,WAAA,EAAY;AAEhE,EAAA,MAAM,eAAe,sBAAA,CAAuB;AAAA,IAC1C,EAAE,KAAA,EAAO,QAAA,EAAU,MAAA,EAAQ,GAAA,EAAK;AAAA,IAChC,EAAE,KAAA,EAAO,SAAA,EAAW,MAAA,EAAQ,IAAA,EAAK;AAAA,IACjC,EAAE,KAAA,EAAO,YAAA,EAAc,MAAA,EAAQ,IAAA,EAAK;AAAA,IACpC,EAAE,KAAA,EAAO,WAAA,EAAa,MAAA,EAAQ,GAAA;AAAK,GACpC,CAAA;AAGD,EAAA,MAAM,QAAA,GAA2B;AAAA,IAC/B;AAAA,MACE,SAAA,EAAW,UAAA;AAAA,MACX,KAAA,EAAO,QAAA;AAAA,MACP,WAAW,CAAA,EAAG,eAAA,CAAgB,MAAM,CAAA,CAAA,EAAI,gBAAgB,MAAM,CAAA,kBAAA,CAAA;AAAA,MAC9D,UAAU,eAAA,CAAgB,MAAA,CAAO,CAAC,CAAA,KAAM,CAAC,EAAE,MAAM,CAAA,CAAE,GAAA,CAAI,CAAC,MAAM,CAAA,CAAE,QAAA,CAAS,IAAI,CAAA,CAAE,KAAA,CAAM,GAAG,CAAC;AAAA,KAC3F;AAAA,IACA;AAAA,MACE,SAAA,EAAW,WAAA;AAAA,MACX,KAAA,EAAO,SAAA;AAAA,MACP,SAAA,EAAW,CAAA,gCAAA,EAAmC,aAAA,CAAc,MAAM,CAAA,kBAAA;AAAA,KACpE;AAAA,IACA;AAAA,MACE,SAAA,EAAW,cAAA;AAAA,MACX,KAAA,EAAO,YAAA;AAAA,MACP,SAAA,EAAW,CAAA,mCAAA,EAAsC,aAAA,CAAc,MAAM,CAAA,kBAAA;AAAA,KACvE;AAAA,IACA;AAAA,MACE,SAAA,EAAW,aAAA;AAAA,MACX,KAAA,EAAO,WAAA;AAAA,MACP,SAAA,EAAW,CAAA,2BAAA,EAA8B,aAAA,CAAc,MAAM,CAAA,kBAAA;AAAA;AAC/D,GACF;AAEA,EAAA,OAAO;AAAA,IACL,QAAQ,YAAA,IAAgB,EAAA;AAAA,IACxB,YAAA;AAAA,IACA,MAAA;AAAA,IACA,QAAA;AAAA,IACA,OAAA,EAAS,gBAAgB,MAAM,CAAA;AAAA,IAC/B,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,IACvB;AAAA,GACF;AACF;AASA,eAAe,WAAA,CACb,QAAA,EACA,MAAA,EACA,MAAA,EACyB;AACzB,EAAA,MAAM,SAAA,GAAY,KAAK,GAAA,EAAI;AAC3B,EAAA,MAAM,QAAA,GAAW,QAAA,CAAS,QAAA,IAAY,MAAA,CAAO,QAAA,IAAY,CAAA;AACzD,EAAA,MAAM,OAAA,GAAU,OAAO,OAAA,IAAW,GAAA;AAElC,EAAA,MAAM,eAAmC,EAAC;AAC1C,EAAA,IAAI,iBAAiB,QAAA,CAAS,YAAA;AAC9B,EAAA,IAAI,OAAA,GAAwB;AAAA,IAC1B,SAAA,EAAW,CAAA,KAAA,EAAQ,IAAA,CAAK,GAAA,EAAK,CAAA,CAAA;AAAA,IAC7B,qBAAqB;AAAC,GACxB;AACA,EAAA,IAAI,YAAA,GAAqC,IAAA;AACzC,EAAA,IAAI,IAAA,GAAO,CAAA;AAEX,EAAA,IAAI;AACF,IAAA,OAAO,OAAO,QAAA,EAAU;AACtB,MAAA,IAAA,EAAA;AAGA,MAAA,MAAM,WAAW,MAAM,SAAA,CAAU,OAAO,aAAA,EAAe,cAAA,EAAgB,SAAS,OAAO,CAAA;AACvF,MAAA,YAAA,GAAe,QAAA;AAGf,MAAA,YAAA,CAAa,IAAA,CAAK;AAAA,QAChB,IAAA;AAAA,QACA,WAAA,EAAa,cAAA;AAAA,QACb,aAAA,EAAe;AAAA,OAChB,CAAA;AAGD,MAAA,OAAA,GAAU;AAAA,QACR,GAAG,OAAA;AAAA,QACH,iBAAiB,QAAA,CAAS,OAAA;AAAA,QAC1B,gBAAgB,QAAA,CAAS,MAAA;AAAA,QACzB,mBAAA,EAAqB;AAAA,UACnB,GAAI,OAAA,CAAQ,mBAAA,IAAuB,EAAC;AAAA,UACpC,EAAE,IAAA,EAAM,MAAA,EAAiB,OAAA,EAAS,cAAA,EAAe;AAAA,UACjD,EAAE,MAAM,WAAA,EAAsB,OAAA,EAAS,SAAS,MAAA,IAAU,QAAA,CAAS,oBAAoB,EAAA;AAAG;AAC5F,OACF;AAGA,MAAA,IAAI,QAAA,CAAS,YAAY,QAAA,EAAU;AACjC,QAAA;AAAA,MACF;AAEA,MAAA,IAAI,QAAA,CAAS,OAAA,KAAY,SAAA,IAAa,QAAA,CAAS,YAAY,cAAA,EAAgB;AACzE,QAAA;AAAA,MACF;AAEA,MAAA,IAAI,QAAA,CAAS,YAAY,WAAA,EAAa;AAEpC,QAAA,MAAM,gBAAA,GAAmB,SAAS,gBAAA,IAAoB,EAAA;AACtD,QAAA,MAAM,kBAAA,GAAqB,wBAAA,CAAyB,gBAAA,EAAkB,QAAQ,CAAA;AAE9E,QAAA,IAAI,CAAC,kBAAA,EAAoB;AAEvB,UAAA;AAAA,QACF;AAEA,QAAA,cAAA,GAAiB,kBAAA;AAAA,MACnB;AAAA,IACF;AAGA,IAAA,MAAM,aAAa,MAAM,sBAAA;AAAA,MACvB,QAAA;AAAA,MACA,YAAA;AAAA,MACA,YAAA;AAAA,MACA;AAAA,KACF;AAGA,IAAA,MAAM,eAAe,QAAA,CAAS,eAAA,GAC1B,YAAA,EAAc,OAAA,KAAY,SAAS,eAAA,GACnC,IAAA;AAEJ,IAAA,MAAM,MAAA,GAAS,gBAAgB,UAAA,CAAW,MAAA;AAE1C,IAAA,OAAO;AAAA,MACL,QAAA;AAAA,MACA,MAAA;AAAA,MACA,KAAA,EAAO,IAAA;AAAA,MACP,YAAA,EAAc,cAAc,OAAA,IAAW,OAAA;AAAA,MACvC,aAAa,YAAA,EAAc,OAAA,KAAY,QAAA,GAAW,YAAA,CAAa,UAAU,IAAA,GAAO,IAAA;AAAA,MAChF,UAAA;AAAA,MACA,YAAA;AAAA,MACA,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI;AAAA,KACzB;AAAA,EACF,SAAS,KAAA,EAAO;AACd,IAAA,OAAO;AAAA,MACL,QAAA;AAAA,MACA,MAAA,EAAQ,KAAA;AAAA,MACR,KAAA,EAAO,IAAA;AAAA,MACP,YAAA,EAAc,OAAA;AAAA,MACd,WAAA,EAAa,IAAA;AAAA,MACb,UAAA,EAAY;AAAA,QACV,MAAA,EAAQ,KAAA;AAAA,QACR,KAAA,EAAO,CAAA;AAAA,QACP,QAAA,EAAU;AAAA,UACR;AAAA,YACE,SAAA,EAAW,OAAA;AAAA,YACX,KAAA,EAAO,CAAA;AAAA,YACP,WAAW,KAAA,YAAiB,KAAA,GAAQ,KAAA,CAAM,OAAA,GAAU,OAAO,KAAK;AAAA;AAClE;AACF,OACF;AAAA,MACA,YAAA;AAAA,MACA,QAAA,EAAU,IAAA,CAAK,GAAA,EAAI,GAAI,SAAA;AAAA,MACvB,OAAO,KAAA,YAAiB,KAAA,GAAQ,KAAA,CAAM,OAAA,GAAU,OAAO,KAAK;AAAA,KAC9D;AAAA,EACF;AACF;AASA,eAAe,SAAA,CACb,QAAA,EACA,OAAA,EACA,OAAA,EACA,OAAA,EACwB;AACxB,EAAA,IAAI,OAAO,aAAa,UAAA,EAAY;AAClC,IAAA,OAAO,QAAA,CAAS,SAAS,OAAO,CAAA;AAAA,EAClC;AAGA,EAAA,MAAM,UAAA,GAAa,IAAI,eAAA,EAAgB;AACvC,EAAA,MAAM,YAAY,UAAA,CAAW,MAAM,UAAA,CAAW,KAAA,IAAS,OAAO,CAAA;AAE9D,EAAA,IAAI;AACF,IAAA,MAAM,QAAA,GAAW,MAAM,KAAA,CAAM,QAAA,EAAU;AAAA,MACrC,MAAA,EAAQ,MAAA;AAAA,MACR,OAAA,EAAS,EAAE,cAAA,EAAgB,kBAAA,EAAmB;AAAA,MAC9C,IAAA,EAAM,KAAK,SAAA,CAAU;AAAA,QACnB,OAAA;AAAA,QACA,YAAY,OAAA,CAAQ,SAAA;AAAA,QACpB,kBAAkB,OAAA,CAAQ,eAAA;AAAA,QAC1B,iBAAiB,OAAA,CAAQ,cAAA;AAAA,QACzB,sBAAsB,OAAA,CAAQ;AAAA,OAC/B,CAAA;AAAA,MACD,QAAQ,UAAA,CAAW;AAAA,KACpB,CAAA;AAED,IAAA,IAAI,CAAC,SAAS,EAAA,EAAI;AAChB,MAAA,MAAM,IAAI,MAAM,CAAA,eAAA,EAAkB,QAAA,CAAS,MAAM,CAAA,EAAA,EAAK,QAAA,CAAS,UAAU,CAAA,CAAE,CAAA;AAAA,IAC7E;AAEA,IAAA,MAAM,IAAA,GAAO,MAAM,QAAA,CAAS,IAAA,EAAK;AAEjC,IAAA,OAAO;AAAA,MACL,OAAA,EAAU,KAAK,OAAA,IAAwC,QAAA;AAAA,MACvD,QAAQ,IAAA,CAAK,MAAA;AAAA,MACb,gBAAA,EAAmB,IAAA,CAAK,gBAAA,IAAoB,IAAA,CAAK,kBAAA;AAAA,MACjD,SAAS,IAAA,CAAK,OAAA;AAAA,MACd,SAAS,IAAA,CAAK,OAAA;AAAA,MACd,QAAQ,IAAA,CAAK,MAAA;AAAA,MACb,OAAO,IAAA,CAAK,KAAA;AAAA,MACZ,SAAA,EAAY,IAAA,CAAK,UAAA,IAAc,IAAA,CAAK;AAAA,KACtC;AAAA,EACF,CAAA,SAAE;AACA,IAAA,YAAA,CAAa,SAAS,CAAA;AAAA,EACxB;AACF;AASA,SAAS,wBAAA,CACP,UACA,QAAA,EACe;AACf,EAAA,IAAI,CAAC,SAAS,iBAAA,EAAmB;AAC/B,IAAA,OAAO,IAAA;AAAA,EACT;AAGA,EAAA,KAAA,MAAW,CAAC,SAAS,QAAQ,CAAA,IAAK,OAAO,OAAA,CAAQ,QAAA,CAAS,iBAAiB,CAAA,EAAG;AAC5E,IAAA,IAAI,SAAS,WAAA,EAAY,CAAE,SAAS,OAAA,CAAQ,WAAA,EAAa,CAAA,EAAG;AAC1D,MAAA,OAAO,QAAA;AAAA,IACT;AAAA,EACF;AAEA,EAAA,OAAO,IAAA;AACT;AASA,eAAe,sBAAA,CACb,QAAA,EACA,QAAA,EACA,YAAA,EACA,MAAA,EACuE;AACvE,EAAA,MAAM,WAA2B,EAAC;AAElC,EAAA,IAAI,CAAC,QAAA,IAAY,QAAA,CAAS,YAAY,QAAA,IAAY,CAAC,SAAS,MAAA,EAAQ;AAElE,IAAA,IAAI,QAAA,CAAS,eAAA,IAAmB,QAAA,CAAS,eAAA,KAAoB,QAAA,EAAU;AAErE,MAAA,OAAO;AAAA,QACL,MAAA,EAAQ,QAAA,EAAU,OAAA,KAAY,QAAA,CAAS,eAAA;AAAA,QACvC,KAAA,EAAO,QAAA,EAAU,OAAA,KAAY,QAAA,CAAS,kBAAkB,GAAA,GAAM,CAAA;AAAA,QAC9D,QAAA,EAAU;AAAA,UACR;AAAA,YACE,SAAA,EAAW,cAAA;AAAA,YACX,KAAA,EAAO,QAAA,EAAU,OAAA,KAAY,QAAA,CAAS,kBAAkB,GAAA,GAAM,CAAA;AAAA,YAC9D,WAAW,CAAA,SAAA,EAAY,QAAA,CAAS,eAAe,CAAA,MAAA,EAAS,QAAA,EAAU,WAAW,aAAa,CAAA,CAAA;AAAA;AAC5F;AACF,OACF;AAAA,IACF;AAEA,IAAA,OAAO;AAAA,MACL,MAAA,EAAQ,KAAA;AAAA,MACR,KAAA,EAAO,CAAA;AAAA,MACP,QAAA,EAAU;AAAA,QACR;AAAA,UACE,SAAA,EAAW,UAAA;AAAA,UACX,KAAA,EAAO,CAAA;AAAA,UACP,SAAA,EAAW,CAAA,2BAAA,EAA8B,QAAA,EAAU,OAAA,IAAW,aAAa,CAAA,CAAA;AAAA;AAC7E;AACF,KACF;AAAA,EACF;AAEA,EAAA,MAAM,SAAS,QAAA,CAAS,MAAA;AACxB,EAAA,IAAI,UAAA,GAAa,CAAA;AACjB,EAAA,IAAI,aAAA,GAAgB,CAAA;AAGpB,EAAA,IAAI,QAAA,CAAS,WAAW,WAAA,EAAa;AACnC,IAAA,MAAM,KAAA,GAAQ,QAAA,CAAS,UAAA,CAAW,WAAA,CAAY,MAAA;AAAA,MAAO,CAAC,MACpD,MAAA,CAAO,WAAA,GAAc,QAAA,CAAS,CAAA,CAAE,aAAa;AAAA,KAC/C;AACA,IAAA,MAAM,KAAA,GAAQ,eAAgB,KAAA,CAAM,MAAA,GAAS,SAAS,UAAA,CAAW,WAAA,CAAY,SAAU,GAAG,CAAA;AAC1F,IAAA,UAAA,IAAc,KAAA;AACd,IAAA,aAAA,EAAA;AAEA,IAAA,QAAA,CAAS,IAAA,CAAK;AAAA,MACZ,SAAA,EAAW,aAAA;AAAA,MACX,KAAA;AAAA,MACA,SAAA,EAAW,SAAS,KAAA,CAAM,MAAM,IAAI,QAAA,CAAS,UAAA,CAAW,YAAY,MAAM,CAAA,gBAAA,CAAA;AAAA,MAC1E,QAAA,EAAU,QAAA,CAAS,UAAA,CAAW,WAAA,CAAY,MAAA;AAAA,QACxC,CAAC,MAAM,CAAC,MAAA,CAAO,aAAY,CAAE,QAAA,CAAS,CAAA,CAAE,WAAA,EAAa;AAAA;AACvD,KACD,CAAA;AAAA,EACH;AAGA,EAAA,IAAI,QAAA,CAAS,WAAW,cAAA,EAAgB;AACtC,IAAA,MAAM,KAAA,GAAQ,QAAA,CAAS,UAAA,CAAW,cAAA,CAAe,MAAA;AAAA,MAAO,CAAC,MACvD,MAAA,CAAO,WAAA,GAAc,QAAA,CAAS,CAAA,CAAE,aAAa;AAAA,KAC/C;AACA,IAAA,MAAM,KAAA,GAAQ,cAAA,CAAA,CAAiB,QAAA,CAAS,UAAA,CAAW,cAAA,CAAe,MAAA,GAAS,KAAA,CAAM,MAAA,IAAU,QAAA,CAAS,UAAA,CAAW,cAAA,CAAe,MAAA,GAAU,GAAG,CAAA;AAC3I,IAAA,UAAA,IAAc,KAAA;AACd,IAAA,aAAA,EAAA;AAEA,IAAA,QAAA,CAAS,IAAA,CAAK;AAAA,MACZ,SAAA,EAAW,gBAAA;AAAA,MACX,KAAA;AAAA,MACA,SAAA,EAAW,CAAA,MAAA,EAAS,KAAA,CAAM,MAAM,CAAA,iBAAA,CAAA;AAAA,MAChC,QAAA,EAAU;AAAA,KACX,CAAA;AAAA,EACH;AAGA,EAAA,MAAM,aAAA,GAAgB;AAAA,OAAA,EACf,SAAS,YAAY;AAAA,QAAA,EACpB,MAAM;AAAA,EACd,QAAA,CAAS,WAAW,MAAA,GAAS,CAAA,QAAA,EAAW,SAAS,UAAA,CAAW,MAAM,KAAK,EAAE;AAAA,CAAA;AAIzE,EAAA,MAAM,eAAA,GAAkB,MAAM,MAAA,CAAO,KAAA;AAAA,IACnC,aAAA;AAAA,IACA,WAAA;AAAA,IACA;AAAA,GACF;AACA,EAAA,QAAA,CAAS,IAAA,CAAK;AAAA,IACZ,SAAA,EAAW,WAAA;AAAA,IACX,OAAO,eAAA,CAAgB,KAAA;AAAA,IACvB,WAAW,eAAA,CAAgB;AAAA,GAC5B,CAAA;AACD,EAAA,UAAA,IAAc,eAAA,CAAgB,KAAA;AAC9B,EAAA,aAAA,EAAA;AAGA,EAAA,MAAM,kBAAA,GAAqB,MAAM,MAAA,CAAO,KAAA;AAAA,IACtC,aAAA;AAAA,IACA,cAAA;AAAA,IACA;AAAA,GACF;AACA,EAAA,QAAA,CAAS,IAAA,CAAK;AAAA,IACZ,SAAA,EAAW,cAAA;AAAA,IACX,OAAO,kBAAA,CAAmB,KAAA;AAAA,IAC1B,WAAW,kBAAA,CAAmB;AAAA,GAC/B,CAAA;AACD,EAAA,UAAA,IAAc,kBAAA,CAAmB,KAAA;AACjC,EAAA,aAAA,EAAA;AAGA,EAAA,MAAM,iBAAA,GAAoB,MAAM,MAAA,CAAO,KAAA;AAAA,IACrC,aAAA;AAAA,IACA,aAAA;AAAA,IACA;AAAA,GACF;AACA,EAAA,QAAA,CAAS,IAAA,CAAK;AAAA,IACZ,SAAA,EAAW,aAAA;AAAA,IACX,OAAO,iBAAA,CAAkB,KAAA;AAAA,IACzB,WAAW,iBAAA,CAAkB;AAAA,GAC9B,CAAA;AACD,EAAA,UAAA,IAAc,iBAAA,CAAkB,KAAA;AAChC,EAAA,aAAA,EAAA;AAEA,EAAA,MAAM,QAAA,GAAW,aAAA,GAAgB,CAAA,GAAI,UAAA,GAAa,aAAA,GAAgB,CAAA;AAElE,EAAA,OAAO;AAAA,IACL,QAAQ,QAAA,IAAY,EAAA;AAAA,IACpB,KAAA,EAAO,eAAe,QAAQ,CAAA;AAAA,IAC9B;AAAA,GACF;AACF","file":"index.cjs","sourcesContent":["/**\n * LLM Grading Utilities\n *\n * Uses OpenAI to grade content quality, relevance, and other metrics.\n */\n\nimport { createOpenAI } from '@ai-sdk/openai';\nimport { generateObject } from 'ai';\nimport { z } from 'zod';\nimport { resolveDefaultOpenAiChatModelId } from '@kat/core';\nimport type { LLMGraderConfig, EvalCriterion, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// GRADING SCHEMAS\n// ============================================================================\n\nconst GradeSchema = z.object({\n score: z.number().min(0).max(100).describe('Score from 0-100'),\n reasoning: z.string().describe('Explanation for the score'),\n examples: z.array(z.string()).optional().describe('Specific examples that influenced the score'),\n});\n\nconst MultiCriteriaGradeSchema = z.object({\n scores: z.array(z.object({\n criterion: z.string(),\n score: z.number().min(0).max(100),\n reasoning: z.string(),\n })),\n overallReasoning: z.string(),\n});\n\n// ============================================================================\n// GRADER FACTORY\n// ============================================================================\n\nexport interface LLMGrader {\n /**\n * Grade content against a single criterion.\n */\n grade(content: string, criterion: string, rubric?: string): Promise<{\n score: number;\n reasoning: string;\n examples?: string[];\n }>;\n\n /**\n * Grade content against multiple criteria.\n */\n gradeMultiple(content: string, criteria: EvalCriterion[]): Promise<EvalEvidence[]>;\n\n /**\n * Grade relevance of content to a query.\n */\n gradeRelevance(query: string, content: string): Promise<{\n score: number;\n reasoning: string;\n }>;\n}\n\n/**\n * Create an LLM grader with the given configuration.\n */\nexport function createLLMGrader(config: LLMGraderConfig = {}): LLMGrader {\n const apiKey = config.openaiApiKey || process.env.OPENAI_API_KEY;\n if (!apiKey) {\n throw new Error('OPENAI_API_KEY is required for LLM grading');\n }\n\n const openai = createOpenAI({ apiKey });\n const model = config.model || resolveDefaultOpenAiChatModelId();\n const temperature = config.temperature ?? 0.1;\n\n return {\n async grade(content: string, criterion: string, rubric?: string) {\n const prompt = `You are an expert evaluator. Grade the following content on the criterion \"${criterion}\".\n\n${rubric ? `Rubric: ${rubric}\\n` : ''}\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return result.object;\n },\n\n async gradeMultiple(content: string, criteria: EvalCriterion[]) {\n const criteriaDescription = criteria\n .map((c) => `- ${c.name}: ${c.description} (weight: ${c.weight})`)\n .join('\\n');\n\n const prompt = `You are an expert evaluator. Grade the following content on multiple criteria.\n\nCriteria:\n${criteriaDescription}\n\nContent to evaluate:\n\"\"\"\n${content}\n\"\"\"\n\nFor each criterion, provide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: MultiCriteriaGradeSchema,\n prompt,\n temperature,\n });\n\n return result.object.scores.map((s) => ({\n criterion: s.criterion,\n score: s.score,\n reasoning: s.reasoning,\n }));\n },\n\n async gradeRelevance(query: string, content: string) {\n const prompt = `You are an expert evaluator. Grade how relevant the following content is to the given query.\n\nQuery: \"${query}\"\n\nContent:\n\"\"\"\n${content}\n\"\"\"\n\nA score of 100 means the content directly and completely answers the query.\nA score of 0 means the content is completely irrelevant.\n\nProvide a score from 0-100 and explain your reasoning.`;\n\n const result = await generateObject({\n model: openai(model),\n schema: GradeSchema,\n prompt,\n temperature,\n });\n\n return {\n score: result.object.score,\n reasoning: result.object.reasoning,\n };\n },\n };\n}\n\n// ============================================================================\n// CONVENIENCE FUNCTIONS\n// ============================================================================\n\n/**\n * Grade content using a one-off grader instance.\n */\nexport async function gradeWithLLM(\n content: string,\n criterion: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string; examples?: string[] }> {\n const grader = createLLMGrader(config);\n return grader.grade(content, criterion);\n}\n\n/**\n * Grade relevance using a one-off grader instance.\n */\nexport async function gradeRelevanceWithLLM(\n query: string,\n content: string,\n config: LLMGraderConfig = {}\n): Promise<{ score: number; reasoning: string }> {\n const grader = createLLMGrader(config);\n return grader.gradeRelevance(query, content);\n}\n","/**\n * Metric Calculation Helpers\n *\n * Pure functions for calculating scores and metrics.\n */\n\n/**\n * Calculate a weighted score from individual scores and weights.\n *\n * @param scores - Array of { score, weight } objects\n * @returns Weighted average score (0-100)\n */\nexport function calculateWeightedScore(\n scores: Array<{ score: number; weight: number }>\n): number {\n if (scores.length === 0) return 0;\n\n const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);\n if (totalWeight === 0) return 0;\n\n const weightedSum = scores.reduce((sum, s) => sum + s.score * s.weight, 0);\n return Math.round(weightedSum / totalWeight);\n}\n\n/**\n * Calculate percentage of found items vs expected items.\n *\n * @param found - Number of items found\n * @param expected - Number of items expected\n * @returns Percentage (0-100)\n */\nexport function calculatePercentage(found: number, expected: number): number {\n if (expected === 0) return 100; // Nothing expected, consider it perfect\n return Math.round((found / expected) * 100);\n}\n\n/**\n * Calculate the average of an array of numbers.\n *\n * @param numbers - Array of numbers\n * @returns Average value\n */\nexport function average(numbers: number[]): number {\n if (numbers.length === 0) return 0;\n return numbers.reduce((sum, n) => sum + n, 0) / numbers.length;\n}\n\n/**\n * Calculate precision: true positives / (true positives + false positives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falsePositives - Number of incorrect positive predictions\n * @returns Precision (0-100)\n */\nexport function calculatePrecision(\n truePositives: number,\n falsePositives: number\n): number {\n const total = truePositives + falsePositives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate recall: true positives / (true positives + false negatives).\n *\n * @param truePositives - Number of correct positive predictions\n * @param falseNegatives - Number of missed positive predictions\n * @returns Recall (0-100)\n */\nexport function calculateRecall(\n truePositives: number,\n falseNegatives: number\n): number {\n const total = truePositives + falseNegatives;\n if (total === 0) return 100;\n return Math.round((truePositives / total) * 100);\n}\n\n/**\n * Calculate F1 score: harmonic mean of precision and recall.\n *\n * @param precision - Precision value (0-100)\n * @param recall - Recall value (0-100)\n * @returns F1 score (0-100)\n */\nexport function calculateF1(precision: number, recall: number): number {\n if (precision + recall === 0) return 0;\n return Math.round((2 * precision * recall) / (precision + recall));\n}\n\n/**\n * Clamp a value between min and max.\n */\nexport function clamp(value: number, min: number, max: number): number {\n return Math.max(min, Math.min(max, value));\n}\n\n/**\n * Normalize a score to 0-100 range.\n */\nexport function normalizeScore(score: number): number {\n return clamp(Math.round(score), 0, 100);\n}\n","/**\n * Report Formatting Utilities\n *\n * Format eval results for different output targets.\n */\n\nimport type { EvalResult, ReportOptions, EvalEvidence } from '../types.js';\n\n// ============================================================================\n// CONSOLE REPORTER\n// ============================================================================\n\n/**\n * Format an eval result for console output.\n */\nexport function formatConsoleReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const lines: string[] = [];\n const { includeEvidence = true } = options;\n\n // Header\n const status = result.passed ? '✓ PASSED' : '✗ FAILED';\n const statusColor = result.passed ? '\\x1b[32m' : '\\x1b[31m';\n const reset = '\\x1b[0m';\n\n lines.push('');\n lines.push('═'.repeat(60));\n lines.push(`${statusColor}${status}${reset} - Overall Score: ${result.overallScore}/100`);\n lines.push('═'.repeat(60));\n\n // Summary\n lines.push('');\n lines.push(`Summary: ${result.summary}`);\n lines.push(`Duration: ${result.duration}ms`);\n\n // Individual scores\n lines.push('');\n lines.push('Scores:');\n for (const [name, score] of Object.entries(result.scores)) {\n const bar = createProgressBar(score, 20);\n const formattedName = formatScoreName(name);\n lines.push(` ${formattedName.padEnd(20)} ${bar} ${score}/100`);\n }\n\n // Evidence (if requested)\n if (includeEvidence && result.evidence.length > 0) {\n lines.push('');\n lines.push('Evidence:');\n for (const evidence of result.evidence) {\n lines.push(` [${evidence.criterion}] (${evidence.score}/100)`);\n lines.push(` ${evidence.reasoning}`);\n if (evidence.examples && evidence.examples.length > 0) {\n for (const example of evidence.examples.slice(0, 3)) {\n lines.push(` - ${example}`);\n }\n }\n }\n }\n\n lines.push('');\n lines.push('─'.repeat(60));\n\n return lines.join('\\n');\n}\n\n/**\n * Create a text progress bar.\n */\nfunction createProgressBar(value: number, width: number): string {\n const filled = Math.round((value / 100) * width);\n const empty = width - filled;\n return `[${'█'.repeat(filled)}${'░'.repeat(empty)}]`;\n}\n\n/**\n * Format a score name for display (camelCase -> Title Case).\n */\nfunction formatScoreName(name: string): string {\n return name\n .replace(/([A-Z])/g, ' $1')\n .replace(/^./, (str) => str.toUpperCase())\n .trim();\n}\n\n// ============================================================================\n// JSON REPORTER\n// ============================================================================\n\n/**\n * Format an eval result as JSON.\n */\nexport function formatJsonReport(\n result: EvalResult,\n options: Partial<ReportOptions> = {}\n): string {\n const { includeEvidence = true, includeRawData = false } = options;\n\n const output: Record<string, unknown> = {\n passed: result.passed,\n overallScore: result.overallScore,\n scores: result.scores,\n summary: result.summary,\n duration: result.duration,\n };\n\n if (includeEvidence) {\n output.evidence = result.evidence;\n }\n\n // Include any additional properties from extended result types\n for (const [key, value] of Object.entries(result)) {\n if (\n !['passed', 'overallScore', 'scores', 'evidence', 'summary', 'duration'].includes(key) &&\n (includeRawData || !isRawData(value))\n ) {\n output[key] = value;\n }\n }\n\n return JSON.stringify(output, null, 2);\n}\n\n/**\n * Check if a value looks like raw data (large arrays/objects).\n */\nfunction isRawData(value: unknown): boolean {\n if (Array.isArray(value) && value.length > 10) return true;\n if (typeof value === 'object' && value !== null) {\n const keys = Object.keys(value);\n if (keys.length > 20) return true;\n }\n return false;\n}\n\n// ============================================================================\n// SUMMARY GENERATION\n// ============================================================================\n\n/**\n * Generate a human-readable summary from scores.\n */\nexport function generateSummary(\n scores: Record<string, number>,\n thresholds: { good: number; acceptable: number } = { good: 80, acceptable: 60 }\n): string {\n const entries = Object.entries(scores);\n const avgScore = entries.reduce((sum, [, score]) => sum + score, 0) / entries.length;\n\n const goodMetrics = entries.filter(([, score]) => score >= thresholds.good).map(([name]) => name);\n const poorMetrics = entries.filter(([, score]) => score < thresholds.acceptable).map(([name]) => name);\n\n const parts: string[] = [];\n\n if (avgScore >= thresholds.good) {\n parts.push('Strong overall performance');\n } else if (avgScore >= thresholds.acceptable) {\n parts.push('Acceptable performance with room for improvement');\n } else {\n parts.push('Performance below acceptable thresholds');\n }\n\n if (goodMetrics.length > 0) {\n parts.push(`Strong: ${goodMetrics.map(formatScoreName).join(', ')}`);\n }\n\n if (poorMetrics.length > 0) {\n parts.push(`Needs improvement: ${poorMetrics.map(formatScoreName).join(', ')}`);\n }\n\n return parts.join('. ') + '.';\n}\n\n// ============================================================================\n// PRINT HELPERS\n// ============================================================================\n\n/**\n * Print an eval result to the console.\n */\nexport function printReport(result: EvalResult, options: Partial<ReportOptions> = {}): void {\n const format = options.format || 'console';\n\n if (format === 'json') {\n console.log(formatJsonReport(result, options));\n } else {\n console.log(formatConsoleReport(result, options));\n }\n}\n","/**\n * Agent Eval - Layer 3\n *\n * Evaluates end-to-end agent behavior by running multi-turn\n * conversation scenarios and grading the responses.\n */\n\nimport { createLLMGrader } from '../utils/llm-grader.js';\nimport { average, normalizeScore, calculateWeightedScore } from '../utils/metrics.js';\nimport { generateSummary } from '../utils/reporters.js';\nimport type { EvalEvidence } from '../types.js';\nimport type {\n AgentEvalConfig,\n AgentEvalResult,\n AgentTestScenario,\n ScenarioResult,\n ConversationTurn,\n AgentResponse,\n AgentContext,\n AgentFunction,\n} from './types.js';\n\nexport type {\n AgentEvalConfig,\n AgentEvalResult,\n AgentTestScenario,\n ScenarioEvaluation,\n ScenarioResult,\n ConversationTurn,\n AgentResponse,\n} from './types.js';\n\n// ============================================================================\n// MAIN EVALUATION FUNCTION\n// ============================================================================\n\n/**\n * Evaluate agent behavior by running test scenarios.\n */\nexport async function evaluateAgent(\n config: AgentEvalConfig\n): Promise<AgentEvalResult> {\n const startTime = Date.now();\n\n if (config.scenarios.length === 0) {\n throw new Error('At least one scenario is required for agent eval');\n }\n\n const grader = createLLMGrader({\n openaiApiKey: config.openaiApiKey,\n model: config.graderConfig?.model,\n temperature: config.graderConfig?.temperature,\n });\n\n // Run each scenario\n const scenarioResults: ScenarioResult[] = [];\n\n for (const scenario of config.scenarios) {\n const result = await runScenario(scenario, config, grader);\n scenarioResults.push(result);\n }\n\n // Calculate aggregate scores\n const passedScenarios = scenarioResults.filter((r) => r.passed);\n const accuracy = normalizeScore((passedScenarios.length / scenarioResults.length) * 100);\n\n // Calculate average scores from graded scenarios\n const gradedResults = scenarioResults.filter((r) => r.evaluation.evidence.length > 0);\n const relevanceScores = gradedResults.flatMap((r) =>\n r.evaluation.evidence.filter((e) => e.criterion === 'relevance').map((e) => e.score)\n );\n const completenessScores = gradedResults.flatMap((r) =>\n r.evaluation.evidence.filter((e) => e.criterion === 'completeness').map((e) => e.score)\n );\n const helpfulnessScores = gradedResults.flatMap((r) =>\n r.evaluation.evidence.filter((e) => e.criterion === 'helpfulness').map((e) => e.score)\n );\n\n const relevance = relevanceScores.length > 0 ? normalizeScore(average(relevanceScores)) : 0;\n const completeness = completenessScores.length > 0 ? normalizeScore(average(completenessScores)) : 0;\n const helpfulness = helpfulnessScores.length > 0 ? normalizeScore(average(helpfulnessScores)) : 0;\n\n const scores = { accuracy, relevance, completeness, helpfulness };\n\n const overallScore = calculateWeightedScore([\n { score: accuracy, weight: 0.30 },\n { score: relevance, weight: 0.25 },\n { score: completeness, weight: 0.25 },\n { score: helpfulness, weight: 0.20 },\n ]);\n\n // Build evidence\n const evidence: EvalEvidence[] = [\n {\n criterion: 'accuracy',\n score: accuracy,\n reasoning: `${passedScenarios.length}/${scenarioResults.length} scenarios passed.`,\n examples: scenarioResults.filter((r) => !r.passed).map((r) => r.scenario.name).slice(0, 3),\n },\n {\n criterion: 'relevance',\n score: relevance,\n reasoning: `Average answer relevance across ${gradedResults.length} graded scenarios.`,\n },\n {\n criterion: 'completeness',\n score: completeness,\n reasoning: `Average answer completeness across ${gradedResults.length} graded scenarios.`,\n },\n {\n criterion: 'helpfulness',\n score: helpfulness,\n reasoning: `Average helpfulness across ${gradedResults.length} graded scenarios.`,\n },\n ];\n\n return {\n passed: overallScore >= 70,\n overallScore,\n scores,\n evidence,\n summary: generateSummary(scores),\n duration: Date.now() - startTime,\n scenarioResults,\n };\n}\n\n// ============================================================================\n// SCENARIO RUNNER\n// ============================================================================\n\n/**\n * Run a single scenario and evaluate the result.\n */\nasync function runScenario(\n scenario: AgentTestScenario,\n config: AgentEvalConfig,\n grader: ReturnType<typeof createLLMGrader>\n): Promise<ScenarioResult> {\n const startTime = Date.now();\n const maxTurns = scenario.maxTurns || config.maxTurns || 5;\n const timeout = config.timeout || 60000;\n\n const conversation: ConversationTurn[] = [];\n let currentMessage = scenario.initialQuery;\n let context: AgentContext = {\n sessionId: `eval_${Date.now()}`,\n conversationHistory: [],\n };\n let lastResponse: AgentResponse | null = null;\n let turn = 0;\n\n try {\n while (turn < maxTurns) {\n turn++;\n\n // Call the agent\n const response = await callAgent(config.agentEndpoint, currentMessage, context, timeout);\n lastResponse = response;\n\n // Record the conversation\n conversation.push({\n turn,\n userMessage: currentMessage,\n agentResponse: response,\n });\n\n // Update context\n context = {\n ...context,\n previousContext: response.context,\n previousIntent: response.intent,\n conversationHistory: [\n ...(context.conversationHistory || []),\n { role: 'user' as const, content: currentMessage },\n { role: 'assistant' as const, content: response.answer || response.followUpQuestion || '' },\n ],\n };\n\n // Check outcome\n if (response.outcome === 'answer') {\n break; // Got an answer, done\n }\n\n if (response.outcome === 'blocked' || response.outcome === 'out_of_scope') {\n break; // Terminal state\n }\n\n if (response.outcome === 'follow_up') {\n // Generate response to follow-up question\n const followUpQuestion = response.followUpQuestion || '';\n const responseToFollowUp = generateFollowUpResponse(followUpQuestion, scenario);\n\n if (!responseToFollowUp) {\n // No response configured, end the conversation\n break;\n }\n\n currentMessage = responseToFollowUp;\n }\n }\n\n // Evaluate the result\n const evaluation = await evaluateScenarioResult(\n scenario,\n lastResponse,\n conversation,\n grader\n );\n\n // Determine if passed\n const outcomeMatch = scenario.expectedOutcome\n ? lastResponse?.outcome === scenario.expectedOutcome\n : true;\n\n const passed = outcomeMatch && evaluation.passed;\n\n return {\n scenario,\n passed,\n turns: turn,\n finalOutcome: lastResponse?.outcome || 'error',\n finalAnswer: lastResponse?.outcome === 'answer' ? lastResponse.answer || null : null,\n evaluation,\n conversation,\n duration: Date.now() - startTime,\n };\n } catch (error) {\n return {\n scenario,\n passed: false,\n turns: turn,\n finalOutcome: 'error',\n finalAnswer: null,\n evaluation: {\n passed: false,\n score: 0,\n evidence: [\n {\n criterion: 'error',\n score: 0,\n reasoning: error instanceof Error ? error.message : String(error),\n },\n ],\n },\n conversation,\n duration: Date.now() - startTime,\n error: error instanceof Error ? error.message : String(error),\n };\n }\n}\n\n// ============================================================================\n// AGENT CALLING\n// ============================================================================\n\n/**\n * Call the agent (either via HTTP or direct function).\n */\nasync function callAgent(\n endpoint: string | AgentFunction,\n message: string,\n context: AgentContext,\n timeout: number\n): Promise<AgentResponse> {\n if (typeof endpoint === 'function') {\n return endpoint(message, context);\n }\n\n // HTTP call\n const controller = new AbortController();\n const timeoutId = setTimeout(() => controller.abort(), timeout);\n\n try {\n const response = await fetch(endpoint, {\n method: 'POST',\n headers: { 'Content-Type': 'application/json' },\n body: JSON.stringify({\n message,\n session_id: context.sessionId,\n previous_context: context.previousContext,\n previous_intent: context.previousIntent,\n conversation_history: context.conversationHistory,\n }),\n signal: controller.signal,\n });\n\n if (!response.ok) {\n throw new Error(`Agent returned ${response.status}: ${response.statusText}`);\n }\n\n const data = await response.json() as Record<string, unknown>;\n\n return {\n outcome: (data.outcome as AgentResponse['outcome']) || 'answer',\n answer: data.answer as string | undefined,\n followUpQuestion: (data.followUpQuestion || data.follow_up_question) as string | undefined,\n options: data.options as AgentResponse['options'],\n context: data.context,\n intent: data.intent,\n trace: data.trace,\n sessionId: (data.session_id || data.sessionId) as string | undefined,\n };\n } finally {\n clearTimeout(timeoutId);\n }\n}\n\n// ============================================================================\n// FOLLOW-UP RESPONSE GENERATION\n// ============================================================================\n\n/**\n * Generate a response to a follow-up question based on scenario config.\n */\nfunction generateFollowUpResponse(\n question: string,\n scenario: AgentTestScenario\n): string | null {\n if (!scenario.followUpResponses) {\n return null;\n }\n\n // Check each pattern\n for (const [pattern, response] of Object.entries(scenario.followUpResponses)) {\n if (question.toLowerCase().includes(pattern.toLowerCase())) {\n return response;\n }\n }\n\n return null;\n}\n\n// ============================================================================\n// SCENARIO EVALUATION\n// ============================================================================\n\n/**\n * Evaluate the result of a scenario.\n */\nasync function evaluateScenarioResult(\n scenario: AgentTestScenario,\n response: AgentResponse | null,\n conversation: ConversationTurn[],\n grader: ReturnType<typeof createLLMGrader>\n): Promise<{ passed: boolean; score: number; evidence: EvalEvidence[] }> {\n const evidence: EvalEvidence[] = [];\n\n if (!response || response.outcome !== 'answer' || !response.answer) {\n // No answer to evaluate\n if (scenario.expectedOutcome && scenario.expectedOutcome !== 'answer') {\n // Expected non-answer outcome\n return {\n passed: response?.outcome === scenario.expectedOutcome,\n score: response?.outcome === scenario.expectedOutcome ? 100 : 0,\n evidence: [\n {\n criterion: 'outcomeMatch',\n score: response?.outcome === scenario.expectedOutcome ? 100 : 0,\n reasoning: `Expected ${scenario.expectedOutcome}, got ${response?.outcome || 'no response'}.`,\n },\n ],\n };\n }\n\n return {\n passed: false,\n score: 0,\n evidence: [\n {\n criterion: 'noAnswer',\n score: 0,\n reasoning: `Expected an answer but got ${response?.outcome || 'no response'}.`,\n },\n ],\n };\n }\n\n const answer = response.answer;\n let totalScore = 0;\n let criteriaCount = 0;\n\n // Check mustContain\n if (scenario.evaluation.mustContain) {\n const found = scenario.evaluation.mustContain.filter((s) =>\n answer.toLowerCase().includes(s.toLowerCase())\n );\n const score = normalizeScore((found.length / scenario.evaluation.mustContain.length) * 100);\n totalScore += score;\n criteriaCount++;\n\n evidence.push({\n criterion: 'mustContain',\n score,\n reasoning: `Found ${found.length}/${scenario.evaluation.mustContain.length} required terms.`,\n examples: scenario.evaluation.mustContain.filter(\n (s) => !answer.toLowerCase().includes(s.toLowerCase())\n ),\n });\n }\n\n // Check mustNotContain\n if (scenario.evaluation.mustNotContain) {\n const found = scenario.evaluation.mustNotContain.filter((s) =>\n answer.toLowerCase().includes(s.toLowerCase())\n );\n const score = normalizeScore(((scenario.evaluation.mustNotContain.length - found.length) / scenario.evaluation.mustNotContain.length) * 100);\n totalScore += score;\n criteriaCount++;\n\n evidence.push({\n criterion: 'mustNotContain',\n score,\n reasoning: `Found ${found.length} forbidden terms.`,\n examples: found,\n });\n }\n\n // LLM grading for relevance, completeness, helpfulness\n const gradingPrompt = `\nQuery: ${scenario.initialQuery}\nAnswer: ${answer}\n${scenario.evaluation.rubric ? `Rubric: ${scenario.evaluation.rubric}` : ''}\n`;\n\n // Grade relevance\n const relevanceResult = await grader.grade(\n gradingPrompt,\n 'relevance',\n 'How relevant is the answer to the query? 100 = directly and completely addresses the query.'\n );\n evidence.push({\n criterion: 'relevance',\n score: relevanceResult.score,\n reasoning: relevanceResult.reasoning,\n });\n totalScore += relevanceResult.score;\n criteriaCount++;\n\n // Grade completeness\n const completenessResult = await grader.grade(\n gradingPrompt,\n 'completeness',\n 'How complete is the answer? 100 = fully addresses all aspects of the query.'\n );\n evidence.push({\n criterion: 'completeness',\n score: completenessResult.score,\n reasoning: completenessResult.reasoning,\n });\n totalScore += completenessResult.score;\n criteriaCount++;\n\n // Grade helpfulness\n const helpfulnessResult = await grader.grade(\n gradingPrompt,\n 'helpfulness',\n 'How helpful and actionable is the answer? 100 = provides clear, actionable guidance.'\n );\n evidence.push({\n criterion: 'helpfulness',\n score: helpfulnessResult.score,\n reasoning: helpfulnessResult.reasoning,\n });\n totalScore += helpfulnessResult.score;\n criteriaCount++;\n\n const avgScore = criteriaCount > 0 ? totalScore / criteriaCount : 0;\n\n return {\n passed: avgScore >= 70,\n score: normalizeScore(avgScore),\n evidence,\n };\n}\n"]}