@agtlantis/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +198 -0
- package/LICENSE +21 -0
- package/README.md +496 -0
- package/dist/cli.js +4709 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +3998 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2738 -0
- package/dist/index.d.ts +2738 -0
- package/dist/index.js +3868 -0
- package/dist/index.js.map +1 -0
- package/package.json +101 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
*No unreleased changes.*
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## [0.3.0] - 2026-01-09
|
|
15
|
+
|
|
16
|
+
### Breaking Changes
|
|
17
|
+
|
|
18
|
+
#### Result Types Refactoring
|
|
19
|
+
|
|
20
|
+
- **Discriminated Union** — `TestResultWithIteration`이 4개의 명확한 타입으로 분리됨
|
|
21
|
+
- `SingleTurnResult` — 단일 실행 결과
|
|
22
|
+
- `SingleTurnIteratedResult` — 반복 실행 결과 (iteration 통계 포함)
|
|
23
|
+
- `MultiTurnResult` — 멀티턴 대화 결과
|
|
24
|
+
- `MultiTurnIteratedResult` — 멀티턴 반복 실행 결과
|
|
25
|
+
- **`kind` discriminator** — 모든 결과 타입에 `kind` 필드 추가
|
|
26
|
+
- **타입가드 변경** — 기존 `hasIterationData()`, `hasMultiTurnData()`, `hasMultiTurnIterationData()` 제거
|
|
27
|
+
- **새 타입가드** — `isSingleTurnResult()`, `isMultiTurnResult()`, `isIteratedResult()` 추가
|
|
28
|
+
|
|
29
|
+
### Migration Guide
|
|
30
|
+
|
|
31
|
+
```typescript
|
|
32
|
+
// Before (v0.2.x)
|
|
33
|
+
if (hasIterationData(result)) {
|
|
34
|
+
console.log(result.iterationStats)
|
|
35
|
+
}
|
|
36
|
+
if (hasMultiTurnIterationData(result)) {
|
|
37
|
+
console.log(result.multiTurnIterationStats)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// After (v0.3.0)
|
|
41
|
+
if (isIteratedResult(result)) {
|
|
42
|
+
console.log(result.iterationStats)
|
|
43
|
+
}
|
|
44
|
+
if (result.kind === 'multi-turn-iterated') {
|
|
45
|
+
console.log(result.multiTurnIterationStats)
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// 또는 switch 문 사용 (권장)
|
|
49
|
+
switch (result.kind) {
|
|
50
|
+
case 'single-turn':
|
|
51
|
+
// 단일 실행
|
|
52
|
+
break
|
|
53
|
+
case 'single-turn-iterated':
|
|
54
|
+
// 반복 실행 (iterationStats 보장)
|
|
55
|
+
break
|
|
56
|
+
case 'multi-turn':
|
|
57
|
+
// 멀티턴 대화 (conversationHistory 보장)
|
|
58
|
+
break
|
|
59
|
+
case 'multi-turn-iterated':
|
|
60
|
+
// 멀티턴 반복 (모든 데이터 보장)
|
|
61
|
+
break
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Technical Details
|
|
66
|
+
|
|
67
|
+
- **822 tests** passing with Vitest 4.x
|
|
68
|
+
- Type-safe discriminated union with exhaustive checking support
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## [0.2.0] - 2026-01-08
|
|
73
|
+
|
|
74
|
+
### Added
|
|
75
|
+
|
|
76
|
+
#### Cost Calculation (Phase 10)
|
|
77
|
+
- **Built-in pricing tables** — Pre-configured pricing for OpenAI, Gemini, and Anthropic models (January 2025 prices)
|
|
78
|
+
- **Per-component cost tracking** — Separate cost breakdown for Agent, Judge, and Improver
|
|
79
|
+
- **Auto-detection** — Automatic provider detection from model names (gpt-*, gemini-*, claude-*)
|
|
80
|
+
- **Custom pricing** — Override pricing tables with custom configuration
|
|
81
|
+
- **`calculateCost()`** — Calculate costs from token usage
|
|
82
|
+
- **`detectProvider()`** — Detect LLM provider from model name
|
|
83
|
+
- **`getModelPricing()`** — Get pricing for specific models
|
|
84
|
+
- **`buildCostBreakdown()`** — Build cost breakdown with automatic total calculation
|
|
85
|
+
|
|
86
|
+
#### Metadata Pattern
|
|
87
|
+
- **`ComponentMetadata`** — Base metadata type with `tokenUsage` and `model`
|
|
88
|
+
- **`JudgeMetadata`** — Metadata returned from Judge evaluation
|
|
89
|
+
- **`ImproverMetadata`** — Metadata returned from Improver analysis
|
|
90
|
+
|
|
91
|
+
### Changed
|
|
92
|
+
|
|
93
|
+
- **`Improver.improve()`** — Now returns `ImproveResult` instead of `Suggestion[]` (**BREAKING CHANGE**)
|
|
94
|
+
```typescript
|
|
95
|
+
// Before
|
|
96
|
+
const suggestions = await improver.improve(prompt, results)
|
|
97
|
+
|
|
98
|
+
// After
|
|
99
|
+
const { suggestions, metadata } = await improver.improve(prompt, results)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
- **`JudgeResult`** — Now includes optional `metadata` field with token usage
|
|
103
|
+
- **`MetricsResult`** — Added `costBreakdown?: CostBreakdown` field
|
|
104
|
+
- **`EvalSuiteConfig`** — Added `pricing?: PricingConfig` option
|
|
105
|
+
- **`EvalConfig` (CLI)** — Added `pricing?: PricingConfig` option
|
|
106
|
+
|
|
107
|
+
### Fixed
|
|
108
|
+
|
|
109
|
+
- **Export `ImproveResult`** — New return type from `improve()` is now properly exported
|
|
110
|
+
- **Export metadata types** — `ComponentMetadata`, `JudgeMetadata`, `ImproverMetadata` now exported from main entry
|
|
111
|
+
|
|
112
|
+
### Technical Details
|
|
113
|
+
|
|
114
|
+
- **798 tests** passing with Vitest 4.x
|
|
115
|
+
- **ESM 99KB + CJS 106KB + DTS 102KB** build output
|
|
116
|
+
- New pricing module: `src/pricing/`
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## [0.1.0] - 2026-01-07
|
|
121
|
+
|
|
122
|
+
### Added
|
|
123
|
+
|
|
124
|
+
#### Core Features
|
|
125
|
+
- **EvalSuite** — Main evaluation runner with concurrent test execution
|
|
126
|
+
- **Judge** — LLM-as-Judge evaluation with customizable criteria
|
|
127
|
+
- **Improver** — AI-powered prompt improvement suggestions
|
|
128
|
+
- **Reporter** — Markdown report generation and comparison
|
|
129
|
+
|
|
130
|
+
#### LLM Support
|
|
131
|
+
- **OpenAI** — Full support via Vercel AI SDK
|
|
132
|
+
- **Gemini** — Google Gemini support via Vercel AI SDK
|
|
133
|
+
- **Structured Output** — `generateObject()` for type-safe LLM responses
|
|
134
|
+
- **JSON Mode** — Automatic JSON mode for Judge/Improver
|
|
135
|
+
|
|
136
|
+
#### Evaluation Criteria
|
|
137
|
+
- **Built-in criteria** — `accuracy()`, `consistency()`, `relevance()`
|
|
138
|
+
- **Schema validation** — `validateSchema()` with Zod schemas
|
|
139
|
+
- **Custom criteria** — Define your own evaluation criteria
|
|
140
|
+
- **Weighted scoring** — Assign different weights to criteria
|
|
141
|
+
|
|
142
|
+
#### Multi-Turn Testing
|
|
143
|
+
- **MultiTurnTestCase** — Test complex conversation flows
|
|
144
|
+
- **Termination conditions** — `fieldEquals()`, `fieldIsSet()`, `afterTurns()`
|
|
145
|
+
- **Composite conditions** — `and()`, `or()`, `not()` combinators
|
|
146
|
+
- **Natural language conditions** — LLM-based termination evaluation
|
|
147
|
+
- **AI simulated users** — `aiUser()` for automated user simulation
|
|
148
|
+
- **Dynamic personas** — Change user behavior during conversation
|
|
149
|
+
|
|
150
|
+
#### Test Iterations
|
|
151
|
+
- **Statistical analysis** — Mean, std dev, min/max, pass rate
|
|
152
|
+
- **Multi-turn statistics** — Avg turns, termination distribution
|
|
153
|
+
- **Representative selection** — Auto-select result closest to mean
|
|
154
|
+
|
|
155
|
+
#### File Context
|
|
156
|
+
- **File loading** — `loadFile()`, `loadFiles()` with glob patterns
|
|
157
|
+
- **File content** — Include files in test cases for evaluation
|
|
158
|
+
- **Size limits** — Configurable file size limits
|
|
159
|
+
|
|
160
|
+
#### Prompt Repository
|
|
161
|
+
- **File-based** — YAML file storage with versioning
|
|
162
|
+
- **SQLite-based** — Database storage for production
|
|
163
|
+
- **Template compilation** — Mustache-style templates
|
|
164
|
+
|
|
165
|
+
#### CLI
|
|
166
|
+
- **`agent-eval run`** — Run evaluations from command line
|
|
167
|
+
- **TypeScript config** — `defineConfig()` with full type safety
|
|
168
|
+
- **Environment files** — Automatic `.env` loading
|
|
169
|
+
- **Verbose mode** — Detailed progress output
|
|
170
|
+
|
|
171
|
+
#### Testing Utilities
|
|
172
|
+
- **MockLLMClient** — Mock LLM for unit testing
|
|
173
|
+
- **RecordingMockLLMClient** — Record and verify LLM calls
|
|
174
|
+
- **MockAgent** — Mock agent for testing
|
|
175
|
+
- **MockJudge** — Mock judge for testing
|
|
176
|
+
- **MockImprover** — Mock improver for testing
|
|
177
|
+
|
|
178
|
+
#### Error Handling
|
|
179
|
+
- **EvalError** — Structured errors with codes
|
|
180
|
+
- **Error codes** — Categorized error types for handling
|
|
181
|
+
- **Error context** — Additional debugging information
|
|
182
|
+
|
|
183
|
+
### Technical Details
|
|
184
|
+
|
|
185
|
+
- **649 tests** passing with Vitest 4.x
|
|
186
|
+
- **ESM + CJS** dual package output
|
|
187
|
+
- **TypeScript** with strict mode
|
|
188
|
+
- **Vercel AI SDK** 6.x for LLM integration
|
|
189
|
+
- **Zod** peer dependency for schema validation
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Future Plans
|
|
194
|
+
|
|
195
|
+
- Web Interface for evaluation dashboards
|
|
196
|
+
- Additional LLM providers (Anthropic client integration)
|
|
197
|
+
- Evaluation history and trends
|
|
198
|
+
- Plugin system for custom evaluators
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 hakzzong
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
# @agtlantis/eval
|
|
2
|
+
|
|
3
|
+
> Unit testing for AI Agents
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Traditional unit tests are deterministic — same input, same output, pass or fail. AI Agents are non-deterministic. The same prompt can produce different outputs, and "correctness" is often a spectrum, not a binary.
|
|
8
|
+
|
|
9
|
+
@agtlantis/eval embraces this reality with LLM-as-Judge evaluation, statistical iterations, and multi-turn conversation testing.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **LLM-as-Judge Evaluation** — Use AI to evaluate AI with customizable criteria
|
|
14
|
+
- **Multi-turn Conversations** — Test complex dialogues with termination conditions
|
|
15
|
+
- **AI Simulated Users** — Automatically generate realistic user inputs with personas
|
|
16
|
+
- **Statistical Iterations** — Run tests multiple times and analyze results statistically
|
|
17
|
+
- **Cost Tracking** — Built-in pricing tables with per-component cost breakdown
|
|
18
|
+
- **Schema Validation** — Validate outputs with Zod schemas (binary pass/fail)
|
|
19
|
+
- **Hybrid Evaluation** — Combine programmatic and LLM-based criteria
|
|
20
|
+
- **Prompt Improvement** — Get AI-generated suggestions to improve your agent's prompts
|
|
21
|
+
- **CLI Runner** — Run evaluations from the command line with TypeScript configs
|
|
22
|
+
- **Markdown Reports** — Generate detailed, human-readable evaluation reports
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
Not published to npm yet. Use from the monorepo:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
git clone <repo-url>
|
|
30
|
+
cd agtlantis
|
|
31
|
+
pnpm install
|
|
32
|
+
pnpm build
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
> **Note:** `zod` is a required peer dependency for schema validation features.
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
### 1. Define Your Agent
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
import type { EvalAgent, AgentPrompt } from '@agtlantis/eval'
|
|
43
|
+
|
|
44
|
+
interface QAInput {
|
|
45
|
+
question: string
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
interface QAOutput {
|
|
49
|
+
answer: string
|
|
50
|
+
confidence: 'high' | 'medium' | 'low'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const qaAgent: EvalAgent<QAInput, QAOutput> = {
|
|
54
|
+
config: {
|
|
55
|
+
name: 'qa-agent',
|
|
56
|
+
description: 'A Q&A agent that answers questions accurately',
|
|
57
|
+
},
|
|
58
|
+
prompt: {
|
|
59
|
+
id: 'qa-prompt',
|
|
60
|
+
version: '1.0.0',
|
|
61
|
+
system: 'You are a helpful Q&A assistant. Answer questions accurately and concisely.',
|
|
62
|
+
renderUserPrompt: (input) => input.question,
|
|
63
|
+
},
|
|
64
|
+
execute: async (input) => {
|
|
65
|
+
// Your actual LLM call here
|
|
66
|
+
return {
|
|
67
|
+
result: { answer: 'The answer...', confidence: 'high' },
|
|
68
|
+
metadata: { tokenUsage: { input: 10, output: 20, total: 30 } },
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### 2. Create a Judge
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
import { createOpenAIProvider } from '@agtlantis/core'
|
|
78
|
+
import { createJudge, accuracy, relevance } from '@agtlantis/eval'
|
|
79
|
+
|
|
80
|
+
const provider = createOpenAIProvider({
|
|
81
|
+
apiKey: process.env.OPENAI_API_KEY!,
|
|
82
|
+
}).withDefaultModel('gpt-4o-mini')
|
|
83
|
+
|
|
84
|
+
const judge = createJudge({
|
|
85
|
+
provider,
|
|
86
|
+
criteria: [
|
|
87
|
+
accuracy({ weight: 2 }),
|
|
88
|
+
relevance(),
|
|
89
|
+
{
|
|
90
|
+
id: 'conciseness',
|
|
91
|
+
name: 'Conciseness',
|
|
92
|
+
description: 'The answer is brief and to the point',
|
|
93
|
+
},
|
|
94
|
+
],
|
|
95
|
+
passThreshold: 70,
|
|
96
|
+
})
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### 3. Run Evaluations
|
|
100
|
+
|
|
101
|
+
```typescript
|
|
102
|
+
import { createEvalSuite, reportToMarkdown } from '@agtlantis/eval'
|
|
103
|
+
|
|
104
|
+
const suite = createEvalSuite({
|
|
105
|
+
agent: qaAgent,
|
|
106
|
+
judge,
|
|
107
|
+
agentDescription: 'Q&A agent that answers general knowledge questions',
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
const testCases = [
|
|
111
|
+
{ id: 'capitals', input: { question: 'What is the capital of France?' } },
|
|
112
|
+
{ id: 'math', input: { question: 'What is 2 + 2?' } },
|
|
113
|
+
{ id: 'history', input: { question: 'Who wrote Romeo and Juliet?' } },
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
const report = await suite.run(testCases, { iterations: 5, concurrency: 2 })
|
|
117
|
+
|
|
118
|
+
console.log(reportToMarkdown(report))
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Core Concepts
|
|
122
|
+
|
|
123
|
+
### Evaluation Criteria
|
|
124
|
+
|
|
125
|
+
Criteria define how your agent's outputs are evaluated. There are two types:
|
|
126
|
+
|
|
127
|
+
#### LLM-Evaluated Criteria (0-100 score)
|
|
128
|
+
|
|
129
|
+
```typescript
|
|
130
|
+
import { accuracy, consistency, relevance } from '@agtlantis/eval'
|
|
131
|
+
|
|
132
|
+
const criteria = [
|
|
133
|
+
accuracy({ weight: 2 }),
|
|
134
|
+
consistency(),
|
|
135
|
+
relevance(),
|
|
136
|
+
{
|
|
137
|
+
id: 'custom-criterion',
|
|
138
|
+
name: 'Domain Expertise',
|
|
139
|
+
description: 'Shows deep understanding of the subject matter',
|
|
140
|
+
weight: 1.5,
|
|
141
|
+
},
|
|
142
|
+
]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
#### Programmatic Criteria (Binary: 0 or 100)
|
|
146
|
+
|
|
147
|
+
```typescript
|
|
148
|
+
import { schema } from '@agtlantis/eval'
|
|
149
|
+
import { z } from 'zod'
|
|
150
|
+
|
|
151
|
+
const OutputSchema = z.object({
|
|
152
|
+
answer: z.string().min(1),
|
|
153
|
+
confidence: z.enum(['high', 'medium', 'low']),
|
|
154
|
+
sources: z.array(z.string()).optional(),
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
const criteria = [
|
|
158
|
+
schema({ schema: OutputSchema, weight: 2 }),
|
|
159
|
+
accuracy(),
|
|
160
|
+
]
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Test Iterations
|
|
164
|
+
|
|
165
|
+
LLM outputs are non-deterministic. Run tests multiple times for statistical reliability:
|
|
166
|
+
|
|
167
|
+
```typescript
|
|
168
|
+
const report = await suite.run(testCases, {
|
|
169
|
+
iterations: 5,
|
|
170
|
+
concurrency: 3,
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
console.log(report.results[0].iterationStats)
|
|
174
|
+
// {
|
|
175
|
+
// iterations: 5,
|
|
176
|
+
// passCount: 4,
|
|
177
|
+
// passRate: 0.8,
|
|
178
|
+
// mean: 82.4,
|
|
179
|
+
// stdDev: 5.2,
|
|
180
|
+
// min: 75,
|
|
181
|
+
// max: 90,
|
|
182
|
+
// }
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Multi-Turn Conversations
|
|
186
|
+
|
|
187
|
+
Test agents that require multiple interaction turns:
|
|
188
|
+
|
|
189
|
+
```typescript
|
|
190
|
+
import {
|
|
191
|
+
fieldEquals,
|
|
192
|
+
afterTurns,
|
|
193
|
+
type MultiTurnTestCase,
|
|
194
|
+
} from '@agtlantis/eval'
|
|
195
|
+
|
|
196
|
+
interface BookingInput {
|
|
197
|
+
message: string
|
|
198
|
+
conversationHistory?: Array<{ role: string; content: string }>
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
interface BookingOutput {
|
|
202
|
+
reply: string
|
|
203
|
+
booking: {
|
|
204
|
+
status: 'pending' | 'confirmed' | 'cancelled'
|
|
205
|
+
date?: string
|
|
206
|
+
guests?: number
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const testCase: MultiTurnTestCase<BookingInput, BookingOutput> = {
|
|
211
|
+
id: 'complete-booking',
|
|
212
|
+
input: { message: 'I want to make a reservation' },
|
|
213
|
+
multiTurn: {
|
|
214
|
+
followUpInputs: [
|
|
215
|
+
{ input: { message: 'Tomorrow at 7pm' }, description: 'Provide date' },
|
|
216
|
+
{ input: { message: '4 guests' }, description: 'Provide party size' },
|
|
217
|
+
{ input: { message: 'Confirm please' }, description: 'Confirm booking' },
|
|
218
|
+
],
|
|
219
|
+
terminationConditions: [
|
|
220
|
+
fieldEquals('booking.status', 'confirmed'),
|
|
221
|
+
afterTurns(10),
|
|
222
|
+
],
|
|
223
|
+
onConditionMet: 'pass',
|
|
224
|
+
onMaxTurnsReached: 'fail',
|
|
225
|
+
},
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### AI Simulated Users
|
|
230
|
+
|
|
231
|
+
Let AI play the user role with customizable personas:
|
|
232
|
+
|
|
233
|
+
```typescript
|
|
234
|
+
import { aiUser, type MultiTurnTestCase } from '@agtlantis/eval'
|
|
235
|
+
|
|
236
|
+
const friendlyCustomerPrompt = `You are a friendly, cooperative customer.
|
|
237
|
+
- Answer questions clearly and politely
|
|
238
|
+
- Provide information naturally over multiple turns
|
|
239
|
+
- Use casual, conversational language`
|
|
240
|
+
|
|
241
|
+
const testCase: MultiTurnTestCase<BookingInput, BookingOutput> = {
|
|
242
|
+
id: 'ai-friendly-booking',
|
|
243
|
+
input: { message: 'Hi, I need to book a table' },
|
|
244
|
+
multiTurn: {
|
|
245
|
+
followUpInputs: [
|
|
246
|
+
{
|
|
247
|
+
input: aiUser({
|
|
248
|
+
provider,
|
|
249
|
+
systemPrompt: friendlyCustomerPrompt,
|
|
250
|
+
formatHistory: (ctx) =>
|
|
251
|
+
ctx.history.map(h => `Agent: ${h.output.reply}`).join('\n'),
|
|
252
|
+
buildInput: (response, ctx) => ({
|
|
253
|
+
message: response,
|
|
254
|
+
conversationHistory: ctx.history,
|
|
255
|
+
}),
|
|
256
|
+
}),
|
|
257
|
+
description: 'AI friendly customer',
|
|
258
|
+
turns: Infinity,
|
|
259
|
+
},
|
|
260
|
+
],
|
|
261
|
+
terminationConditions: [
|
|
262
|
+
fieldEquals('booking.status', 'confirmed'),
|
|
263
|
+
afterTurns(8),
|
|
264
|
+
],
|
|
265
|
+
},
|
|
266
|
+
}
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
Dynamic personas that change based on conversation progress:
|
|
270
|
+
|
|
271
|
+
```typescript
|
|
272
|
+
aiUser({
|
|
273
|
+
provider,
|
|
274
|
+
systemPrompt: (ctx) => {
|
|
275
|
+
if (ctx.currentTurn <= 2) return 'You are patient and friendly.'
|
|
276
|
+
if (ctx.currentTurn <= 5) return 'You are becoming impatient.'
|
|
277
|
+
return 'You are very rushed and want quick answers.'
|
|
278
|
+
},
|
|
279
|
+
// ...
|
|
280
|
+
})
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Cost Tracking
|
|
284
|
+
|
|
285
|
+
Track LLM costs with built-in pricing tables:
|
|
286
|
+
|
|
287
|
+
```typescript
|
|
288
|
+
import {
|
|
289
|
+
createEvalSuite,
|
|
290
|
+
addCostsToResults,
|
|
291
|
+
DEFAULT_PRICING_CONFIG,
|
|
292
|
+
} from '@agtlantis/eval'
|
|
293
|
+
|
|
294
|
+
const suite = createEvalSuite({
|
|
295
|
+
agent: qaAgent,
|
|
296
|
+
judge,
|
|
297
|
+
agentDescription: 'Q&A agent',
|
|
298
|
+
})
|
|
299
|
+
|
|
300
|
+
const report = await suite.run(testCases)
|
|
301
|
+
|
|
302
|
+
// Add cost breakdown to results using pricing config
|
|
303
|
+
const resultsWithCost = addCostsToResults(report.results, DEFAULT_PRICING_CONFIG)
|
|
304
|
+
|
|
305
|
+
for (const result of resultsWithCost) {
|
|
306
|
+
const cost = result.metrics.costBreakdown
|
|
307
|
+
console.log(`Test: ${result.testCase.id}`)
|
|
308
|
+
console.log(` Agent cost: $${cost.agent?.toFixed(6)}`)
|
|
309
|
+
console.log(` Judge cost: $${cost.judge?.toFixed(6)}`)
|
|
310
|
+
console.log(` Total cost: $${cost.total?.toFixed(6)}`)
|
|
311
|
+
}
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Prompt Improvement
|
|
315
|
+
|
|
316
|
+
Get AI-generated suggestions to improve your agent:
|
|
317
|
+
|
|
318
|
+
```typescript
|
|
319
|
+
import { createImprover, applyPromptSuggestions } from '@agtlantis/eval'
|
|
320
|
+
|
|
321
|
+
const improver = createImprover({
|
|
322
|
+
provider,
|
|
323
|
+
})
|
|
324
|
+
|
|
325
|
+
const suite = createEvalSuite({
|
|
326
|
+
agent: qaAgent,
|
|
327
|
+
judge,
|
|
328
|
+
improver,
|
|
329
|
+
agentDescription: 'Q&A agent',
|
|
330
|
+
})
|
|
331
|
+
|
|
332
|
+
const report = await suite.run(testCases)
|
|
333
|
+
|
|
334
|
+
for (const suggestion of report.suggestions) {
|
|
335
|
+
console.log(`[${suggestion.priority}] ${suggestion.type}`)
|
|
336
|
+
console.log(`Reasoning: ${suggestion.reasoning}`)
|
|
337
|
+
suggestion.approved = true
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
const result = applyPromptSuggestions(
|
|
341
|
+
qaAgent.prompt,
|
|
342
|
+
report.suggestions.filter(s => s.approved),
|
|
343
|
+
{ bumpVersion: 'minor' }
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
console.log(`Applied ${result.appliedCount} suggestions`)
|
|
347
|
+
console.log(`New version: ${result.prompt.version}`)
|
|
348
|
+
```
|
|
349
|
+
|
|
350
|
+
## CLI Usage
|
|
351
|
+
|
|
352
|
+
Run evaluations from the command line:
|
|
353
|
+
|
|
354
|
+
```bash
|
|
355
|
+
npx agent-eval run
|
|
356
|
+
npx agent-eval run ./my-config.ts
|
|
357
|
+
npx agent-eval run -v -c 3 -i 5 -o ./reports/eval.md
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### CLI Options
|
|
361
|
+
|
|
362
|
+
| Option | Description | Default |
|
|
363
|
+
|--------|-------------|---------|
|
|
364
|
+
| `-o, --output <path>` | Report output path | `./reports/eval-{timestamp}.md` |
|
|
365
|
+
| `-e, --env-file <path>` | Environment file path | `.env` |
|
|
366
|
+
| `-v, --verbose` | Verbose output mode | `false` |
|
|
367
|
+
| `-c, --concurrency <n>` | Concurrent executions | `1` |
|
|
368
|
+
| `-i, --iterations <n>` | Iterations per test | `1` |
|
|
369
|
+
| `--no-report` | Skip saving report | `false` |
|
|
370
|
+
|
|
371
|
+
### Config File
|
|
372
|
+
|
|
373
|
+
Create `agent-eval.config.ts`:
|
|
374
|
+
|
|
375
|
+
```typescript
|
|
376
|
+
import { defineConfig, accuracy, relevance } from '@agtlantis/eval'
|
|
377
|
+
import { myAgent } from './src/agent'
|
|
378
|
+
|
|
379
|
+
export default defineConfig({
|
|
380
|
+
name: 'My Agent Evaluation',
|
|
381
|
+
agentDescription: 'Helpful assistant that answers questions',
|
|
382
|
+
agent: myAgent,
|
|
383
|
+
|
|
384
|
+
llm: {
|
|
385
|
+
provider: 'openai',
|
|
386
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
387
|
+
defaultModel: 'gpt-4o-mini',
|
|
388
|
+
},
|
|
389
|
+
|
|
390
|
+
judge: {
|
|
391
|
+
criteria: [accuracy({ weight: 2 }), relevance()],
|
|
392
|
+
passThreshold: 70,
|
|
393
|
+
},
|
|
394
|
+
|
|
395
|
+
testCases: [
|
|
396
|
+
{ id: 'test-1', input: { question: 'What is TypeScript?' } },
|
|
397
|
+
{ id: 'test-2', input: { question: 'Explain async/await' } },
|
|
398
|
+
],
|
|
399
|
+
|
|
400
|
+
output: {
|
|
401
|
+
dir: './reports',
|
|
402
|
+
filename: 'evaluation-report.md',
|
|
403
|
+
},
|
|
404
|
+
|
|
405
|
+
run: {
|
|
406
|
+
concurrency: 3,
|
|
407
|
+
iterations: 1,
|
|
408
|
+
},
|
|
409
|
+
})
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
## LLM Providers
|
|
413
|
+
|
|
414
|
+
Providers are imported from `@agtlantis/core`:
|
|
415
|
+
|
|
416
|
+
```typescript
|
|
417
|
+
import { createOpenAIProvider, createGoogleProvider } from '@agtlantis/core'
|
|
418
|
+
|
|
419
|
+
const openai = createOpenAIProvider({
|
|
420
|
+
apiKey: process.env.OPENAI_API_KEY!,
|
|
421
|
+
}).withDefaultModel('gpt-4o')
|
|
422
|
+
|
|
423
|
+
const google = createGoogleProvider({
|
|
424
|
+
apiKey: process.env.GOOGLE_AI_API_KEY!,
|
|
425
|
+
}).withDefaultModel('gemini-2.0-flash')
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
## Testing Utilities
|
|
429
|
+
|
|
430
|
+
### Mock Provider
|
|
431
|
+
|
|
432
|
+
```typescript
|
|
433
|
+
import { mock } from '@agtlantis/eval'
|
|
434
|
+
|
|
435
|
+
const mockProvider = mock({
|
|
436
|
+
response: JSON.stringify({
|
|
437
|
+
verdicts: [{ criterionId: 'accuracy', score: 85, reasoning: 'Good', passed: true }],
|
|
438
|
+
}),
|
|
439
|
+
})
|
|
440
|
+
|
|
441
|
+
const judge = createJudge({ provider: mockProvider, ... })
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
### Mock Agent
|
|
445
|
+
|
|
446
|
+
```typescript
|
|
447
|
+
import { createMockAgent } from '@agtlantis/eval'
|
|
448
|
+
|
|
449
|
+
const mockAgent = createMockAgent({
|
|
450
|
+
name: 'test-agent',
|
|
451
|
+
defaultOutput: { answer: 'Test answer', confidence: 'high' },
|
|
452
|
+
})
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
## Error Handling
|
|
456
|
+
|
|
457
|
+
All errors are wrapped in `EvalError` with error codes:
|
|
458
|
+
|
|
459
|
+
```typescript
|
|
460
|
+
import { EvalError, EvalErrorCode } from '@agtlantis/eval'
|
|
461
|
+
|
|
462
|
+
try {
|
|
463
|
+
await judge.evaluate({ ... })
|
|
464
|
+
} catch (error) {
|
|
465
|
+
if (error instanceof EvalError) {
|
|
466
|
+
switch (error.code) {
|
|
467
|
+
case EvalErrorCode.LLM_API_ERROR:
|
|
468
|
+
console.error('LLM API failed:', error.message)
|
|
469
|
+
break
|
|
470
|
+
case EvalErrorCode.VERDICT_PARSE_ERROR:
|
|
471
|
+
console.error('Failed to parse verdict:', error.context)
|
|
472
|
+
break
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
## Documentation
|
|
479
|
+
|
|
480
|
+
For detailed guides and API reference, see the [docs](./docs/) folder:
|
|
481
|
+
|
|
482
|
+
- [Getting Started](./docs/getting-started.md)
|
|
483
|
+
- [API Reference](./docs/api/README.md)
|
|
484
|
+
|
|
485
|
+
## Examples
|
|
486
|
+
|
|
487
|
+
Check the [`examples/`](./examples/) directory:
|
|
488
|
+
|
|
489
|
+
- **Q&A Agent** — Basic evaluation example
|
|
490
|
+
- **Multi-turn Booking Agent** — Complex conversation testing
|
|
491
|
+
- **AI Simulated User** — Automated user simulation with personas
|
|
492
|
+
- **Full Pipeline** — Complete evaluation workflow with improvement
|
|
493
|
+
|
|
494
|
+
## License
|
|
495
|
+
|
|
496
|
+
MIT
|