vitest-evals 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -65
- package/dist/index.d.mts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +307 -76
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +306 -76
- package/dist/index.mjs.map +1 -1
- package/dist/scorers/index.d.mts +2 -1
- package/dist/scorers/index.d.ts +2 -1
- package/dist/scorers/index.js +322 -73
- package/dist/scorers/index.js.map +1 -1
- package/dist/scorers/index.mjs +316 -72
- package/dist/scorers/index.mjs.map +1 -1
- package/dist/scorers/structuredOutputScorer.d.mts +3 -0
- package/dist/scorers/structuredOutputScorer.d.ts +3 -0
- package/dist/scorers/structuredOutputScorer.js +299 -0
- package/dist/scorers/structuredOutputScorer.js.map +1 -0
- package/dist/scorers/structuredOutputScorer.mjs +273 -0
- package/dist/scorers/structuredOutputScorer.mjs.map +1 -0
- package/dist/scorers/toolCallScorer.d.mts +123 -31
- package/dist/scorers/toolCallScorer.d.ts +123 -31
- package/dist/scorers/toolCallScorer.js +161 -70
- package/dist/scorers/toolCallScorer.js.map +1 -1
- package/dist/scorers/toolCallScorer.mjs +162 -71
- package/dist/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/scorers/utils.d.mts +103 -0
- package/dist/scorers/utils.d.ts +103 -0
- package/dist/scorers/utils.js +176 -0
- package/dist/scorers/utils.js.map +1 -0
- package/dist/scorers/utils.mjs +146 -0
- package/dist/scorers/utils.mjs.map +1 -0
- package/package.json +13 -10
package/README.md
CHANGED
|
@@ -16,16 +16,18 @@ import { describeEval } from "vitest-evals";
|
|
|
16
16
|
describeEval("capital cities", {
|
|
17
17
|
data: async () => [
|
|
18
18
|
{ input: "What is the capital of France?", expected: "Paris" },
|
|
19
|
-
{ input: "What is the capital of Japan?", expected: "Tokyo" }
|
|
19
|
+
{ input: "What is the capital of Japan?", expected: "Tokyo" },
|
|
20
20
|
],
|
|
21
21
|
task: async (input) => {
|
|
22
22
|
const response = await queryLLM(input);
|
|
23
23
|
return response; // Simple string return
|
|
24
24
|
},
|
|
25
|
-
scorers: [
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
scorers: [
|
|
26
|
+
async ({ output, expected }) => ({
|
|
27
|
+
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
|
|
28
|
+
}),
|
|
29
|
+
],
|
|
30
|
+
threshold: 0.8,
|
|
29
31
|
});
|
|
30
32
|
```
|
|
31
33
|
|
|
@@ -58,15 +60,15 @@ import { ToolCallScorer } from "vitest-evals/scorers/toolCallScorer";
|
|
|
58
60
|
|
|
59
61
|
describeEval("tool usage", {
|
|
60
62
|
data: async () => [
|
|
61
|
-
{ input: "Search weather", expectedTools: [{ name: "weather_api" }] }
|
|
63
|
+
{ input: "Search weather", expectedTools: [{ name: "weather_api" }] },
|
|
62
64
|
],
|
|
63
65
|
task: weatherTask,
|
|
64
|
-
scorers: [ToolCallScorer()]
|
|
66
|
+
scorers: [ToolCallScorer()],
|
|
65
67
|
});
|
|
66
68
|
|
|
67
69
|
// Custom scorer
|
|
68
70
|
const LengthScorer = async ({ output }) => ({
|
|
69
|
-
score: output.length > 50 ? 1.0 : 0.0
|
|
71
|
+
score: output.length > 50 ? 1.0 : 0.0,
|
|
70
72
|
});
|
|
71
73
|
|
|
72
74
|
// TypeScript scorer with custom options
|
|
@@ -77,46 +79,54 @@ interface CustomOptions extends BaseScorerOptions {
|
|
|
77
79
|
}
|
|
78
80
|
|
|
79
81
|
const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
|
|
80
|
-
score: opts.output.length >= opts.minLength ? 1.0 : 0.0
|
|
82
|
+
score: opts.output.length >= opts.minLength ? 1.0 : 0.0,
|
|
81
83
|
});
|
|
82
84
|
```
|
|
83
85
|
|
|
84
86
|
### Built-in Scorers
|
|
85
87
|
|
|
86
88
|
#### ToolCallScorer
|
|
89
|
+
|
|
87
90
|
Evaluates if the expected tools were called with correct arguments.
|
|
88
91
|
|
|
89
92
|
```javascript
|
|
90
93
|
// Basic usage - strict matching, any order
|
|
91
94
|
describeEval("search test", {
|
|
92
|
-
data: async () => [
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
95
|
+
data: async () => [
|
|
96
|
+
{
|
|
97
|
+
input: "Find Italian restaurants",
|
|
98
|
+
expectedTools: [
|
|
99
|
+
{ name: "search", arguments: { type: "restaurant" } },
|
|
100
|
+
{ name: "filter", arguments: { cuisine: "italian" } },
|
|
101
|
+
],
|
|
102
|
+
},
|
|
103
|
+
],
|
|
99
104
|
task: myTask,
|
|
100
|
-
scorers: [ToolCallScorer()]
|
|
105
|
+
scorers: [ToolCallScorer()],
|
|
101
106
|
});
|
|
102
107
|
|
|
103
108
|
// Strict evaluation - exact order and parameters
|
|
104
|
-
scorers: [
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
109
|
+
scorers: [
|
|
110
|
+
ToolCallScorer({
|
|
111
|
+
ordered: true, // Tools must be in exact order
|
|
112
|
+
params: "strict", // Parameters must match exactly
|
|
113
|
+
}),
|
|
114
|
+
];
|
|
108
115
|
|
|
109
116
|
// Flexible evaluation
|
|
110
|
-
scorers: [
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
117
|
+
scorers: [
|
|
118
|
+
ToolCallScorer({
|
|
119
|
+
requireAll: false, // Partial matches give partial credit
|
|
120
|
+
allowExtras: false, // No additional tools allowed
|
|
121
|
+
}),
|
|
122
|
+
];
|
|
114
123
|
```
|
|
115
124
|
|
|
116
125
|
**Default behavior:**
|
|
126
|
+
|
|
117
127
|
- Strict parameter matching (exact equality required)
|
|
118
128
|
- Any order allowed
|
|
119
|
-
- Extra tools allowed
|
|
129
|
+
- Extra tools allowed
|
|
120
130
|
- All expected tools required
|
|
121
131
|
|
|
122
132
|
## AI SDK Integration
|
|
@@ -126,17 +136,20 @@ See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a com
|
|
|
126
136
|
Transform provider responses to our format:
|
|
127
137
|
|
|
128
138
|
```javascript
|
|
129
|
-
|
|
130
|
-
|
|
139
|
+
const { text, steps } = await generateText({
|
|
140
|
+
model: openai("gpt-4o"),
|
|
141
|
+
prompt: input,
|
|
142
|
+
tools: { myTool: myToolDefinition },
|
|
143
|
+
});
|
|
144
|
+
|
|
131
145
|
return {
|
|
132
146
|
result: text,
|
|
133
|
-
toolCalls:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
}))
|
|
147
|
+
toolCalls: steps
|
|
148
|
+
.flatMap((step) => step.toolCalls)
|
|
149
|
+
.map((call) => ({
|
|
150
|
+
name: call.toolName,
|
|
151
|
+
arguments: call.args,
|
|
152
|
+
})),
|
|
140
153
|
};
|
|
141
154
|
```
|
|
142
155
|
|
|
@@ -154,9 +167,9 @@ import { Factuality, ClosedQA } from "autoevals";
|
|
|
154
167
|
scorers: [
|
|
155
168
|
Factuality, // LLM-based factuality checking
|
|
156
169
|
ClosedQA.partial({
|
|
157
|
-
criteria: "Does the answer mention Paris?"
|
|
158
|
-
})
|
|
159
|
-
]
|
|
170
|
+
criteria: "Does the answer mention Paris?",
|
|
171
|
+
}),
|
|
172
|
+
];
|
|
160
173
|
```
|
|
161
174
|
|
|
162
175
|
#### Custom LLM-based Factuality Scorer
|
|
@@ -164,18 +177,18 @@ scorers: [
|
|
|
164
177
|
Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
|
|
165
178
|
|
|
166
179
|
```javascript
|
|
167
|
-
import { generateObject } from
|
|
168
|
-
import { openai } from
|
|
169
|
-
import { z } from
|
|
170
|
-
|
|
171
|
-
const Factuality = (model = openai(
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
180
|
+
import { generateObject } from "ai";
|
|
181
|
+
import { openai } from "@ai-sdk/openai";
|
|
182
|
+
import { z } from "zod";
|
|
183
|
+
|
|
184
|
+
const Factuality = (model = openai("gpt-4o")) => async ({ input, output, expected }) => {
|
|
185
|
+
if (!expected) {
|
|
186
|
+
return { score: 1.0, metadata: { rationale: "No expected answer" } };
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const { object } = await generateObject({
|
|
190
|
+
model,
|
|
191
|
+
prompt: `
|
|
179
192
|
Compare the factual content of the submitted answer with the expert answer.
|
|
180
193
|
|
|
181
194
|
Question: ${input}
|
|
@@ -189,21 +202,21 @@ const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expecte
|
|
|
189
202
|
(D) Contradicts expert answer
|
|
190
203
|
(E) Different but factually equivalent
|
|
191
204
|
`,
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
205
|
+
schema: z.object({
|
|
206
|
+
answer: z.enum(["A", "B", "C", "D", "E"]),
|
|
207
|
+
rationale: z.string(),
|
|
208
|
+
}),
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
|
|
212
|
+
return {
|
|
213
|
+
score: scores[object.answer],
|
|
214
|
+
metadata: { rationale: object.rationale, answer: object.answer },
|
|
215
|
+
};
|
|
202
216
|
};
|
|
203
|
-
};
|
|
204
217
|
|
|
205
218
|
// Usage
|
|
206
|
-
scorers: [Factuality()]
|
|
219
|
+
scorers: [Factuality()];
|
|
207
220
|
```
|
|
208
221
|
|
|
209
222
|
### Skip Tests Conditionally
|
|
@@ -217,14 +230,18 @@ describeEval("gpt-4 tests", {
|
|
|
217
230
|
|
|
218
231
|
### Existing Test Suites
|
|
219
232
|
|
|
233
|
+
For integration with existing Vitest test suites, you can use the `.toEval()` matcher:
|
|
234
|
+
|
|
235
|
+
> **⚠️ Deprecated**: The `.toEval()` helper is deprecated. Use `describeEval()` instead for better test organization and multiple scorers support. We may consider bringing back a similar check, but its currently too limited for many scorer implementations.
|
|
236
|
+
|
|
220
237
|
```javascript
|
|
221
238
|
import "vitest-evals";
|
|
222
239
|
|
|
223
240
|
test("capital check", () => {
|
|
224
241
|
const simpleFactuality = async ({ output, expected }) => ({
|
|
225
|
-
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
|
|
242
|
+
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
|
|
226
243
|
});
|
|
227
|
-
|
|
244
|
+
|
|
228
245
|
expect("What is the capital of France?").toEval(
|
|
229
246
|
"Paris",
|
|
230
247
|
answerQuestion,
|
|
@@ -234,6 +251,25 @@ test("capital check", () => {
|
|
|
234
251
|
});
|
|
235
252
|
```
|
|
236
253
|
|
|
254
|
+
**Recommended migration** to `describeEval()`:
|
|
255
|
+
|
|
256
|
+
```javascript
|
|
257
|
+
import { describeEval } from "vitest-evals";
|
|
258
|
+
|
|
259
|
+
describeEval("capital check", {
|
|
260
|
+
data: async () => [
|
|
261
|
+
{ input: "What is the capital of France?", expected: "Paris" },
|
|
262
|
+
],
|
|
263
|
+
task: answerQuestion,
|
|
264
|
+
scorers: [
|
|
265
|
+
async ({ output, expected }) => ({
|
|
266
|
+
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
|
|
267
|
+
}),
|
|
268
|
+
],
|
|
269
|
+
threshold: 0.8,
|
|
270
|
+
});
|
|
271
|
+
```
|
|
272
|
+
|
|
237
273
|
## Configuration
|
|
238
274
|
|
|
239
275
|
### Separate Eval Configuration
|
|
@@ -264,4 +300,4 @@ vitest --config=vitest.evals.config.ts
|
|
|
264
300
|
```shell
|
|
265
301
|
npm install
|
|
266
302
|
npm test
|
|
267
|
-
```
|
|
303
|
+
```
|
package/dist/index.d.mts
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
1
1
|
import 'vitest';
|
|
2
|
-
export { B as BaseScorerOptions, E as EvalMatchers,
|
|
2
|
+
export { B as BaseScorerOptions, E as EvalMatchers, e as Score, f as ScoreFn, b as StructuredOutputScorer, S as StructuredOutputScorerOptions, d as TaskFn, c as TaskResult, g as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, h as describeEval, i as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
|
|
3
|
+
import './scorers/utils.mjs';
|
package/dist/index.d.ts
CHANGED
|
@@ -1,2 +1,3 @@
|
|
|
1
1
|
import 'vitest';
|
|
2
|
-
export { B as BaseScorerOptions, E as EvalMatchers,
|
|
2
|
+
export { B as BaseScorerOptions, E as EvalMatchers, e as Score, f as ScoreFn, b as StructuredOutputScorer, S as StructuredOutputScorerOptions, d as TaskFn, c as TaskResult, g as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, h as describeEval, i as formatScores, w as wrapText } from './scorers/toolCallScorer.js';
|
|
3
|
+
import './scorers/utils.js';
|