vitest-evals 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -16,16 +16,18 @@ import { describeEval } from "vitest-evals";
16
16
  describeEval("capital cities", {
17
17
  data: async () => [
18
18
  { input: "What is the capital of France?", expected: "Paris" },
19
- { input: "What is the capital of Japan?", expected: "Tokyo" }
19
+ { input: "What is the capital of Japan?", expected: "Tokyo" },
20
20
  ],
21
21
  task: async (input) => {
22
22
  const response = await queryLLM(input);
23
23
  return response; // Simple string return
24
24
  },
25
- scorers: [async ({ output, expected }) => ({
26
- score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
27
- })],
28
- threshold: 0.8
25
+ scorers: [
26
+ async ({ output, expected }) => ({
27
+ score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
28
+ }),
29
+ ],
30
+ threshold: 0.8,
29
31
  });
30
32
  ```
31
33
 
@@ -58,15 +60,15 @@ import { ToolCallScorer } from "vitest-evals/scorers/toolCallScorer";
58
60
 
59
61
  describeEval("tool usage", {
60
62
  data: async () => [
61
- { input: "Search weather", expectedTools: [{ name: "weather_api" }] }
63
+ { input: "Search weather", expectedTools: [{ name: "weather_api" }] },
62
64
  ],
63
65
  task: weatherTask,
64
- scorers: [ToolCallScorer()]
66
+ scorers: [ToolCallScorer()],
65
67
  });
66
68
 
67
69
  // Custom scorer
68
70
  const LengthScorer = async ({ output }) => ({
69
- score: output.length > 50 ? 1.0 : 0.0
71
+ score: output.length > 50 ? 1.0 : 0.0,
70
72
  });
71
73
 
72
74
  // TypeScript scorer with custom options
@@ -77,46 +79,54 @@ interface CustomOptions extends BaseScorerOptions {
77
79
  }
78
80
 
79
81
  const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
80
- score: opts.output.length >= opts.minLength ? 1.0 : 0.0
82
+ score: opts.output.length >= opts.minLength ? 1.0 : 0.0,
81
83
  });
82
84
  ```
83
85
 
84
86
  ### Built-in Scorers
85
87
 
86
88
  #### ToolCallScorer
89
+
87
90
  Evaluates if the expected tools were called with correct arguments.
88
91
 
89
92
  ```javascript
90
93
  // Basic usage - strict matching, any order
91
94
  describeEval("search test", {
92
- data: async () => [{
93
- input: "Find Italian restaurants",
94
- expectedTools: [
95
- { name: "search", arguments: { type: "restaurant" } },
96
- { name: "filter", arguments: { cuisine: "italian" } }
97
- ]
98
- }],
95
+ data: async () => [
96
+ {
97
+ input: "Find Italian restaurants",
98
+ expectedTools: [
99
+ { name: "search", arguments: { type: "restaurant" } },
100
+ { name: "filter", arguments: { cuisine: "italian" } },
101
+ ],
102
+ },
103
+ ],
99
104
  task: myTask,
100
- scorers: [ToolCallScorer()]
105
+ scorers: [ToolCallScorer()],
101
106
  });
102
107
 
103
108
  // Strict evaluation - exact order and parameters
104
- scorers: [ToolCallScorer({
105
- ordered: true, // Tools must be in exact order
106
- params: "strict" // Parameters must match exactly
107
- })]
109
+ scorers: [
110
+ ToolCallScorer({
111
+ ordered: true, // Tools must be in exact order
112
+ params: "strict", // Parameters must match exactly
113
+ }),
114
+ ];
108
115
 
109
116
  // Flexible evaluation
110
- scorers: [ToolCallScorer({
111
- requireAll: false, // Partial matches give partial credit
112
- allowExtras: false // No additional tools allowed
113
- })]
117
+ scorers: [
118
+ ToolCallScorer({
119
+ requireAll: false, // Partial matches give partial credit
120
+ allowExtras: false, // No additional tools allowed
121
+ }),
122
+ ];
114
123
  ```
115
124
 
116
125
  **Default behavior:**
126
+
117
127
  - Strict parameter matching (exact equality required)
118
128
  - Any order allowed
119
- - Extra tools allowed
129
+ - Extra tools allowed
120
130
  - All expected tools required
121
131
 
122
132
  ## AI SDK Integration
@@ -126,17 +136,20 @@ See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a com
126
136
  Transform provider responses to our format:
127
137
 
128
138
  ```javascript
129
- // Vercel AI SDK
130
- const { text, toolCalls, toolResults } = await generateText(...);
139
+ const { text, steps } = await generateText({
140
+ model: openai("gpt-4o"),
141
+ prompt: input,
142
+ tools: { myTool: myToolDefinition },
143
+ });
144
+
131
145
  return {
132
146
  result: text,
133
- toolCalls: toolCalls?.map((call, i) => ({
134
- id: call.toolCallId,
135
- name: call.toolName,
136
- arguments: call.args,
137
- result: toolResults?.[i]?.result,
138
- status: toolResults?.[i]?.error ? 'failed' : 'completed'
139
- }))
147
+ toolCalls: steps
148
+ .flatMap((step) => step.toolCalls)
149
+ .map((call) => ({
150
+ name: call.toolName,
151
+ arguments: call.args,
152
+ })),
140
153
  };
141
154
  ```
142
155
 
@@ -154,9 +167,9 @@ import { Factuality, ClosedQA } from "autoevals";
154
167
  scorers: [
155
168
  Factuality, // LLM-based factuality checking
156
169
  ClosedQA.partial({
157
- criteria: "Does the answer mention Paris?"
158
- })
159
- ]
170
+ criteria: "Does the answer mention Paris?",
171
+ }),
172
+ ];
160
173
  ```
161
174
 
162
175
  #### Custom LLM-based Factuality Scorer
@@ -164,18 +177,18 @@ scorers: [
164
177
  Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
165
178
 
166
179
  ```javascript
167
- import { generateObject } from 'ai';
168
- import { openai } from '@ai-sdk/openai';
169
- import { z } from 'zod';
170
-
171
- const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expected }) => {
172
- if (!expected) {
173
- return { score: 1.0, metadata: { rationale: "No expected answer" } };
174
- }
175
-
176
- const { object } = await generateObject({
177
- model,
178
- prompt: `
180
+ import { generateObject } from "ai";
181
+ import { openai } from "@ai-sdk/openai";
182
+ import { z } from "zod";
183
+
184
+ const Factuality = (model = openai("gpt-4o")) => async ({ input, output, expected }) => {
185
+ if (!expected) {
186
+ return { score: 1.0, metadata: { rationale: "No expected answer" } };
187
+ }
188
+
189
+ const { object } = await generateObject({
190
+ model,
191
+ prompt: `
179
192
  Compare the factual content of the submitted answer with the expert answer.
180
193
 
181
194
  Question: ${input}
@@ -189,21 +202,21 @@ const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expecte
189
202
  (D) Contradicts expert answer
190
203
  (E) Different but factually equivalent
191
204
  `,
192
- schema: z.object({
193
- answer: z.enum(['A', 'B', 'C', 'D', 'E']),
194
- rationale: z.string()
195
- })
196
- });
197
-
198
- const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
199
- return {
200
- score: scores[object.answer],
201
- metadata: { rationale: object.rationale, answer: object.answer }
205
+ schema: z.object({
206
+ answer: z.enum(["A", "B", "C", "D", "E"]),
207
+ rationale: z.string(),
208
+ }),
209
+ });
210
+
211
+ const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
212
+ return {
213
+ score: scores[object.answer],
214
+ metadata: { rationale: object.rationale, answer: object.answer },
215
+ };
202
216
  };
203
- };
204
217
 
205
218
  // Usage
206
- scorers: [Factuality()]
219
+ scorers: [Factuality()];
207
220
  ```
208
221
 
209
222
  ### Skip Tests Conditionally
@@ -217,14 +230,18 @@ describeEval("gpt-4 tests", {
217
230
 
218
231
  ### Existing Test Suites
219
232
 
233
+ For integration with existing Vitest test suites, you can use the `.toEval()` matcher:
234
+
235
+ > **⚠️ Deprecated**: The `.toEval()` helper is deprecated. Use `describeEval()` instead for better test organization and multiple scorers support. We may consider bringing back a similar check, but its currently too limited for many scorer implementations.
236
+
220
237
  ```javascript
221
238
  import "vitest-evals";
222
239
 
223
240
  test("capital check", () => {
224
241
  const simpleFactuality = async ({ output, expected }) => ({
225
- score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
242
+ score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
226
243
  });
227
-
244
+
228
245
  expect("What is the capital of France?").toEval(
229
246
  "Paris",
230
247
  answerQuestion,
@@ -234,6 +251,25 @@ test("capital check", () => {
234
251
  });
235
252
  ```
236
253
 
254
+ **Recommended migration** to `describeEval()`:
255
+
256
+ ```javascript
257
+ import { describeEval } from "vitest-evals";
258
+
259
+ describeEval("capital check", {
260
+ data: async () => [
261
+ { input: "What is the capital of France?", expected: "Paris" },
262
+ ],
263
+ task: answerQuestion,
264
+ scorers: [
265
+ async ({ output, expected }) => ({
266
+ score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
267
+ }),
268
+ ],
269
+ threshold: 0.8,
270
+ });
271
+ ```
272
+
237
273
  ## Configuration
238
274
 
239
275
  ### Separate Eval Configuration
@@ -264,4 +300,4 @@ vitest --config=vitest.evals.config.ts
264
300
  ```shell
265
301
  npm install
266
302
  npm test
267
- ```
303
+ ```
package/dist/index.d.mts CHANGED
@@ -1,2 +1,3 @@
1
1
  import 'vitest';
2
- export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
2
+ export { B as BaseScorerOptions, E as EvalMatchers, e as Score, f as ScoreFn, b as StructuredOutputScorer, S as StructuredOutputScorerOptions, d as TaskFn, c as TaskResult, g as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, h as describeEval, i as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
3
+ import './scorers/utils.mjs';
package/dist/index.d.ts CHANGED
@@ -1,2 +1,3 @@
1
1
  import 'vitest';
2
- export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.js';
2
+ export { B as BaseScorerOptions, E as EvalMatchers, e as Score, f as ScoreFn, b as StructuredOutputScorer, S as StructuredOutputScorerOptions, d as TaskFn, c as TaskResult, g as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, h as describeEval, i as formatScores, w as wrapText } from './scorers/toolCallScorer.js';
3
+ import './scorers/utils.js';