vitest-evals 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +211 -172
- package/dist/index.d.mts +2 -98
- package/dist/index.d.ts +2 -98
- package/dist/index.js +270 -11
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +269 -11
- package/dist/index.mjs.map +1 -1
- package/dist/scorers/index.d.mts +2 -0
- package/dist/scorers/index.d.ts +2 -0
- package/dist/scorers/index.js +282 -0
- package/dist/scorers/index.js.map +1 -0
- package/dist/scorers/index.mjs +256 -0
- package/dist/scorers/index.mjs.map +1 -0
- package/dist/scorers/toolCallScorer.d.mts +240 -0
- package/dist/scorers/toolCallScorer.d.ts +240 -0
- package/dist/scorers/toolCallScorer.js +280 -0
- package/dist/scorers/toolCallScorer.js.map +1 -0
- package/dist/scorers/toolCallScorer.mjs +256 -0
- package/dist/scorers/toolCallScorer.mjs.map +1 -0
- package/package.json +16 -4
- package/dist/compatibility.test.d.mts +0 -2
- package/dist/compatibility.test.d.ts +0 -2
- package/dist/compatibility.test.js +0 -45009
- package/dist/compatibility.test.js.map +0 -1
- package/dist/compatibility.test.mjs +0 -45864
- package/dist/compatibility.test.mjs.map +0 -1
- package/dist/formatScores.test.d.mts +0 -2
- package/dist/formatScores.test.d.ts +0 -2
- package/dist/formatScores.test.js +0 -195
- package/dist/formatScores.test.js.map +0 -1
- package/dist/formatScores.test.mjs +0 -194
- package/dist/formatScores.test.mjs.map +0 -1
- package/dist/wrapText.test.d.mts +0 -2
- package/dist/wrapText.test.d.ts +0 -2
- package/dist/wrapText.test.js +0 -162
- package/dist/wrapText.test.js.map +0 -1
- package/dist/wrapText.test.mjs +0 -161
- package/dist/wrapText.test.mjs.map +0 -1
package/README.md
CHANGED
|
@@ -1,186 +1,246 @@
|
|
|
1
1
|
# vitest-evals
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Evaluate LLM outputs using the familiar Vitest testing framework.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
## Use
|
|
5
|
+
## Installation
|
|
8
6
|
|
|
9
7
|
```shell
|
|
10
8
|
npm install -D vitest-evals
|
|
11
9
|
```
|
|
12
10
|
|
|
13
|
-
|
|
11
|
+
## Quick Start
|
|
14
12
|
|
|
15
13
|
```javascript
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
14
|
+
import { describeEval } from "vitest-evals";
|
|
15
|
+
|
|
16
|
+
describeEval("capital cities", {
|
|
17
|
+
data: async () => [
|
|
18
|
+
{ input: "What is the capital of France?", expected: "Paris" },
|
|
19
|
+
{ input: "What is the capital of Japan?", expected: "Tokyo" }
|
|
20
|
+
],
|
|
21
|
+
task: async (input) => {
|
|
22
|
+
const response = await queryLLM(input);
|
|
23
|
+
return response; // Simple string return
|
|
24
|
+
},
|
|
25
|
+
scorers: [async ({ output, expected }) => ({
|
|
26
|
+
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
|
|
27
|
+
})],
|
|
28
|
+
threshold: 0.8
|
|
29
|
+
});
|
|
24
30
|
```
|
|
25
31
|
|
|
26
|
-
|
|
27
|
-
|
|
32
|
+
## Tasks
|
|
33
|
+
|
|
34
|
+
Tasks process inputs and return outputs. Two formats are supported:
|
|
28
35
|
|
|
29
36
|
```javascript
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
37
|
+
// Simple: just return a string
|
|
38
|
+
const task = async (input) => "response";
|
|
39
|
+
|
|
40
|
+
// With tool tracking: return a TaskResult
|
|
41
|
+
const task = async (input) => ({
|
|
42
|
+
result: "response",
|
|
43
|
+
toolCalls: [
|
|
44
|
+
{ name: "search", arguments: { query: "..." }, result: {...} }
|
|
45
|
+
]
|
|
46
|
+
});
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Scorers
|
|
50
|
+
|
|
51
|
+
Scorers evaluate outputs and return a score (0-1). Use built-in scorers or create your own:
|
|
41
52
|
|
|
42
|
-
|
|
53
|
+
```javascript
|
|
54
|
+
// Built-in scorer
|
|
55
|
+
import { ToolCallScorer } from "vitest-evals";
|
|
56
|
+
// Or import individually
|
|
57
|
+
import { ToolCallScorer } from "vitest-evals/scorers/toolCallScorer";
|
|
58
|
+
|
|
59
|
+
describeEval("tool usage", {
|
|
60
|
+
data: async () => [
|
|
61
|
+
{ input: "Search weather", expectedTools: [{ name: "weather_api" }] }
|
|
62
|
+
],
|
|
63
|
+
task: weatherTask,
|
|
64
|
+
scorers: [ToolCallScorer()]
|
|
65
|
+
});
|
|
43
66
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
67
|
+
// Custom scorer
|
|
68
|
+
const LengthScorer = async ({ output }) => ({
|
|
69
|
+
score: output.length > 50 ? 1.0 : 0.0
|
|
70
|
+
});
|
|
47
71
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
// ok with a 60% score (see the implementation for why).
|
|
51
|
-
threshold: 0.6,
|
|
72
|
+
// TypeScript scorer with custom options
|
|
73
|
+
import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
|
|
52
74
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
75
|
+
interface CustomOptions extends BaseScorerOptions {
|
|
76
|
+
minLength: number;
|
|
77
|
+
}
|
|
56
78
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
})
|
|
79
|
+
const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
|
|
80
|
+
score: opts.output.length >= opts.minLength ? 1.0 : 0.0
|
|
81
|
+
});
|
|
61
82
|
```
|
|
62
83
|
|
|
63
|
-
###
|
|
84
|
+
### Built-in Scorers
|
|
85
|
+
|
|
86
|
+
#### ToolCallScorer
|
|
87
|
+
Evaluates if the expected tools were called with correct arguments.
|
|
64
88
|
|
|
65
89
|
```javascript
|
|
66
|
-
//
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
0.8
|
|
78
|
-
);
|
|
79
|
-
});
|
|
90
|
+
// Basic usage - strict matching, any order
|
|
91
|
+
describeEval("search test", {
|
|
92
|
+
data: async () => [{
|
|
93
|
+
input: "Find Italian restaurants",
|
|
94
|
+
expectedTools: [
|
|
95
|
+
{ name: "search", arguments: { type: "restaurant" } },
|
|
96
|
+
{ name: "filter", arguments: { cuisine: "italian" } }
|
|
97
|
+
]
|
|
98
|
+
}],
|
|
99
|
+
task: myTask,
|
|
100
|
+
scorers: [ToolCallScorer()]
|
|
80
101
|
});
|
|
102
|
+
|
|
103
|
+
// Strict evaluation - exact order and parameters
|
|
104
|
+
scorers: [ToolCallScorer({
|
|
105
|
+
ordered: true, // Tools must be in exact order
|
|
106
|
+
params: "strict" // Parameters must match exactly
|
|
107
|
+
})]
|
|
108
|
+
|
|
109
|
+
// Flexible evaluation
|
|
110
|
+
scorers: [ToolCallScorer({
|
|
111
|
+
requireAll: false, // Partial matches give partial credit
|
|
112
|
+
allowExtras: false // No additional tools allowed
|
|
113
|
+
})]
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
**Default behavior:**
|
|
117
|
+
- Strict parameter matching (exact equality required)
|
|
118
|
+
- Any order allowed
|
|
119
|
+
- Extra tools allowed
|
|
120
|
+
- All expected tools required
|
|
121
|
+
|
|
122
|
+
## AI SDK Integration
|
|
123
|
+
|
|
124
|
+
See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a complete example with the Vercel AI SDK.
|
|
125
|
+
|
|
126
|
+
Transform provider responses to our format:
|
|
127
|
+
|
|
128
|
+
```javascript
|
|
129
|
+
// Vercel AI SDK
|
|
130
|
+
const { text, toolCalls, toolResults } = await generateText(...);
|
|
131
|
+
return {
|
|
132
|
+
result: text,
|
|
133
|
+
toolCalls: toolCalls?.map((call, i) => ({
|
|
134
|
+
id: call.toolCallId,
|
|
135
|
+
name: call.toolName,
|
|
136
|
+
arguments: call.args,
|
|
137
|
+
result: toolResults?.[i]?.result,
|
|
138
|
+
status: toolResults?.[i]?.error ? 'failed' : 'completed'
|
|
139
|
+
}))
|
|
140
|
+
};
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Advanced Usage
|
|
144
|
+
|
|
145
|
+
### Advanced Scorers
|
|
146
|
+
|
|
147
|
+
#### Using autoevals
|
|
148
|
+
|
|
149
|
+
For sophisticated evaluation, use autoevals scorers:
|
|
150
|
+
|
|
151
|
+
```javascript
|
|
152
|
+
import { Factuality, ClosedQA } from "autoevals";
|
|
153
|
+
|
|
154
|
+
scorers: [
|
|
155
|
+
Factuality, // LLM-based factuality checking
|
|
156
|
+
ClosedQA.partial({
|
|
157
|
+
criteria: "Does the answer mention Paris?"
|
|
158
|
+
})
|
|
159
|
+
]
|
|
81
160
|
```
|
|
82
161
|
|
|
83
|
-
|
|
162
|
+
#### Custom LLM-based Factuality Scorer
|
|
84
163
|
|
|
85
|
-
|
|
164
|
+
Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
|
|
86
165
|
|
|
87
166
|
```javascript
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
}) => {
|
|
167
|
+
import { generateObject } from 'ai';
|
|
168
|
+
import { openai } from '@ai-sdk/openai';
|
|
169
|
+
import { z } from 'zod';
|
|
170
|
+
|
|
171
|
+
const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expected }) => {
|
|
172
|
+
if (!expected) {
|
|
173
|
+
return { score: 1.0, metadata: { rationale: "No expected answer" } };
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const { object } = await generateObject({
|
|
177
|
+
model,
|
|
178
|
+
prompt: `
|
|
179
|
+
Compare the factual content of the submitted answer with the expert answer.
|
|
180
|
+
|
|
181
|
+
Question: ${input}
|
|
182
|
+
Expert: ${expected}
|
|
183
|
+
Submission: ${output}
|
|
184
|
+
|
|
185
|
+
Options:
|
|
186
|
+
(A) Subset of expert answer
|
|
187
|
+
(B) Superset of expert answer
|
|
188
|
+
(C) Same content as expert
|
|
189
|
+
(D) Contradicts expert answer
|
|
190
|
+
(E) Different but factually equivalent
|
|
191
|
+
`,
|
|
192
|
+
schema: z.object({
|
|
193
|
+
answer: z.enum(['A', 'B', 'C', 'D', 'E']),
|
|
194
|
+
rationale: z.string()
|
|
195
|
+
})
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
|
|
93
199
|
return {
|
|
94
|
-
score:
|
|
200
|
+
score: scores[object.answer],
|
|
201
|
+
metadata: { rationale: object.rationale, answer: object.answer }
|
|
95
202
|
};
|
|
96
203
|
};
|
|
204
|
+
|
|
205
|
+
// Usage
|
|
206
|
+
scorers: [Factuality()]
|
|
97
207
|
```
|
|
98
208
|
|
|
99
|
-
|
|
100
|
-
on the model, enabling you to evaluate against multiple models:
|
|
101
|
-
|
|
102
|
-
````javascript
|
|
103
|
-
import { generateObject, type LanguageModel } from "ai";
|
|
104
|
-
import { z } from "zod";
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
|
|
108
|
-
*
|
|
109
|
-
* @param model - The language model to utilize (via `ai`).
|
|
110
|
-
*
|
|
111
|
-
* @example
|
|
112
|
-
* ```javascript
|
|
113
|
-
* import { openai } from "@ai-sdk/openai";
|
|
114
|
-
*
|
|
115
|
-
* scorers: [Factuality(openai("gpt-4o"))]
|
|
116
|
-
* ```
|
|
117
|
-
*/
|
|
118
|
-
export function Factuality(model: LanguageModel) {
|
|
119
|
-
return async Factuality(opts: {
|
|
120
|
-
input: string;
|
|
121
|
-
output: string;
|
|
122
|
-
expected?: string;
|
|
123
|
-
}) => {
|
|
124
|
-
const { object } = await generateObject({
|
|
125
|
-
model,
|
|
126
|
-
/**
|
|
127
|
-
* Prompt implementation from `autoevals`:
|
|
128
|
-
*
|
|
129
|
-
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
|
|
130
|
-
*/
|
|
131
|
-
prompt: `
|
|
132
|
-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
|
|
133
|
-
[BEGIN DATA]
|
|
134
|
-
************
|
|
135
|
-
[Question]: ${opts.input}
|
|
136
|
-
************
|
|
137
|
-
[Expert]: ${opts.expected}
|
|
138
|
-
************
|
|
139
|
-
[Submission]: ${opts.output}
|
|
140
|
-
************
|
|
141
|
-
[END DATA]
|
|
142
|
-
|
|
143
|
-
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
|
|
144
|
-
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
|
|
145
|
-
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
|
|
146
|
-
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
|
|
147
|
-
(C) The submitted answer contains all the same details as the expert answer.
|
|
148
|
-
(D) There is a disagreement between the submitted answer and the expert answer.
|
|
149
|
-
(E) The answers differ, but these differences don't matter from the perspective of factuality.
|
|
150
|
-
`,
|
|
151
|
-
schema: z.object({
|
|
152
|
-
answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
|
|
153
|
-
rationale: z
|
|
154
|
-
.string()
|
|
155
|
-
.describe("Why you chose this answer. Be very detailed."),
|
|
156
|
-
}),
|
|
157
|
-
});
|
|
158
|
-
|
|
159
|
-
const scores = {
|
|
160
|
-
A: 0.4,
|
|
161
|
-
B: 0.6,
|
|
162
|
-
C: 1,
|
|
163
|
-
D: 0,
|
|
164
|
-
E: 1,
|
|
165
|
-
};
|
|
166
|
-
|
|
167
|
-
return {
|
|
168
|
-
score: scores[object.answer],
|
|
169
|
-
metadata: {
|
|
170
|
-
rationale: object.rationale,
|
|
171
|
-
},
|
|
172
|
-
};
|
|
173
|
-
};
|
|
174
|
-
}
|
|
175
|
-
````
|
|
209
|
+
### Skip Tests Conditionally
|
|
176
210
|
|
|
177
|
-
|
|
211
|
+
```javascript
|
|
212
|
+
describeEval("gpt-4 tests", {
|
|
213
|
+
skipIf: () => !process.env.OPENAI_API_KEY,
|
|
214
|
+
// ...
|
|
215
|
+
});
|
|
216
|
+
```
|
|
178
217
|
|
|
179
|
-
|
|
218
|
+
### Existing Test Suites
|
|
219
|
+
|
|
220
|
+
```javascript
|
|
221
|
+
import "vitest-evals";
|
|
222
|
+
|
|
223
|
+
test("capital check", () => {
|
|
224
|
+
const simpleFactuality = async ({ output, expected }) => ({
|
|
225
|
+
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
|
|
226
|
+
});
|
|
227
|
+
|
|
228
|
+
expect("What is the capital of France?").toEval(
|
|
229
|
+
"Paris",
|
|
230
|
+
answerQuestion,
|
|
231
|
+
simpleFactuality,
|
|
232
|
+
0.8
|
|
233
|
+
);
|
|
234
|
+
});
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Configuration
|
|
238
|
+
|
|
239
|
+
### Separate Eval Configuration
|
|
240
|
+
|
|
241
|
+
Create `vitest.evals.config.ts`:
|
|
180
242
|
|
|
181
243
|
```javascript
|
|
182
|
-
// vitest.evals.config.ts
|
|
183
|
-
/// <reference types="vitest" />
|
|
184
244
|
import { defineConfig } from "vitest/config";
|
|
185
245
|
import defaultConfig from "./vitest.config";
|
|
186
246
|
|
|
@@ -188,41 +248,20 @@ export default defineConfig({
|
|
|
188
248
|
...defaultConfig,
|
|
189
249
|
test: {
|
|
190
250
|
...defaultConfig.test,
|
|
191
|
-
|
|
192
|
-
include: ["src/**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"],
|
|
251
|
+
include: ["src/**/*.eval.{js,ts}"],
|
|
193
252
|
},
|
|
194
253
|
});
|
|
195
254
|
```
|
|
196
255
|
|
|
197
|
-
|
|
256
|
+
Run evals separately:
|
|
198
257
|
|
|
199
258
|
```shell
|
|
200
259
|
vitest --config=vitest.evals.config.ts
|
|
201
260
|
```
|
|
202
261
|
|
|
203
|
-
Its recommended to add this to your `package.json`, such as under an `eval` helper:
|
|
204
|
-
|
|
205
|
-
```javascript
|
|
206
|
-
// package.json
|
|
207
|
-
{
|
|
208
|
-
// ...
|
|
209
|
-
"scripts": {
|
|
210
|
-
// ...
|
|
211
|
-
"eval": "vitest --config=vitest.evals.config.ts",
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
```
|
|
215
|
-
|
|
216
|
-
You can then run your evals using `npm run eval`.
|
|
217
|
-
|
|
218
262
|
## Development
|
|
219
263
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
```
|
|
225
|
-
|
|
226
|
-
```javascript
|
|
227
|
-
pnpm test
|
|
228
|
-
```
|
|
264
|
+
```shell
|
|
265
|
+
npm install
|
|
266
|
+
npm test
|
|
267
|
+
```
|
package/dist/index.d.mts
CHANGED
|
@@ -1,98 +1,2 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
type TaskFn = (input: string) => Promise<string>;
|
|
4
|
-
type Score = {
|
|
5
|
-
score: number | null;
|
|
6
|
-
metadata?: {
|
|
7
|
-
rationale?: string;
|
|
8
|
-
output?: string;
|
|
9
|
-
};
|
|
10
|
-
};
|
|
11
|
-
type ScoreFn = (opts: {
|
|
12
|
-
input: string;
|
|
13
|
-
output: string;
|
|
14
|
-
expected?: string;
|
|
15
|
-
}) => Promise<Score> | Score;
|
|
16
|
-
type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
|
|
17
|
-
interface EvalMatchers<R = unknown> {
|
|
18
|
-
toEval: ToEval<R>;
|
|
19
|
-
}
|
|
20
|
-
declare module "vitest" {
|
|
21
|
-
interface Assertion<T = any> extends EvalMatchers<T> {
|
|
22
|
-
}
|
|
23
|
-
interface AsymmetricMatchersContaining extends EvalMatchers {
|
|
24
|
-
}
|
|
25
|
-
interface TaskMeta {
|
|
26
|
-
eval?: {
|
|
27
|
-
scores: (Score & {
|
|
28
|
-
name: string;
|
|
29
|
-
})[];
|
|
30
|
-
avgScore: number;
|
|
31
|
-
};
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* Creates a test suite for evaluating language model outputs.
|
|
36
|
-
*
|
|
37
|
-
* @param name - The name of the test suite
|
|
38
|
-
* @param options - Configuration options
|
|
39
|
-
* @param options.data - Async function that returns an array of test cases with input and expected values
|
|
40
|
-
* @param options.task - Function that processes the input and returns the model output
|
|
41
|
-
* @param options.skipIf - Optional function that determines if tests should be skipped
|
|
42
|
-
* @param options.scorers - Array of scoring functions that evaluate model outputs
|
|
43
|
-
* @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
|
|
44
|
-
* @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
|
|
45
|
-
*
|
|
46
|
-
* @example
|
|
47
|
-
* ```javascript
|
|
48
|
-
* describeEval("capital cities test", {
|
|
49
|
-
* data: async () => [{
|
|
50
|
-
* input: "What is the capital of France?",
|
|
51
|
-
* expected: "Paris"
|
|
52
|
-
* }],
|
|
53
|
-
* task: async (input) => {
|
|
54
|
-
* // Query LLM here
|
|
55
|
-
* return "Paris";
|
|
56
|
-
* },
|
|
57
|
-
* scorers: [checkFactuality],
|
|
58
|
-
* threshold: 0.8
|
|
59
|
-
* });
|
|
60
|
-
* ```
|
|
61
|
-
*/
|
|
62
|
-
declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
|
|
63
|
-
data: () => Promise<{
|
|
64
|
-
input: string;
|
|
65
|
-
expected: string;
|
|
66
|
-
}[]>;
|
|
67
|
-
task: TaskFn;
|
|
68
|
-
skipIf?: () => boolean;
|
|
69
|
-
scorers: ScoreFn[];
|
|
70
|
-
threshold?: number | null;
|
|
71
|
-
timeout?: number;
|
|
72
|
-
}): vitest.SuiteCollector<object>;
|
|
73
|
-
declare function formatScores(scores: (Score & {
|
|
74
|
-
name: string;
|
|
75
|
-
})[]): string;
|
|
76
|
-
/**
|
|
77
|
-
* Wraps text to fit within a specified width, breaking at word boundaries.
|
|
78
|
-
*
|
|
79
|
-
* @param text - The text to wrap
|
|
80
|
-
* @param width - The maximum width in characters (default: 80)
|
|
81
|
-
* @returns The wrapped text with line breaks
|
|
82
|
-
*
|
|
83
|
-
* @example
|
|
84
|
-
* ```javascript
|
|
85
|
-
* const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
|
|
86
|
-
* console.log(wrapped);
|
|
87
|
-
* // Output:
|
|
88
|
-
* // This is a very
|
|
89
|
-
* // long text that
|
|
90
|
-
* // needs to be
|
|
91
|
-
* // wrapped to fit
|
|
92
|
-
* // within an 80
|
|
93
|
-
* // character width.
|
|
94
|
-
* ```
|
|
95
|
-
*/
|
|
96
|
-
declare function wrapText(text: string, width?: number): string;
|
|
97
|
-
|
|
98
|
-
export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores, wrapText };
|
|
1
|
+
import 'vitest';
|
|
2
|
+
export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
|
package/dist/index.d.ts
CHANGED
|
@@ -1,98 +1,2 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
type TaskFn = (input: string) => Promise<string>;
|
|
4
|
-
type Score = {
|
|
5
|
-
score: number | null;
|
|
6
|
-
metadata?: {
|
|
7
|
-
rationale?: string;
|
|
8
|
-
output?: string;
|
|
9
|
-
};
|
|
10
|
-
};
|
|
11
|
-
type ScoreFn = (opts: {
|
|
12
|
-
input: string;
|
|
13
|
-
output: string;
|
|
14
|
-
expected?: string;
|
|
15
|
-
}) => Promise<Score> | Score;
|
|
16
|
-
type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
|
|
17
|
-
interface EvalMatchers<R = unknown> {
|
|
18
|
-
toEval: ToEval<R>;
|
|
19
|
-
}
|
|
20
|
-
declare module "vitest" {
|
|
21
|
-
interface Assertion<T = any> extends EvalMatchers<T> {
|
|
22
|
-
}
|
|
23
|
-
interface AsymmetricMatchersContaining extends EvalMatchers {
|
|
24
|
-
}
|
|
25
|
-
interface TaskMeta {
|
|
26
|
-
eval?: {
|
|
27
|
-
scores: (Score & {
|
|
28
|
-
name: string;
|
|
29
|
-
})[];
|
|
30
|
-
avgScore: number;
|
|
31
|
-
};
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* Creates a test suite for evaluating language model outputs.
|
|
36
|
-
*
|
|
37
|
-
* @param name - The name of the test suite
|
|
38
|
-
* @param options - Configuration options
|
|
39
|
-
* @param options.data - Async function that returns an array of test cases with input and expected values
|
|
40
|
-
* @param options.task - Function that processes the input and returns the model output
|
|
41
|
-
* @param options.skipIf - Optional function that determines if tests should be skipped
|
|
42
|
-
* @param options.scorers - Array of scoring functions that evaluate model outputs
|
|
43
|
-
* @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
|
|
44
|
-
* @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
|
|
45
|
-
*
|
|
46
|
-
* @example
|
|
47
|
-
* ```javascript
|
|
48
|
-
* describeEval("capital cities test", {
|
|
49
|
-
* data: async () => [{
|
|
50
|
-
* input: "What is the capital of France?",
|
|
51
|
-
* expected: "Paris"
|
|
52
|
-
* }],
|
|
53
|
-
* task: async (input) => {
|
|
54
|
-
* // Query LLM here
|
|
55
|
-
* return "Paris";
|
|
56
|
-
* },
|
|
57
|
-
* scorers: [checkFactuality],
|
|
58
|
-
* threshold: 0.8
|
|
59
|
-
* });
|
|
60
|
-
* ```
|
|
61
|
-
*/
|
|
62
|
-
declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
|
|
63
|
-
data: () => Promise<{
|
|
64
|
-
input: string;
|
|
65
|
-
expected: string;
|
|
66
|
-
}[]>;
|
|
67
|
-
task: TaskFn;
|
|
68
|
-
skipIf?: () => boolean;
|
|
69
|
-
scorers: ScoreFn[];
|
|
70
|
-
threshold?: number | null;
|
|
71
|
-
timeout?: number;
|
|
72
|
-
}): vitest.SuiteCollector<object>;
|
|
73
|
-
declare function formatScores(scores: (Score & {
|
|
74
|
-
name: string;
|
|
75
|
-
})[]): string;
|
|
76
|
-
/**
|
|
77
|
-
* Wraps text to fit within a specified width, breaking at word boundaries.
|
|
78
|
-
*
|
|
79
|
-
* @param text - The text to wrap
|
|
80
|
-
* @param width - The maximum width in characters (default: 80)
|
|
81
|
-
* @returns The wrapped text with line breaks
|
|
82
|
-
*
|
|
83
|
-
* @example
|
|
84
|
-
* ```javascript
|
|
85
|
-
* const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
|
|
86
|
-
* console.log(wrapped);
|
|
87
|
-
* // Output:
|
|
88
|
-
* // This is a very
|
|
89
|
-
* // long text that
|
|
90
|
-
* // needs to be
|
|
91
|
-
* // wrapped to fit
|
|
92
|
-
* // within an 80
|
|
93
|
-
* // character width.
|
|
94
|
-
* ```
|
|
95
|
-
*/
|
|
96
|
-
declare function wrapText(text: string, width?: number): string;
|
|
97
|
-
|
|
98
|
-
export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores, wrapText };
|
|
1
|
+
import 'vitest';
|
|
2
|
+
export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.js';
|