vitest-evals 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -70
- package/dist/index.d.mts +38 -5
- package/dist/index.d.ts +38 -5
- package/dist/index.js +84 -10
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +85 -10
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# vitest-evals
|
|
2
2
|
|
|
3
|
-
This project is a
|
|
3
|
+
This project is a prototype of extending vitest to support basic _Evals_ functionality. Evals are a type of testing that is most commonly deployed to _evaluate_ the results of calls to language models. This allows you to utilize them with a pattern of testing you're familiar with, working well with your existing continuous integration toolchain.
|
|
4
4
|
|
|
5
|
-
This is heavily inspired by [Evalite](https://www.evalite.dev/), but opts for a vitest-native approach to maximize the compatibility of the existing ecosystem.
|
|
5
|
+
This is heavily inspired by [Evalite](https://www.evalite.dev/), but opts for a vitest-native approach to maximize the compatibility of the existing ecosystem. This means you can use it with your existing toolchain, including reporting such as code coverage and xunit.
|
|
6
6
|
|
|
7
7
|
## Use
|
|
8
8
|
|
|
@@ -10,104 +10,173 @@ This is heavily inspired by [Evalite](https://www.evalite.dev/), but opts for a
|
|
|
10
10
|
npm install -D vitest-evals
|
|
11
11
|
```
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
You've likely already got a mechanism for passing the user input into your model, for example:
|
|
14
14
|
|
|
15
15
|
```javascript
|
|
16
|
-
|
|
16
|
+
async function answerQuestion(prompt: string) {
|
|
17
|
+
const model = openai("gpt-4o");
|
|
18
|
+
const { text } = await generateText({
|
|
19
|
+
model,
|
|
20
|
+
prompt,
|
|
21
|
+
})
|
|
22
|
+
return text;
|
|
23
|
+
}
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
You'll use this as the `task` within your evals, and then you simply need to define a set of scenarios
|
|
27
|
+
and a way to validate if the LLM is responding as you desire:
|
|
28
|
+
|
|
29
|
+
```javascript
|
|
30
|
+
import { describeEval } from "vitest-evals";
|
|
31
|
+
import { Factuality } from "autoevals";
|
|
32
|
+
|
|
17
33
|
describeEval("my evals", {
|
|
18
34
|
data: async () => {
|
|
35
|
+
// The scenarios you wish to evaluate
|
|
19
36
|
return [
|
|
20
37
|
input: "What is the capital of France?",
|
|
21
38
|
expected: "Paris",
|
|
22
39
|
];
|
|
23
40
|
},
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
scorers: [
|
|
30
|
-
|
|
31
|
-
//
|
|
41
|
+
|
|
42
|
+
task: answerQuestion,
|
|
43
|
+
|
|
44
|
+
// Scorers determine if the response was acceptable - in this case we're using
|
|
45
|
+
// a secondary LLM prompt to judge the response of the first.
|
|
46
|
+
scorers: [Factuality],
|
|
47
|
+
|
|
48
|
+
// The threshold required for the average score for this eval to pass. This will be
|
|
49
|
+
// based on the scorers you've provided, and in the case of Factuality, we might be
|
|
50
|
+
// ok with a 60% score (see the implementation for why).
|
|
51
|
+
threshold: 0.6,
|
|
52
|
+
|
|
53
|
+
// The timeout for each test. Defaults to 10s. You may need to increase this if your model
|
|
54
|
+
// provider has high latency or you're using a large number of scorers.
|
|
55
|
+
// timeout: 10000,
|
|
56
|
+
|
|
57
|
+
// A check to determine if these tests should run. This is helpful to control tests so they only
|
|
58
|
+
// in certain situations, for example if a model providers API key is defined.
|
|
59
|
+
// skipIf: () => !process.env.OPENAI_API_KEY
|
|
32
60
|
})
|
|
33
61
|
```
|
|
34
62
|
|
|
35
63
|
### Existing Test Suites
|
|
36
64
|
|
|
37
65
|
```javascript
|
|
38
|
-
// import vitest-evals to expose expect().toEval()
|
|
66
|
+
// import `vitest-evals` to expose `expect().toEval()`
|
|
67
|
+
// This can also be done via `setupFiles` pattern in `vitest`.
|
|
39
68
|
import "vitest-evals";
|
|
40
|
-
import {
|
|
41
|
-
|
|
42
|
-
function askTheLLM(input: string) {
|
|
43
|
-
// TODO: query an LLM using a factuality checker
|
|
44
|
-
const output = 'Paris';
|
|
45
|
-
return output;
|
|
46
|
-
}
|
|
69
|
+
import { Factuality } from "autoevals";
|
|
47
70
|
|
|
48
71
|
describe("my test suite", () => {
|
|
49
72
|
it("kind of works", () => {
|
|
50
|
-
expect("What is the capital of France?").toEval("Paris",
|
|
73
|
+
expect("What is the capital of France?").toEval("Paris", answerQuestion, Factuality, 0.8)
|
|
51
74
|
});
|
|
52
75
|
});
|
|
53
76
|
```
|
|
54
77
|
|
|
55
|
-
###
|
|
78
|
+
### Scoring
|
|
79
|
+
|
|
80
|
+
Scorers are compatible with the `autoevals` interface, but are also simple to implement on your own:
|
|
56
81
|
|
|
57
82
|
```javascript
|
|
58
|
-
export const
|
|
83
|
+
export const Contains = async (opts: {
|
|
59
84
|
input: string;
|
|
60
85
|
expected: string;
|
|
61
86
|
output: string;
|
|
62
87
|
}) => {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Prompt taken from autoevals:
|
|
67
|
-
*
|
|
68
|
-
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
|
|
69
|
-
*/
|
|
70
|
-
prompt: `
|
|
71
|
-
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
|
|
72
|
-
[BEGIN DATA]
|
|
73
|
-
************
|
|
74
|
-
[Question]: ${opts.input}
|
|
75
|
-
************
|
|
76
|
-
[Expert]: ${opts.expected}
|
|
77
|
-
************
|
|
78
|
-
[Submission]: ${opts.output}
|
|
79
|
-
************
|
|
80
|
-
[END DATA]
|
|
81
|
-
|
|
82
|
-
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
|
|
83
|
-
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
|
|
84
|
-
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
|
|
85
|
-
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
|
|
86
|
-
(C) The submitted answer contains all the same details as the expert answer.
|
|
87
|
-
(D) There is a disagreement between the submitted answer and the expert answer.
|
|
88
|
-
(E) The answers differ, but these differences don't matter from the perspective of factuality.
|
|
89
|
-
`,
|
|
90
|
-
schema: z.object({
|
|
91
|
-
answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
|
|
92
|
-
rationale: z
|
|
93
|
-
.string()
|
|
94
|
-
.describe("Why you chose this answer. Be very detailed."),
|
|
95
|
-
}),
|
|
96
|
-
});
|
|
97
|
-
|
|
98
|
-
const scores = {
|
|
99
|
-
A: 0.4,
|
|
100
|
-
B: 0.6,
|
|
101
|
-
C: 1,
|
|
102
|
-
D: 0,
|
|
103
|
-
E: 1,
|
|
88
|
+
return {
|
|
89
|
+
score: output.indexOf(expected) !== -1 ? 1.0 : 0.0,
|
|
104
90
|
};
|
|
91
|
+
}
|
|
92
|
+
```
|
|
105
93
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
94
|
+
For something more realistic, here's a reimplementation of the Factuality scorer from `autoevals`, with some flexibility
|
|
95
|
+
on the model, enabling you to evaluate against multiple models:
|
|
96
|
+
|
|
97
|
+
```javascript
|
|
98
|
+
import { generateObject, type LanguageModel } from "ai";
|
|
99
|
+
import { z } from "zod";
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
|
|
103
|
+
*
|
|
104
|
+
* @param model - The language model to utilize (via `ai`).
|
|
105
|
+
*
|
|
106
|
+
* @example
|
|
107
|
+
* ```javascript
|
|
108
|
+
* import { openai } from "@ai-sdk/openai";
|
|
109
|
+
*
|
|
110
|
+
* scorers: [Factuality(openai("gpt-4o"))]
|
|
111
|
+
* ```
|
|
112
|
+
*/
|
|
113
|
+
export function Factuality(model: LanguageModel) {
|
|
114
|
+
return async (opts: {
|
|
115
|
+
input: string;
|
|
116
|
+
expected: string;
|
|
117
|
+
output: string;
|
|
118
|
+
}) => {
|
|
119
|
+
const { object } = await generateObject({
|
|
120
|
+
model,
|
|
121
|
+
/**
|
|
122
|
+
* Prompt implementation from `autoevals`:
|
|
123
|
+
*
|
|
124
|
+
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
|
|
125
|
+
*/
|
|
126
|
+
prompt: `
|
|
127
|
+
You are comparing a submitted answer to an expert answer on a given question. Here is the data:
|
|
128
|
+
[BEGIN DATA]
|
|
129
|
+
************
|
|
130
|
+
[Question]: ${opts.input}
|
|
131
|
+
************
|
|
132
|
+
[Expert]: ${opts.expected}
|
|
133
|
+
************
|
|
134
|
+
[Submission]: ${opts.output}
|
|
135
|
+
************
|
|
136
|
+
[END DATA]
|
|
137
|
+
|
|
138
|
+
Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
|
|
139
|
+
The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
|
|
140
|
+
(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
|
|
141
|
+
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
|
|
142
|
+
(C) The submitted answer contains all the same details as the expert answer.
|
|
143
|
+
(D) There is a disagreement between the submitted answer and the expert answer.
|
|
144
|
+
(E) The answers differ, but these differences don't matter from the perspective of factuality.
|
|
145
|
+
`,
|
|
146
|
+
schema: z.object({
|
|
147
|
+
answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
|
|
148
|
+
rationale: z
|
|
149
|
+
.string()
|
|
150
|
+
.describe("Why you chose this answer. Be very detailed."),
|
|
151
|
+
}),
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
const scores = {
|
|
155
|
+
A: 0.4,
|
|
156
|
+
B: 0.6,
|
|
157
|
+
C: 1,
|
|
158
|
+
D: 0,
|
|
159
|
+
E: 1,
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
score: scores[object.answer],
|
|
164
|
+
metadata: {
|
|
165
|
+
rationale: object.rationale,
|
|
166
|
+
},
|
|
167
|
+
};
|
|
111
168
|
};
|
|
112
|
-
}
|
|
169
|
+
}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Development
|
|
173
|
+
|
|
174
|
+
Nothing fancy here.
|
|
175
|
+
|
|
176
|
+
```javascript
|
|
177
|
+
pnpm install
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
```javascript
|
|
181
|
+
pnpm test
|
|
113
182
|
```
|
package/dist/index.d.mts
CHANGED
|
@@ -4,14 +4,15 @@ type TaskFn = (input: string) => Promise<string>;
|
|
|
4
4
|
type Score = {
|
|
5
5
|
score: number | null;
|
|
6
6
|
metadata?: {
|
|
7
|
-
rationale
|
|
7
|
+
rationale?: string;
|
|
8
|
+
output?: string;
|
|
8
9
|
};
|
|
9
10
|
};
|
|
10
11
|
type ScoreFn = (opts: {
|
|
11
12
|
input: string;
|
|
12
|
-
expected: string;
|
|
13
13
|
output: string;
|
|
14
|
-
|
|
14
|
+
expected?: string;
|
|
15
|
+
}) => Promise<Score> | Score;
|
|
15
16
|
type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
|
|
16
17
|
interface EvalMatchers<R = unknown> {
|
|
17
18
|
toEval: ToEval<R>;
|
|
@@ -22,15 +23,47 @@ declare module "vitest" {
|
|
|
22
23
|
interface AsymmetricMatchersContaining extends EvalMatchers {
|
|
23
24
|
}
|
|
24
25
|
}
|
|
25
|
-
|
|
26
|
+
/**
|
|
27
|
+
* Creates a test suite for evaluating language model outputs.
|
|
28
|
+
*
|
|
29
|
+
* @param name - The name of the test suite
|
|
30
|
+
* @param options - Configuration options
|
|
31
|
+
* @param options.data - Async function that returns an array of test cases with input and expected values
|
|
32
|
+
* @param options.task - Function that processes the input and returns the model output
|
|
33
|
+
* @param options.skipIf - Optional function that determines if tests should be skipped
|
|
34
|
+
* @param options.scorers - Array of scoring functions that evaluate model outputs
|
|
35
|
+
* @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
|
|
36
|
+
* @param options.timeout - Test timeout in milliseconds, defaults to 10000
|
|
37
|
+
*
|
|
38
|
+
* @example
|
|
39
|
+
* ```javascript
|
|
40
|
+
* describeEval("capital cities test", {
|
|
41
|
+
* data: async () => [{
|
|
42
|
+
* input: "What is the capital of France?",
|
|
43
|
+
* expected: "Paris"
|
|
44
|
+
* }],
|
|
45
|
+
* task: async (input) => {
|
|
46
|
+
* // Query LLM here
|
|
47
|
+
* return "Paris";
|
|
48
|
+
* },
|
|
49
|
+
* scorers: [checkFactuality],
|
|
50
|
+
* threshold: 0.8
|
|
51
|
+
* });
|
|
52
|
+
* ```
|
|
53
|
+
*/
|
|
54
|
+
declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
|
|
26
55
|
data: () => Promise<{
|
|
27
56
|
input: string;
|
|
28
57
|
expected: string;
|
|
29
58
|
}[]>;
|
|
30
59
|
task: TaskFn;
|
|
60
|
+
skipIf?: () => boolean;
|
|
31
61
|
scorers: ScoreFn[];
|
|
32
62
|
threshold?: number | null;
|
|
33
63
|
timeout?: number;
|
|
34
64
|
}): vitest.SuiteCollector<object>;
|
|
65
|
+
declare function formatScores(scores: (Score & {
|
|
66
|
+
name: string;
|
|
67
|
+
})[]): string;
|
|
35
68
|
|
|
36
|
-
export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval };
|
|
69
|
+
export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores };
|
package/dist/index.d.ts
CHANGED
|
@@ -4,14 +4,15 @@ type TaskFn = (input: string) => Promise<string>;
|
|
|
4
4
|
type Score = {
|
|
5
5
|
score: number | null;
|
|
6
6
|
metadata?: {
|
|
7
|
-
rationale
|
|
7
|
+
rationale?: string;
|
|
8
|
+
output?: string;
|
|
8
9
|
};
|
|
9
10
|
};
|
|
10
11
|
type ScoreFn = (opts: {
|
|
11
12
|
input: string;
|
|
12
|
-
expected: string;
|
|
13
13
|
output: string;
|
|
14
|
-
|
|
14
|
+
expected?: string;
|
|
15
|
+
}) => Promise<Score> | Score;
|
|
15
16
|
type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
|
|
16
17
|
interface EvalMatchers<R = unknown> {
|
|
17
18
|
toEval: ToEval<R>;
|
|
@@ -22,15 +23,47 @@ declare module "vitest" {
|
|
|
22
23
|
interface AsymmetricMatchersContaining extends EvalMatchers {
|
|
23
24
|
}
|
|
24
25
|
}
|
|
25
|
-
|
|
26
|
+
/**
|
|
27
|
+
* Creates a test suite for evaluating language model outputs.
|
|
28
|
+
*
|
|
29
|
+
* @param name - The name of the test suite
|
|
30
|
+
* @param options - Configuration options
|
|
31
|
+
* @param options.data - Async function that returns an array of test cases with input and expected values
|
|
32
|
+
* @param options.task - Function that processes the input and returns the model output
|
|
33
|
+
* @param options.skipIf - Optional function that determines if tests should be skipped
|
|
34
|
+
* @param options.scorers - Array of scoring functions that evaluate model outputs
|
|
35
|
+
* @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
|
|
36
|
+
* @param options.timeout - Test timeout in milliseconds, defaults to 10000
|
|
37
|
+
*
|
|
38
|
+
* @example
|
|
39
|
+
* ```javascript
|
|
40
|
+
* describeEval("capital cities test", {
|
|
41
|
+
* data: async () => [{
|
|
42
|
+
* input: "What is the capital of France?",
|
|
43
|
+
* expected: "Paris"
|
|
44
|
+
* }],
|
|
45
|
+
* task: async (input) => {
|
|
46
|
+
* // Query LLM here
|
|
47
|
+
* return "Paris";
|
|
48
|
+
* },
|
|
49
|
+
* scorers: [checkFactuality],
|
|
50
|
+
* threshold: 0.8
|
|
51
|
+
* });
|
|
52
|
+
* ```
|
|
53
|
+
*/
|
|
54
|
+
declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
|
|
26
55
|
data: () => Promise<{
|
|
27
56
|
input: string;
|
|
28
57
|
expected: string;
|
|
29
58
|
}[]>;
|
|
30
59
|
task: TaskFn;
|
|
60
|
+
skipIf?: () => boolean;
|
|
31
61
|
scorers: ScoreFn[];
|
|
32
62
|
threshold?: number | null;
|
|
33
63
|
timeout?: number;
|
|
34
64
|
}): vitest.SuiteCollector<object>;
|
|
65
|
+
declare function formatScores(scores: (Score & {
|
|
66
|
+
name: string;
|
|
67
|
+
})[]): string;
|
|
35
68
|
|
|
36
|
-
export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval };
|
|
69
|
+
export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores };
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,25 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
var __defProp = Object.defineProperty;
|
|
3
|
+
var __defProps = Object.defineProperties;
|
|
3
4
|
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
|
|
4
6
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
7
|
+
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
|
|
5
8
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
9
|
+
var __propIsEnum = Object.prototype.propertyIsEnumerable;
|
|
10
|
+
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
11
|
+
var __spreadValues = (a, b) => {
|
|
12
|
+
for (var prop in b || (b = {}))
|
|
13
|
+
if (__hasOwnProp.call(b, prop))
|
|
14
|
+
__defNormalProp(a, prop, b[prop]);
|
|
15
|
+
if (__getOwnPropSymbols)
|
|
16
|
+
for (var prop of __getOwnPropSymbols(b)) {
|
|
17
|
+
if (__propIsEnum.call(b, prop))
|
|
18
|
+
__defNormalProp(a, prop, b[prop]);
|
|
19
|
+
}
|
|
20
|
+
return a;
|
|
21
|
+
};
|
|
22
|
+
var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
|
|
6
23
|
var __export = (target, all) => {
|
|
7
24
|
for (var name in all)
|
|
8
25
|
__defProp(target, name, { get: all[name], enumerable: true });
|
|
@@ -40,18 +57,45 @@ var __async = (__this, __arguments, generator) => {
|
|
|
40
57
|
// src/index.ts
|
|
41
58
|
var index_exports = {};
|
|
42
59
|
__export(index_exports, {
|
|
43
|
-
describeEval: () => describeEval
|
|
60
|
+
describeEval: () => describeEval,
|
|
61
|
+
formatScores: () => formatScores
|
|
44
62
|
});
|
|
45
63
|
module.exports = __toCommonJS(index_exports);
|
|
46
64
|
var import_vitest = require("vitest");
|
|
47
65
|
var import_vitest2 = require("vitest");
|
|
48
66
|
import_vitest.expect.extend({
|
|
67
|
+
/**
|
|
68
|
+
* Evaluates a language model output against an expected answer using a scoring function.
|
|
69
|
+
*
|
|
70
|
+
* @param expected - The expected (ground truth) answer
|
|
71
|
+
* @param taskFn - Async function that processes the input and returns the model output
|
|
72
|
+
* @param scoreFn - Function that evaluates the model output against the expected answer
|
|
73
|
+
* @param threshold - Minimum acceptable score (0-1), defaults to 1.0
|
|
74
|
+
*
|
|
75
|
+
* @example
|
|
76
|
+
* ```javascript
|
|
77
|
+
* test("checks capital of France", async () => {
|
|
78
|
+
* expect("What is the capital of France?").toEval(
|
|
79
|
+
* "Paris",
|
|
80
|
+
* async (input) => {
|
|
81
|
+
* // Query LLM here
|
|
82
|
+
* return "Paris";
|
|
83
|
+
* },
|
|
84
|
+
* checkFactuality,
|
|
85
|
+
* 0.8
|
|
86
|
+
* );
|
|
87
|
+
* });
|
|
88
|
+
* ```
|
|
89
|
+
*/
|
|
49
90
|
toEval: function toEval(input, expected, taskFn, scoreFn, threshold = 1) {
|
|
50
91
|
return __async(this, null, function* () {
|
|
51
92
|
var _a;
|
|
52
93
|
const { isNot } = this;
|
|
53
94
|
const output = yield taskFn(input);
|
|
54
|
-
|
|
95
|
+
let result = scoreFn({ input, expected, output });
|
|
96
|
+
if (result instanceof Promise) {
|
|
97
|
+
result = yield result;
|
|
98
|
+
}
|
|
55
99
|
return {
|
|
56
100
|
pass: ((_a = result.score) != null ? _a : 0) >= threshold,
|
|
57
101
|
message: () => `Score: ${result.score} ${isNot ? "<" : ">"} ${threshold}
|
|
@@ -63,6 +107,7 @@ ${result.metadata ? `Rationale: ${result.metadata.rationale}` : ""}`
|
|
|
63
107
|
function describeEval(name, {
|
|
64
108
|
data,
|
|
65
109
|
task,
|
|
110
|
+
skipIf,
|
|
66
111
|
scorers,
|
|
67
112
|
threshold = 1,
|
|
68
113
|
// increase default test timeout as 5s is usually not enough for
|
|
@@ -70,14 +115,27 @@ function describeEval(name, {
|
|
|
70
115
|
timeout = 1e4
|
|
71
116
|
}) {
|
|
72
117
|
return (0, import_vitest.describe)(name, () => __async(this, null, function* () {
|
|
118
|
+
const testFn = skipIf ? import_vitest.test.skipIf(skipIf()) : import_vitest.test;
|
|
73
119
|
for (const { input, expected } of yield data()) {
|
|
74
|
-
(
|
|
120
|
+
testFn(
|
|
75
121
|
input,
|
|
122
|
+
{
|
|
123
|
+
timeout
|
|
124
|
+
},
|
|
76
125
|
() => __async(this, null, function* () {
|
|
77
126
|
const output = yield task(input);
|
|
78
127
|
const scores = yield Promise.all(
|
|
79
|
-
scorers.map((scorer) =>
|
|
128
|
+
scorers.map((scorer) => {
|
|
129
|
+
const result = scorer({ input, expected, output });
|
|
130
|
+
if (result instanceof Promise) {
|
|
131
|
+
return result;
|
|
132
|
+
}
|
|
133
|
+
return new Promise((resolve) => resolve(result));
|
|
134
|
+
})
|
|
80
135
|
);
|
|
136
|
+
const scoresWithName = scores.map((s, i) => __spreadProps(__spreadValues({}, s), {
|
|
137
|
+
name: scorers[i].name
|
|
138
|
+
}));
|
|
81
139
|
const avgScore = scores.reduce((acc, s) => {
|
|
82
140
|
var _a;
|
|
83
141
|
return acc + ((_a = s.score) != null ? _a : 0);
|
|
@@ -86,19 +144,35 @@ function describeEval(name, {
|
|
|
86
144
|
(0, import_vitest.assert)(
|
|
87
145
|
avgScore >= threshold,
|
|
88
146
|
`Score: ${avgScore} below threshold: ${threshold}
|
|
89
|
-
Output: ${output}
|
|
147
|
+
Output: ${output}
|
|
148
|
+
${formatScores(
|
|
149
|
+
scoresWithName
|
|
150
|
+
)}`
|
|
90
151
|
);
|
|
91
152
|
}
|
|
92
|
-
})
|
|
93
|
-
{
|
|
94
|
-
timeout
|
|
95
|
-
}
|
|
153
|
+
})
|
|
96
154
|
);
|
|
97
155
|
}
|
|
98
156
|
}));
|
|
99
157
|
}
|
|
158
|
+
function formatScores(scores) {
|
|
159
|
+
return scores.sort((a, b) => {
|
|
160
|
+
var _a, _b;
|
|
161
|
+
return ((_a = a.score) != null ? _a : 0) - ((_b = b.score) != null ? _b : 0);
|
|
162
|
+
}).map((s) => {
|
|
163
|
+
var _a, _b, _c, _d;
|
|
164
|
+
const scoreLine = `${s.name || "Unknown"} [${((_a = s.score) != null ? _a : 0).toFixed(1)}]`;
|
|
165
|
+
if (((_b = s.score) != null ? _b : 0) < 1 && ((_c = s.metadata) == null ? void 0 : _c.rationale) || ((_d = s.metadata) == null ? void 0 : _d.output)) {
|
|
166
|
+
return `${scoreLine}${s.metadata.rationale ? `
|
|
167
|
+
Rationale: ${s.metadata.rationale}` : ""}${s.metadata.output ? `
|
|
168
|
+
Output: ${s.metadata.output}` : ""}`;
|
|
169
|
+
}
|
|
170
|
+
return scoreLine;
|
|
171
|
+
}).join("\n\n");
|
|
172
|
+
}
|
|
100
173
|
// Annotate the CommonJS export names for ESM import in node:
|
|
101
174
|
0 && (module.exports = {
|
|
102
|
-
describeEval
|
|
175
|
+
describeEval,
|
|
176
|
+
formatScores
|
|
103
177
|
});
|
|
104
178
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { assert, describe, expect,
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { assert, describe, expect, test } from \"vitest\";\nimport \"vitest\";\n\nexport type TaskFn = (input: string) => Promise<string>;\n\nexport type Score = {\n score: number | null;\n metadata?: {\n rationale?: string;\n output?: string;\n };\n};\n\nexport type ScoreFn = (opts: {\n input: string;\n output: string;\n expected?: string;\n}) => Promise<Score> | Score;\n\nexport type ToEval<R = unknown> = (\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold?: number,\n) => Promise<R>;\n\nexport interface EvalMatchers<R = unknown> {\n toEval: ToEval<R>;\n}\n\ndeclare module \"vitest\" {\n interface Assertion<T = any> extends EvalMatchers<T> {}\n interface AsymmetricMatchersContaining extends EvalMatchers {}\n}\n\nexpect.extend({\n /**\n * Evaluates a language model output against an expected answer using a scoring function.\n *\n * @param expected - The expected (ground truth) answer\n * @param taskFn - Async function that processes the input and returns the model output\n * @param scoreFn - Function that evaluates the model output against the expected answer\n * @param threshold - Minimum acceptable score (0-1), defaults to 1.0\n *\n * @example\n * ```javascript\n * test(\"checks capital of France\", async () => {\n * expect(\"What is the capital of France?\").toEval(\n * \"Paris\",\n * async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * checkFactuality,\n * 0.8\n * );\n * });\n * ```\n */\n toEval: async function toEval(\n input: string,\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold = 1.0,\n ) {\n const { isNot } = this;\n\n const output = await taskFn(input);\n\n let result = scoreFn({ input, expected, output });\n if (result instanceof Promise) {\n result = await result;\n }\n\n return {\n pass: (result.score ?? 0) >= threshold,\n message: () =>\n `Score: ${result.score} ${isNot ? \"<\" : \">\"} ${threshold}\\n${\n result.metadata ? `Rationale: ${result.metadata.rationale}` : \"\"\n }`,\n };\n },\n});\n\n/**\n * Creates a test suite for evaluating language model outputs.\n *\n * @param name - The name of the test suite\n * @param options - Configuration options\n * @param options.data - Async function that returns an array of test cases with input and expected values\n * @param options.task - Function that processes the input and returns the model output\n * @param options.skipIf - Optional function that determines if tests should be skipped\n * @param options.scorers - Array of scoring functions that evaluate model outputs\n * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0\n * @param options.timeout - Test timeout in milliseconds, defaults to 10000\n *\n * @example\n * ```javascript\n * describeEval(\"capital cities test\", {\n * data: async () => [{\n * input: \"What is the capital of France?\",\n * expected: \"Paris\"\n * }],\n * task: async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * scorers: [checkFactuality],\n * threshold: 0.8\n * });\n * ```\n */\nexport function describeEval(\n name: string,\n {\n data,\n task,\n skipIf,\n scorers,\n threshold = 1.0,\n // increase default test timeout as 5s is usually not enough for\n // a single factuality check\n timeout = 10000,\n }: {\n data: () => Promise<{ input: string; expected: string }[]>;\n task: TaskFn;\n skipIf?: () => boolean;\n scorers: ScoreFn[];\n threshold?: number | null;\n timeout?: number;\n },\n) {\n return describe(name, async () => {\n const testFn = skipIf ? test.skipIf(skipIf()) : test;\n // TODO: should data just be a generator?\n for (const { input, expected } of await data()) {\n testFn(\n input,\n {\n timeout,\n },\n async () => {\n const output = await task(input);\n\n const scores = await Promise.all(\n scorers.map((scorer) => {\n const result = scorer({ input, expected, output });\n if (result instanceof Promise) {\n return result;\n }\n return new Promise<Score>((resolve) => resolve(result));\n }),\n );\n const scoresWithName = scores.map((s, i) => ({\n ...s,\n name: scorers[i].name,\n }));\n\n const avgScore =\n scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;\n if (threshold) {\n assert(\n avgScore >= threshold,\n `Score: ${avgScore} below threshold: ${threshold}\\nOutput: ${output}\\n${formatScores(\n scoresWithName,\n )}`,\n );\n }\n },\n );\n }\n });\n}\n\nexport function formatScores(scores: (Score & { name: string })[]) {\n return scores\n .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))\n .map((s) => {\n const scoreLine = `${s.name || \"Unknown\"} [${(s.score ?? 0).toFixed(1)}]`;\n if (\n ((s.score ?? 0) < 1.0 && s.metadata?.rationale) ||\n s.metadata?.output\n ) {\n return `${scoreLine}${\n s.metadata.rationale ? `\\nRationale: ${s.metadata.rationale}` : \"\"\n }${s.metadata.output ? `\\nOutput: ${s.metadata.output}` : \"\"}`;\n }\n return scoreLine;\n })\n .join(\"\\n\\n\");\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,oBAA+C;AAC/C,IAAAA,iBAAO;AAkCP,qBAAO,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAwBZ,QAAQ,SAAe,OACrB,OACA,UACA,QACA,SACA,YAAY,GACZ;AAAA;AAjEJ;AAkEI,YAAM,EAAE,MAAM,IAAI;AAElB,YAAM,SAAS,MAAM,OAAO,KAAK;AAEjC,UAAI,SAAS,QAAQ,EAAE,OAAO,UAAU,OAAO,CAAC;AAChD,UAAI,kBAAkB,SAAS;AAC7B,iBAAS,MAAM;AAAA,MACjB;AAEA,aAAO;AAAA,QACL,QAAO,YAAO,UAAP,YAAgB,MAAM;AAAA,QAC7B,SAAS,MACP,UAAU,OAAO,KAAK,IAAI,QAAQ,MAAM,GAAG,IAAI,SAAS;AAAA,EACtD,OAAO,WAAW,cAAc,OAAO,SAAS,SAAS,KAAK,EAChE;AAAA,MACJ;AAAA,IACF;AAAA;AACF,CAAC;AA8BM,SAAS,aACd,MACA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA;AAAA;AAAA,EAGZ,UAAU;AACZ,GAQA;AACA,aAAO,wBAAS,MAAM,MAAY;AAChC,UAAM,SAAS,SAAS,mBAAK,OAAO,OAAO,CAAC,IAAI;AAEhD,eAAW,EAAE,OAAO,SAAS,KAAK,MAAM,KAAK,GAAG;AAC9C;AAAA,QACE;AAAA,QACA;AAAA,UACE;AAAA,QACF;AAAA,QACA,MAAY;AACV,gBAAM,SAAS,MAAM,KAAK,KAAK;AAE/B,gBAAM,SAAS,MAAM,QAAQ;AAAA,YAC3B,QAAQ,IAAI,CAAC,WAAW;AACtB,oBAAM,SAAS,OAAO,EAAE,OAAO,UAAU,OAAO,CAAC;AACjD,kBAAI,kBAAkB,SAAS;AAC7B,uBAAO;AAAA,cACT;AACA,qBAAO,IAAI,QAAe,CAAC,YAAY,QAAQ,MAAM,CAAC;AAAA,YACxD,CAAC;AAAA,UACH;AACA,gBAAM,iBAAiB,OAAO,IAAI,CAAC,GAAG,MAAO,iCACxC,IADwC;AAAA,YAE3C,MAAM,QAAQ,CAAC,EAAE;AAAA,UACnB,EAAE;AAEF,gBAAM,WACJ,OAAO,OAAO,CAAC,KAAK,MAAG;AAhKnC;AAgKsC,2BAAO,OAAE,UAAF,YAAW;AAAA,aAAI,CAAC,IAAI,OAAO;AAC9D,cAAI,WAAW;AACb;AAAA,cACE,YAAY;AAAA,cACZ,UAAU,QAAQ,qBAAqB,SAAS;AAAA,UAAa,MAAM;AAAA,EAAK;AAAA,gBACtE;AAAA,cACF,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF,EAAC;AACH;AAEO,SAAS,aAAa,QAAsC;AACjE,SAAO,OACJ,KAAK,CAAC,GAAG,MAAG;AAjLjB;AAiLqB,oBAAE,UAAF,YAAW,OAAM,OAAE,UAAF,YAAW;AAAA,GAAE,EAC9C,IAAI,CAAC,MAAM;AAlLhB;AAmLM,UAAM,YAAY,GAAG,EAAE,QAAQ,SAAS,OAAM,OAAE,UAAF,YAAW,GAAG,QAAQ,CAAC,CAAC;AACtE,UACI,OAAE,UAAF,YAAW,KAAK,OAAO,OAAE,aAAF,mBAAY,gBACrC,OAAE,aAAF,mBAAY,SACZ;AACA,aAAO,GAAG,SAAS,GACjB,EAAE,SAAS,YAAY;AAAA,aAAgB,EAAE,SAAS,SAAS,KAAK,EAClE,GAAG,EAAE,SAAS,SAAS;AAAA,UAAa,EAAE,SAAS,MAAM,KAAK,EAAE;AAAA,IAC9D;AACA,WAAO;AAAA,EACT,CAAC,EACA,KAAK,MAAM;AAChB;","names":["import_vitest"]}
|
package/dist/index.mjs
CHANGED
|
@@ -1,3 +1,22 @@
|
|
|
1
|
+
var __defProp = Object.defineProperty;
|
|
2
|
+
var __defProps = Object.defineProperties;
|
|
3
|
+
var __getOwnPropDescs = Object.getOwnPropertyDescriptors;
|
|
4
|
+
var __getOwnPropSymbols = Object.getOwnPropertySymbols;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __propIsEnum = Object.prototype.propertyIsEnumerable;
|
|
7
|
+
var __defNormalProp = (obj, key, value) => key in obj ? __defProp(obj, key, { enumerable: true, configurable: true, writable: true, value }) : obj[key] = value;
|
|
8
|
+
var __spreadValues = (a, b) => {
|
|
9
|
+
for (var prop in b || (b = {}))
|
|
10
|
+
if (__hasOwnProp.call(b, prop))
|
|
11
|
+
__defNormalProp(a, prop, b[prop]);
|
|
12
|
+
if (__getOwnPropSymbols)
|
|
13
|
+
for (var prop of __getOwnPropSymbols(b)) {
|
|
14
|
+
if (__propIsEnum.call(b, prop))
|
|
15
|
+
__defNormalProp(a, prop, b[prop]);
|
|
16
|
+
}
|
|
17
|
+
return a;
|
|
18
|
+
};
|
|
19
|
+
var __spreadProps = (a, b) => __defProps(a, __getOwnPropDescs(b));
|
|
1
20
|
var __async = (__this, __arguments, generator) => {
|
|
2
21
|
return new Promise((resolve, reject) => {
|
|
3
22
|
var fulfilled = (value) => {
|
|
@@ -20,15 +39,41 @@ var __async = (__this, __arguments, generator) => {
|
|
|
20
39
|
};
|
|
21
40
|
|
|
22
41
|
// src/index.ts
|
|
23
|
-
import { assert, describe, expect,
|
|
42
|
+
import { assert, describe, expect, test } from "vitest";
|
|
24
43
|
import "vitest";
|
|
25
44
|
expect.extend({
|
|
45
|
+
/**
|
|
46
|
+
* Evaluates a language model output against an expected answer using a scoring function.
|
|
47
|
+
*
|
|
48
|
+
* @param expected - The expected (ground truth) answer
|
|
49
|
+
* @param taskFn - Async function that processes the input and returns the model output
|
|
50
|
+
* @param scoreFn - Function that evaluates the model output against the expected answer
|
|
51
|
+
* @param threshold - Minimum acceptable score (0-1), defaults to 1.0
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* ```javascript
|
|
55
|
+
* test("checks capital of France", async () => {
|
|
56
|
+
* expect("What is the capital of France?").toEval(
|
|
57
|
+
* "Paris",
|
|
58
|
+
* async (input) => {
|
|
59
|
+
* // Query LLM here
|
|
60
|
+
* return "Paris";
|
|
61
|
+
* },
|
|
62
|
+
* checkFactuality,
|
|
63
|
+
* 0.8
|
|
64
|
+
* );
|
|
65
|
+
* });
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
26
68
|
toEval: function toEval(input, expected, taskFn, scoreFn, threshold = 1) {
|
|
27
69
|
return __async(this, null, function* () {
|
|
28
70
|
var _a;
|
|
29
71
|
const { isNot } = this;
|
|
30
72
|
const output = yield taskFn(input);
|
|
31
|
-
|
|
73
|
+
let result = scoreFn({ input, expected, output });
|
|
74
|
+
if (result instanceof Promise) {
|
|
75
|
+
result = yield result;
|
|
76
|
+
}
|
|
32
77
|
return {
|
|
33
78
|
pass: ((_a = result.score) != null ? _a : 0) >= threshold,
|
|
34
79
|
message: () => `Score: ${result.score} ${isNot ? "<" : ">"} ${threshold}
|
|
@@ -40,6 +85,7 @@ ${result.metadata ? `Rationale: ${result.metadata.rationale}` : ""}`
|
|
|
40
85
|
function describeEval(name, {
|
|
41
86
|
data,
|
|
42
87
|
task,
|
|
88
|
+
skipIf,
|
|
43
89
|
scorers,
|
|
44
90
|
threshold = 1,
|
|
45
91
|
// increase default test timeout as 5s is usually not enough for
|
|
@@ -47,14 +93,27 @@ function describeEval(name, {
|
|
|
47
93
|
timeout = 1e4
|
|
48
94
|
}) {
|
|
49
95
|
return describe(name, () => __async(this, null, function* () {
|
|
96
|
+
const testFn = skipIf ? test.skipIf(skipIf()) : test;
|
|
50
97
|
for (const { input, expected } of yield data()) {
|
|
51
|
-
|
|
98
|
+
testFn(
|
|
52
99
|
input,
|
|
100
|
+
{
|
|
101
|
+
timeout
|
|
102
|
+
},
|
|
53
103
|
() => __async(this, null, function* () {
|
|
54
104
|
const output = yield task(input);
|
|
55
105
|
const scores = yield Promise.all(
|
|
56
|
-
scorers.map((scorer) =>
|
|
106
|
+
scorers.map((scorer) => {
|
|
107
|
+
const result = scorer({ input, expected, output });
|
|
108
|
+
if (result instanceof Promise) {
|
|
109
|
+
return result;
|
|
110
|
+
}
|
|
111
|
+
return new Promise((resolve) => resolve(result));
|
|
112
|
+
})
|
|
57
113
|
);
|
|
114
|
+
const scoresWithName = scores.map((s, i) => __spreadProps(__spreadValues({}, s), {
|
|
115
|
+
name: scorers[i].name
|
|
116
|
+
}));
|
|
58
117
|
const avgScore = scores.reduce((acc, s) => {
|
|
59
118
|
var _a;
|
|
60
119
|
return acc + ((_a = s.score) != null ? _a : 0);
|
|
@@ -63,18 +122,34 @@ function describeEval(name, {
|
|
|
63
122
|
assert(
|
|
64
123
|
avgScore >= threshold,
|
|
65
124
|
`Score: ${avgScore} below threshold: ${threshold}
|
|
66
|
-
Output: ${output}
|
|
125
|
+
Output: ${output}
|
|
126
|
+
${formatScores(
|
|
127
|
+
scoresWithName
|
|
128
|
+
)}`
|
|
67
129
|
);
|
|
68
130
|
}
|
|
69
|
-
})
|
|
70
|
-
{
|
|
71
|
-
timeout
|
|
72
|
-
}
|
|
131
|
+
})
|
|
73
132
|
);
|
|
74
133
|
}
|
|
75
134
|
}));
|
|
76
135
|
}
|
|
136
|
+
function formatScores(scores) {
|
|
137
|
+
return scores.sort((a, b) => {
|
|
138
|
+
var _a, _b;
|
|
139
|
+
return ((_a = a.score) != null ? _a : 0) - ((_b = b.score) != null ? _b : 0);
|
|
140
|
+
}).map((s) => {
|
|
141
|
+
var _a, _b, _c, _d;
|
|
142
|
+
const scoreLine = `${s.name || "Unknown"} [${((_a = s.score) != null ? _a : 0).toFixed(1)}]`;
|
|
143
|
+
if (((_b = s.score) != null ? _b : 0) < 1 && ((_c = s.metadata) == null ? void 0 : _c.rationale) || ((_d = s.metadata) == null ? void 0 : _d.output)) {
|
|
144
|
+
return `${scoreLine}${s.metadata.rationale ? `
|
|
145
|
+
Rationale: ${s.metadata.rationale}` : ""}${s.metadata.output ? `
|
|
146
|
+
Output: ${s.metadata.output}` : ""}`;
|
|
147
|
+
}
|
|
148
|
+
return scoreLine;
|
|
149
|
+
}).join("\n\n");
|
|
150
|
+
}
|
|
77
151
|
export {
|
|
78
|
-
describeEval
|
|
152
|
+
describeEval,
|
|
153
|
+
formatScores
|
|
79
154
|
};
|
|
80
155
|
//# sourceMappingURL=index.mjs.map
|
package/dist/index.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { assert, describe, expect,
|
|
1
|
+
{"version":3,"sources":["../src/index.ts"],"sourcesContent":["import { assert, describe, expect, test } from \"vitest\";\nimport \"vitest\";\n\nexport type TaskFn = (input: string) => Promise<string>;\n\nexport type Score = {\n score: number | null;\n metadata?: {\n rationale?: string;\n output?: string;\n };\n};\n\nexport type ScoreFn = (opts: {\n input: string;\n output: string;\n expected?: string;\n}) => Promise<Score> | Score;\n\nexport type ToEval<R = unknown> = (\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold?: number,\n) => Promise<R>;\n\nexport interface EvalMatchers<R = unknown> {\n toEval: ToEval<R>;\n}\n\ndeclare module \"vitest\" {\n interface Assertion<T = any> extends EvalMatchers<T> {}\n interface AsymmetricMatchersContaining extends EvalMatchers {}\n}\n\nexpect.extend({\n /**\n * Evaluates a language model output against an expected answer using a scoring function.\n *\n * @param expected - The expected (ground truth) answer\n * @param taskFn - Async function that processes the input and returns the model output\n * @param scoreFn - Function that evaluates the model output against the expected answer\n * @param threshold - Minimum acceptable score (0-1), defaults to 1.0\n *\n * @example\n * ```javascript\n * test(\"checks capital of France\", async () => {\n * expect(\"What is the capital of France?\").toEval(\n * \"Paris\",\n * async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * checkFactuality,\n * 0.8\n * );\n * });\n * ```\n */\n toEval: async function toEval(\n input: string,\n expected: string,\n taskFn: TaskFn,\n scoreFn: ScoreFn,\n threshold = 1.0,\n ) {\n const { isNot } = this;\n\n const output = await taskFn(input);\n\n let result = scoreFn({ input, expected, output });\n if (result instanceof Promise) {\n result = await result;\n }\n\n return {\n pass: (result.score ?? 0) >= threshold,\n message: () =>\n `Score: ${result.score} ${isNot ? \"<\" : \">\"} ${threshold}\\n${\n result.metadata ? `Rationale: ${result.metadata.rationale}` : \"\"\n }`,\n };\n },\n});\n\n/**\n * Creates a test suite for evaluating language model outputs.\n *\n * @param name - The name of the test suite\n * @param options - Configuration options\n * @param options.data - Async function that returns an array of test cases with input and expected values\n * @param options.task - Function that processes the input and returns the model output\n * @param options.skipIf - Optional function that determines if tests should be skipped\n * @param options.scorers - Array of scoring functions that evaluate model outputs\n * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0\n * @param options.timeout - Test timeout in milliseconds, defaults to 10000\n *\n * @example\n * ```javascript\n * describeEval(\"capital cities test\", {\n * data: async () => [{\n * input: \"What is the capital of France?\",\n * expected: \"Paris\"\n * }],\n * task: async (input) => {\n * // Query LLM here\n * return \"Paris\";\n * },\n * scorers: [checkFactuality],\n * threshold: 0.8\n * });\n * ```\n */\nexport function describeEval(\n name: string,\n {\n data,\n task,\n skipIf,\n scorers,\n threshold = 1.0,\n // increase default test timeout as 5s is usually not enough for\n // a single factuality check\n timeout = 10000,\n }: {\n data: () => Promise<{ input: string; expected: string }[]>;\n task: TaskFn;\n skipIf?: () => boolean;\n scorers: ScoreFn[];\n threshold?: number | null;\n timeout?: number;\n },\n) {\n return describe(name, async () => {\n const testFn = skipIf ? test.skipIf(skipIf()) : test;\n // TODO: should data just be a generator?\n for (const { input, expected } of await data()) {\n testFn(\n input,\n {\n timeout,\n },\n async () => {\n const output = await task(input);\n\n const scores = await Promise.all(\n scorers.map((scorer) => {\n const result = scorer({ input, expected, output });\n if (result instanceof Promise) {\n return result;\n }\n return new Promise<Score>((resolve) => resolve(result));\n }),\n );\n const scoresWithName = scores.map((s, i) => ({\n ...s,\n name: scorers[i].name,\n }));\n\n const avgScore =\n scores.reduce((acc, s) => acc + (s.score ?? 0), 0) / scores.length;\n if (threshold) {\n assert(\n avgScore >= threshold,\n `Score: ${avgScore} below threshold: ${threshold}\\nOutput: ${output}\\n${formatScores(\n scoresWithName,\n )}`,\n );\n }\n },\n );\n }\n });\n}\n\nexport function formatScores(scores: (Score & { name: string })[]) {\n return scores\n .sort((a, b) => (a.score ?? 0) - (b.score ?? 0))\n .map((s) => {\n const scoreLine = `${s.name || \"Unknown\"} [${(s.score ?? 0).toFixed(1)}]`;\n if (\n ((s.score ?? 0) < 1.0 && s.metadata?.rationale) ||\n s.metadata?.output\n ) {\n return `${scoreLine}${\n s.metadata.rationale ? `\\nRationale: ${s.metadata.rationale}` : \"\"\n }${s.metadata.output ? `\\nOutput: ${s.metadata.output}` : \"\"}`;\n }\n return scoreLine;\n })\n .join(\"\\n\\n\");\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,SAAS,QAAQ,UAAU,QAAQ,YAAY;AAC/C,OAAO;AAkCP,OAAO,OAAO;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EAwBZ,QAAQ,SAAe,OACrB,OACA,UACA,QACA,SACA,YAAY,GACZ;AAAA;AAjEJ;AAkEI,YAAM,EAAE,MAAM,IAAI;AAElB,YAAM,SAAS,MAAM,OAAO,KAAK;AAEjC,UAAI,SAAS,QAAQ,EAAE,OAAO,UAAU,OAAO,CAAC;AAChD,UAAI,kBAAkB,SAAS;AAC7B,iBAAS,MAAM;AAAA,MACjB;AAEA,aAAO;AAAA,QACL,QAAO,YAAO,UAAP,YAAgB,MAAM;AAAA,QAC7B,SAAS,MACP,UAAU,OAAO,KAAK,IAAI,QAAQ,MAAM,GAAG,IAAI,SAAS;AAAA,EACtD,OAAO,WAAW,cAAc,OAAO,SAAS,SAAS,KAAK,EAChE;AAAA,MACJ;AAAA,IACF;AAAA;AACF,CAAC;AA8BM,SAAS,aACd,MACA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA,YAAY;AAAA;AAAA;AAAA,EAGZ,UAAU;AACZ,GAQA;AACA,SAAO,SAAS,MAAM,MAAY;AAChC,UAAM,SAAS,SAAS,KAAK,OAAO,OAAO,CAAC,IAAI;AAEhD,eAAW,EAAE,OAAO,SAAS,KAAK,MAAM,KAAK,GAAG;AAC9C;AAAA,QACE;AAAA,QACA;AAAA,UACE;AAAA,QACF;AAAA,QACA,MAAY;AACV,gBAAM,SAAS,MAAM,KAAK,KAAK;AAE/B,gBAAM,SAAS,MAAM,QAAQ;AAAA,YAC3B,QAAQ,IAAI,CAAC,WAAW;AACtB,oBAAM,SAAS,OAAO,EAAE,OAAO,UAAU,OAAO,CAAC;AACjD,kBAAI,kBAAkB,SAAS;AAC7B,uBAAO;AAAA,cACT;AACA,qBAAO,IAAI,QAAe,CAAC,YAAY,QAAQ,MAAM,CAAC;AAAA,YACxD,CAAC;AAAA,UACH;AACA,gBAAM,iBAAiB,OAAO,IAAI,CAAC,GAAG,MAAO,iCACxC,IADwC;AAAA,YAE3C,MAAM,QAAQ,CAAC,EAAE;AAAA,UACnB,EAAE;AAEF,gBAAM,WACJ,OAAO,OAAO,CAAC,KAAK,MAAG;AAhKnC;AAgKsC,2BAAO,OAAE,UAAF,YAAW;AAAA,aAAI,CAAC,IAAI,OAAO;AAC9D,cAAI,WAAW;AACb;AAAA,cACE,YAAY;AAAA,cACZ,UAAU,QAAQ,qBAAqB,SAAS;AAAA,UAAa,MAAM;AAAA,EAAK;AAAA,gBACtE;AAAA,cACF,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAAA,EACF,EAAC;AACH;AAEO,SAAS,aAAa,QAAsC;AACjE,SAAO,OACJ,KAAK,CAAC,GAAG,MAAG;AAjLjB;AAiLqB,oBAAE,UAAF,YAAW,OAAM,OAAE,UAAF,YAAW;AAAA,GAAE,EAC9C,IAAI,CAAC,MAAM;AAlLhB;AAmLM,UAAM,YAAY,GAAG,EAAE,QAAQ,SAAS,OAAM,OAAE,UAAF,YAAW,GAAG,QAAQ,CAAC,CAAC;AACtE,UACI,OAAE,UAAF,YAAW,KAAK,OAAO,OAAE,aAAF,mBAAY,gBACrC,OAAE,aAAF,mBAAY,SACZ;AACA,aAAO,GAAG,SAAS,GACjB,EAAE,SAAS,YAAY;AAAA,aAAgB,EAAE,SAAS,SAAS,KAAK,EAClE,GAAG,EAAE,SAAS,SAAS;AAAA,UAAa,EAAE,SAAS,MAAM,KAAK,EAAE;AAAA,IAC9D;AACA,WAAO;AAAA,EACT,CAAC,EACA,KAAK,MAAM;AAChB;","names":[]}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "vitest-evals",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"sideEffects": false,
|
|
5
5
|
"types": "./dist/index.d.ts",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
},
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"@biomejs/biome": "^1.9.4",
|
|
40
|
+
"autoevals": "^0.0.127",
|
|
40
41
|
"lint-staged": "^15.5.0",
|
|
41
42
|
"simple-git-hooks": "^2.12.1",
|
|
42
43
|
"tsup": "^8.4.0",
|