vitest-evals 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -122
- package/dist/evaluate/index.d.mts +23 -0
- package/dist/evaluate/index.d.ts +23 -0
- package/dist/evaluate/index.js +189 -0
- package/dist/evaluate/index.js.map +1 -0
- package/dist/evaluate/index.mjs +163 -0
- package/dist/evaluate/index.mjs.map +1 -0
- package/dist/index.d.mts +2 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +33 -23
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +41 -24
- package/dist/index.mjs.map +1 -1
- package/dist/reporter.js +3 -2
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +3 -2
- package/dist/reporter.mjs.map +1 -1
- package/dist/scorers/index.d.mts +2 -1
- package/dist/scorers/index.d.ts +2 -1
- package/dist/scorers/structuredOutputScorer.d.mts +2 -1
- package/dist/scorers/structuredOutputScorer.d.ts +2 -1
- package/dist/scorers/toolCallScorer.d.mts +6 -23
- package/dist/scorers/toolCallScorer.d.ts +6 -23
- package/dist/wrapText.d.mts +23 -0
- package/dist/wrapText.d.ts +23 -0
- package/dist/wrapText.js +50 -0
- package/dist/wrapText.js.map +1 -0
- package/dist/wrapText.mjs +25 -0
- package/dist/wrapText.mjs.map +1 -0
- package/package.json +27 -11
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# vitest-evals
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
End-to-end evaluation framework for AI agents, built on Vitest.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
@@ -13,14 +13,14 @@ npm install -D vitest-evals
|
|
|
13
13
|
```javascript
|
|
14
14
|
import { describeEval } from "vitest-evals";
|
|
15
15
|
|
|
16
|
-
describeEval("
|
|
16
|
+
describeEval("deploy agent", {
|
|
17
17
|
data: async () => [
|
|
18
|
-
{ input: "
|
|
19
|
-
{ input: "
|
|
18
|
+
{ input: "Deploy the latest release to production", expected: "deployed" },
|
|
19
|
+
{ input: "Roll back the last deploy", expected: "rolled back" },
|
|
20
20
|
],
|
|
21
21
|
task: async (input) => {
|
|
22
|
-
const response = await
|
|
23
|
-
return response;
|
|
22
|
+
const response = await myAgent.run(input);
|
|
23
|
+
return response;
|
|
24
24
|
},
|
|
25
25
|
scorers: [
|
|
26
26
|
async ({ output, expected }) => ({
|
|
@@ -48,50 +48,49 @@ const task = async (input) => ({
|
|
|
48
48
|
});
|
|
49
49
|
```
|
|
50
50
|
|
|
51
|
-
##
|
|
51
|
+
## Test Data
|
|
52
52
|
|
|
53
|
-
|
|
53
|
+
Each test case requires an `input` field. Use `name` to give tests a descriptive label:
|
|
54
54
|
|
|
55
55
|
```javascript
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
56
|
+
data: async () => [
|
|
57
|
+
{ name: "simple deploy", input: "Deploy to staging" },
|
|
58
|
+
{ name: "deploy with rollback", input: "Deploy to prod, roll back if errors" },
|
|
59
|
+
],
|
|
60
|
+
```
|
|
60
61
|
|
|
61
|
-
|
|
62
|
-
data: async () => [
|
|
63
|
-
{ input: "Search weather", expectedTools: [{ name: "weather_api" }] },
|
|
64
|
-
],
|
|
65
|
-
task: weatherTask,
|
|
66
|
-
scorers: [ToolCallScorer()],
|
|
67
|
-
});
|
|
62
|
+
Additional fields (like `expected`, `expectedTools`) are passed through to scorers.
|
|
68
63
|
|
|
69
|
-
|
|
70
|
-
const LengthScorer = async ({ output }) => ({
|
|
71
|
-
score: output.length > 50 ? 1.0 : 0.0,
|
|
72
|
-
});
|
|
64
|
+
## Lifecycle Hooks
|
|
73
65
|
|
|
74
|
-
|
|
75
|
-
import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
|
|
76
|
-
|
|
77
|
-
interface CustomOptions extends BaseScorerOptions {
|
|
78
|
-
minLength: number;
|
|
79
|
-
}
|
|
66
|
+
Use `beforeEach` and `afterEach` for setup and teardown:
|
|
80
67
|
|
|
81
|
-
|
|
82
|
-
|
|
68
|
+
```javascript
|
|
69
|
+
describeEval("agent with database", {
|
|
70
|
+
beforeEach: async () => {
|
|
71
|
+
await db.seed();
|
|
72
|
+
},
|
|
73
|
+
afterEach: async () => {
|
|
74
|
+
await db.clean();
|
|
75
|
+
},
|
|
76
|
+
data: async () => [{ input: "Find recent errors" }],
|
|
77
|
+
task: myAgentTask,
|
|
78
|
+
scorers: [async ({ output }) => ({ score: output.includes("error") ? 1.0 : 0.0 })],
|
|
83
79
|
});
|
|
84
80
|
```
|
|
85
81
|
|
|
86
|
-
|
|
82
|
+
## Scorers
|
|
87
83
|
|
|
88
|
-
|
|
84
|
+
Scorers evaluate outputs and return a score (0-1). Use built-in scorers or create your own.
|
|
85
|
+
|
|
86
|
+
### ToolCallScorer
|
|
89
87
|
|
|
90
88
|
Evaluates if the expected tools were called with correct arguments.
|
|
91
89
|
|
|
92
90
|
```javascript
|
|
93
|
-
|
|
94
|
-
|
|
91
|
+
import { ToolCallScorer } from "vitest-evals";
|
|
92
|
+
|
|
93
|
+
describeEval("tool usage", {
|
|
95
94
|
data: async () => [
|
|
96
95
|
{
|
|
97
96
|
input: "Find Italian restaurants",
|
|
@@ -105,21 +104,11 @@ describeEval("search test", {
|
|
|
105
104
|
scorers: [ToolCallScorer()],
|
|
106
105
|
});
|
|
107
106
|
|
|
108
|
-
// Strict
|
|
109
|
-
scorers: [
|
|
110
|
-
ToolCallScorer({
|
|
111
|
-
ordered: true, // Tools must be in exact order
|
|
112
|
-
params: "strict", // Parameters must match exactly
|
|
113
|
-
}),
|
|
114
|
-
];
|
|
107
|
+
// Strict order and parameters
|
|
108
|
+
scorers: [ToolCallScorer({ ordered: true, params: "strict" })];
|
|
115
109
|
|
|
116
110
|
// Flexible evaluation
|
|
117
|
-
scorers: [
|
|
118
|
-
ToolCallScorer({
|
|
119
|
-
requireAll: false, // Partial matches give partial credit
|
|
120
|
-
allowExtras: false, // No additional tools allowed
|
|
121
|
-
}),
|
|
122
|
-
];
|
|
111
|
+
scorers: [ToolCallScorer({ requireAll: false, allowExtras: false })];
|
|
123
112
|
```
|
|
124
113
|
|
|
125
114
|
**Default behavior:**
|
|
@@ -129,6 +118,63 @@ scorers: [
|
|
|
129
118
|
- Extra tools allowed
|
|
130
119
|
- All expected tools required
|
|
131
120
|
|
|
121
|
+
### StructuredOutputScorer
|
|
122
|
+
|
|
123
|
+
Evaluates if the output matches expected structured data (JSON).
|
|
124
|
+
|
|
125
|
+
```javascript
|
|
126
|
+
import { StructuredOutputScorer } from "vitest-evals";
|
|
127
|
+
|
|
128
|
+
describeEval("query generation", {
|
|
129
|
+
data: async () => [
|
|
130
|
+
{
|
|
131
|
+
input: "Show me errors from today",
|
|
132
|
+
expected: {
|
|
133
|
+
dataset: "errors",
|
|
134
|
+
query: "",
|
|
135
|
+
sort: "-timestamp",
|
|
136
|
+
timeRange: { statsPeriod: "24h" },
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
],
|
|
140
|
+
task: myTask,
|
|
141
|
+
scorers: [StructuredOutputScorer()],
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
// Fuzzy matching
|
|
145
|
+
scorers: [StructuredOutputScorer({ match: "fuzzy" })];
|
|
146
|
+
|
|
147
|
+
// Custom validation
|
|
148
|
+
scorers: [
|
|
149
|
+
StructuredOutputScorer({
|
|
150
|
+
match: (expected, actual, key) => {
|
|
151
|
+
if (key === "age") return actual >= 18 && actual <= 100;
|
|
152
|
+
return expected === actual;
|
|
153
|
+
},
|
|
154
|
+
}),
|
|
155
|
+
];
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Custom Scorers
|
|
159
|
+
|
|
160
|
+
```javascript
|
|
161
|
+
// Inline scorer
|
|
162
|
+
const LengthScorer = async ({ output }) => ({
|
|
163
|
+
score: output.length > 50 ? 1.0 : 0.0,
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
// TypeScript scorer with custom options
|
|
167
|
+
import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
|
|
168
|
+
|
|
169
|
+
interface CustomOptions extends BaseScorerOptions {
|
|
170
|
+
minLength: number;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
|
|
174
|
+
score: opts.output.length >= opts.minLength ? 1.0 : 0.0,
|
|
175
|
+
});
|
|
176
|
+
```
|
|
177
|
+
|
|
132
178
|
## AI SDK Integration
|
|
133
179
|
|
|
134
180
|
See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a complete example with the Vercel AI SDK.
|
|
@@ -155,70 +201,21 @@ return {
|
|
|
155
201
|
|
|
156
202
|
## Advanced Usage
|
|
157
203
|
|
|
158
|
-
###
|
|
159
|
-
|
|
160
|
-
#### Using autoevals
|
|
204
|
+
### Using autoevals
|
|
161
205
|
|
|
162
|
-
For
|
|
206
|
+
For evaluation using the autoevals library:
|
|
163
207
|
|
|
164
208
|
```javascript
|
|
165
209
|
import { Factuality, ClosedQA } from "autoevals";
|
|
166
210
|
|
|
167
211
|
scorers: [
|
|
168
|
-
Factuality,
|
|
212
|
+
Factuality,
|
|
169
213
|
ClosedQA.partial({
|
|
170
214
|
criteria: "Does the answer mention Paris?",
|
|
171
215
|
}),
|
|
172
216
|
];
|
|
173
217
|
```
|
|
174
218
|
|
|
175
|
-
#### Custom LLM-based Factuality Scorer
|
|
176
|
-
|
|
177
|
-
Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
|
|
178
|
-
|
|
179
|
-
```javascript
|
|
180
|
-
import { generateObject } from "ai";
|
|
181
|
-
import { openai } from "@ai-sdk/openai";
|
|
182
|
-
import { z } from "zod";
|
|
183
|
-
|
|
184
|
-
const Factuality = (model = openai("gpt-4o")) => async ({ input, output, expected }) => {
|
|
185
|
-
if (!expected) {
|
|
186
|
-
return { score: 1.0, metadata: { rationale: "No expected answer" } };
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
const { object } = await generateObject({
|
|
190
|
-
model,
|
|
191
|
-
prompt: `
|
|
192
|
-
Compare the factual content of the submitted answer with the expert answer.
|
|
193
|
-
|
|
194
|
-
Question: ${input}
|
|
195
|
-
Expert: ${expected}
|
|
196
|
-
Submission: ${output}
|
|
197
|
-
|
|
198
|
-
Options:
|
|
199
|
-
(A) Subset of expert answer
|
|
200
|
-
(B) Superset of expert answer
|
|
201
|
-
(C) Same content as expert
|
|
202
|
-
(D) Contradicts expert answer
|
|
203
|
-
(E) Different but factually equivalent
|
|
204
|
-
`,
|
|
205
|
-
schema: z.object({
|
|
206
|
-
answer: z.enum(["A", "B", "C", "D", "E"]),
|
|
207
|
-
rationale: z.string(),
|
|
208
|
-
}),
|
|
209
|
-
});
|
|
210
|
-
|
|
211
|
-
const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
|
|
212
|
-
return {
|
|
213
|
-
score: scores[object.answer],
|
|
214
|
-
metadata: { rationale: object.rationale, answer: object.answer },
|
|
215
|
-
};
|
|
216
|
-
};
|
|
217
|
-
|
|
218
|
-
// Usage
|
|
219
|
-
scorers: [Factuality()];
|
|
220
|
-
```
|
|
221
|
-
|
|
222
219
|
### Skip Tests Conditionally
|
|
223
220
|
|
|
224
221
|
```javascript
|
|
@@ -232,7 +229,7 @@ describeEval("gpt-4 tests", {
|
|
|
232
229
|
|
|
233
230
|
For integration with existing Vitest test suites, you can use the `.toEval()` matcher:
|
|
234
231
|
|
|
235
|
-
>
|
|
232
|
+
> **Deprecated**: The `.toEval()` helper is deprecated. Use `describeEval()` instead for better test organization and multiple scorers support.
|
|
236
233
|
|
|
237
234
|
```javascript
|
|
238
235
|
import "vitest-evals";
|
|
@@ -251,25 +248,6 @@ test("capital check", () => {
|
|
|
251
248
|
});
|
|
252
249
|
```
|
|
253
250
|
|
|
254
|
-
**Recommended migration** to `describeEval()`:
|
|
255
|
-
|
|
256
|
-
```javascript
|
|
257
|
-
import { describeEval } from "vitest-evals";
|
|
258
|
-
|
|
259
|
-
describeEval("capital check", {
|
|
260
|
-
data: async () => [
|
|
261
|
-
{ input: "What is the capital of France?", expected: "Paris" },
|
|
262
|
-
],
|
|
263
|
-
task: answerQuestion,
|
|
264
|
-
scorers: [
|
|
265
|
-
async ({ output, expected }) => ({
|
|
266
|
-
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
|
|
267
|
-
}),
|
|
268
|
-
],
|
|
269
|
-
threshold: 0.8,
|
|
270
|
-
});
|
|
271
|
-
```
|
|
272
|
-
|
|
273
251
|
## Configuration
|
|
274
252
|
|
|
275
253
|
### Separate Eval Configuration
|
|
@@ -298,6 +276,6 @@ vitest --config=vitest.evals.config.ts
|
|
|
298
276
|
## Development
|
|
299
277
|
|
|
300
278
|
```shell
|
|
301
|
-
|
|
302
|
-
|
|
279
|
+
pnpm install
|
|
280
|
+
pnpm test
|
|
303
281
|
```
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { generateObject } from 'ai';
|
|
2
|
+
|
|
3
|
+
type LanguageModel = Parameters<typeof generateObject>[0]["model"];
|
|
4
|
+
declare function configure(opts: {
|
|
5
|
+
model: LanguageModel;
|
|
6
|
+
}): void;
|
|
7
|
+
interface EvaluateOptions {
|
|
8
|
+
task: () => Promise<string>;
|
|
9
|
+
criteria: string;
|
|
10
|
+
threshold?: number;
|
|
11
|
+
}
|
|
12
|
+
interface TestTaskContext {
|
|
13
|
+
task: {
|
|
14
|
+
meta: Record<string, any>;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
/** @internal Core evaluation logic, exported for testing. */
|
|
18
|
+
declare function _evaluate(ctx: TestTaskContext, opts: EvaluateOptions): Promise<void>;
|
|
19
|
+
declare function evaluate(name: string, opts: EvaluateOptions & {
|
|
20
|
+
timeout?: number;
|
|
21
|
+
}): void;
|
|
22
|
+
|
|
23
|
+
export { _evaluate, configure, evaluate };
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { generateObject } from 'ai';
|
|
2
|
+
|
|
3
|
+
type LanguageModel = Parameters<typeof generateObject>[0]["model"];
|
|
4
|
+
declare function configure(opts: {
|
|
5
|
+
model: LanguageModel;
|
|
6
|
+
}): void;
|
|
7
|
+
interface EvaluateOptions {
|
|
8
|
+
task: () => Promise<string>;
|
|
9
|
+
criteria: string;
|
|
10
|
+
threshold?: number;
|
|
11
|
+
}
|
|
12
|
+
interface TestTaskContext {
|
|
13
|
+
task: {
|
|
14
|
+
meta: Record<string, any>;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
/** @internal Core evaluation logic, exported for testing. */
|
|
18
|
+
declare function _evaluate(ctx: TestTaskContext, opts: EvaluateOptions): Promise<void>;
|
|
19
|
+
declare function evaluate(name: string, opts: EvaluateOptions & {
|
|
20
|
+
timeout?: number;
|
|
21
|
+
}): void;
|
|
22
|
+
|
|
23
|
+
export { _evaluate, configure, evaluate };
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var __async = (__this, __arguments, generator) => {
|
|
20
|
+
return new Promise((resolve, reject) => {
|
|
21
|
+
var fulfilled = (value) => {
|
|
22
|
+
try {
|
|
23
|
+
step(generator.next(value));
|
|
24
|
+
} catch (e) {
|
|
25
|
+
reject(e);
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
var rejected = (value) => {
|
|
29
|
+
try {
|
|
30
|
+
step(generator.throw(value));
|
|
31
|
+
} catch (e) {
|
|
32
|
+
reject(e);
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
|
|
36
|
+
step((generator = generator.apply(__this, __arguments)).next());
|
|
37
|
+
});
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
// src/evaluate/index.ts
|
|
41
|
+
var evaluate_exports = {};
|
|
42
|
+
__export(evaluate_exports, {
|
|
43
|
+
_evaluate: () => _evaluate,
|
|
44
|
+
configure: () => configure,
|
|
45
|
+
evaluate: () => evaluate
|
|
46
|
+
});
|
|
47
|
+
module.exports = __toCommonJS(evaluate_exports);
|
|
48
|
+
var import_ai = require("ai");
|
|
49
|
+
var import_zod = require("zod");
|
|
50
|
+
var import_vitest = require("vitest");
|
|
51
|
+
|
|
52
|
+
// src/wrapText.ts
|
|
53
|
+
function wrapText(text, width = 80) {
|
|
54
|
+
if (!text || text.length <= width) {
|
|
55
|
+
return text;
|
|
56
|
+
}
|
|
57
|
+
const words = text.split(/\s+/);
|
|
58
|
+
const lines = [];
|
|
59
|
+
let currentLine = "";
|
|
60
|
+
for (const word of words) {
|
|
61
|
+
if (currentLine.length + word.length + 1 > width) {
|
|
62
|
+
lines.push(currentLine.trim());
|
|
63
|
+
currentLine = word;
|
|
64
|
+
} else {
|
|
65
|
+
currentLine += (currentLine ? " " : "") + word;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if (currentLine) {
|
|
69
|
+
lines.push(currentLine);
|
|
70
|
+
}
|
|
71
|
+
return lines.join("\n");
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// src/evaluate/index.ts
|
|
75
|
+
var defaultModel;
|
|
76
|
+
function configure(opts) {
|
|
77
|
+
defaultModel = opts.model;
|
|
78
|
+
}
|
|
79
|
+
var EVAL_SYSTEM = `You are assessing a submitted output based on a given criterion. Ignore differences in style, grammar, punctuation, or length. Focus only on whether the criterion is met.`;
|
|
80
|
+
var EVAL_PROMPT = (output, criteria) => `<submission>
|
|
81
|
+
${output}
|
|
82
|
+
</submission>
|
|
83
|
+
|
|
84
|
+
<criteria>
|
|
85
|
+
${criteria}
|
|
86
|
+
</criteria>
|
|
87
|
+
|
|
88
|
+
Does the submission meet the criteria? Select one option:
|
|
89
|
+
(A) The criteria is fully met with no issues
|
|
90
|
+
(B) The criteria is mostly met with minor gaps
|
|
91
|
+
(C) The criteria is partially met with notable gaps
|
|
92
|
+
(D) The criteria is barely met or only tangentially addressed
|
|
93
|
+
(E) The criteria is not met at all`;
|
|
94
|
+
var CHOICE_SCORES = {
|
|
95
|
+
A: 1,
|
|
96
|
+
B: 0.75,
|
|
97
|
+
C: 0.5,
|
|
98
|
+
D: 0.25,
|
|
99
|
+
E: 0
|
|
100
|
+
};
|
|
101
|
+
function _evaluate(ctx, opts) {
|
|
102
|
+
return __async(this, null, function* () {
|
|
103
|
+
var _a;
|
|
104
|
+
if (!defaultModel) {
|
|
105
|
+
throw new Error(
|
|
106
|
+
"No model configured. Call configure({ model }) before using evaluate."
|
|
107
|
+
);
|
|
108
|
+
}
|
|
109
|
+
let output;
|
|
110
|
+
try {
|
|
111
|
+
output = yield opts.task();
|
|
112
|
+
} catch (error) {
|
|
113
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
114
|
+
ctx.task.meta.eval = {
|
|
115
|
+
scores: [
|
|
116
|
+
{
|
|
117
|
+
score: 0,
|
|
118
|
+
name: "evaluate",
|
|
119
|
+
metadata: { rationale: `Task failed: ${errorMessage}` }
|
|
120
|
+
}
|
|
121
|
+
],
|
|
122
|
+
avgScore: 0
|
|
123
|
+
};
|
|
124
|
+
throw error;
|
|
125
|
+
}
|
|
126
|
+
let object;
|
|
127
|
+
try {
|
|
128
|
+
({ object } = yield (0, import_ai.generateObject)({
|
|
129
|
+
model: defaultModel,
|
|
130
|
+
schema: import_zod.z.object({
|
|
131
|
+
answer: import_zod.z.enum(["A", "B", "C", "D", "E"]),
|
|
132
|
+
rationale: import_zod.z.string()
|
|
133
|
+
}),
|
|
134
|
+
system: EVAL_SYSTEM,
|
|
135
|
+
prompt: EVAL_PROMPT(output, opts.criteria)
|
|
136
|
+
}));
|
|
137
|
+
} catch (error) {
|
|
138
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
139
|
+
ctx.task.meta.eval = {
|
|
140
|
+
scores: [
|
|
141
|
+
{
|
|
142
|
+
score: 0,
|
|
143
|
+
name: "evaluate",
|
|
144
|
+
metadata: { rationale: `Judge failed: ${errorMessage}` }
|
|
145
|
+
}
|
|
146
|
+
],
|
|
147
|
+
avgScore: 0
|
|
148
|
+
};
|
|
149
|
+
throw error;
|
|
150
|
+
}
|
|
151
|
+
const score = CHOICE_SCORES[object.answer];
|
|
152
|
+
const threshold = (_a = opts.threshold) != null ? _a : 1;
|
|
153
|
+
ctx.task.meta.eval = {
|
|
154
|
+
scores: [
|
|
155
|
+
{
|
|
156
|
+
score,
|
|
157
|
+
name: "evaluate",
|
|
158
|
+
metadata: { rationale: object.rationale, answer: object.answer }
|
|
159
|
+
}
|
|
160
|
+
],
|
|
161
|
+
avgScore: score
|
|
162
|
+
};
|
|
163
|
+
if (score < threshold) {
|
|
164
|
+
(0, import_vitest.assert)(
|
|
165
|
+
false,
|
|
166
|
+
`Score: ${score} (${object.answer}) below threshold: ${threshold}
|
|
167
|
+
|
|
168
|
+
## Output:
|
|
169
|
+
${wrapText(output)}
|
|
170
|
+
|
|
171
|
+
## Rationale:
|
|
172
|
+
${wrapText(object.rationale)}`
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
function evaluate(name, opts) {
|
|
178
|
+
var _a;
|
|
179
|
+
(0, import_vitest.test)(name, { timeout: (_a = opts.timeout) != null ? _a : 6e4 }, (_0) => __async(null, [_0], function* ({ task: testTask }) {
|
|
180
|
+
yield _evaluate({ task: testTask }, opts);
|
|
181
|
+
}));
|
|
182
|
+
}
|
|
183
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
184
|
+
0 && (module.exports = {
|
|
185
|
+
_evaluate,
|
|
186
|
+
configure,
|
|
187
|
+
evaluate
|
|
188
|
+
});
|
|
189
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/evaluate/index.ts","../../src/wrapText.ts"],"sourcesContent":["import { generateObject } from \"ai\";\nimport { z } from \"zod\";\nimport { assert, test } from \"vitest\";\nimport { wrapText } from \"../wrapText\";\n\ntype LanguageModel = Parameters<typeof generateObject>[0][\"model\"];\n\nlet defaultModel: LanguageModel | undefined;\n\nexport function configure(opts: { model: LanguageModel }) {\n defaultModel = opts.model;\n}\n\nconst EVAL_SYSTEM = `You are assessing a submitted output based on a given criterion. Ignore differences in style, grammar, punctuation, or length. Focus only on whether the criterion is met.`;\n\nconst EVAL_PROMPT = (output: string, criteria: string) => `<submission>\n${output}\n</submission>\n\n<criteria>\n${criteria}\n</criteria>\n\nDoes the submission meet the criteria? Select one option:\n(A) The criteria is fully met with no issues\n(B) The criteria is mostly met with minor gaps\n(C) The criteria is partially met with notable gaps\n(D) The criteria is barely met or only tangentially addressed\n(E) The criteria is not met at all`;\n\nconst CHOICE_SCORES: Record<string, number> = {\n A: 1.0,\n B: 0.75,\n C: 0.5,\n D: 0.25,\n E: 0.0,\n};\n\ninterface EvaluateOptions {\n task: () => Promise<string>;\n criteria: string;\n threshold?: number;\n}\n\ninterface TestTaskContext {\n task: { meta: Record<string, any> };\n}\n\n/** @internal Core evaluation logic, exported for testing. */\nexport async function _evaluate(\n ctx: TestTaskContext,\n opts: EvaluateOptions,\n): Promise<void> {\n if (!defaultModel) {\n throw new Error(\n \"No model configured. Call configure({ model }) before using evaluate.\",\n );\n }\n\n let output: string;\n try {\n output = await opts.task();\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n ctx.task.meta.eval = {\n scores: [\n {\n score: 0,\n name: \"evaluate\",\n metadata: { rationale: `Task failed: ${errorMessage}` },\n },\n ],\n avgScore: 0,\n };\n throw error;\n }\n\n let object: { answer: string; rationale: string };\n try {\n ({ object } = await generateObject({\n model: defaultModel,\n schema: z.object({\n answer: z.enum([\"A\", \"B\", \"C\", \"D\", \"E\"]),\n rationale: z.string(),\n }),\n system: EVAL_SYSTEM,\n prompt: EVAL_PROMPT(output, opts.criteria),\n }));\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n ctx.task.meta.eval = {\n scores: [\n {\n score: 0,\n name: \"evaluate\",\n metadata: { rationale: `Judge failed: ${errorMessage}` },\n },\n ],\n avgScore: 0,\n };\n throw error;\n }\n\n const score = CHOICE_SCORES[object.answer];\n const threshold = opts.threshold ?? 1.0;\n\n ctx.task.meta.eval = {\n scores: [\n {\n score,\n name: \"evaluate\",\n metadata: { rationale: object.rationale, answer: object.answer },\n },\n ],\n avgScore: score,\n };\n\n if (score < threshold) {\n assert(\n false,\n `Score: ${score} (${object.answer}) below threshold: ${threshold}\\n\\n## Output:\\n${wrapText(output)}\\n\\n## Rationale:\\n${wrapText(object.rationale)}`,\n );\n }\n}\n\nexport function evaluate(\n name: string,\n opts: EvaluateOptions & { timeout?: number },\n) {\n test(name, { timeout: opts.timeout ?? 60000 }, async ({ task: testTask }) => {\n await _evaluate({ task: testTask }, opts);\n });\n}\n","/**\n * Wraps text to fit within a specified width, breaking at word boundaries.\n *\n * @param text - The text to wrap\n * @param width - The maximum width in characters (default: 80)\n * @returns The wrapped text with line breaks\n *\n * @example\n * ```javascript\n * const wrapped = wrapText(\"This is a very long text that needs to be wrapped to fit within an 80 character width.\", 20);\n * console.log(wrapped);\n * // Output:\n * // This is a very\n * // long text that\n * // needs to be\n * // wrapped to fit\n * // within an 80\n * // character width.\n * ```\n */\nexport function wrapText(text: string, width = 80): string {\n if (!text || text.length <= width) {\n return text;\n }\n\n const words = text.split(/\\s+/);\n const lines: string[] = [];\n let currentLine = \"\";\n\n for (const word of words) {\n // If adding this word would exceed the width, start a new line\n if (currentLine.length + word.length + 1 > width) {\n lines.push(currentLine.trim());\n currentLine = word;\n } else {\n // Add the word to the current line\n currentLine += (currentLine ? \" \" : \"\") + word;\n }\n }\n\n // Add the last line if it's not empty\n if (currentLine) {\n lines.push(currentLine);\n }\n\n return lines.join(\"\\n\");\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,gBAA+B;AAC/B,iBAAkB;AAClB,oBAA6B;;;ACkBtB,SAAS,SAAS,MAAc,QAAQ,IAAY;AACzD,MAAI,CAAC,QAAQ,KAAK,UAAU,OAAO;AACjC,WAAO;AAAA,EACT;AAEA,QAAM,QAAQ,KAAK,MAAM,KAAK;AAC9B,QAAM,QAAkB,CAAC;AACzB,MAAI,cAAc;AAElB,aAAW,QAAQ,OAAO;AAExB,QAAI,YAAY,SAAS,KAAK,SAAS,IAAI,OAAO;AAChD,YAAM,KAAK,YAAY,KAAK,CAAC;AAC7B,oBAAc;AAAA,IAChB,OAAO;AAEL,sBAAgB,cAAc,MAAM,MAAM;AAAA,IAC5C;AAAA,EACF;AAGA,MAAI,aAAa;AACf,UAAM,KAAK,WAAW;AAAA,EACxB;AAEA,SAAO,MAAM,KAAK,IAAI;AACxB;;;ADvCA,IAAI;AAEG,SAAS,UAAU,MAAgC;AACxD,iBAAe,KAAK;AACtB;AAEA,IAAM,cAAc;AAEpB,IAAM,cAAc,CAAC,QAAgB,aAAqB;AAAA,EACxD,MAAM;AAAA;AAAA;AAAA;AAAA,EAIN,QAAQ;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAUV,IAAM,gBAAwC;AAAA,EAC5C,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AACL;AAaA,SAAsB,UACpB,KACA,MACe;AAAA;AApDjB;AAqDE,QAAI,CAAC,cAAc;AACjB,YAAM,IAAI;AAAA,QACR;AAAA,MACF;AAAA,IACF;AAEA,QAAI;AACJ,QAAI;AACF,eAAS,MAAM,KAAK,KAAK;AAAA,IAC3B,SAAS,OAAO;AACd,YAAM,eAAe,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1E,UAAI,KAAK,KAAK,OAAO;AAAA,QACnB,QAAQ;AAAA,UACN;AAAA,YACE,OAAO;AAAA,YACP,MAAM;AAAA,YACN,UAAU,EAAE,WAAW,gBAAgB,YAAY,GAAG;AAAA,UACxD;AAAA,QACF;AAAA,QACA,UAAU;AAAA,MACZ;AACA,YAAM;AAAA,IACR;AAEA,QAAI;AACJ,QAAI;AACF,OAAC,EAAE,OAAO,IAAI,UAAM,0BAAe;AAAA,QACjC,OAAO;AAAA,QACP,QAAQ,aAAE,OAAO;AAAA,UACf,QAAQ,aAAE,KAAK,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG,CAAC;AAAA,UACxC,WAAW,aAAE,OAAO;AAAA,QACtB,CAAC;AAAA,QACD,QAAQ;AAAA,QACR,QAAQ,YAAY,QAAQ,KAAK,QAAQ;AAAA,MAC3C,CAAC;AAAA,IACH,SAAS,OAAO;AACd,YAAM,eAAe,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC1E,UAAI,KAAK,KAAK,OAAO;AAAA,QACnB,QAAQ;AAAA,UACN;AAAA,YACE,OAAO;AAAA,YACP,MAAM;AAAA,YACN,UAAU,EAAE,WAAW,iBAAiB,YAAY,GAAG;AAAA,UACzD;AAAA,QACF;AAAA,QACA,UAAU;AAAA,MACZ;AACA,YAAM;AAAA,IACR;AAEA,UAAM,QAAQ,cAAc,OAAO,MAAM;AACzC,UAAM,aAAY,UAAK,cAAL,YAAkB;AAEpC,QAAI,KAAK,KAAK,OAAO;AAAA,MACnB,QAAQ;AAAA,QACN;AAAA,UACE;AAAA,UACA,MAAM;AAAA,UACN,UAAU,EAAE,WAAW,OAAO,WAAW,QAAQ,OAAO,OAAO;AAAA,QACjE;AAAA,MACF;AAAA,MACA,UAAU;AAAA,IACZ;AAEA,QAAI,QAAQ,WAAW;AACrB;AAAA,QACE;AAAA,QACA,UAAU,KAAK,KAAK,OAAO,MAAM,sBAAsB,SAAS;AAAA;AAAA;AAAA,EAAmB,SAAS,MAAM,CAAC;AAAA;AAAA;AAAA,EAAsB,SAAS,OAAO,SAAS,CAAC;AAAA,MACrJ;AAAA,IACF;AAAA,EACF;AAAA;AAEO,SAAS,SACd,MACA,MACA;AAhIF;AAiIE,0BAAK,MAAM,EAAE,UAAS,UAAK,YAAL,YAAgB,IAAM,GAAG,CAAO,OAAuB,eAAvB,KAAuB,WAAvB,EAAE,MAAM,SAAS,GAAM;AAC3E,UAAM,UAAU,EAAE,MAAM,SAAS,GAAG,IAAI;AAAA,EAC1C,EAAC;AACH;","names":[]}
|