vitest-evals 0.7.0 → 0.9.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +167 -236
- package/dist/harness.d.mts +118 -0
- package/dist/harness.d.ts +118 -0
- package/dist/harness.js +167 -0
- package/dist/harness.js.map +1 -0
- package/dist/harness.mjs +126 -0
- package/dist/harness.mjs.map +1 -0
- package/dist/index.d.mts +114 -3
- package/dist/index.d.ts +114 -3
- package/dist/index.js +682 -398
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +672 -407
- package/dist/index.mjs.map +1 -1
- package/dist/internal/matchers.d.mts +41 -0
- package/dist/internal/matchers.d.ts +41 -0
- package/dist/internal/matchers.js +206 -0
- package/dist/internal/matchers.js.map +1 -0
- package/dist/internal/matchers.mjs +176 -0
- package/dist/internal/matchers.mjs.map +1 -0
- package/dist/internal/scoring.d.mts +18 -0
- package/dist/internal/scoring.d.ts +18 -0
- package/dist/internal/scoring.js +19 -0
- package/dist/internal/scoring.js.map +1 -0
- package/dist/internal/scoring.mjs +1 -0
- package/dist/internal/scoring.mjs.map +1 -0
- package/dist/internal/structuredOutputScorer.d.mts +16 -0
- package/dist/internal/structuredOutputScorer.d.ts +16 -0
- package/dist/{scorers → internal}/structuredOutputScorer.js +94 -80
- package/dist/internal/structuredOutputScorer.js.map +1 -0
- package/dist/{scorers → internal}/structuredOutputScorer.mjs +93 -80
- package/dist/internal/structuredOutputScorer.mjs.map +1 -0
- package/dist/internal/toolCallScorer.d.mts +20 -0
- package/dist/internal/toolCallScorer.d.ts +20 -0
- package/dist/{scorers → internal}/toolCallScorer.js +100 -134
- package/dist/internal/toolCallScorer.js.map +1 -0
- package/dist/internal/toolCallScorer.mjs +310 -0
- package/dist/internal/toolCallScorer.mjs.map +1 -0
- package/dist/judges/index.d.mts +8 -0
- package/dist/judges/index.d.ts +8 -0
- package/dist/judges/index.js +550 -0
- package/dist/judges/index.js.map +1 -0
- package/dist/judges/index.mjs +522 -0
- package/dist/judges/index.mjs.map +1 -0
- package/dist/judges/structuredOutputJudge.d.mts +20 -0
- package/dist/judges/structuredOutputJudge.d.ts +20 -0
- package/dist/judges/structuredOutputJudge.js +340 -0
- package/dist/judges/structuredOutputJudge.js.map +1 -0
- package/dist/judges/structuredOutputJudge.mjs +313 -0
- package/dist/judges/structuredOutputJudge.mjs.map +1 -0
- package/dist/judges/toolCallJudge.d.mts +22 -0
- package/dist/judges/toolCallJudge.d.ts +22 -0
- package/dist/judges/toolCallJudge.js +357 -0
- package/dist/judges/toolCallJudge.js.map +1 -0
- package/dist/judges/toolCallJudge.mjs +330 -0
- package/dist/judges/toolCallJudge.mjs.map +1 -0
- package/dist/judges/types.d.mts +39 -0
- package/dist/judges/types.d.ts +39 -0
- package/dist/judges/types.js +19 -0
- package/dist/judges/types.js.map +1 -0
- package/dist/judges/types.mjs +1 -0
- package/dist/judges/types.mjs.map +1 -0
- package/dist/{evaluate → legacy/evaluate}/index.d.mts +7 -0
- package/dist/{evaluate → legacy/evaluate}/index.d.ts +7 -0
- package/dist/{evaluate → legacy/evaluate}/index.js +65 -89
- package/dist/legacy/evaluate/index.js.map +1 -0
- package/dist/legacy/evaluate/index.mjs +138 -0
- package/dist/legacy/evaluate/index.mjs.map +1 -0
- package/dist/legacy/format.d.mts +16 -0
- package/dist/legacy/format.d.ts +16 -0
- package/dist/legacy/format.js +69 -0
- package/dist/legacy/format.js.map +1 -0
- package/dist/legacy/format.mjs +43 -0
- package/dist/legacy/format.mjs.map +1 -0
- package/dist/legacy/scorers/index.d.mts +4 -0
- package/dist/legacy/scorers/index.d.ts +4 -0
- package/dist/{scorers → legacy/scorers}/index.js +164 -162
- package/dist/legacy/scorers/index.js.map +1 -0
- package/dist/{scorers → legacy/scorers}/index.mjs +163 -164
- package/dist/legacy/scorers/index.mjs.map +1 -0
- package/dist/legacy/scorers/structuredOutputScorer.d.mts +20 -0
- package/dist/legacy/scorers/structuredOutputScorer.d.ts +20 -0
- package/dist/legacy/scorers/structuredOutputScorer.js +320 -0
- package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -0
- package/dist/legacy/scorers/structuredOutputScorer.mjs +293 -0
- package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -0
- package/dist/legacy/scorers/toolCallScorer.d.mts +23 -0
- package/dist/legacy/scorers/toolCallScorer.d.ts +23 -0
- package/dist/{scorers/toolCallScorer.mjs → legacy/scorers/toolCallScorer.js} +129 -134
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -0
- package/dist/legacy/scorers/toolCallScorer.mjs +315 -0
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -0
- package/dist/legacy/scorers/utils.d.mts +1 -0
- package/dist/legacy/scorers/utils.d.ts +1 -0
- package/dist/{scorers → legacy/scorers}/utils.js +73 -41
- package/dist/legacy/scorers/utils.js.map +1 -0
- package/dist/{scorers → legacy/scorers}/utils.mjs +71 -41
- package/dist/legacy/scorers/utils.mjs.map +1 -0
- package/dist/legacy/shared.d.mts +31 -0
- package/dist/legacy/shared.d.ts +31 -0
- package/dist/legacy/shared.js +19 -0
- package/dist/legacy/shared.js.map +1 -0
- package/dist/legacy/shared.mjs +1 -0
- package/dist/legacy/shared.mjs.map +1 -0
- package/dist/legacy.d.mts +34 -0
- package/dist/legacy.d.ts +34 -0
- package/dist/legacy.js +751 -0
- package/dist/legacy.js.map +1 -0
- package/dist/legacy.mjs +727 -0
- package/dist/legacy.mjs.map +1 -0
- package/dist/replay.d.mts +60 -0
- package/dist/replay.d.ts +60 -0
- package/dist/replay.js +228 -0
- package/dist/replay.js.map +1 -0
- package/dist/replay.mjs +201 -0
- package/dist/replay.mjs.map +1 -0
- package/dist/reporter.d.mts +46 -3
- package/dist/reporter.d.ts +46 -3
- package/dist/reporter.js +518 -10
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs +518 -10
- package/dist/reporter.mjs.map +1 -1
- package/package.json +29 -61
- package/dist/evaluate/index.js.map +0 -1
- package/dist/evaluate/index.mjs +0 -163
- package/dist/evaluate/index.mjs.map +0 -1
- package/dist/scorers/index.d.mts +0 -4
- package/dist/scorers/index.d.ts +0 -4
- package/dist/scorers/index.js.map +0 -1
- package/dist/scorers/index.mjs.map +0 -1
- package/dist/scorers/structuredOutputScorer.d.mts +0 -4
- package/dist/scorers/structuredOutputScorer.d.ts +0 -4
- package/dist/scorers/structuredOutputScorer.js.map +0 -1
- package/dist/scorers/structuredOutputScorer.mjs.map +0 -1
- package/dist/scorers/toolCallScorer.d.mts +0 -315
- package/dist/scorers/toolCallScorer.d.ts +0 -315
- package/dist/scorers/toolCallScorer.js.map +0 -1
- package/dist/scorers/toolCallScorer.mjs.map +0 -1
- package/dist/scorers/utils.d.mts +0 -103
- package/dist/scorers/utils.d.ts +0 -103
- package/dist/scorers/utils.js.map +0 -1
- package/dist/scorers/utils.mjs.map +0 -1
package/README.md
CHANGED
|
@@ -1,281 +1,212 @@
|
|
|
1
1
|
# vitest-evals
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Harness-backed AI testing on top of Vitest.
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## Install
|
|
6
6
|
|
|
7
|
-
```
|
|
7
|
+
```sh
|
|
8
8
|
npm install -D vitest-evals
|
|
9
9
|
```
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
Install a first-party harness package for the runtime you want to test:
|
|
12
12
|
|
|
13
|
-
```
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
data: async () => [
|
|
18
|
-
{ input: "Deploy the latest release to production", expected: "deployed" },
|
|
19
|
-
{ input: "Roll back the last deploy", expected: "rolled back" },
|
|
20
|
-
],
|
|
21
|
-
task: async (input) => {
|
|
22
|
-
const response = await myAgent.run(input);
|
|
23
|
-
return response;
|
|
24
|
-
},
|
|
25
|
-
scorers: [
|
|
26
|
-
async ({ output, expected }) => ({
|
|
27
|
-
score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
|
|
28
|
-
}),
|
|
29
|
-
],
|
|
30
|
-
threshold: 0.8,
|
|
31
|
-
});
|
|
13
|
+
```sh
|
|
14
|
+
npm install -D @vitest-evals/harness-pi-ai
|
|
15
|
+
# or
|
|
16
|
+
npm install -D @vitest-evals/harness-ai-sdk
|
|
32
17
|
```
|
|
33
18
|
|
|
34
|
-
##
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
19
|
+
## Core Model
|
|
20
|
+
|
|
21
|
+
- `describeEval(...)` binds exactly one harness to a suite
|
|
22
|
+
- the suite callback receives a fixture-backed Vitest `it`
|
|
23
|
+
- `run(input, { metadata? })` executes the harness explicitly and returns a
|
|
24
|
+
normalized `HarnessRun`
|
|
25
|
+
- the returned `result.output` is the app-facing value you assert on directly
|
|
26
|
+
- the returned `result.session` is the canonical JSON-serializable trace for
|
|
27
|
+
reporting, replay, tool assertions, and judges
|
|
28
|
+
- per-run judge inputs should usually live under `metadata`
|
|
29
|
+
- suite-level `judges` are optional and run automatically after each `run(...)`
|
|
30
|
+
- suite-level `judgeThreshold` controls fail-on-score for those automatic judges
|
|
31
|
+
- explicit judge assertions use
|
|
32
|
+
`await expect(result).toSatisfyJudge(judge, context)`
|
|
33
|
+
|
|
34
|
+
## Explicit Run Example
|
|
35
|
+
|
|
36
|
+
```ts
|
|
37
|
+
import { expect } from "vitest";
|
|
38
|
+
import { piAiHarness } from "@vitest-evals/harness-pi-ai";
|
|
39
|
+
import {
|
|
40
|
+
describeEval,
|
|
41
|
+
namedJudge,
|
|
42
|
+
toolCalls,
|
|
43
|
+
type JudgeContext,
|
|
44
|
+
} from "vitest-evals";
|
|
45
|
+
import { createRefundAgent } from "../src/refundAgent";
|
|
46
|
+
|
|
47
|
+
type RefundEvalMetadata = {
|
|
48
|
+
expectedStatus: "approved" | "denied";
|
|
49
|
+
expectedTools: string[];
|
|
50
|
+
};
|
|
65
51
|
|
|
66
|
-
|
|
52
|
+
const FactualityJudge = namedJudge(
|
|
53
|
+
"FactualityJudge",
|
|
54
|
+
async ({
|
|
55
|
+
input,
|
|
56
|
+
output,
|
|
57
|
+
metadata,
|
|
58
|
+
}: JudgeContext<string, RefundEvalMetadata>) => {
|
|
59
|
+
const verdict = await judgeFactuality({
|
|
60
|
+
question: input,
|
|
61
|
+
answer: output,
|
|
62
|
+
expectedStatus: metadata.expectedStatus,
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
score: verdict.score,
|
|
67
|
+
metadata: {
|
|
68
|
+
rationale: verdict.rationale,
|
|
69
|
+
},
|
|
70
|
+
};
|
|
71
|
+
},
|
|
72
|
+
);
|
|
67
73
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
74
|
+
describeEval(
|
|
75
|
+
"refund agent",
|
|
76
|
+
{
|
|
77
|
+
harness: piAiHarness({
|
|
78
|
+
createAgent: () => createRefundAgent(),
|
|
79
|
+
}),
|
|
80
|
+
judges: [FactualityJudge],
|
|
72
81
|
},
|
|
73
|
-
|
|
74
|
-
|
|
82
|
+
(it) => {
|
|
83
|
+
it("approves a refundable invoice", async ({ run }) => {
|
|
84
|
+
const result = await run("Refund invoice inv_123", {
|
|
85
|
+
metadata: {
|
|
86
|
+
expectedStatus: "approved",
|
|
87
|
+
expectedTools: ["lookupInvoice", "createRefund"],
|
|
88
|
+
},
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
expect(result.output).toMatchObject({ status: "approved" });
|
|
92
|
+
expect(toolCalls(result.session).map((call) => call.name)).toEqual([
|
|
93
|
+
"lookupInvoice",
|
|
94
|
+
"createRefund",
|
|
95
|
+
]);
|
|
96
|
+
});
|
|
75
97
|
},
|
|
76
|
-
|
|
77
|
-
task: myAgentTask,
|
|
78
|
-
scorers: [async ({ output }) => ({ score: output.includes("error") ? 1.0 : 0.0 })],
|
|
79
|
-
});
|
|
98
|
+
);
|
|
80
99
|
```
|
|
81
100
|
|
|
82
|
-
##
|
|
83
|
-
|
|
84
|
-
Scorers evaluate outputs and return a score (0-1). Use built-in scorers or create your own.
|
|
85
|
-
|
|
86
|
-
### ToolCallScorer
|
|
101
|
+
## Table-Driven Vitest Style
|
|
87
102
|
|
|
88
|
-
|
|
103
|
+
If you want case tables, use Vitest's own `it.for(...)` and call `run(...)`
|
|
104
|
+
inside the test body:
|
|
89
105
|
|
|
90
|
-
```
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
describeEval("tool usage", {
|
|
94
|
-
data: async () => [
|
|
106
|
+
```ts
|
|
107
|
+
describeEval("refund agent", { harness }, (it) => {
|
|
108
|
+
it.for([
|
|
95
109
|
{
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
{ name: "filter", arguments: { cuisine: "italian" } },
|
|
100
|
-
],
|
|
110
|
+
name: "approves refundable invoice",
|
|
111
|
+
input: "Refund invoice inv_123",
|
|
112
|
+
expectedStatus: "approved",
|
|
101
113
|
},
|
|
102
|
-
],
|
|
103
|
-
task: myTask,
|
|
104
|
-
scorers: [ToolCallScorer()],
|
|
105
|
-
});
|
|
106
|
-
|
|
107
|
-
// Strict order and parameters
|
|
108
|
-
scorers: [ToolCallScorer({ ordered: true, params: "strict" })];
|
|
109
|
-
|
|
110
|
-
// Flexible evaluation
|
|
111
|
-
scorers: [ToolCallScorer({ requireAll: false, allowExtras: false })];
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
**Default behavior:**
|
|
115
|
-
|
|
116
|
-
- Strict parameter matching (exact equality required)
|
|
117
|
-
- Any order allowed
|
|
118
|
-
- Extra tools allowed
|
|
119
|
-
- All expected tools required
|
|
120
|
-
|
|
121
|
-
### StructuredOutputScorer
|
|
122
|
-
|
|
123
|
-
Evaluates if the output matches expected structured data (JSON).
|
|
124
|
-
|
|
125
|
-
```javascript
|
|
126
|
-
import { StructuredOutputScorer } from "vitest-evals";
|
|
127
|
-
|
|
128
|
-
describeEval("query generation", {
|
|
129
|
-
data: async () => [
|
|
130
114
|
{
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
query: "",
|
|
135
|
-
sort: "-timestamp",
|
|
136
|
-
timeRange: { statsPeriod: "24h" },
|
|
137
|
-
},
|
|
115
|
+
name: "denies non-refundable invoice",
|
|
116
|
+
input: "Refund invoice inv_404",
|
|
117
|
+
expectedStatus: "denied",
|
|
138
118
|
},
|
|
139
|
-
],
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
});
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
scorers: [
|
|
149
|
-
StructuredOutputScorer({
|
|
150
|
-
match: (expected, actual, key) => {
|
|
151
|
-
if (key === "age") return actual >= 18 && actual <= 100;
|
|
152
|
-
return expected === actual;
|
|
153
|
-
},
|
|
154
|
-
}),
|
|
155
|
-
];
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
### Custom Scorers
|
|
159
|
-
|
|
160
|
-
```javascript
|
|
161
|
-
// Inline scorer
|
|
162
|
-
const LengthScorer = async ({ output }) => ({
|
|
163
|
-
score: output.length > 50 ? 1.0 : 0.0,
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
// TypeScript scorer with custom options
|
|
167
|
-
import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
|
|
168
|
-
|
|
169
|
-
interface CustomOptions extends BaseScorerOptions {
|
|
170
|
-
minLength: number;
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
|
|
174
|
-
score: opts.output.length >= opts.minLength ? 1.0 : 0.0,
|
|
119
|
+
])("$name", async ({ input, ...metadata }, { run }) => {
|
|
120
|
+
const result = await run(input, {
|
|
121
|
+
metadata,
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
expect(result.output).toMatchObject({
|
|
125
|
+
status: metadata.expectedStatus,
|
|
126
|
+
});
|
|
127
|
+
});
|
|
175
128
|
});
|
|
176
129
|
```
|
|
177
130
|
|
|
178
|
-
##
|
|
179
|
-
|
|
180
|
-
See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a complete example with the Vercel AI SDK.
|
|
131
|
+
## Existing Agents
|
|
181
132
|
|
|
182
|
-
|
|
133
|
+
For an existing agent, the intended contract is:
|
|
183
134
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
tools: { myTool: myToolDefinition },
|
|
189
|
-
});
|
|
190
|
-
|
|
191
|
-
return {
|
|
192
|
-
result: text,
|
|
193
|
-
toolCalls: steps
|
|
194
|
-
.flatMap((step) => step.toolCalls)
|
|
195
|
-
.map((call) => ({
|
|
196
|
-
name: call.toolName,
|
|
197
|
-
arguments: call.args,
|
|
198
|
-
})),
|
|
199
|
-
};
|
|
200
|
-
```
|
|
135
|
+
- pass the agent instance or per-test factory through the harness
|
|
136
|
+
- optionally pass `run` when the app entrypoint is not `run(input, runtime)`
|
|
137
|
+
- let the harness infer native tools from the existing agent by default
|
|
138
|
+
- only pass an explicit `tools` override when the agent hides its tool surface
|
|
201
139
|
|
|
202
|
-
|
|
140
|
+
The harness owns normalization, diagnostics, tool capture, replay plumbing, and
|
|
141
|
+
reporter-facing artifacts. Your app just needs one runtime seam where those
|
|
142
|
+
wrapped pieces can be injected.
|
|
203
143
|
|
|
204
|
-
|
|
144
|
+
For the Pi-specific harness, output/session/usage normalization should usually
|
|
145
|
+
be inferred automatically. Treat low-level normalization callbacks as an escape
|
|
146
|
+
hatch, not part of the primary authoring path.
|
|
205
147
|
|
|
206
|
-
|
|
148
|
+
## Judge Matchers
|
|
207
149
|
|
|
208
|
-
|
|
209
|
-
|
|
150
|
+
Use the matcher when a judge should behave like a normal Vitest assertion.
|
|
151
|
+
In practice, this is usually most useful for factuality, rubric, or grounded
|
|
152
|
+
answer checks:
|
|
210
153
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
ClosedQA.partial({
|
|
214
|
-
criteria: "Does the answer mention Paris?",
|
|
215
|
-
}),
|
|
216
|
-
];
|
|
154
|
+
```ts
|
|
155
|
+
await expect(result).toSatisfyJudge(FactualityJudge);
|
|
217
156
|
```
|
|
218
157
|
|
|
219
|
-
|
|
158
|
+
For lower-level cases, the matcher also accepts raw values and synthetic judge
|
|
159
|
+
context:
|
|
220
160
|
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
// ...
|
|
161
|
+
```ts
|
|
162
|
+
await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
163
|
+
inputValue: "Refund invoice inv_123",
|
|
225
164
|
});
|
|
226
165
|
```
|
|
227
166
|
|
|
228
|
-
|
|
167
|
+
If you are writing a custom judge, wrap it with `namedJudge(...)` so reporter
|
|
168
|
+
output uses a stable label:
|
|
229
169
|
|
|
230
|
-
|
|
170
|
+
```ts
|
|
171
|
+
import { namedJudge } from "vitest-evals";
|
|
231
172
|
|
|
232
|
-
|
|
173
|
+
const FactualityJudge = namedJudge(
|
|
174
|
+
"FactualityJudge",
|
|
175
|
+
async ({ output }) => {
|
|
176
|
+
const answer = output;
|
|
177
|
+
const verdict = await judgeFactuality(answer);
|
|
233
178
|
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
});
|
|
241
|
-
|
|
242
|
-
expect("What is the capital of France?").toEval(
|
|
243
|
-
"Paris",
|
|
244
|
-
answerQuestion,
|
|
245
|
-
simpleFactuality,
|
|
246
|
-
0.8
|
|
247
|
-
);
|
|
248
|
-
});
|
|
249
|
-
```
|
|
250
|
-
|
|
251
|
-
## Configuration
|
|
252
|
-
|
|
253
|
-
### Separate Eval Configuration
|
|
254
|
-
|
|
255
|
-
Create `vitest.evals.config.ts`:
|
|
256
|
-
|
|
257
|
-
```javascript
|
|
258
|
-
import { defineConfig } from "vitest/config";
|
|
259
|
-
import defaultConfig from "./vitest.config";
|
|
260
|
-
|
|
261
|
-
export default defineConfig({
|
|
262
|
-
...defaultConfig,
|
|
263
|
-
test: {
|
|
264
|
-
...defaultConfig.test,
|
|
265
|
-
include: ["src/**/*.eval.{js,ts}"],
|
|
179
|
+
return {
|
|
180
|
+
score: verdict.score,
|
|
181
|
+
metadata: {
|
|
182
|
+
rationale: verdict.rationale,
|
|
183
|
+
},
|
|
184
|
+
};
|
|
266
185
|
},
|
|
267
|
-
|
|
186
|
+
);
|
|
268
187
|
```
|
|
269
188
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
189
|
+
For a `HarnessRun`, `toSatisfyJudge(...)` passes `result.output` as `output`.
|
|
190
|
+
For raw values or normalized sessions, the matcher infers the best available
|
|
191
|
+
output from the received value. Structured or programmatic result checks should
|
|
192
|
+
usually assert on `result.output` directly. When a judge needs richer context,
|
|
193
|
+
type it with `JudgeContext` and read `inputValue`, `metadata`, `toolCalls`, or
|
|
194
|
+
`session` from there.
|
|
195
|
+
|
|
196
|
+
When you only need deterministic contract checks, built-ins such as
|
|
197
|
+
`StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
|
|
198
|
+
documentation examples intentionally use factuality/rubric judges because those
|
|
199
|
+
match the product's LLM-as-a-judge direction.
|
|
200
|
+
|
|
201
|
+
## Legacy Compatibility
|
|
202
|
+
|
|
203
|
+
The root package is harness-first and judge-first. Legacy scorer-first suites
|
|
204
|
+
and `evaluate(...)` live under `vitest-evals/legacy`.
|
|
205
|
+
|
|
206
|
+
```ts
|
|
207
|
+
import {
|
|
208
|
+
describeEval,
|
|
209
|
+
StructuredOutputScorer,
|
|
210
|
+
ToolCallScorer,
|
|
211
|
+
} from "vitest-evals/legacy";
|
|
281
212
|
```
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
type JsonPrimitive = string | number | boolean | null;
|
|
2
|
+
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
3
|
+
[key: string]: JsonValue;
|
|
4
|
+
};
|
|
5
|
+
type ToolCallRecord = {
|
|
6
|
+
id?: string;
|
|
7
|
+
name: string;
|
|
8
|
+
arguments?: Record<string, JsonValue>;
|
|
9
|
+
result?: JsonValue;
|
|
10
|
+
error?: {
|
|
11
|
+
message: string;
|
|
12
|
+
type?: string;
|
|
13
|
+
[key: string]: JsonValue | undefined;
|
|
14
|
+
};
|
|
15
|
+
startedAt?: string;
|
|
16
|
+
finishedAt?: string;
|
|
17
|
+
durationMs?: number;
|
|
18
|
+
metadata?: Record<string, JsonValue>;
|
|
19
|
+
};
|
|
20
|
+
type NormalizedMessage = {
|
|
21
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
22
|
+
content?: JsonValue;
|
|
23
|
+
toolCalls?: ToolCallRecord[];
|
|
24
|
+
metadata?: Record<string, JsonValue>;
|
|
25
|
+
};
|
|
26
|
+
type UsageSummary = {
|
|
27
|
+
provider?: string;
|
|
28
|
+
model?: string;
|
|
29
|
+
inputTokens?: number;
|
|
30
|
+
outputTokens?: number;
|
|
31
|
+
reasoningTokens?: number;
|
|
32
|
+
totalTokens?: number;
|
|
33
|
+
estimatedCost?: number;
|
|
34
|
+
toolCalls?: number;
|
|
35
|
+
retries?: number;
|
|
36
|
+
metadata?: Record<string, JsonValue>;
|
|
37
|
+
};
|
|
38
|
+
type TimingSummary = {
|
|
39
|
+
totalMs?: number;
|
|
40
|
+
metadata?: Record<string, JsonValue>;
|
|
41
|
+
};
|
|
42
|
+
type NormalizedSession = {
|
|
43
|
+
messages: NormalizedMessage[];
|
|
44
|
+
outputText?: string;
|
|
45
|
+
provider?: string;
|
|
46
|
+
model?: string;
|
|
47
|
+
metadata?: Record<string, JsonValue>;
|
|
48
|
+
};
|
|
49
|
+
type HarnessRun = {
|
|
50
|
+
session: NormalizedSession;
|
|
51
|
+
output?: JsonValue;
|
|
52
|
+
usage: UsageSummary;
|
|
53
|
+
timings?: TimingSummary;
|
|
54
|
+
artifacts?: Record<string, JsonValue>;
|
|
55
|
+
errors: Array<Record<string, JsonValue>>;
|
|
56
|
+
};
|
|
57
|
+
type HarnessPromptOptions = {
|
|
58
|
+
system?: string;
|
|
59
|
+
metadata?: Record<string, JsonValue>;
|
|
60
|
+
};
|
|
61
|
+
type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
|
|
62
|
+
type HarnessRuntime = {
|
|
63
|
+
prompt: HarnessPrompt;
|
|
64
|
+
};
|
|
65
|
+
type HarnessRunError = Error & {
|
|
66
|
+
vitestEvalsRun: HarnessRun;
|
|
67
|
+
};
|
|
68
|
+
type HarnessMetadata = Record<string, unknown>;
|
|
69
|
+
type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
70
|
+
metadata: Readonly<TMetadata>;
|
|
71
|
+
task: {
|
|
72
|
+
meta: Record<string, unknown>;
|
|
73
|
+
};
|
|
74
|
+
signal?: AbortSignal;
|
|
75
|
+
artifacts: Record<string, JsonValue>;
|
|
76
|
+
setArtifact: (name: string, value: JsonValue) => void;
|
|
77
|
+
};
|
|
78
|
+
type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
79
|
+
name: string;
|
|
80
|
+
prompt?: HarnessPrompt;
|
|
81
|
+
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
|
|
82
|
+
};
|
|
83
|
+
/** Returns true when a value exposes a callable method with the given name. */
|
|
84
|
+
declare function hasCallableMethod(value: unknown, methodName: string): boolean;
|
|
85
|
+
/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */
|
|
86
|
+
declare function toJsonValue(value: unknown): JsonValue | undefined;
|
|
87
|
+
/** Drops non-JSON properties from a record while preserving valid values. */
|
|
88
|
+
declare function normalizeRecord(value: Record<string, unknown>): Record<string, JsonValue>;
|
|
89
|
+
/** Normalizes metadata and omits the field entirely when nothing survives. */
|
|
90
|
+
declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
|
|
91
|
+
/** Converts arbitrary content into the JSON-safe message content shape. */
|
|
92
|
+
declare function normalizeContent(value: unknown): JsonValue;
|
|
93
|
+
/** Flattens every recorded tool call from a normalized session. */
|
|
94
|
+
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
95
|
+
/** Filters normalized session messages by role. */
|
|
96
|
+
declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
|
|
97
|
+
/** Returns every normalized system message from a session. */
|
|
98
|
+
declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
|
|
99
|
+
/** Returns every normalized user message from a session. */
|
|
100
|
+
declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
101
|
+
/** Returns every normalized assistant message from a session. */
|
|
102
|
+
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
103
|
+
/** Returns every normalized tool message from a session. */
|
|
104
|
+
declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
|
|
105
|
+
/** Attaches a partial or complete harness run to an arbitrary thrown error. */
|
|
106
|
+
declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
|
|
107
|
+
/** Reads an attached harness run back off a previously wrapped error value. */
|
|
108
|
+
declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
|
|
109
|
+
/** Returns true when a value matches the normalized `HarnessRun` contract. */
|
|
110
|
+
declare function isHarnessRun(value: unknown): value is HarnessRun;
|
|
111
|
+
/** Returns true when a value matches the normalized session contract. */
|
|
112
|
+
declare function isNormalizedSession(value: unknown): value is NormalizedSession;
|
|
113
|
+
/** Reuses pre-normalized harness errors when a runtime already returns them. */
|
|
114
|
+
declare function resolveHarnessRunErrors(result: unknown): Array<Record<string, JsonValue>>;
|
|
115
|
+
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
116
|
+
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
117
|
+
|
|
118
|
+
export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type HarnessRuntime, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
type JsonPrimitive = string | number | boolean | null;
|
|
2
|
+
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
3
|
+
[key: string]: JsonValue;
|
|
4
|
+
};
|
|
5
|
+
type ToolCallRecord = {
|
|
6
|
+
id?: string;
|
|
7
|
+
name: string;
|
|
8
|
+
arguments?: Record<string, JsonValue>;
|
|
9
|
+
result?: JsonValue;
|
|
10
|
+
error?: {
|
|
11
|
+
message: string;
|
|
12
|
+
type?: string;
|
|
13
|
+
[key: string]: JsonValue | undefined;
|
|
14
|
+
};
|
|
15
|
+
startedAt?: string;
|
|
16
|
+
finishedAt?: string;
|
|
17
|
+
durationMs?: number;
|
|
18
|
+
metadata?: Record<string, JsonValue>;
|
|
19
|
+
};
|
|
20
|
+
type NormalizedMessage = {
|
|
21
|
+
role: "system" | "user" | "assistant" | "tool";
|
|
22
|
+
content?: JsonValue;
|
|
23
|
+
toolCalls?: ToolCallRecord[];
|
|
24
|
+
metadata?: Record<string, JsonValue>;
|
|
25
|
+
};
|
|
26
|
+
type UsageSummary = {
|
|
27
|
+
provider?: string;
|
|
28
|
+
model?: string;
|
|
29
|
+
inputTokens?: number;
|
|
30
|
+
outputTokens?: number;
|
|
31
|
+
reasoningTokens?: number;
|
|
32
|
+
totalTokens?: number;
|
|
33
|
+
estimatedCost?: number;
|
|
34
|
+
toolCalls?: number;
|
|
35
|
+
retries?: number;
|
|
36
|
+
metadata?: Record<string, JsonValue>;
|
|
37
|
+
};
|
|
38
|
+
type TimingSummary = {
|
|
39
|
+
totalMs?: number;
|
|
40
|
+
metadata?: Record<string, JsonValue>;
|
|
41
|
+
};
|
|
42
|
+
type NormalizedSession = {
|
|
43
|
+
messages: NormalizedMessage[];
|
|
44
|
+
outputText?: string;
|
|
45
|
+
provider?: string;
|
|
46
|
+
model?: string;
|
|
47
|
+
metadata?: Record<string, JsonValue>;
|
|
48
|
+
};
|
|
49
|
+
type HarnessRun = {
|
|
50
|
+
session: NormalizedSession;
|
|
51
|
+
output?: JsonValue;
|
|
52
|
+
usage: UsageSummary;
|
|
53
|
+
timings?: TimingSummary;
|
|
54
|
+
artifacts?: Record<string, JsonValue>;
|
|
55
|
+
errors: Array<Record<string, JsonValue>>;
|
|
56
|
+
};
|
|
57
|
+
type HarnessPromptOptions = {
|
|
58
|
+
system?: string;
|
|
59
|
+
metadata?: Record<string, JsonValue>;
|
|
60
|
+
};
|
|
61
|
+
type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
|
|
62
|
+
type HarnessRuntime = {
|
|
63
|
+
prompt: HarnessPrompt;
|
|
64
|
+
};
|
|
65
|
+
type HarnessRunError = Error & {
|
|
66
|
+
vitestEvalsRun: HarnessRun;
|
|
67
|
+
};
|
|
68
|
+
type HarnessMetadata = Record<string, unknown>;
|
|
69
|
+
type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
70
|
+
metadata: Readonly<TMetadata>;
|
|
71
|
+
task: {
|
|
72
|
+
meta: Record<string, unknown>;
|
|
73
|
+
};
|
|
74
|
+
signal?: AbortSignal;
|
|
75
|
+
artifacts: Record<string, JsonValue>;
|
|
76
|
+
setArtifact: (name: string, value: JsonValue) => void;
|
|
77
|
+
};
|
|
78
|
+
type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
79
|
+
name: string;
|
|
80
|
+
prompt?: HarnessPrompt;
|
|
81
|
+
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
|
|
82
|
+
};
|
|
83
|
+
/** Returns true when a value exposes a callable method with the given name. */
|
|
84
|
+
declare function hasCallableMethod(value: unknown, methodName: string): boolean;
|
|
85
|
+
/** Normalizes an unknown value into the JSON-safe shape used by harness runs. */
|
|
86
|
+
declare function toJsonValue(value: unknown): JsonValue | undefined;
|
|
87
|
+
/** Drops non-JSON properties from a record while preserving valid values. */
|
|
88
|
+
declare function normalizeRecord(value: Record<string, unknown>): Record<string, JsonValue>;
|
|
89
|
+
/** Normalizes metadata and omits the field entirely when nothing survives. */
|
|
90
|
+
declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
|
|
91
|
+
/** Converts arbitrary content into the JSON-safe message content shape. */
|
|
92
|
+
declare function normalizeContent(value: unknown): JsonValue;
|
|
93
|
+
/** Flattens every recorded tool call from a normalized session. */
|
|
94
|
+
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
95
|
+
/** Filters normalized session messages by role. */
|
|
96
|
+
declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
|
|
97
|
+
/** Returns every normalized system message from a session. */
|
|
98
|
+
declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
|
|
99
|
+
/** Returns every normalized user message from a session. */
|
|
100
|
+
declare function userMessages(session: NormalizedSession): NormalizedMessage[];
|
|
101
|
+
/** Returns every normalized assistant message from a session. */
|
|
102
|
+
declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
|
|
103
|
+
/** Returns every normalized tool message from a session. */
|
|
104
|
+
declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
|
|
105
|
+
/** Attaches a partial or complete harness run to an arbitrary thrown error. */
|
|
106
|
+
declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
|
|
107
|
+
/** Reads an attached harness run back off a previously wrapped error value. */
|
|
108
|
+
declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
|
|
109
|
+
/** Returns true when a value matches the normalized `HarnessRun` contract. */
|
|
110
|
+
declare function isHarnessRun(value: unknown): value is HarnessRun;
|
|
111
|
+
/** Returns true when a value matches the normalized session contract. */
|
|
112
|
+
declare function isNormalizedSession(value: unknown): value is NormalizedSession;
|
|
113
|
+
/** Reuses pre-normalized harness errors when a runtime already returns them. */
|
|
114
|
+
declare function resolveHarnessRunErrors(result: unknown): Array<Record<string, JsonValue>>;
|
|
115
|
+
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
116
|
+
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
117
|
+
|
|
118
|
+
export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type HarnessRuntime, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|