vitest-evals 0.9.0-beta.6 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +113 -80
- package/dist/harness.d.mts +341 -20
- package/dist/harness.d.ts +341 -20
- package/dist/harness.js +8 -0
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +7 -0
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +188 -17
- package/dist/index.d.ts +188 -17
- package/dist/index.js +308 -21
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +304 -21
- package/dist/index.mjs.map +1 -1
- package/dist/internal/matchers.d.mts +41 -3
- package/dist/internal/matchers.d.ts +41 -3
- package/dist/internal/matchers.js.map +1 -1
- package/dist/internal/matchers.mjs.map +1 -1
- package/dist/internal/scoring.d.mts +3 -3
- package/dist/internal/scoring.d.ts +3 -3
- package/dist/internal/scoring.js.map +1 -1
- package/dist/internal/structuredOutputScorer.d.mts +4 -0
- package/dist/internal/structuredOutputScorer.d.ts +4 -0
- package/dist/internal/structuredOutputScorer.js.map +1 -1
- package/dist/internal/structuredOutputScorer.mjs.map +1 -1
- package/dist/internal/toolCallScorer.d.mts +6 -0
- package/dist/internal/toolCallScorer.d.ts +6 -0
- package/dist/internal/toolCallScorer.js +42 -2
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs +42 -2
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/factualityJudge.d.mts +151 -0
- package/dist/judges/factualityJudge.d.ts +151 -0
- package/dist/judges/factualityJudge.js +235 -0
- package/dist/judges/factualityJudge.js.map +1 -0
- package/dist/judges/factualityJudge.mjs +208 -0
- package/dist/judges/factualityJudge.mjs.map +1 -0
- package/dist/judges/index.d.mts +5 -3
- package/dist/judges/index.d.ts +5 -3
- package/dist/judges/index.js +447 -7
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +443 -6
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/judgeHarness.d.mts +122 -0
- package/dist/judges/judgeHarness.d.ts +122 -0
- package/dist/judges/judgeHarness.js +303 -0
- package/dist/judges/judgeHarness.js.map +1 -0
- package/dist/judges/judgeHarness.mjs +274 -0
- package/dist/judges/judgeHarness.mjs.map +1 -0
- package/dist/judges/structuredOutputJudge.d.mts +55 -4
- package/dist/judges/structuredOutputJudge.d.ts +55 -4
- package/dist/judges/structuredOutputJudge.js.map +1 -1
- package/dist/judges/structuredOutputJudge.mjs.map +1 -1
- package/dist/judges/toolCallJudge.d.mts +57 -6
- package/dist/judges/toolCallJudge.d.ts +57 -6
- package/dist/judges/toolCallJudge.js +42 -2
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +42 -2
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +100 -8
- package/dist/judges/types.d.ts +100 -8
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/scorers/index.js +42 -2
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs +42 -2
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
- package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.js +42 -2
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs +42 -2
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy/scorers/utils.js.map +1 -1
- package/dist/legacy/scorers/utils.mjs.map +1 -1
- package/dist/legacy.js +56 -3
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs +56 -3
- package/dist/legacy.mjs.map +1 -1
- package/dist/replay.js +1 -1
- package/dist/replay.js.map +1 -1
- package/dist/replay.mjs +1 -1
- package/dist/replay.mjs.map +1 -1
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +13 -1
package/README.md
CHANGED
|
@@ -2,6 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Harness-backed AI testing on top of Vitest.
|
|
4
4
|
|
|
5
|
+
Use this package README for the core authoring model. For a guided setup path,
|
|
6
|
+
runtime-specific harness examples, replay, and GitHub Actions reporting, start
|
|
7
|
+
with the docs site: `https://vitest-evals.sentry.dev/docs`.
|
|
8
|
+
|
|
5
9
|
## Install
|
|
6
10
|
|
|
7
11
|
```sh
|
|
@@ -40,55 +44,28 @@ workflow.
|
|
|
40
44
|
- every judge receives `JudgeContext` with typed `input`, typed `output`, the
|
|
41
45
|
normalized run/session, tool calls, and metadata; `output` is only optional
|
|
42
46
|
when the harness output type includes `undefined`
|
|
43
|
-
- judges own their prompt, rubric,
|
|
44
|
-
`
|
|
45
|
-
when multiple judges share setup
|
|
47
|
+
- judges own their prompt, rubric, and parsing; LLM-backed judges use
|
|
48
|
+
`ctx.runJudge(...)` from a configured `judgeHarness`
|
|
46
49
|
- explicit judge assertions use
|
|
47
50
|
`await expect(result).toSatisfyJudge(judge, context)`
|
|
48
51
|
|
|
49
52
|
## Explicit Run Example
|
|
50
53
|
|
|
51
54
|
```ts
|
|
55
|
+
import { getModel } from "@mariozechner/pi-ai";
|
|
52
56
|
import { expect } from "vitest";
|
|
53
|
-
import { piAiHarness } from "@vitest-evals/harness-pi-ai";
|
|
57
|
+
import { piAiHarness, piAiJudgeHarness } from "@vitest-evals/harness-pi-ai";
|
|
54
58
|
import {
|
|
55
|
-
createJudge,
|
|
56
59
|
describeEval,
|
|
60
|
+
FactualityJudge,
|
|
57
61
|
toolCalls,
|
|
58
|
-
type JudgeContext,
|
|
59
62
|
} from "vitest-evals";
|
|
60
63
|
import { createRefundAgent } from "../src/refundAgent";
|
|
61
64
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
};
|
|
66
|
-
|
|
67
|
-
type RefundOutput = {
|
|
68
|
-
status: "approved" | "denied";
|
|
69
|
-
};
|
|
70
|
-
|
|
71
|
-
const FactualityJudge = createJudge(
|
|
72
|
-
"FactualityJudge",
|
|
73
|
-
async ({
|
|
74
|
-
input,
|
|
75
|
-
output,
|
|
76
|
-
metadata,
|
|
77
|
-
}: JudgeContext<string, RefundOutput, RefundEvalMetadata>) => {
|
|
78
|
-
const verdict = await judgeFactuality({
|
|
79
|
-
question: input,
|
|
80
|
-
answer: output,
|
|
81
|
-
expectedStatus: metadata.expectedStatus,
|
|
82
|
-
});
|
|
83
|
-
|
|
84
|
-
return {
|
|
85
|
-
score: verdict.score,
|
|
86
|
-
metadata: {
|
|
87
|
-
rationale: verdict.rationale,
|
|
88
|
-
},
|
|
89
|
-
};
|
|
90
|
-
},
|
|
91
|
-
);
|
|
65
|
+
const judgeHarness = piAiJudgeHarness({
|
|
66
|
+
model: getModel("anthropic", "claude-sonnet-4-5"),
|
|
67
|
+
temperature: 0,
|
|
68
|
+
});
|
|
92
69
|
|
|
93
70
|
describeEval(
|
|
94
71
|
"refund agent",
|
|
@@ -96,12 +73,15 @@ describeEval(
|
|
|
96
73
|
harness: piAiHarness({
|
|
97
74
|
agent: () => createRefundAgent(),
|
|
98
75
|
}),
|
|
99
|
-
|
|
76
|
+
judgeHarness,
|
|
77
|
+
judges: [FactualityJudge()],
|
|
78
|
+
judgeThreshold: 0.6,
|
|
100
79
|
},
|
|
101
80
|
(it) => {
|
|
102
81
|
it("approves a refundable invoice", async ({ run }) => {
|
|
103
82
|
const result = await run("Refund invoice inv_123", {
|
|
104
83
|
metadata: {
|
|
84
|
+
expected: "The refund request is approved.",
|
|
105
85
|
expectedStatus: "approved",
|
|
106
86
|
expectedTools: ["lookupInvoice", "createRefund"],
|
|
107
87
|
},
|
|
@@ -153,7 +133,7 @@ Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
|
|
|
153
133
|
contains eval scores and normalized harness runs.
|
|
154
134
|
|
|
155
135
|
```sh
|
|
156
|
-
vitest run evals \
|
|
136
|
+
vitest run --config vitest.evals.config.ts \
|
|
157
137
|
--reporter=vitest-evals/reporter \
|
|
158
138
|
--reporter=json \
|
|
159
139
|
--outputFile.json=vitest-results.json
|
|
@@ -210,6 +190,7 @@ When generics are needed, use `createHarness<Input, Output, Metadata>(...)`.
|
|
|
210
190
|
import {
|
|
211
191
|
createHarness,
|
|
212
192
|
createJudge,
|
|
193
|
+
createJudgeHarness,
|
|
213
194
|
describeEval,
|
|
214
195
|
type JudgeContext,
|
|
215
196
|
} from "vitest-evals";
|
|
@@ -255,14 +236,25 @@ const appHarness = createHarness<AppEvalInput, AppOutput, AppEvalMetadata>({
|
|
|
255
236
|
},
|
|
256
237
|
});
|
|
257
238
|
|
|
239
|
+
const judgeHarness = createJudgeHarness({
|
|
240
|
+
name: "app-rubric-judge-model",
|
|
241
|
+
run: async ({ prompt }, { signal }) =>
|
|
242
|
+
promptJudgeModel({ prompt, signal }),
|
|
243
|
+
});
|
|
244
|
+
|
|
258
245
|
const AppRubricJudge = createJudge(
|
|
259
246
|
"AppRubricJudge",
|
|
260
247
|
async (ctx: JudgeContext<AppEvalInput, AppOutput, AppEvalMetadata>) => {
|
|
261
|
-
|
|
248
|
+
if (!ctx.runJudge) {
|
|
249
|
+
throw new Error("AppRubricJudge requires a configured judgeHarness.");
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
const verdict = await ctx.runJudge({
|
|
262
253
|
prompt: formatRubricPrompt({
|
|
263
254
|
output: ctx.output,
|
|
264
255
|
criteria: ctx.input.criteria,
|
|
265
256
|
}),
|
|
257
|
+
responseFormat: { type: "json" },
|
|
266
258
|
});
|
|
267
259
|
|
|
268
260
|
return parseRubricVerdict(verdict);
|
|
@@ -273,6 +265,7 @@ describeEval(
|
|
|
273
265
|
"app behavior",
|
|
274
266
|
{
|
|
275
267
|
harness: appHarness,
|
|
268
|
+
judgeHarness,
|
|
276
269
|
judges: [AppRubricJudge],
|
|
277
270
|
judgeThreshold: 0.75,
|
|
278
271
|
},
|
|
@@ -328,11 +321,26 @@ In practice, this is usually most useful for factuality, rubric, or grounded
|
|
|
328
321
|
answer checks:
|
|
329
322
|
|
|
330
323
|
```ts
|
|
331
|
-
|
|
324
|
+
import { openai } from "@ai-sdk/openai";
|
|
325
|
+
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
|
|
326
|
+
import { expect } from "vitest";
|
|
327
|
+
import { FactualityJudge } from "vitest-evals";
|
|
328
|
+
|
|
329
|
+
const judgeHarness = aiSdkJudgeHarness({
|
|
330
|
+
model: openai("gpt-4.1-mini"),
|
|
331
|
+
temperature: 0,
|
|
332
|
+
});
|
|
333
|
+
const factualityJudge = FactualityJudge({ judgeHarness });
|
|
334
|
+
|
|
335
|
+
await expect(result).toSatisfyJudge(factualityJudge, {
|
|
336
|
+
expected: "Paris is the capital of France.",
|
|
337
|
+
threshold: 0.6,
|
|
338
|
+
});
|
|
332
339
|
```
|
|
333
340
|
|
|
334
341
|
For lower-level cases, the matcher also accepts raw values and synthetic judge
|
|
335
|
-
context
|
|
342
|
+
context. Pass every context field the judge needs when the value did not come
|
|
343
|
+
from eval fixture `run(...)`:
|
|
336
344
|
|
|
337
345
|
```ts
|
|
338
346
|
await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
@@ -340,35 +348,75 @@ await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
|
340
348
|
});
|
|
341
349
|
```
|
|
342
350
|
|
|
343
|
-
Use
|
|
344
|
-
|
|
351
|
+
Use the built-in factuality judge when you want a model-backed factuality grade
|
|
352
|
+
over the normalized run:
|
|
345
353
|
|
|
346
354
|
```ts
|
|
347
|
-
import {
|
|
355
|
+
import { openai } from "@ai-sdk/openai";
|
|
356
|
+
import { aiSdkJudgeHarness } from "@vitest-evals/harness-ai-sdk";
|
|
357
|
+
import { FactualityJudge } from "vitest-evals";
|
|
348
358
|
|
|
349
|
-
const
|
|
350
|
-
"
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
359
|
+
export const judgeHarness = aiSdkJudgeHarness({
|
|
360
|
+
model: openai("gpt-4.1-mini"),
|
|
361
|
+
temperature: 0,
|
|
362
|
+
});
|
|
363
|
+
export const factualityJudge = FactualityJudge({ judgeHarness });
|
|
364
|
+
```
|
|
354
365
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
366
|
+
For custom judge providers, create a dedicated judge harness with the same
|
|
367
|
+
prompt contract:
|
|
368
|
+
|
|
369
|
+
```ts
|
|
370
|
+
import {
|
|
371
|
+
createJudgeHarness,
|
|
372
|
+
FactualityJudge,
|
|
373
|
+
type JudgeHarness,
|
|
374
|
+
} from "vitest-evals";
|
|
375
|
+
import { callJudgeModel } from "./judgeModel";
|
|
376
|
+
|
|
377
|
+
export const judgeHarness: JudgeHarness = createJudgeHarness({
|
|
378
|
+
name: "factuality-judge-model",
|
|
379
|
+
run: async ({ system, prompt }, { signal }) =>
|
|
380
|
+
callJudgeModel({ system, prompt, signal }),
|
|
381
|
+
});
|
|
382
|
+
|
|
383
|
+
export const factualityJudge = FactualityJudge({ judgeHarness });
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
Configure that judge harness once and reuse the same judge with any app
|
|
387
|
+
harness:
|
|
388
|
+
|
|
389
|
+
```ts
|
|
390
|
+
import { describeEval } from "vitest-evals";
|
|
391
|
+
import { aiSdkRefundHarness } from "./aiSdkRefundHarness";
|
|
392
|
+
import { piRefundHarness } from "./piRefundHarness";
|
|
393
|
+
import { factualityJudge } from "./sharedJudges";
|
|
394
|
+
|
|
395
|
+
describeEval("ai sdk refund agent", {
|
|
396
|
+
harness: aiSdkRefundHarness,
|
|
397
|
+
judges: [factualityJudge],
|
|
398
|
+
});
|
|
399
|
+
|
|
400
|
+
describeEval("pi refund agent", {
|
|
401
|
+
harness: piRefundHarness,
|
|
402
|
+
judges: [factualityJudge],
|
|
403
|
+
});
|
|
363
404
|
```
|
|
364
405
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
406
|
+
Use `createJudge(...)` for custom judges so reporter output gets a stable
|
|
407
|
+
label. Custom LLM-backed judges should provide their own judge prompt, rubric
|
|
408
|
+
text, and parser, then call `ctx.runJudge(...)` for the provider-specific model
|
|
409
|
+
request. Bind a reusable default with `createJudge({ name, judgeHarness,
|
|
410
|
+
assess })` or pass `judgeHarness` on the matcher or suite. Core curries the
|
|
411
|
+
matcher, judge, or explicit suite `judgeHarness` into that function with the
|
|
412
|
+
current run's abort signal. Matcher options win over a judge default, and a
|
|
413
|
+
judge default wins over the suite default. Explicit matcher calls can also
|
|
414
|
+
reuse a single unambiguous judge-level harness from the suite's automatic
|
|
415
|
+
judges, but automatic judges do not inherit inferred harnesses from sibling
|
|
416
|
+
judges. That inference requires those judges to share the same judge harness
|
|
417
|
+
instance. Leave `judgeHarness` unset for suites that only use deterministic
|
|
418
|
+
judges. Calling `harness.run(...)` from a judge executes the application again,
|
|
419
|
+
so use that only when a second run is intentional.
|
|
372
420
|
|
|
373
421
|
For an `EvalHarnessRun` returned by fixture `run(...)`,
|
|
374
422
|
`toSatisfyJudge(...)` uses the run's typed `output` and reuses the registered
|
|
@@ -386,19 +434,4 @@ When a judge needs richer normalized context or the configured suite harness,
|
|
|
386
434
|
type it with `JudgeContext`.
|
|
387
435
|
|
|
388
436
|
When you only need deterministic contract checks, built-ins such as
|
|
389
|
-
`StructuredOutputJudge()` and `ToolCallJudge()` are still available.
|
|
390
|
-
documentation examples intentionally use factuality/rubric judges because those
|
|
391
|
-
match the product's LLM-as-a-judge direction.
|
|
392
|
-
|
|
393
|
-
## Legacy Compatibility
|
|
394
|
-
|
|
395
|
-
The root package is harness-first and judge-first. Legacy scorer-first suites
|
|
396
|
-
and `evaluate(...)` live under `vitest-evals/legacy`.
|
|
397
|
-
|
|
398
|
-
```ts
|
|
399
|
-
import {
|
|
400
|
-
describeEval,
|
|
401
|
-
StructuredOutputScorer,
|
|
402
|
-
ToolCallScorer,
|
|
403
|
-
} from "vitest-evals/legacy";
|
|
404
|
-
```
|
|
437
|
+
`StructuredOutputJudge()` and `ToolCallJudge()` are still available.
|