vitest-evals 0.9.0-beta.1 → 0.9.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +168 -1
- package/dist/judges/types.d.mts +3 -1
- package/dist/judges/types.d.ts +3 -1
- package/dist/judges/types.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -14,6 +14,8 @@ Install a first-party harness package for the runtime you want to test:
|
|
|
14
14
|
npm install -D @vitest-evals/harness-pi-ai
|
|
15
15
|
# or
|
|
16
16
|
npm install -D @vitest-evals/harness-ai-sdk
|
|
17
|
+
# or
|
|
18
|
+
npm install -D @vitest-evals/harness-openai-agents
|
|
17
19
|
```
|
|
18
20
|
|
|
19
21
|
## Core Model
|
|
@@ -25,7 +27,9 @@ npm install -D @vitest-evals/harness-ai-sdk
|
|
|
25
27
|
- the returned `result.output` is the app-facing value you assert on directly
|
|
26
28
|
- the returned `result.session` is the canonical JSON-serializable trace for
|
|
27
29
|
reporting, replay, tool assertions, and judges
|
|
28
|
-
-
|
|
30
|
+
- scenario-specific judge criteria can live in `inputValue`; use `metadata` for
|
|
31
|
+
per-run expectations or harness configuration that are not part of the
|
|
32
|
+
scenario payload
|
|
29
33
|
- suite-level `judges` are optional and run automatically after each `run(...)`
|
|
30
34
|
- suite-level `judgeThreshold` controls fail-on-score for those automatic judges
|
|
31
35
|
- every judge receives `JudgeContext`, including the configured `harness` with
|
|
@@ -144,10 +148,173 @@ The harness owns normalization, diagnostics, tool capture, replay plumbing, and
|
|
|
144
148
|
reporter-facing artifacts. Your app just needs one runtime seam where those
|
|
145
149
|
wrapped pieces can be injected.
|
|
146
150
|
|
|
151
|
+
Replay opt-in belongs on the harness, via `toolReplay`, while replay mode and
|
|
152
|
+
recording directory can live in Vitest environment config. Tool definitions
|
|
153
|
+
should stay free of VCR policy.
|
|
154
|
+
|
|
147
155
|
For the Pi-specific harness, output/session/usage normalization should usually
|
|
148
156
|
be inferred automatically. Treat low-level normalization callbacks as an escape
|
|
149
157
|
hatch, not part of the primary authoring path.
|
|
150
158
|
|
|
159
|
+
For OpenAI Agents SDK apps, use
|
|
160
|
+
`@vitest-evals/harness-openai-agents` with an existing `Agent` or
|
|
161
|
+
`createAgent()` factory and a `Runner` / `createRunner()` callback. The harness
|
|
162
|
+
calls `Runner.run(agent, input, options)` by default and exposes the same
|
|
163
|
+
normalization and replay hooks when the app needs a custom entrypoint or
|
|
164
|
+
structured domain output mapping.
|
|
165
|
+
|
|
166
|
+
## Custom App Harnesses
|
|
167
|
+
|
|
168
|
+
First-party harness packages are conveniences, not the only supported path. If
|
|
169
|
+
you need to test a full application flow, define a harness that runs your app
|
|
170
|
+
through its normal entrypoint and returns a normalized `HarnessRun`. The same
|
|
171
|
+
harness should also expose `prompt`, which LLM-backed judges can reuse through
|
|
172
|
+
`JudgeContext.harness.prompt`.
|
|
173
|
+
|
|
174
|
+
```ts
|
|
175
|
+
import {
|
|
176
|
+
describeEval,
|
|
177
|
+
namedJudge,
|
|
178
|
+
type JudgeContext,
|
|
179
|
+
} from "vitest-evals";
|
|
180
|
+
import {
|
|
181
|
+
normalizeContent,
|
|
182
|
+
normalizeMetadata,
|
|
183
|
+
toJsonValue,
|
|
184
|
+
type Harness,
|
|
185
|
+
type HarnessRun,
|
|
186
|
+
} from "vitest-evals/harness";
|
|
187
|
+
|
|
188
|
+
type AppEvent = {
|
|
189
|
+
type: string;
|
|
190
|
+
payload: Record<string, unknown>;
|
|
191
|
+
};
|
|
192
|
+
|
|
193
|
+
type AppEvalInput = {
|
|
194
|
+
events: AppEvent[];
|
|
195
|
+
criteria: {
|
|
196
|
+
contract: string;
|
|
197
|
+
pass: string[];
|
|
198
|
+
fail?: string[];
|
|
199
|
+
};
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
const appHarness: Harness<AppEvalInput> = {
|
|
203
|
+
name: "custom-app",
|
|
204
|
+
prompt: (input, options) => promptJudgeModel(input, options),
|
|
205
|
+
run: async (input, context): Promise<HarnessRun> => {
|
|
206
|
+
const result = await replayAppEvents(input.events, {
|
|
207
|
+
signal: context.signal,
|
|
208
|
+
});
|
|
209
|
+
const output = {
|
|
210
|
+
replies: result.replies,
|
|
211
|
+
sideEffects: result.sideEffects,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
output: toJsonValue(output),
|
|
216
|
+
session: {
|
|
217
|
+
messages: [
|
|
218
|
+
...input.events.map((event) => ({
|
|
219
|
+
role: "user" as const,
|
|
220
|
+
content: normalizeContent(event),
|
|
221
|
+
})),
|
|
222
|
+
...result.replies.map((reply) => ({
|
|
223
|
+
role: "assistant" as const,
|
|
224
|
+
content: normalizeContent(reply.text),
|
|
225
|
+
metadata: normalizeMetadata({
|
|
226
|
+
target: reply.target,
|
|
227
|
+
}),
|
|
228
|
+
})),
|
|
229
|
+
],
|
|
230
|
+
outputText: result.replies.map((reply) => reply.text).join("\n\n"),
|
|
231
|
+
metadata: normalizeMetadata({
|
|
232
|
+
replyCount: result.replies.length,
|
|
233
|
+
}),
|
|
234
|
+
},
|
|
235
|
+
usage: {},
|
|
236
|
+
artifacts:
|
|
237
|
+
Object.keys(context.artifacts).length > 0
|
|
238
|
+
? context.artifacts
|
|
239
|
+
: undefined,
|
|
240
|
+
errors: [],
|
|
241
|
+
};
|
|
242
|
+
},
|
|
243
|
+
};
|
|
244
|
+
|
|
245
|
+
const AppRubricJudge = namedJudge(
|
|
246
|
+
"AppRubricJudge",
|
|
247
|
+
async (
|
|
248
|
+
ctx: JudgeContext<AppEvalInput, Record<string, unknown>, typeof appHarness>,
|
|
249
|
+
) => {
|
|
250
|
+
const verdict = await ctx.harness.prompt(
|
|
251
|
+
formatRubricPrompt({
|
|
252
|
+
output: ctx.output,
|
|
253
|
+
criteria: ctx.inputValue.criteria,
|
|
254
|
+
}),
|
|
255
|
+
{
|
|
256
|
+
metadata: {
|
|
257
|
+
judge: "AppRubricJudge",
|
|
258
|
+
},
|
|
259
|
+
},
|
|
260
|
+
);
|
|
261
|
+
|
|
262
|
+
return parseRubricVerdict(verdict);
|
|
263
|
+
},
|
|
264
|
+
);
|
|
265
|
+
|
|
266
|
+
describeEval(
|
|
267
|
+
"app behavior",
|
|
268
|
+
{
|
|
269
|
+
harness: appHarness,
|
|
270
|
+
judges: [AppRubricJudge],
|
|
271
|
+
judgeThreshold: 0.75,
|
|
272
|
+
},
|
|
273
|
+
(it) => {
|
|
274
|
+
it("handles an event flow", async ({ run }) => {
|
|
275
|
+
await run({
|
|
276
|
+
events: [
|
|
277
|
+
{
|
|
278
|
+
type: "message.created",
|
|
279
|
+
payload: {
|
|
280
|
+
text: "Summarize the current incident.",
|
|
281
|
+
},
|
|
282
|
+
},
|
|
283
|
+
],
|
|
284
|
+
criteria: {
|
|
285
|
+
contract: "The app posts one user-visible incident summary.",
|
|
286
|
+
pass: ["The reply names the incident status."],
|
|
287
|
+
fail: ["The reply exposes internal metadata."],
|
|
288
|
+
},
|
|
289
|
+
});
|
|
290
|
+
});
|
|
291
|
+
},
|
|
292
|
+
);
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
Use `Harness.run(...)` for the application under test and `Harness.prompt(...)`
|
|
296
|
+
for judge model calls. Calling `ctx.harness.run(...)` from inside a judge runs
|
|
297
|
+
the application a second time, so reserve that for judges that intentionally
|
|
298
|
+
need a second execution. Put criteria on `inputValue` when they are part of the
|
|
299
|
+
scenario itself; use per-run `metadata` for harness configuration or
|
|
300
|
+
expectations that are not part of the scenario payload. `session.outputText` is
|
|
301
|
+
the canonical text sent to judges, so define it deliberately when your app
|
|
302
|
+
returns structured artifacts.
|
|
303
|
+
|
|
304
|
+
Provider setup and rubric parsing stay in your harness and judge. The core
|
|
305
|
+
package only requires the judge to return a `JudgeResult` with a score and
|
|
306
|
+
optional metadata.
|
|
307
|
+
|
|
308
|
+
Automatic suite-level judges are a good fit when every `run(...)` should get
|
|
309
|
+
the same scoring. For cases where only some runs need an LLM judge, keep the
|
|
310
|
+
suite free of automatic judges and use an explicit matcher:
|
|
311
|
+
|
|
312
|
+
```ts
|
|
313
|
+
await expect(result).toSatisfyJudge(AppRubricJudge, {
|
|
314
|
+
threshold: 0.75,
|
|
315
|
+
});
|
|
316
|
+
```
|
|
317
|
+
|
|
151
318
|
## Judge Matchers
|
|
152
319
|
|
|
153
320
|
Use the matcher when a judge should behave like a normal Vitest assertion.
|
package/dist/judges/types.d.mts
CHANGED
|
@@ -11,7 +11,9 @@ type JudgeResult = {
|
|
|
11
11
|
/**
|
|
12
12
|
* Full normalized context passed to every judge.
|
|
13
13
|
*
|
|
14
|
-
*
|
|
14
|
+
* Scenario-owned judge criteria should live on `inputValue`. Use `metadata`
|
|
15
|
+
* for per-run expectations or harness configuration that are not part of the
|
|
16
|
+
* scenario payload.
|
|
15
17
|
*/
|
|
16
18
|
interface JudgeContext<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> | undefined = Harness<TInput, TMetadata> | undefined> {
|
|
17
19
|
/** Canonical text input passed to judges for plain prompt evaluation. */
|
package/dist/judges/types.d.ts
CHANGED
|
@@ -11,7 +11,9 @@ type JudgeResult = {
|
|
|
11
11
|
/**
|
|
12
12
|
* Full normalized context passed to every judge.
|
|
13
13
|
*
|
|
14
|
-
*
|
|
14
|
+
* Scenario-owned judge criteria should live on `inputValue`. Use `metadata`
|
|
15
|
+
* for per-run expectations or harness configuration that are not part of the
|
|
16
|
+
* scenario payload.
|
|
15
17
|
*/
|
|
16
18
|
interface JudgeContext<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, THarness extends Harness<TInput, TMetadata> | undefined = Harness<TInput, TMetadata> | undefined> {
|
|
17
19
|
/** Canonical text input passed to judges for plain prompt evaluation. */
|
package/dist/judges/types.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../../src/judges/types.ts"],"sourcesContent":["import type {\n Harness,\n HarnessMetadata,\n HarnessRun,\n ToolCallRecord,\n} from \"../harness\";\n\n/** Score payload returned by a judge. */\nexport type JudgeResult = {\n score: number | null;\n metadata?: {\n rationale?: string;\n output?: unknown;\n } & Record<string, unknown>;\n};\n\n/**\n * Full normalized context passed to every judge.\n *\n *
|
|
1
|
+
{"version":3,"sources":["../../src/judges/types.ts"],"sourcesContent":["import type {\n Harness,\n HarnessMetadata,\n HarnessRun,\n ToolCallRecord,\n} from \"../harness\";\n\n/** Score payload returned by a judge. */\nexport type JudgeResult = {\n score: number | null;\n metadata?: {\n rationale?: string;\n output?: unknown;\n } & Record<string, unknown>;\n};\n\n/**\n * Full normalized context passed to every judge.\n *\n * Scenario-owned judge criteria should live on `inputValue`. Use `metadata`\n * for per-run expectations or harness configuration that are not part of the\n * scenario payload.\n */\nexport interface JudgeContext<\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n THarness extends Harness<TInput, TMetadata> | undefined =\n | Harness<TInput, TMetadata>\n | undefined,\n> {\n /** Canonical text input passed to judges for plain prompt evaluation. */\n input: string;\n /** Canonical text response passed to judges for plain output evaluation. */\n output: string;\n /** Original non-string input value when the judge needs more than `input`. */\n inputValue: TInput;\n toolCalls: ToolCallRecord[];\n metadata: Readonly<TMetadata>;\n run: HarnessRun;\n session: HarnessRun[\"session\"];\n /** Harness associated with this judge context. */\n harness: THarness;\n}\n\n/** Convenience helper for judges that accept explicit per-call params. */\nexport type JudgeOptions<\n TParams extends Record<string, unknown> = Record<string, never>,\n TInput = unknown,\n TMetadata extends HarnessMetadata = HarnessMetadata,\n THarness extends Harness<TInput, TMetadata> | undefined =\n | Harness<TInput, TMetadata>\n | undefined,\n> = JudgeContext<TInput, TMetadata, THarness> & TParams;\n\n/** Judge function over the normalized judge context. */\nexport type JudgeFn<\n TOptions extends JudgeContext<any, any, any> = JudgeContext,\n> = (opts: TOptions) => Promise<JudgeResult> | JudgeResult;\n"],"mappings":";;;;;;;;;;;;;;;;AAAA;AAAA;","names":[]}
|