vitest-evals 0.9.0-beta.3 → 0.9.0-beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -97
- package/dist/harness.d.mts +59 -18
- package/dist/harness.d.ts +59 -18
- package/dist/harness.js +136 -1
- package/dist/harness.js.map +1 -1
- package/dist/harness.mjs +134 -1
- package/dist/harness.mjs.map +1 -1
- package/dist/index.d.mts +45 -29
- package/dist/index.d.ts +45 -29
- package/dist/index.js +293 -103
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +290 -102
- package/dist/index.mjs.map +1 -1
- package/dist/internal/matchers.d.mts +4 -0
- package/dist/internal/matchers.d.ts +4 -0
- package/dist/internal/matchers.js.map +1 -1
- package/dist/internal/matchers.mjs.map +1 -1
- package/dist/internal/structuredOutputScorer.js.map +1 -1
- package/dist/internal/structuredOutputScorer.mjs.map +1 -1
- package/dist/internal/toolCallScorer.js.map +1 -1
- package/dist/internal/toolCallScorer.mjs.map +1 -1
- package/dist/judges/index.d.mts +1 -1
- package/dist/judges/index.d.ts +1 -1
- package/dist/judges/index.js +37 -23
- package/dist/judges/index.js.map +1 -1
- package/dist/judges/index.mjs +37 -23
- package/dist/judges/index.mjs.map +1 -1
- package/dist/judges/structuredOutputJudge.d.mts +6 -3
- package/dist/judges/structuredOutputJudge.d.ts +6 -3
- package/dist/judges/structuredOutputJudge.js +11 -11
- package/dist/judges/structuredOutputJudge.js.map +1 -1
- package/dist/judges/structuredOutputJudge.mjs +11 -11
- package/dist/judges/structuredOutputJudge.mjs.map +1 -1
- package/dist/judges/toolCallJudge.d.mts +6 -3
- package/dist/judges/toolCallJudge.d.ts +6 -3
- package/dist/judges/toolCallJudge.js +26 -12
- package/dist/judges/toolCallJudge.js.map +1 -1
- package/dist/judges/toolCallJudge.mjs +26 -12
- package/dist/judges/toolCallJudge.mjs.map +1 -1
- package/dist/judges/types.d.mts +33 -16
- package/dist/judges/types.d.ts +33 -16
- package/dist/judges/types.js.map +1 -1
- package/dist/legacy/evaluate/index.d.mts +2 -0
- package/dist/legacy/evaluate/index.d.ts +2 -0
- package/dist/legacy/evaluate/index.js.map +1 -1
- package/dist/legacy/evaluate/index.mjs.map +1 -1
- package/dist/legacy/scorers/index.js.map +1 -1
- package/dist/legacy/scorers/index.mjs.map +1 -1
- package/dist/legacy/scorers/structuredOutputScorer.d.mts +2 -0
- package/dist/legacy/scorers/structuredOutputScorer.d.ts +2 -0
- package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
- package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.d.mts +2 -0
- package/dist/legacy/scorers/toolCallScorer.d.ts +2 -0
- package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
- package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
- package/dist/legacy/scorers/utils.js.map +1 -1
- package/dist/legacy/scorers/utils.mjs.map +1 -1
- package/dist/legacy/shared.d.mts +6 -0
- package/dist/legacy/shared.d.ts +6 -0
- package/dist/legacy/shared.js.map +1 -1
- package/dist/legacy.d.mts +3 -0
- package/dist/legacy.d.ts +3 -0
- package/dist/legacy.js.map +1 -1
- package/dist/legacy.mjs.map +1 -1
- package/dist/replay.d.mts +7 -0
- package/dist/replay.d.ts +7 -0
- package/dist/replay.js.map +1 -1
- package/dist/replay.mjs.map +1 -1
- package/dist/reporter.d.mts +1 -0
- package/dist/reporter.d.ts +1 -0
- package/dist/reporter.js.map +1 -1
- package/dist/reporter.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -18,6 +18,12 @@ npm install -D @vitest-evals/harness-ai-sdk
|
|
|
18
18
|
npm install -D @vitest-evals/harness-openai-agents
|
|
19
19
|
```
|
|
20
20
|
|
|
21
|
+
For GitHub Actions summaries and annotations, install the JSON post-processor:
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
npm install -D @vitest-evals/github-reporter
|
|
25
|
+
```
|
|
26
|
+
|
|
21
27
|
## Core Model
|
|
22
28
|
|
|
23
29
|
- `describeEval(...)` binds exactly one harness to a suite
|
|
@@ -27,13 +33,18 @@ npm install -D @vitest-evals/harness-openai-agents
|
|
|
27
33
|
- the returned `result.output` is the app-facing value you assert on directly
|
|
28
34
|
- the returned `result.session` is the canonical JSON-serializable trace for
|
|
29
35
|
reporting, replay, tool assertions, and judges
|
|
30
|
-
- scenario-specific judge criteria can live in `
|
|
36
|
+
- scenario-specific judge criteria can live in `input`; use `metadata` for
|
|
31
37
|
per-run expectations or harness configuration that are not part of the
|
|
32
38
|
scenario payload
|
|
33
39
|
- suite-level `judges` are optional and run automatically after each `run(...)`
|
|
34
40
|
- suite-level `judgeThreshold` controls fail-on-score for those automatic judges
|
|
35
|
-
- every judge
|
|
36
|
-
|
|
41
|
+
- every judge is a named object with `assess(ctx)`
|
|
42
|
+
- every judge receives `JudgeContext` with typed `input`, typed `output`, the
|
|
43
|
+
normalized run/session, tool calls, and metadata; `output` is only optional
|
|
44
|
+
when the harness output type includes `undefined`
|
|
45
|
+
- judges own their prompt, rubric, model call, and parsing; use
|
|
46
|
+
`createJudge(...)` for custom judges and its provider-helper overload only
|
|
47
|
+
when multiple judges share setup
|
|
37
48
|
- explicit judge assertions use
|
|
38
49
|
`await expect(result).toSatisfyJudge(judge, context)`
|
|
39
50
|
|
|
@@ -43,25 +54,29 @@ npm install -D @vitest-evals/harness-openai-agents
|
|
|
43
54
|
import { expect } from "vitest";
|
|
44
55
|
import { piAiHarness } from "@vitest-evals/harness-pi-ai";
|
|
45
56
|
import {
|
|
57
|
+
createJudge,
|
|
46
58
|
describeEval,
|
|
47
|
-
namedJudge,
|
|
48
59
|
toolCalls,
|
|
49
60
|
type JudgeContext,
|
|
50
61
|
} from "vitest-evals";
|
|
51
|
-
import { createRefundAgent
|
|
62
|
+
import { createRefundAgent } from "../src/refundAgent";
|
|
52
63
|
|
|
53
64
|
type RefundEvalMetadata = {
|
|
54
65
|
expectedStatus: "approved" | "denied";
|
|
55
66
|
expectedTools: string[];
|
|
56
67
|
};
|
|
57
68
|
|
|
58
|
-
|
|
69
|
+
type RefundOutput = {
|
|
70
|
+
status: "approved" | "denied";
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
const FactualityJudge = createJudge(
|
|
59
74
|
"FactualityJudge",
|
|
60
75
|
async ({
|
|
61
76
|
input,
|
|
62
77
|
output,
|
|
63
78
|
metadata,
|
|
64
|
-
}: JudgeContext<string, RefundEvalMetadata>) => {
|
|
79
|
+
}: JudgeContext<string, RefundOutput, RefundEvalMetadata>) => {
|
|
65
80
|
const verdict = await judgeFactuality({
|
|
66
81
|
question: input,
|
|
67
82
|
answer: output,
|
|
@@ -81,8 +96,7 @@ describeEval(
|
|
|
81
96
|
"refund agent",
|
|
82
97
|
{
|
|
83
98
|
harness: piAiHarness({
|
|
84
|
-
|
|
85
|
-
prompt: judgePrompt,
|
|
99
|
+
agent: () => createRefundAgent(),
|
|
86
100
|
}),
|
|
87
101
|
judges: [FactualityJudge],
|
|
88
102
|
},
|
|
@@ -135,6 +149,24 @@ describeEval("refund agent", { harness }, (it) => {
|
|
|
135
149
|
});
|
|
136
150
|
```
|
|
137
151
|
|
|
152
|
+
## GitHub Actions Reporting
|
|
153
|
+
|
|
154
|
+
Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
|
|
155
|
+
contains eval scores and normalized harness runs.
|
|
156
|
+
|
|
157
|
+
```sh
|
|
158
|
+
vitest run evals \
|
|
159
|
+
--reporter=vitest-evals/reporter \
|
|
160
|
+
--reporter=json \
|
|
161
|
+
--outputFile.json=vitest-results.json
|
|
162
|
+
|
|
163
|
+
vitest-evals-github-report
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
The GitHub reporter writes a job summary when `GITHUB_STEP_SUMMARY` is present,
|
|
167
|
+
emits short failure annotations in Actions, and can publish a separate Check Run
|
|
168
|
+
with `--check-run` when `checks: write` permission is configured.
|
|
169
|
+
|
|
138
170
|
## Existing Agents
|
|
139
171
|
|
|
140
172
|
For an existing agent, the intended contract is:
|
|
@@ -157,37 +189,31 @@ be inferred automatically. Treat low-level normalization callbacks as an escape
|
|
|
157
189
|
hatch, not part of the primary authoring path.
|
|
158
190
|
|
|
159
191
|
For OpenAI Agents SDK apps, use
|
|
160
|
-
`@vitest-evals/harness-openai-agents` with an existing `Agent` or
|
|
161
|
-
|
|
162
|
-
|
|
192
|
+
`@vitest-evals/harness-openai-agents` with an existing `Agent` or an `agent`
|
|
193
|
+
factory and a `Runner` or `runner` factory. The harness calls
|
|
194
|
+
`Runner.run(agent, input, options)` by default and exposes the same
|
|
163
195
|
normalization and replay hooks when the app needs a custom entrypoint or
|
|
164
196
|
structured domain output mapping.
|
|
165
197
|
|
|
166
198
|
## Custom App Harnesses
|
|
167
199
|
|
|
168
200
|
First-party harness packages are conveniences, not the only supported path. If
|
|
169
|
-
you need to test a full application flow,
|
|
170
|
-
through its normal entrypoint and
|
|
171
|
-
|
|
172
|
-
`
|
|
201
|
+
you need to test a full application flow, use `createHarness(...)` to run your
|
|
202
|
+
app through its normal entrypoint and return the app-facing output. Judges own
|
|
203
|
+
their prompt/rubric text separately from the system under test.
|
|
204
|
+
When generics are needed, use `createHarness<Input, Output, Metadata>(...)`.
|
|
173
205
|
|
|
174
206
|
```ts
|
|
175
207
|
import {
|
|
208
|
+
createHarness,
|
|
209
|
+
createJudge,
|
|
176
210
|
describeEval,
|
|
177
|
-
namedJudge,
|
|
178
211
|
type JudgeContext,
|
|
179
212
|
} from "vitest-evals";
|
|
180
|
-
import {
|
|
181
|
-
normalizeContent,
|
|
182
|
-
normalizeMetadata,
|
|
183
|
-
toJsonValue,
|
|
184
|
-
type Harness,
|
|
185
|
-
type HarnessRun,
|
|
186
|
-
} from "vitest-evals/harness";
|
|
187
213
|
|
|
188
214
|
type AppEvent = {
|
|
189
215
|
type: string;
|
|
190
|
-
payload: Record<string,
|
|
216
|
+
payload: Record<string, string>;
|
|
191
217
|
};
|
|
192
218
|
|
|
193
219
|
type AppEvalInput = {
|
|
@@ -199,65 +225,42 @@ type AppEvalInput = {
|
|
|
199
225
|
};
|
|
200
226
|
};
|
|
201
227
|
|
|
202
|
-
|
|
228
|
+
type AppEvalMetadata = Record<string, never>;
|
|
229
|
+
|
|
230
|
+
type AppOutput = {
|
|
231
|
+
replies: Array<{ text: string }>;
|
|
232
|
+
sideEffects: string[];
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
const appHarness = createHarness<AppEvalInput, AppOutput, AppEvalMetadata>({
|
|
203
236
|
name: "custom-app",
|
|
204
|
-
|
|
205
|
-
run: async (input, context): Promise<HarnessRun> => {
|
|
237
|
+
run: async ({ input, signal }) => {
|
|
206
238
|
const result = await replayAppEvents(input.events, {
|
|
207
|
-
signal
|
|
239
|
+
signal,
|
|
208
240
|
});
|
|
209
|
-
const output = {
|
|
210
|
-
replies: result.replies,
|
|
211
|
-
sideEffects: result.sideEffects,
|
|
212
|
-
};
|
|
213
241
|
|
|
214
242
|
return {
|
|
215
|
-
output:
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
})),
|
|
222
|
-
...result.replies.map((reply) => ({
|
|
223
|
-
role: "assistant" as const,
|
|
224
|
-
content: normalizeContent(reply.text),
|
|
225
|
-
metadata: normalizeMetadata({
|
|
226
|
-
target: reply.target,
|
|
227
|
-
}),
|
|
228
|
-
})),
|
|
229
|
-
],
|
|
230
|
-
outputText: result.replies.map((reply) => reply.text).join("\n\n"),
|
|
231
|
-
metadata: normalizeMetadata({
|
|
232
|
-
replyCount: result.replies.length,
|
|
233
|
-
}),
|
|
243
|
+
output: {
|
|
244
|
+
replies: result.replies,
|
|
245
|
+
sideEffects: result.sideEffects,
|
|
246
|
+
},
|
|
247
|
+
artifacts: {
|
|
248
|
+
replyCount: result.replies.length,
|
|
234
249
|
},
|
|
235
250
|
usage: {},
|
|
236
|
-
artifacts:
|
|
237
|
-
Object.keys(context.artifacts).length > 0
|
|
238
|
-
? context.artifacts
|
|
239
|
-
: undefined,
|
|
240
|
-
errors: [],
|
|
241
251
|
};
|
|
242
252
|
},
|
|
243
|
-
};
|
|
253
|
+
});
|
|
244
254
|
|
|
245
|
-
const AppRubricJudge =
|
|
255
|
+
const AppRubricJudge = createJudge(
|
|
246
256
|
"AppRubricJudge",
|
|
247
|
-
async (
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
const verdict = await ctx.harness.prompt(
|
|
251
|
-
formatRubricPrompt({
|
|
257
|
+
async (ctx: JudgeContext<AppEvalInput, AppOutput, AppEvalMetadata>) => {
|
|
258
|
+
const verdict = await promptJudgeModel({
|
|
259
|
+
prompt: formatRubricPrompt({
|
|
252
260
|
output: ctx.output,
|
|
253
|
-
criteria: ctx.
|
|
261
|
+
criteria: ctx.input.criteria,
|
|
254
262
|
}),
|
|
255
|
-
|
|
256
|
-
metadata: {
|
|
257
|
-
judge: "AppRubricJudge",
|
|
258
|
-
},
|
|
259
|
-
},
|
|
260
|
-
);
|
|
263
|
+
});
|
|
261
264
|
|
|
262
265
|
return parseRubricVerdict(verdict);
|
|
263
266
|
},
|
|
@@ -292,16 +295,16 @@ describeEval(
|
|
|
292
295
|
);
|
|
293
296
|
```
|
|
294
297
|
|
|
295
|
-
Use `Harness.run(...)` for the application under test
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
298
|
+
Use `Harness.run(...)` for the application under test. Calling
|
|
299
|
+
`ctx.harness.run(...)` from inside a judge runs the application a second time,
|
|
300
|
+
so reserve that for judges that intentionally need a second execution. Put
|
|
301
|
+
criteria on `input` when they are part of the scenario itself; use per-run
|
|
302
|
+
`metadata` for harness configuration or expectations that are not part of the
|
|
303
|
+
scenario payload. `createHarness(...)` builds a default user/assistant session
|
|
304
|
+
from `input` and typed `output`; return a full `HarnessRun` only when you need
|
|
305
|
+
exact session control.
|
|
303
306
|
|
|
304
|
-
Provider setup and rubric parsing stay in your
|
|
307
|
+
Provider setup and rubric parsing stay in your judge. The core
|
|
305
308
|
package only requires the judge to return a `JudgeResult` with a score and
|
|
306
309
|
optional metadata.
|
|
307
310
|
|
|
@@ -330,17 +333,17 @@ context:
|
|
|
330
333
|
|
|
331
334
|
```ts
|
|
332
335
|
await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
|
|
333
|
-
|
|
336
|
+
input: "Refund invoice inv_123",
|
|
334
337
|
});
|
|
335
338
|
```
|
|
336
339
|
|
|
337
|
-
|
|
338
|
-
|
|
340
|
+
Use `createJudge(...)` for custom judges so reporter output gets a stable
|
|
341
|
+
label:
|
|
339
342
|
|
|
340
343
|
```ts
|
|
341
|
-
import {
|
|
344
|
+
import { createJudge } from "vitest-evals";
|
|
342
345
|
|
|
343
|
-
const FactualityJudge =
|
|
346
|
+
const FactualityJudge = createJudge(
|
|
344
347
|
"FactualityJudge",
|
|
345
348
|
async ({ output }) => {
|
|
346
349
|
const answer = output;
|
|
@@ -356,21 +359,25 @@ const FactualityJudge = namedJudge(
|
|
|
356
359
|
);
|
|
357
360
|
```
|
|
358
361
|
|
|
359
|
-
LLM-backed judges
|
|
360
|
-
`
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
362
|
+
LLM-backed judges should provide their own judge prompt and rubric text.
|
|
363
|
+
`vitest-evals` does not prescribe a rubric schema, scoring scale, model
|
|
364
|
+
provider, or parser; those stay in the judge. When multiple judges share a
|
|
365
|
+
reusable judge-side provider helper, use the provider-helper overload of
|
|
366
|
+
`createJudge(...)` so run-scoped options such as abort signals stay curried.
|
|
367
|
+
Calling `harness.run(...)` from a judge executes the application again, so use
|
|
368
|
+
that only when a second run is intentional.
|
|
364
369
|
|
|
365
370
|
For an `EvalHarnessRun` returned by fixture `run(...)`,
|
|
366
|
-
`toSatisfyJudge(...)` uses the run's
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
371
|
+
`toSatisfyJudge(...)` uses the run's typed `output` and reuses the registered
|
|
372
|
+
input and metadata. It requires any custom judge params and rejects judges whose
|
|
373
|
+
output type cannot assess the received value. Inside an eval test,
|
|
374
|
+
matcher calls on registered output objects or session objects reuse that exact
|
|
375
|
+
run context when the value can be registered by reference, so
|
|
376
|
+
`expect(result.output).toSatisfyJudge(judge)` stays concise for structured
|
|
377
|
+
outputs. Other raw values fall back to the current test's most recent
|
|
378
|
+
`run(...)` context. For
|
|
372
379
|
manually-created runs or values outside an eval context, pass any required
|
|
373
|
-
`
|
|
380
|
+
`input`, `metadata`, or `harness` in matcher options. Structured or
|
|
374
381
|
programmatic result checks should usually assert on `result.output` directly.
|
|
375
382
|
When a judge needs richer normalized context or the configured suite harness,
|
|
376
383
|
type it with `JudgeContext`.
|
package/dist/harness.d.mts
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
/** Primitive scalar values allowed in normalized JSON-safe eval data. */
|
|
1
2
|
type JsonPrimitive = string | number | boolean | null;
|
|
3
|
+
/** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
|
|
2
4
|
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
3
5
|
[key: string]: JsonValue;
|
|
4
6
|
};
|
|
7
|
+
/** Normalized record for one tool call observed during a harness run. */
|
|
5
8
|
type ToolCallRecord = {
|
|
6
9
|
id?: string;
|
|
7
10
|
name: string;
|
|
@@ -17,12 +20,14 @@ type ToolCallRecord = {
|
|
|
17
20
|
durationMs?: number;
|
|
18
21
|
metadata?: Record<string, JsonValue>;
|
|
19
22
|
};
|
|
23
|
+
/** Normalized message recorded in a harness session transcript. */
|
|
20
24
|
type NormalizedMessage = {
|
|
21
25
|
role: "system" | "user" | "assistant" | "tool";
|
|
22
26
|
content?: JsonValue;
|
|
23
27
|
toolCalls?: ToolCallRecord[];
|
|
24
28
|
metadata?: Record<string, JsonValue>;
|
|
25
29
|
};
|
|
30
|
+
/** Provider usage summary attached to a normalized harness run. */
|
|
26
31
|
type UsageSummary = {
|
|
27
32
|
provider?: string;
|
|
28
33
|
model?: string;
|
|
@@ -35,50 +40,82 @@ type UsageSummary = {
|
|
|
35
40
|
retries?: number;
|
|
36
41
|
metadata?: Record<string, JsonValue>;
|
|
37
42
|
};
|
|
43
|
+
/** Timing summary attached to a normalized harness run. */
|
|
38
44
|
type TimingSummary = {
|
|
39
45
|
totalMs?: number;
|
|
40
46
|
metadata?: Record<string, JsonValue>;
|
|
41
47
|
};
|
|
48
|
+
/** JSON-serializable transcript produced by the system under test. */
|
|
42
49
|
type NormalizedSession = {
|
|
43
50
|
messages: NormalizedMessage[];
|
|
44
|
-
outputText?: string;
|
|
45
51
|
provider?: string;
|
|
46
52
|
model?: string;
|
|
47
53
|
metadata?: Record<string, JsonValue>;
|
|
48
54
|
};
|
|
49
|
-
type
|
|
55
|
+
type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
|
|
56
|
+
output?: TOutput;
|
|
57
|
+
} : {
|
|
58
|
+
output: TOutput;
|
|
59
|
+
};
|
|
60
|
+
/** Normalized result returned by every harness execution. */
|
|
61
|
+
type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
50
62
|
session: NormalizedSession;
|
|
51
|
-
output?: JsonValue;
|
|
52
63
|
usage: UsageSummary;
|
|
53
64
|
timings?: TimingSummary;
|
|
54
65
|
artifacts?: Record<string, JsonValue>;
|
|
55
66
|
errors: Array<Record<string, JsonValue>>;
|
|
56
67
|
};
|
|
57
|
-
/**
|
|
58
|
-
type HarnessPromptOptions = {
|
|
59
|
-
system?: string;
|
|
60
|
-
metadata?: Record<string, JsonValue>;
|
|
61
|
-
};
|
|
62
|
-
/** Provider-agnostic prompt seam that judges can reuse from a harness. */
|
|
63
|
-
type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
|
|
68
|
+
/** Error value with an attached partial or complete normalized harness run. */
|
|
64
69
|
type HarnessRunError = Error & {
|
|
65
70
|
vitestEvalsRun: HarnessRun;
|
|
66
71
|
};
|
|
72
|
+
/** Per-run metadata shape accepted by harnesses and eval tests. */
|
|
67
73
|
type HarnessMetadata = Record<string, unknown>;
|
|
74
|
+
/** Runtime context passed from the eval fixture into a harness run. */
|
|
68
75
|
type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
69
76
|
metadata: Readonly<TMetadata>;
|
|
70
|
-
task: {
|
|
71
|
-
meta: Record<string, unknown>;
|
|
72
|
-
};
|
|
73
77
|
signal?: AbortSignal;
|
|
74
78
|
artifacts: Record<string, JsonValue>;
|
|
75
79
|
setArtifact: (name: string, value: JsonValue) => void;
|
|
76
80
|
};
|
|
77
|
-
|
|
81
|
+
/** Adapter that executes the system under test and returns a normalized run. */
|
|
82
|
+
type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
83
|
+
name: string;
|
|
84
|
+
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
|
|
85
|
+
};
|
|
86
|
+
/** Value or promise accepted by lightweight harness callbacks. */
|
|
87
|
+
type MaybePromise<T> = T | Promise<T>;
|
|
88
|
+
/** Lightweight tool-call record accepted by `createHarness(...)` results. */
|
|
89
|
+
type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
|
|
90
|
+
arguments?: unknown;
|
|
91
|
+
result?: unknown;
|
|
92
|
+
error?: unknown;
|
|
93
|
+
metadata?: Record<string, unknown>;
|
|
94
|
+
};
|
|
95
|
+
/** Lightweight result shape normalized by `createHarness(...)`. */
|
|
96
|
+
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
97
|
+
messages?: NormalizedMessage[];
|
|
98
|
+
toolCalls?: SimpleToolCallRecord[];
|
|
99
|
+
usage?: UsageSummary;
|
|
100
|
+
timings?: TimingSummary;
|
|
101
|
+
artifacts?: Record<string, unknown>;
|
|
102
|
+
metadata?: Record<string, unknown>;
|
|
103
|
+
errors?: unknown[];
|
|
104
|
+
};
|
|
105
|
+
/** Either a complete normalized run or a lightweight result to normalize. */
|
|
106
|
+
type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
|
|
107
|
+
/** Arguments passed to the `createHarness(...)` convenience callback. */
|
|
108
|
+
type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
|
|
109
|
+
input: TInput;
|
|
110
|
+
metadata: Readonly<TMetadata>;
|
|
111
|
+
signal?: AbortSignal;
|
|
112
|
+
artifacts: HarnessContext<TMetadata>["artifacts"];
|
|
113
|
+
setArtifact: HarnessContext<TMetadata>["setArtifact"];
|
|
114
|
+
};
|
|
115
|
+
/** Options for creating a lightweight custom application harness. */
|
|
116
|
+
type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
78
117
|
name: string;
|
|
79
|
-
|
|
80
|
-
prompt: HarnessPrompt;
|
|
81
|
-
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
|
|
118
|
+
run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
|
|
82
119
|
};
|
|
83
120
|
/** Returns true when a value exposes a callable method with the given name. */
|
|
84
121
|
declare function hasCallableMethod(value: unknown, methodName: string): boolean;
|
|
@@ -90,6 +127,10 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
|
|
|
90
127
|
declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
|
|
91
128
|
/** Converts arbitrary content into the JSON-safe message content shape. */
|
|
92
129
|
declare function normalizeContent(value: unknown): JsonValue;
|
|
130
|
+
/** Creates a harness from the common "run app code and return output" shape. */
|
|
131
|
+
declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
|
|
132
|
+
/** Normalizes a lightweight harness result into the reporter-facing run shape. */
|
|
133
|
+
declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
|
|
93
134
|
/** Flattens every recorded tool call from a normalized session. */
|
|
94
135
|
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
95
136
|
/** Filters normalized session messages by role. */
|
|
@@ -115,4 +156,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
115
156
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
116
157
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
117
158
|
|
|
118
|
-
export { type
|
|
159
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|
package/dist/harness.d.ts
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
/** Primitive scalar values allowed in normalized JSON-safe eval data. */
|
|
1
2
|
type JsonPrimitive = string | number | boolean | null;
|
|
3
|
+
/** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
|
|
2
4
|
type JsonValue = JsonPrimitive | JsonValue[] | {
|
|
3
5
|
[key: string]: JsonValue;
|
|
4
6
|
};
|
|
7
|
+
/** Normalized record for one tool call observed during a harness run. */
|
|
5
8
|
type ToolCallRecord = {
|
|
6
9
|
id?: string;
|
|
7
10
|
name: string;
|
|
@@ -17,12 +20,14 @@ type ToolCallRecord = {
|
|
|
17
20
|
durationMs?: number;
|
|
18
21
|
metadata?: Record<string, JsonValue>;
|
|
19
22
|
};
|
|
23
|
+
/** Normalized message recorded in a harness session transcript. */
|
|
20
24
|
type NormalizedMessage = {
|
|
21
25
|
role: "system" | "user" | "assistant" | "tool";
|
|
22
26
|
content?: JsonValue;
|
|
23
27
|
toolCalls?: ToolCallRecord[];
|
|
24
28
|
metadata?: Record<string, JsonValue>;
|
|
25
29
|
};
|
|
30
|
+
/** Provider usage summary attached to a normalized harness run. */
|
|
26
31
|
type UsageSummary = {
|
|
27
32
|
provider?: string;
|
|
28
33
|
model?: string;
|
|
@@ -35,50 +40,82 @@ type UsageSummary = {
|
|
|
35
40
|
retries?: number;
|
|
36
41
|
metadata?: Record<string, JsonValue>;
|
|
37
42
|
};
|
|
43
|
+
/** Timing summary attached to a normalized harness run. */
|
|
38
44
|
type TimingSummary = {
|
|
39
45
|
totalMs?: number;
|
|
40
46
|
metadata?: Record<string, JsonValue>;
|
|
41
47
|
};
|
|
48
|
+
/** JSON-serializable transcript produced by the system under test. */
|
|
42
49
|
type NormalizedSession = {
|
|
43
50
|
messages: NormalizedMessage[];
|
|
44
|
-
outputText?: string;
|
|
45
51
|
provider?: string;
|
|
46
52
|
model?: string;
|
|
47
53
|
metadata?: Record<string, JsonValue>;
|
|
48
54
|
};
|
|
49
|
-
type
|
|
55
|
+
type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
|
|
56
|
+
output?: TOutput;
|
|
57
|
+
} : {
|
|
58
|
+
output: TOutput;
|
|
59
|
+
};
|
|
60
|
+
/** Normalized result returned by every harness execution. */
|
|
61
|
+
type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
50
62
|
session: NormalizedSession;
|
|
51
|
-
output?: JsonValue;
|
|
52
63
|
usage: UsageSummary;
|
|
53
64
|
timings?: TimingSummary;
|
|
54
65
|
artifacts?: Record<string, JsonValue>;
|
|
55
66
|
errors: Array<Record<string, JsonValue>>;
|
|
56
67
|
};
|
|
57
|
-
/**
|
|
58
|
-
type HarnessPromptOptions = {
|
|
59
|
-
system?: string;
|
|
60
|
-
metadata?: Record<string, JsonValue>;
|
|
61
|
-
};
|
|
62
|
-
/** Provider-agnostic prompt seam that judges can reuse from a harness. */
|
|
63
|
-
type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
|
|
68
|
+
/** Error value with an attached partial or complete normalized harness run. */
|
|
64
69
|
type HarnessRunError = Error & {
|
|
65
70
|
vitestEvalsRun: HarnessRun;
|
|
66
71
|
};
|
|
72
|
+
/** Per-run metadata shape accepted by harnesses and eval tests. */
|
|
67
73
|
type HarnessMetadata = Record<string, unknown>;
|
|
74
|
+
/** Runtime context passed from the eval fixture into a harness run. */
|
|
68
75
|
type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
69
76
|
metadata: Readonly<TMetadata>;
|
|
70
|
-
task: {
|
|
71
|
-
meta: Record<string, unknown>;
|
|
72
|
-
};
|
|
73
77
|
signal?: AbortSignal;
|
|
74
78
|
artifacts: Record<string, JsonValue>;
|
|
75
79
|
setArtifact: (name: string, value: JsonValue) => void;
|
|
76
80
|
};
|
|
77
|
-
|
|
81
|
+
/** Adapter that executes the system under test and returns a normalized run. */
|
|
82
|
+
type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
83
|
+
name: string;
|
|
84
|
+
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
|
|
85
|
+
};
|
|
86
|
+
/** Value or promise accepted by lightweight harness callbacks. */
|
|
87
|
+
type MaybePromise<T> = T | Promise<T>;
|
|
88
|
+
/** Lightweight tool-call record accepted by `createHarness(...)` results. */
|
|
89
|
+
type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
|
|
90
|
+
arguments?: unknown;
|
|
91
|
+
result?: unknown;
|
|
92
|
+
error?: unknown;
|
|
93
|
+
metadata?: Record<string, unknown>;
|
|
94
|
+
};
|
|
95
|
+
/** Lightweight result shape normalized by `createHarness(...)`. */
|
|
96
|
+
type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
|
|
97
|
+
messages?: NormalizedMessage[];
|
|
98
|
+
toolCalls?: SimpleToolCallRecord[];
|
|
99
|
+
usage?: UsageSummary;
|
|
100
|
+
timings?: TimingSummary;
|
|
101
|
+
artifacts?: Record<string, unknown>;
|
|
102
|
+
metadata?: Record<string, unknown>;
|
|
103
|
+
errors?: unknown[];
|
|
104
|
+
};
|
|
105
|
+
/** Either a complete normalized run or a lightweight result to normalize. */
|
|
106
|
+
type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
|
|
107
|
+
/** Arguments passed to the `createHarness(...)` convenience callback. */
|
|
108
|
+
type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
|
|
109
|
+
input: TInput;
|
|
110
|
+
metadata: Readonly<TMetadata>;
|
|
111
|
+
signal?: AbortSignal;
|
|
112
|
+
artifacts: HarnessContext<TMetadata>["artifacts"];
|
|
113
|
+
setArtifact: HarnessContext<TMetadata>["setArtifact"];
|
|
114
|
+
};
|
|
115
|
+
/** Options for creating a lightweight custom application harness. */
|
|
116
|
+
type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
|
|
78
117
|
name: string;
|
|
79
|
-
|
|
80
|
-
prompt: HarnessPrompt;
|
|
81
|
-
run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
|
|
118
|
+
run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
|
|
82
119
|
};
|
|
83
120
|
/** Returns true when a value exposes a callable method with the given name. */
|
|
84
121
|
declare function hasCallableMethod(value: unknown, methodName: string): boolean;
|
|
@@ -90,6 +127,10 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
|
|
|
90
127
|
declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
|
|
91
128
|
/** Converts arbitrary content into the JSON-safe message content shape. */
|
|
92
129
|
declare function normalizeContent(value: unknown): JsonValue;
|
|
130
|
+
/** Creates a harness from the common "run app code and return output" shape. */
|
|
131
|
+
declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
|
|
132
|
+
/** Normalizes a lightweight harness result into the reporter-facing run shape. */
|
|
133
|
+
declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
|
|
93
134
|
/** Flattens every recorded tool call from a normalized session. */
|
|
94
135
|
declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
|
|
95
136
|
/** Filters normalized session messages by role. */
|
|
@@ -115,4 +156,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
|
|
|
115
156
|
/** Serializes an arbitrary thrown value into the normalized error shape. */
|
|
116
157
|
declare function serializeError(error: unknown): Record<string, JsonValue>;
|
|
117
158
|
|
|
118
|
-
export { type
|
|
159
|
+
export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
|