vitest-evals 0.9.0-beta.3 → 0.9.0-beta.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +104 -97
  2. package/dist/harness.d.mts +59 -18
  3. package/dist/harness.d.ts +59 -18
  4. package/dist/harness.js +136 -1
  5. package/dist/harness.js.map +1 -1
  6. package/dist/harness.mjs +134 -1
  7. package/dist/harness.mjs.map +1 -1
  8. package/dist/index.d.mts +45 -29
  9. package/dist/index.d.ts +45 -29
  10. package/dist/index.js +293 -103
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +290 -102
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/matchers.d.mts +4 -0
  15. package/dist/internal/matchers.d.ts +4 -0
  16. package/dist/internal/matchers.js.map +1 -1
  17. package/dist/internal/matchers.mjs.map +1 -1
  18. package/dist/internal/structuredOutputScorer.js.map +1 -1
  19. package/dist/internal/structuredOutputScorer.mjs.map +1 -1
  20. package/dist/internal/toolCallScorer.js.map +1 -1
  21. package/dist/internal/toolCallScorer.mjs.map +1 -1
  22. package/dist/judges/index.d.mts +1 -1
  23. package/dist/judges/index.d.ts +1 -1
  24. package/dist/judges/index.js +37 -23
  25. package/dist/judges/index.js.map +1 -1
  26. package/dist/judges/index.mjs +37 -23
  27. package/dist/judges/index.mjs.map +1 -1
  28. package/dist/judges/structuredOutputJudge.d.mts +6 -3
  29. package/dist/judges/structuredOutputJudge.d.ts +6 -3
  30. package/dist/judges/structuredOutputJudge.js +11 -11
  31. package/dist/judges/structuredOutputJudge.js.map +1 -1
  32. package/dist/judges/structuredOutputJudge.mjs +11 -11
  33. package/dist/judges/structuredOutputJudge.mjs.map +1 -1
  34. package/dist/judges/toolCallJudge.d.mts +6 -3
  35. package/dist/judges/toolCallJudge.d.ts +6 -3
  36. package/dist/judges/toolCallJudge.js +26 -12
  37. package/dist/judges/toolCallJudge.js.map +1 -1
  38. package/dist/judges/toolCallJudge.mjs +26 -12
  39. package/dist/judges/toolCallJudge.mjs.map +1 -1
  40. package/dist/judges/types.d.mts +33 -16
  41. package/dist/judges/types.d.ts +33 -16
  42. package/dist/judges/types.js.map +1 -1
  43. package/dist/legacy/evaluate/index.d.mts +2 -0
  44. package/dist/legacy/evaluate/index.d.ts +2 -0
  45. package/dist/legacy/evaluate/index.js.map +1 -1
  46. package/dist/legacy/evaluate/index.mjs.map +1 -1
  47. package/dist/legacy/scorers/index.js.map +1 -1
  48. package/dist/legacy/scorers/index.mjs.map +1 -1
  49. package/dist/legacy/scorers/structuredOutputScorer.d.mts +2 -0
  50. package/dist/legacy/scorers/structuredOutputScorer.d.ts +2 -0
  51. package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -1
  52. package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -1
  53. package/dist/legacy/scorers/toolCallScorer.d.mts +2 -0
  54. package/dist/legacy/scorers/toolCallScorer.d.ts +2 -0
  55. package/dist/legacy/scorers/toolCallScorer.js.map +1 -1
  56. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -1
  57. package/dist/legacy/scorers/utils.js.map +1 -1
  58. package/dist/legacy/scorers/utils.mjs.map +1 -1
  59. package/dist/legacy/shared.d.mts +6 -0
  60. package/dist/legacy/shared.d.ts +6 -0
  61. package/dist/legacy/shared.js.map +1 -1
  62. package/dist/legacy.d.mts +3 -0
  63. package/dist/legacy.d.ts +3 -0
  64. package/dist/legacy.js.map +1 -1
  65. package/dist/legacy.mjs.map +1 -1
  66. package/dist/replay.d.mts +7 -0
  67. package/dist/replay.d.ts +7 -0
  68. package/dist/replay.js.map +1 -1
  69. package/dist/replay.mjs.map +1 -1
  70. package/dist/reporter.d.mts +1 -0
  71. package/dist/reporter.d.ts +1 -0
  72. package/dist/reporter.js.map +1 -1
  73. package/dist/reporter.mjs.map +1 -1
  74. package/package.json +1 -1
package/README.md CHANGED
@@ -18,6 +18,12 @@ npm install -D @vitest-evals/harness-ai-sdk
18
18
  npm install -D @vitest-evals/harness-openai-agents
19
19
  ```
20
20
 
21
+ For GitHub Actions summaries and annotations, install the JSON post-processor:
22
+
23
+ ```sh
24
+ npm install -D @vitest-evals/github-reporter
25
+ ```
26
+
21
27
  ## Core Model
22
28
 
23
29
  - `describeEval(...)` binds exactly one harness to a suite
@@ -27,13 +33,18 @@ npm install -D @vitest-evals/harness-openai-agents
27
33
  - the returned `result.output` is the app-facing value you assert on directly
28
34
  - the returned `result.session` is the canonical JSON-serializable trace for
29
35
  reporting, replay, tool assertions, and judges
30
- - scenario-specific judge criteria can live in `inputValue`; use `metadata` for
36
+ - scenario-specific judge criteria can live in `input`; use `metadata` for
31
37
  per-run expectations or harness configuration that are not part of the
32
38
  scenario payload
33
39
  - suite-level `judges` are optional and run automatically after each `run(...)`
34
40
  - suite-level `judgeThreshold` controls fail-on-score for those automatic judges
35
- - every judge receives `JudgeContext`, including the configured `harness` with
36
- its required `prompt` function
41
+ - every judge is a named object with `assess(ctx)`
42
+ - every judge receives `JudgeContext` with typed `input`, typed `output`, the
43
+ normalized run/session, tool calls, and metadata; `output` is only optional
44
+ when the harness output type includes `undefined`
45
+ - judges own their prompt, rubric, model call, and parsing; use
46
+ `createJudge(...)` for custom judges and its provider-helper overload only
47
+ when multiple judges share setup
37
48
  - explicit judge assertions use
38
49
  `await expect(result).toSatisfyJudge(judge, context)`
39
50
 
@@ -43,25 +54,29 @@ npm install -D @vitest-evals/harness-openai-agents
43
54
  import { expect } from "vitest";
44
55
  import { piAiHarness } from "@vitest-evals/harness-pi-ai";
45
56
  import {
57
+ createJudge,
46
58
  describeEval,
47
- namedJudge,
48
59
  toolCalls,
49
60
  type JudgeContext,
50
61
  } from "vitest-evals";
51
- import { createRefundAgent, judgePrompt } from "../src/refundAgent";
62
+ import { createRefundAgent } from "../src/refundAgent";
52
63
 
53
64
  type RefundEvalMetadata = {
54
65
  expectedStatus: "approved" | "denied";
55
66
  expectedTools: string[];
56
67
  };
57
68
 
58
- const FactualityJudge = namedJudge(
69
+ type RefundOutput = {
70
+ status: "approved" | "denied";
71
+ };
72
+
73
+ const FactualityJudge = createJudge(
59
74
  "FactualityJudge",
60
75
  async ({
61
76
  input,
62
77
  output,
63
78
  metadata,
64
- }: JudgeContext<string, RefundEvalMetadata>) => {
79
+ }: JudgeContext<string, RefundOutput, RefundEvalMetadata>) => {
65
80
  const verdict = await judgeFactuality({
66
81
  question: input,
67
82
  answer: output,
@@ -81,8 +96,7 @@ describeEval(
81
96
  "refund agent",
82
97
  {
83
98
  harness: piAiHarness({
84
- createAgent: () => createRefundAgent(),
85
- prompt: judgePrompt,
99
+ agent: () => createRefundAgent(),
86
100
  }),
87
101
  judges: [FactualityJudge],
88
102
  },
@@ -135,6 +149,24 @@ describeEval("refund agent", { harness }, (it) => {
135
149
  });
136
150
  ```
137
151
 
152
+ ## GitHub Actions Reporting
153
+
154
+ Use Vitest JSON as the eval report artifact. It preserves the `meta` field that
155
+ contains eval scores and normalized harness runs.
156
+
157
+ ```sh
158
+ vitest run evals \
159
+ --reporter=vitest-evals/reporter \
160
+ --reporter=json \
161
+ --outputFile.json=vitest-results.json
162
+
163
+ vitest-evals-github-report
164
+ ```
165
+
166
+ The GitHub reporter writes a job summary when `GITHUB_STEP_SUMMARY` is present,
167
+ emits short failure annotations in Actions, and can publish a separate Check Run
168
+ with `--check-run` when `checks: write` permission is configured.
169
+
138
170
  ## Existing Agents
139
171
 
140
172
  For an existing agent, the intended contract is:
@@ -157,37 +189,31 @@ be inferred automatically. Treat low-level normalization callbacks as an escape
157
189
  hatch, not part of the primary authoring path.
158
190
 
159
191
  For OpenAI Agents SDK apps, use
160
- `@vitest-evals/harness-openai-agents` with an existing `Agent` or
161
- `createAgent()` factory and a `Runner` / `createRunner()` callback. The harness
162
- calls `Runner.run(agent, input, options)` by default and exposes the same
192
+ `@vitest-evals/harness-openai-agents` with an existing `Agent` or an `agent`
193
+ factory and a `Runner` or `runner` factory. The harness calls
194
+ `Runner.run(agent, input, options)` by default and exposes the same
163
195
  normalization and replay hooks when the app needs a custom entrypoint or
164
196
  structured domain output mapping.
165
197
 
166
198
  ## Custom App Harnesses
167
199
 
168
200
  First-party harness packages are conveniences, not the only supported path. If
169
- you need to test a full application flow, define a harness that runs your app
170
- through its normal entrypoint and returns a normalized `HarnessRun`. The same
171
- harness should also expose `prompt`, which LLM-backed judges can reuse through
172
- `JudgeContext.harness.prompt`.
201
+ you need to test a full application flow, use `createHarness(...)` to run your
202
+ app through its normal entrypoint and return the app-facing output. Judges own
203
+ their prompt/rubric text separately from the system under test.
204
+ When generics are needed, use `createHarness<Input, Output, Metadata>(...)`.
173
205
 
174
206
  ```ts
175
207
  import {
208
+ createHarness,
209
+ createJudge,
176
210
  describeEval,
177
- namedJudge,
178
211
  type JudgeContext,
179
212
  } from "vitest-evals";
180
- import {
181
- normalizeContent,
182
- normalizeMetadata,
183
- toJsonValue,
184
- type Harness,
185
- type HarnessRun,
186
- } from "vitest-evals/harness";
187
213
 
188
214
  type AppEvent = {
189
215
  type: string;
190
- payload: Record<string, unknown>;
216
+ payload: Record<string, string>;
191
217
  };
192
218
 
193
219
  type AppEvalInput = {
@@ -199,65 +225,42 @@ type AppEvalInput = {
199
225
  };
200
226
  };
201
227
 
202
- const appHarness: Harness<AppEvalInput> = {
228
+ type AppEvalMetadata = Record<string, never>;
229
+
230
+ type AppOutput = {
231
+ replies: Array<{ text: string }>;
232
+ sideEffects: string[];
233
+ };
234
+
235
+ const appHarness = createHarness<AppEvalInput, AppOutput, AppEvalMetadata>({
203
236
  name: "custom-app",
204
- prompt: (input, options) => promptJudgeModel(input, options),
205
- run: async (input, context): Promise<HarnessRun> => {
237
+ run: async ({ input, signal }) => {
206
238
  const result = await replayAppEvents(input.events, {
207
- signal: context.signal,
239
+ signal,
208
240
  });
209
- const output = {
210
- replies: result.replies,
211
- sideEffects: result.sideEffects,
212
- };
213
241
 
214
242
  return {
215
- output: toJsonValue(output),
216
- session: {
217
- messages: [
218
- ...input.events.map((event) => ({
219
- role: "user" as const,
220
- content: normalizeContent(event),
221
- })),
222
- ...result.replies.map((reply) => ({
223
- role: "assistant" as const,
224
- content: normalizeContent(reply.text),
225
- metadata: normalizeMetadata({
226
- target: reply.target,
227
- }),
228
- })),
229
- ],
230
- outputText: result.replies.map((reply) => reply.text).join("\n\n"),
231
- metadata: normalizeMetadata({
232
- replyCount: result.replies.length,
233
- }),
243
+ output: {
244
+ replies: result.replies,
245
+ sideEffects: result.sideEffects,
246
+ },
247
+ artifacts: {
248
+ replyCount: result.replies.length,
234
249
  },
235
250
  usage: {},
236
- artifacts:
237
- Object.keys(context.artifacts).length > 0
238
- ? context.artifacts
239
- : undefined,
240
- errors: [],
241
251
  };
242
252
  },
243
- };
253
+ });
244
254
 
245
- const AppRubricJudge = namedJudge(
255
+ const AppRubricJudge = createJudge(
246
256
  "AppRubricJudge",
247
- async (
248
- ctx: JudgeContext<AppEvalInput, Record<string, unknown>, typeof appHarness>,
249
- ) => {
250
- const verdict = await ctx.harness.prompt(
251
- formatRubricPrompt({
257
+ async (ctx: JudgeContext<AppEvalInput, AppOutput, AppEvalMetadata>) => {
258
+ const verdict = await promptJudgeModel({
259
+ prompt: formatRubricPrompt({
252
260
  output: ctx.output,
253
- criteria: ctx.inputValue.criteria,
261
+ criteria: ctx.input.criteria,
254
262
  }),
255
- {
256
- metadata: {
257
- judge: "AppRubricJudge",
258
- },
259
- },
260
- );
263
+ });
261
264
 
262
265
  return parseRubricVerdict(verdict);
263
266
  },
@@ -292,16 +295,16 @@ describeEval(
292
295
  );
293
296
  ```
294
297
 
295
- Use `Harness.run(...)` for the application under test and `Harness.prompt(...)`
296
- for judge model calls. Calling `ctx.harness.run(...)` from inside a judge runs
297
- the application a second time, so reserve that for judges that intentionally
298
- need a second execution. Put criteria on `inputValue` when they are part of the
299
- scenario itself; use per-run `metadata` for harness configuration or
300
- expectations that are not part of the scenario payload. `session.outputText` is
301
- the canonical text sent to judges, so define it deliberately when your app
302
- returns structured artifacts.
298
+ Use `Harness.run(...)` for the application under test. Calling
299
+ `ctx.harness.run(...)` from inside a judge runs the application a second time,
300
+ so reserve that for judges that intentionally need a second execution. Put
301
+ criteria on `input` when they are part of the scenario itself; use per-run
302
+ `metadata` for harness configuration or expectations that are not part of the
303
+ scenario payload. `createHarness(...)` builds a default user/assistant session
304
+ from `input` and typed `output`; return a full `HarnessRun` only when you need
305
+ exact session control.
303
306
 
304
- Provider setup and rubric parsing stay in your harness and judge. The core
307
+ Provider setup and rubric parsing stay in your judge. The core
305
308
  package only requires the judge to return a `JudgeResult` with a score and
306
309
  optional metadata.
307
310
 
@@ -330,17 +333,17 @@ context:
330
333
 
331
334
  ```ts
332
335
  await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
333
- inputValue: "Refund invoice inv_123",
336
+ input: "Refund invoice inv_123",
334
337
  });
335
338
  ```
336
339
 
337
- If you are writing a custom judge, wrap it with `namedJudge(...)` so reporter
338
- output uses a stable label:
340
+ Use `createJudge(...)` for custom judges so reporter output gets a stable
341
+ label:
339
342
 
340
343
  ```ts
341
- import { namedJudge } from "vitest-evals";
344
+ import { createJudge } from "vitest-evals";
342
345
 
343
- const FactualityJudge = namedJudge(
346
+ const FactualityJudge = createJudge(
344
347
  "FactualityJudge",
345
348
  async ({ output }) => {
346
349
  const answer = output;
@@ -356,21 +359,25 @@ const FactualityJudge = namedJudge(
356
359
  );
357
360
  ```
358
361
 
359
- LLM-backed judges can reuse the suite harness prompt by calling
360
- `harness.prompt(...)`. `vitest-evals` does not prescribe a rubric schema,
361
- scoring scale, model provider, or parser; those stay in the judge. Calling
362
- `harness.run(...)` from a judge executes the application again, so use that
363
- only when a second run is intentional.
362
+ LLM-backed judges should provide their own judge prompt and rubric text.
363
+ `vitest-evals` does not prescribe a rubric schema, scoring scale, model
364
+ provider, or parser; those stay in the judge. When multiple judges share a
365
+ reusable judge-side provider helper, use the provider-helper overload of
366
+ `createJudge(...)` so run-scoped options such as abort signals stay curried.
367
+ Calling `harness.run(...)` from a judge executes the application again, so use
368
+ that only when a second run is intentional.
364
369
 
365
370
  For an `EvalHarnessRun` returned by fixture `run(...)`,
366
- `toSatisfyJudge(...)` uses the run's canonical text output and reuses the
367
- registered input, metadata, and harness prompt. Inside an eval test,
368
- matcher calls on registered raw output or session objects reuse that exact run
369
- context; raw output values are serialized as the judge `output`, so
370
- `expect(result.output).toSatisfyJudge(judge)` stays concise. Other raw values
371
- fall back to the current test's most recent `run(...)` context. For
371
+ `toSatisfyJudge(...)` uses the run's typed `output` and reuses the registered
372
+ input and metadata. It requires any custom judge params and rejects judges whose
373
+ output type cannot assess the received value. Inside an eval test,
374
+ matcher calls on registered output objects or session objects reuse that exact
375
+ run context when the value can be registered by reference, so
376
+ `expect(result.output).toSatisfyJudge(judge)` stays concise for structured
377
+ outputs. Other raw values fall back to the current test's most recent
378
+ `run(...)` context. For
372
379
  manually-created runs or values outside an eval context, pass any required
373
- `inputValue`, `metadata`, or `harness` in matcher options. Structured or
380
+ `input`, `metadata`, or `harness` in matcher options. Structured or
374
381
  programmatic result checks should usually assert on `result.output` directly.
375
382
  When a judge needs richer normalized context or the configured suite harness,
376
383
  type it with `JudgeContext`.
@@ -1,7 +1,10 @@
1
+ /** Primitive scalar values allowed in normalized JSON-safe eval data. */
1
2
  type JsonPrimitive = string | number | boolean | null;
3
+ /** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
2
4
  type JsonValue = JsonPrimitive | JsonValue[] | {
3
5
  [key: string]: JsonValue;
4
6
  };
7
+ /** Normalized record for one tool call observed during a harness run. */
5
8
  type ToolCallRecord = {
6
9
  id?: string;
7
10
  name: string;
@@ -17,12 +20,14 @@ type ToolCallRecord = {
17
20
  durationMs?: number;
18
21
  metadata?: Record<string, JsonValue>;
19
22
  };
23
+ /** Normalized message recorded in a harness session transcript. */
20
24
  type NormalizedMessage = {
21
25
  role: "system" | "user" | "assistant" | "tool";
22
26
  content?: JsonValue;
23
27
  toolCalls?: ToolCallRecord[];
24
28
  metadata?: Record<string, JsonValue>;
25
29
  };
30
+ /** Provider usage summary attached to a normalized harness run. */
26
31
  type UsageSummary = {
27
32
  provider?: string;
28
33
  model?: string;
@@ -35,50 +40,82 @@ type UsageSummary = {
35
40
  retries?: number;
36
41
  metadata?: Record<string, JsonValue>;
37
42
  };
43
+ /** Timing summary attached to a normalized harness run. */
38
44
  type TimingSummary = {
39
45
  totalMs?: number;
40
46
  metadata?: Record<string, JsonValue>;
41
47
  };
48
+ /** JSON-serializable transcript produced by the system under test. */
42
49
  type NormalizedSession = {
43
50
  messages: NormalizedMessage[];
44
- outputText?: string;
45
51
  provider?: string;
46
52
  model?: string;
47
53
  metadata?: Record<string, JsonValue>;
48
54
  };
49
- type HarnessRun = {
55
+ type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
56
+ output?: TOutput;
57
+ } : {
58
+ output: TOutput;
59
+ };
60
+ /** Normalized result returned by every harness execution. */
61
+ type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
50
62
  session: NormalizedSession;
51
- output?: JsonValue;
52
63
  usage: UsageSummary;
53
64
  timings?: TimingSummary;
54
65
  artifacts?: Record<string, JsonValue>;
55
66
  errors: Array<Record<string, JsonValue>>;
56
67
  };
57
- /** Optional provider-facing hints for harness prompt calls. */
58
- type HarnessPromptOptions = {
59
- system?: string;
60
- metadata?: Record<string, JsonValue>;
61
- };
62
- /** Provider-agnostic prompt seam that judges can reuse from a harness. */
63
- type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
68
+ /** Error value with an attached partial or complete normalized harness run. */
64
69
  type HarnessRunError = Error & {
65
70
  vitestEvalsRun: HarnessRun;
66
71
  };
72
+ /** Per-run metadata shape accepted by harnesses and eval tests. */
67
73
  type HarnessMetadata = Record<string, unknown>;
74
+ /** Runtime context passed from the eval fixture into a harness run. */
68
75
  type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
69
76
  metadata: Readonly<TMetadata>;
70
- task: {
71
- meta: Record<string, unknown>;
72
- };
73
77
  signal?: AbortSignal;
74
78
  artifacts: Record<string, JsonValue>;
75
79
  setArtifact: (name: string, value: JsonValue) => void;
76
80
  };
77
- type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
81
+ /** Adapter that executes the system under test and returns a normalized run. */
82
+ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
83
+ name: string;
84
+ run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
85
+ };
86
+ /** Value or promise accepted by lightweight harness callbacks. */
87
+ type MaybePromise<T> = T | Promise<T>;
88
+ /** Lightweight tool-call record accepted by `createHarness(...)` results. */
89
+ type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
90
+ arguments?: unknown;
91
+ result?: unknown;
92
+ error?: unknown;
93
+ metadata?: Record<string, unknown>;
94
+ };
95
+ /** Lightweight result shape normalized by `createHarness(...)`. */
96
+ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
97
+ messages?: NormalizedMessage[];
98
+ toolCalls?: SimpleToolCallRecord[];
99
+ usage?: UsageSummary;
100
+ timings?: TimingSummary;
101
+ artifacts?: Record<string, unknown>;
102
+ metadata?: Record<string, unknown>;
103
+ errors?: unknown[];
104
+ };
105
+ /** Either a complete normalized run or a lightweight result to normalize. */
106
+ type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
107
+ /** Arguments passed to the `createHarness(...)` convenience callback. */
108
+ type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
109
+ input: TInput;
110
+ metadata: Readonly<TMetadata>;
111
+ signal?: AbortSignal;
112
+ artifacts: HarnessContext<TMetadata>["artifacts"];
113
+ setArtifact: HarnessContext<TMetadata>["setArtifact"];
114
+ };
115
+ /** Options for creating a lightweight custom application harness. */
116
+ type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
78
117
  name: string;
79
- /** Prompt seam reused by LLM-backed judges. */
80
- prompt: HarnessPrompt;
81
- run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
118
+ run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
82
119
  };
83
120
  /** Returns true when a value exposes a callable method with the given name. */
84
121
  declare function hasCallableMethod(value: unknown, methodName: string): boolean;
@@ -90,6 +127,10 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
90
127
  declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
91
128
  /** Converts arbitrary content into the JSON-safe message content shape. */
92
129
  declare function normalizeContent(value: unknown): JsonValue;
130
+ /** Creates a harness from the common "run app code and return output" shape. */
131
+ declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
132
+ /** Normalizes a lightweight harness result into the reporter-facing run shape. */
133
+ declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
93
134
  /** Flattens every recorded tool call from a normalized session. */
94
135
  declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
95
136
  /** Filters normalized session messages by role. */
@@ -115,4 +156,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
115
156
  /** Serializes an arbitrary thrown value into the normalized error shape. */
116
157
  declare function serializeError(error: unknown): Record<string, JsonValue>;
117
158
 
118
- export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
159
+ export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
package/dist/harness.d.ts CHANGED
@@ -1,7 +1,10 @@
1
+ /** Primitive scalar values allowed in normalized JSON-safe eval data. */
1
2
  type JsonPrimitive = string | number | boolean | null;
3
+ /** JSON-safe value shape used by normalized sessions, artifacts, and errors. */
2
4
  type JsonValue = JsonPrimitive | JsonValue[] | {
3
5
  [key: string]: JsonValue;
4
6
  };
7
+ /** Normalized record for one tool call observed during a harness run. */
5
8
  type ToolCallRecord = {
6
9
  id?: string;
7
10
  name: string;
@@ -17,12 +20,14 @@ type ToolCallRecord = {
17
20
  durationMs?: number;
18
21
  metadata?: Record<string, JsonValue>;
19
22
  };
23
+ /** Normalized message recorded in a harness session transcript. */
20
24
  type NormalizedMessage = {
21
25
  role: "system" | "user" | "assistant" | "tool";
22
26
  content?: JsonValue;
23
27
  toolCalls?: ToolCallRecord[];
24
28
  metadata?: Record<string, JsonValue>;
25
29
  };
30
+ /** Provider usage summary attached to a normalized harness run. */
26
31
  type UsageSummary = {
27
32
  provider?: string;
28
33
  model?: string;
@@ -35,50 +40,82 @@ type UsageSummary = {
35
40
  retries?: number;
36
41
  metadata?: Record<string, JsonValue>;
37
42
  };
43
+ /** Timing summary attached to a normalized harness run. */
38
44
  type TimingSummary = {
39
45
  totalMs?: number;
40
46
  metadata?: Record<string, JsonValue>;
41
47
  };
48
+ /** JSON-serializable transcript produced by the system under test. */
42
49
  type NormalizedSession = {
43
50
  messages: NormalizedMessage[];
44
- outputText?: string;
45
51
  provider?: string;
46
52
  model?: string;
47
53
  metadata?: Record<string, JsonValue>;
48
54
  };
49
- type HarnessRun = {
55
+ type OutputField<TOutput extends JsonValue | undefined> = undefined extends TOutput ? {
56
+ output?: TOutput;
57
+ } : {
58
+ output: TOutput;
59
+ };
60
+ /** Normalized result returned by every harness execution. */
61
+ type HarnessRun<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
50
62
  session: NormalizedSession;
51
- output?: JsonValue;
52
63
  usage: UsageSummary;
53
64
  timings?: TimingSummary;
54
65
  artifacts?: Record<string, JsonValue>;
55
66
  errors: Array<Record<string, JsonValue>>;
56
67
  };
57
- /** Optional provider-facing hints for harness prompt calls. */
58
- type HarnessPromptOptions = {
59
- system?: string;
60
- metadata?: Record<string, JsonValue>;
61
- };
62
- /** Provider-agnostic prompt seam that judges can reuse from a harness. */
63
- type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
68
+ /** Error value with an attached partial or complete normalized harness run. */
64
69
  type HarnessRunError = Error & {
65
70
  vitestEvalsRun: HarnessRun;
66
71
  };
72
+ /** Per-run metadata shape accepted by harnesses and eval tests. */
67
73
  type HarnessMetadata = Record<string, unknown>;
74
+ /** Runtime context passed from the eval fixture into a harness run. */
68
75
  type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
69
76
  metadata: Readonly<TMetadata>;
70
- task: {
71
- meta: Record<string, unknown>;
72
- };
73
77
  signal?: AbortSignal;
74
78
  artifacts: Record<string, JsonValue>;
75
79
  setArtifact: (name: string, value: JsonValue) => void;
76
80
  };
77
- type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
81
+ /** Adapter that executes the system under test and returns a normalized run. */
82
+ type Harness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
83
+ name: string;
84
+ run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun<TOutput>>;
85
+ };
86
+ /** Value or promise accepted by lightweight harness callbacks. */
87
+ type MaybePromise<T> = T | Promise<T>;
88
+ /** Lightweight tool-call record accepted by `createHarness(...)` results. */
89
+ type SimpleToolCallRecord = Omit<ToolCallRecord, "arguments" | "result" | "error" | "metadata"> & {
90
+ arguments?: unknown;
91
+ result?: unknown;
92
+ error?: unknown;
93
+ metadata?: Record<string, unknown>;
94
+ };
95
+ /** Lightweight result shape normalized by `createHarness(...)`. */
96
+ type SimpleHarnessResult<TOutput extends JsonValue | undefined = JsonValue | undefined> = OutputField<TOutput> & {
97
+ messages?: NormalizedMessage[];
98
+ toolCalls?: SimpleToolCallRecord[];
99
+ usage?: UsageSummary;
100
+ timings?: TimingSummary;
101
+ artifacts?: Record<string, unknown>;
102
+ metadata?: Record<string, unknown>;
103
+ errors?: unknown[];
104
+ };
105
+ /** Either a complete normalized run or a lightweight result to normalize. */
106
+ type HarnessResultLike<TOutput extends JsonValue | undefined = JsonValue | undefined> = HarnessRun<TOutput> | SimpleHarnessResult<TOutput>;
107
+ /** Arguments passed to the `createHarness(...)` convenience callback. */
108
+ type CreateHarnessRunArgs<TInput, TMetadata extends HarnessMetadata> = {
109
+ input: TInput;
110
+ metadata: Readonly<TMetadata>;
111
+ signal?: AbortSignal;
112
+ artifacts: HarnessContext<TMetadata>["artifacts"];
113
+ setArtifact: HarnessContext<TMetadata>["setArtifact"];
114
+ };
115
+ /** Options for creating a lightweight custom application harness. */
116
+ type CreateHarnessOptions<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata> = {
78
117
  name: string;
79
- /** Prompt seam reused by LLM-backed judges. */
80
- prompt: HarnessPrompt;
81
- run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
118
+ run: (args: CreateHarnessRunArgs<TInput, TMetadata>) => MaybePromise<HarnessResultLike<TOutput>>;
82
119
  };
83
120
  /** Returns true when a value exposes a callable method with the given name. */
84
121
  declare function hasCallableMethod(value: unknown, methodName: string): boolean;
@@ -90,6 +127,10 @@ declare function normalizeRecord(value: Record<string, unknown>): Record<string,
90
127
  declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
91
128
  /** Converts arbitrary content into the JSON-safe message content shape. */
92
129
  declare function normalizeContent(value: unknown): JsonValue;
130
+ /** Creates a harness from the common "run app code and return output" shape. */
131
+ declare function createHarness<TInput = unknown, TOutput extends JsonValue | undefined = JsonValue | undefined, TMetadata extends HarnessMetadata = HarnessMetadata>(options: CreateHarnessOptions<TInput, TOutput, TMetadata>): Harness<TInput, TOutput, TMetadata>;
132
+ /** Normalizes a lightweight harness result into the reporter-facing run shape. */
133
+ declare function normalizeHarnessRun<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata, TOutput extends JsonValue | undefined = JsonValue | undefined>(input: TInput, result: HarnessResultLike<TOutput>, context?: HarnessContext<TMetadata>): HarnessRun<TOutput>;
93
134
  /** Flattens every recorded tool call from a normalized session. */
94
135
  declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
95
136
  /** Filters normalized session messages by role. */
@@ -115,4 +156,4 @@ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string,
115
156
  /** Serializes an arbitrary thrown value into the normalized error shape. */
116
157
  declare function serializeError(error: unknown): Record<string, JsonValue>;
117
158
 
118
- export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
159
+ export { type CreateHarnessOptions, type CreateHarnessRunArgs, type Harness, type HarnessContext, type HarnessMetadata, type HarnessResultLike, type HarnessRun, type HarnessRunError, type JsonPrimitive, type JsonValue, type MaybePromise, type NormalizedMessage, type NormalizedSession, type SimpleHarnessResult, type SimpleToolCallRecord, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, createHarness, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeHarnessRun, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };