vitest-evals 0.8.0 → 0.9.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/README.md +167 -236
  2. package/dist/harness.d.mts +118 -0
  3. package/dist/harness.d.ts +118 -0
  4. package/dist/harness.js +167 -0
  5. package/dist/harness.js.map +1 -0
  6. package/dist/harness.mjs +126 -0
  7. package/dist/harness.mjs.map +1 -0
  8. package/dist/index.d.mts +114 -3
  9. package/dist/index.d.ts +114 -3
  10. package/dist/index.js +682 -398
  11. package/dist/index.js.map +1 -1
  12. package/dist/index.mjs +672 -407
  13. package/dist/index.mjs.map +1 -1
  14. package/dist/internal/matchers.d.mts +41 -0
  15. package/dist/internal/matchers.d.ts +41 -0
  16. package/dist/internal/matchers.js +206 -0
  17. package/dist/internal/matchers.js.map +1 -0
  18. package/dist/internal/matchers.mjs +176 -0
  19. package/dist/internal/matchers.mjs.map +1 -0
  20. package/dist/internal/scoring.d.mts +18 -0
  21. package/dist/internal/scoring.d.ts +18 -0
  22. package/dist/internal/scoring.js +19 -0
  23. package/dist/internal/scoring.js.map +1 -0
  24. package/dist/internal/scoring.mjs +1 -0
  25. package/dist/internal/scoring.mjs.map +1 -0
  26. package/dist/internal/structuredOutputScorer.d.mts +16 -0
  27. package/dist/internal/structuredOutputScorer.d.ts +16 -0
  28. package/dist/{scorers → internal}/structuredOutputScorer.js +94 -80
  29. package/dist/internal/structuredOutputScorer.js.map +1 -0
  30. package/dist/{scorers → internal}/structuredOutputScorer.mjs +93 -80
  31. package/dist/internal/structuredOutputScorer.mjs.map +1 -0
  32. package/dist/internal/toolCallScorer.d.mts +20 -0
  33. package/dist/internal/toolCallScorer.d.ts +20 -0
  34. package/dist/{scorers → internal}/toolCallScorer.js +100 -134
  35. package/dist/internal/toolCallScorer.js.map +1 -0
  36. package/dist/internal/toolCallScorer.mjs +310 -0
  37. package/dist/internal/toolCallScorer.mjs.map +1 -0
  38. package/dist/judges/index.d.mts +8 -0
  39. package/dist/judges/index.d.ts +8 -0
  40. package/dist/judges/index.js +550 -0
  41. package/dist/judges/index.js.map +1 -0
  42. package/dist/judges/index.mjs +522 -0
  43. package/dist/judges/index.mjs.map +1 -0
  44. package/dist/judges/structuredOutputJudge.d.mts +20 -0
  45. package/dist/judges/structuredOutputJudge.d.ts +20 -0
  46. package/dist/judges/structuredOutputJudge.js +340 -0
  47. package/dist/judges/structuredOutputJudge.js.map +1 -0
  48. package/dist/judges/structuredOutputJudge.mjs +313 -0
  49. package/dist/judges/structuredOutputJudge.mjs.map +1 -0
  50. package/dist/judges/toolCallJudge.d.mts +22 -0
  51. package/dist/judges/toolCallJudge.d.ts +22 -0
  52. package/dist/judges/toolCallJudge.js +357 -0
  53. package/dist/judges/toolCallJudge.js.map +1 -0
  54. package/dist/judges/toolCallJudge.mjs +330 -0
  55. package/dist/judges/toolCallJudge.mjs.map +1 -0
  56. package/dist/judges/types.d.mts +39 -0
  57. package/dist/judges/types.d.ts +39 -0
  58. package/dist/judges/types.js +19 -0
  59. package/dist/judges/types.js.map +1 -0
  60. package/dist/judges/types.mjs +1 -0
  61. package/dist/judges/types.mjs.map +1 -0
  62. package/dist/{evaluate → legacy/evaluate}/index.d.mts +7 -0
  63. package/dist/{evaluate → legacy/evaluate}/index.d.ts +7 -0
  64. package/dist/{evaluate → legacy/evaluate}/index.js +65 -89
  65. package/dist/legacy/evaluate/index.js.map +1 -0
  66. package/dist/legacy/evaluate/index.mjs +138 -0
  67. package/dist/legacy/evaluate/index.mjs.map +1 -0
  68. package/dist/legacy/format.d.mts +16 -0
  69. package/dist/legacy/format.d.ts +16 -0
  70. package/dist/legacy/format.js +69 -0
  71. package/dist/legacy/format.js.map +1 -0
  72. package/dist/legacy/format.mjs +43 -0
  73. package/dist/legacy/format.mjs.map +1 -0
  74. package/dist/legacy/scorers/index.d.mts +4 -0
  75. package/dist/legacy/scorers/index.d.ts +4 -0
  76. package/dist/{scorers → legacy/scorers}/index.js +164 -162
  77. package/dist/legacy/scorers/index.js.map +1 -0
  78. package/dist/{scorers → legacy/scorers}/index.mjs +163 -164
  79. package/dist/legacy/scorers/index.mjs.map +1 -0
  80. package/dist/legacy/scorers/structuredOutputScorer.d.mts +20 -0
  81. package/dist/legacy/scorers/structuredOutputScorer.d.ts +20 -0
  82. package/dist/legacy/scorers/structuredOutputScorer.js +320 -0
  83. package/dist/legacy/scorers/structuredOutputScorer.js.map +1 -0
  84. package/dist/legacy/scorers/structuredOutputScorer.mjs +293 -0
  85. package/dist/legacy/scorers/structuredOutputScorer.mjs.map +1 -0
  86. package/dist/legacy/scorers/toolCallScorer.d.mts +23 -0
  87. package/dist/legacy/scorers/toolCallScorer.d.ts +23 -0
  88. package/dist/{scorers/toolCallScorer.mjs → legacy/scorers/toolCallScorer.js} +129 -134
  89. package/dist/legacy/scorers/toolCallScorer.js.map +1 -0
  90. package/dist/legacy/scorers/toolCallScorer.mjs +315 -0
  91. package/dist/legacy/scorers/toolCallScorer.mjs.map +1 -0
  92. package/dist/legacy/scorers/utils.d.mts +1 -0
  93. package/dist/legacy/scorers/utils.d.ts +1 -0
  94. package/dist/{scorers → legacy/scorers}/utils.js +73 -41
  95. package/dist/legacy/scorers/utils.js.map +1 -0
  96. package/dist/{scorers → legacy/scorers}/utils.mjs +71 -41
  97. package/dist/legacy/scorers/utils.mjs.map +1 -0
  98. package/dist/legacy/shared.d.mts +31 -0
  99. package/dist/legacy/shared.d.ts +31 -0
  100. package/dist/legacy/shared.js +19 -0
  101. package/dist/legacy/shared.js.map +1 -0
  102. package/dist/legacy/shared.mjs +1 -0
  103. package/dist/legacy/shared.mjs.map +1 -0
  104. package/dist/legacy.d.mts +34 -0
  105. package/dist/legacy.d.ts +34 -0
  106. package/dist/legacy.js +751 -0
  107. package/dist/legacy.js.map +1 -0
  108. package/dist/legacy.mjs +727 -0
  109. package/dist/legacy.mjs.map +1 -0
  110. package/dist/replay.d.mts +60 -0
  111. package/dist/replay.d.ts +60 -0
  112. package/dist/replay.js +228 -0
  113. package/dist/replay.js.map +1 -0
  114. package/dist/replay.mjs +201 -0
  115. package/dist/replay.mjs.map +1 -0
  116. package/dist/reporter.d.mts +42 -0
  117. package/dist/reporter.d.ts +42 -0
  118. package/dist/reporter.js +494 -10
  119. package/dist/reporter.js.map +1 -1
  120. package/dist/reporter.mjs +494 -10
  121. package/dist/reporter.mjs.map +1 -1
  122. package/package.json +28 -60
  123. package/dist/evaluate/index.js.map +0 -1
  124. package/dist/evaluate/index.mjs +0 -163
  125. package/dist/evaluate/index.mjs.map +0 -1
  126. package/dist/scorers/index.d.mts +0 -4
  127. package/dist/scorers/index.d.ts +0 -4
  128. package/dist/scorers/index.js.map +0 -1
  129. package/dist/scorers/index.mjs.map +0 -1
  130. package/dist/scorers/structuredOutputScorer.d.mts +0 -4
  131. package/dist/scorers/structuredOutputScorer.d.ts +0 -4
  132. package/dist/scorers/structuredOutputScorer.js.map +0 -1
  133. package/dist/scorers/structuredOutputScorer.mjs.map +0 -1
  134. package/dist/scorers/toolCallScorer.d.mts +0 -315
  135. package/dist/scorers/toolCallScorer.d.ts +0 -315
  136. package/dist/scorers/toolCallScorer.js.map +0 -1
  137. package/dist/scorers/toolCallScorer.mjs.map +0 -1
  138. package/dist/scorers/utils.d.mts +0 -103
  139. package/dist/scorers/utils.d.ts +0 -103
  140. package/dist/scorers/utils.js.map +0 -1
  141. package/dist/scorers/utils.mjs.map +0 -1
package/README.md CHANGED
@@ -1,281 +1,212 @@
1
1
  # vitest-evals
2
2
 
3
- End-to-end evaluation framework for AI agents, built on Vitest.
3
+ Harness-backed AI testing on top of Vitest.
4
4
 
5
- ## Installation
5
+ ## Install
6
6
 
7
- ```shell
7
+ ```sh
8
8
  npm install -D vitest-evals
9
9
  ```
10
10
 
11
- ## Quick Start
11
+ Install a first-party harness package for the runtime you want to test:
12
12
 
13
- ```javascript
14
- import { describeEval } from "vitest-evals";
15
-
16
- describeEval("deploy agent", {
17
- data: async () => [
18
- { input: "Deploy the latest release to production", expected: "deployed" },
19
- { input: "Roll back the last deploy", expected: "rolled back" },
20
- ],
21
- task: async (input) => {
22
- const response = await myAgent.run(input);
23
- return response;
24
- },
25
- scorers: [
26
- async ({ output, expected }) => ({
27
- score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
28
- }),
29
- ],
30
- threshold: 0.8,
31
- });
13
+ ```sh
14
+ npm install -D @vitest-evals/harness-pi-ai
15
+ # or
16
+ npm install -D @vitest-evals/harness-ai-sdk
32
17
  ```
33
18
 
34
- ## Tasks
35
-
36
- Tasks process inputs and return outputs. Two formats are supported:
37
-
38
- ```javascript
39
- // Simple: just return a string
40
- const task = async (input) => "response";
41
-
42
- // With tool tracking: return a TaskResult
43
- const task = async (input) => ({
44
- result: "response",
45
- toolCalls: [
46
- { name: "search", arguments: { query: "..." }, result: {...} }
47
- ]
48
- });
49
- ```
50
-
51
- ## Test Data
52
-
53
- Each test case requires an `input` field. Use `name` to give tests a descriptive label:
54
-
55
- ```javascript
56
- data: async () => [
57
- { name: "simple deploy", input: "Deploy to staging" },
58
- { name: "deploy with rollback", input: "Deploy to prod, roll back if errors" },
59
- ],
60
- ```
61
-
62
- Additional fields (like `expected`, `expectedTools`) are passed through to scorers.
63
-
64
- ## Lifecycle Hooks
19
+ ## Core Model
20
+
21
+ - `describeEval(...)` binds exactly one harness to a suite
22
+ - the suite callback receives a fixture-backed Vitest `it`
23
+ - `run(input, { metadata? })` executes the harness explicitly and returns a
24
+ normalized `HarnessRun`
25
+ - the returned `result.output` is the app-facing value you assert on directly
26
+ - the returned `result.session` is the canonical JSON-serializable trace for
27
+ reporting, replay, tool assertions, and judges
28
+ - per-run judge inputs should usually live under `metadata`
29
+ - suite-level `judges` are optional and run automatically after each `run(...)`
30
+ - suite-level `judgeThreshold` controls fail-on-score for those automatic judges
31
+ - explicit judge assertions use
32
+ `await expect(result).toSatisfyJudge(judge, context)`
33
+
34
+ ## Explicit Run Example
35
+
36
+ ```ts
37
+ import { expect } from "vitest";
38
+ import { piAiHarness } from "@vitest-evals/harness-pi-ai";
39
+ import {
40
+ describeEval,
41
+ namedJudge,
42
+ toolCalls,
43
+ type JudgeContext,
44
+ } from "vitest-evals";
45
+ import { createRefundAgent } from "../src/refundAgent";
46
+
47
+ type RefundEvalMetadata = {
48
+ expectedStatus: "approved" | "denied";
49
+ expectedTools: string[];
50
+ };
65
51
 
66
- Use `beforeEach` and `afterEach` for setup and teardown:
52
+ const FactualityJudge = namedJudge(
53
+ "FactualityJudge",
54
+ async ({
55
+ input,
56
+ output,
57
+ metadata,
58
+ }: JudgeContext<string, RefundEvalMetadata>) => {
59
+ const verdict = await judgeFactuality({
60
+ question: input,
61
+ answer: output,
62
+ expectedStatus: metadata.expectedStatus,
63
+ });
64
+
65
+ return {
66
+ score: verdict.score,
67
+ metadata: {
68
+ rationale: verdict.rationale,
69
+ },
70
+ };
71
+ },
72
+ );
67
73
 
68
- ```javascript
69
- describeEval("agent with database", {
70
- beforeEach: async () => {
71
- await db.seed();
74
+ describeEval(
75
+ "refund agent",
76
+ {
77
+ harness: piAiHarness({
78
+ createAgent: () => createRefundAgent(),
79
+ }),
80
+ judges: [FactualityJudge],
72
81
  },
73
- afterEach: async () => {
74
- await db.clean();
82
+ (it) => {
83
+ it("approves a refundable invoice", async ({ run }) => {
84
+ const result = await run("Refund invoice inv_123", {
85
+ metadata: {
86
+ expectedStatus: "approved",
87
+ expectedTools: ["lookupInvoice", "createRefund"],
88
+ },
89
+ });
90
+
91
+ expect(result.output).toMatchObject({ status: "approved" });
92
+ expect(toolCalls(result.session).map((call) => call.name)).toEqual([
93
+ "lookupInvoice",
94
+ "createRefund",
95
+ ]);
96
+ });
75
97
  },
76
- data: async () => [{ input: "Find recent errors" }],
77
- task: myAgentTask,
78
- scorers: [async ({ output }) => ({ score: output.includes("error") ? 1.0 : 0.0 })],
79
- });
98
+ );
80
99
  ```
81
100
 
82
- ## Scorers
83
-
84
- Scorers evaluate outputs and return a score (0-1). Use built-in scorers or create your own.
85
-
86
- ### ToolCallScorer
101
+ ## Table-Driven Vitest Style
87
102
 
88
- Evaluates if the expected tools were called with correct arguments.
103
+ If you want case tables, use Vitest's own `it.for(...)` and call `run(...)`
104
+ inside the test body:
89
105
 
90
- ```javascript
91
- import { ToolCallScorer } from "vitest-evals";
92
-
93
- describeEval("tool usage", {
94
- data: async () => [
106
+ ```ts
107
+ describeEval("refund agent", { harness }, (it) => {
108
+ it.for([
95
109
  {
96
- input: "Find Italian restaurants",
97
- expectedTools: [
98
- { name: "search", arguments: { type: "restaurant" } },
99
- { name: "filter", arguments: { cuisine: "italian" } },
100
- ],
110
+ name: "approves refundable invoice",
111
+ input: "Refund invoice inv_123",
112
+ expectedStatus: "approved",
101
113
  },
102
- ],
103
- task: myTask,
104
- scorers: [ToolCallScorer()],
105
- });
106
-
107
- // Strict order and parameters
108
- scorers: [ToolCallScorer({ ordered: true, params: "strict" })];
109
-
110
- // Flexible evaluation
111
- scorers: [ToolCallScorer({ requireAll: false, allowExtras: false })];
112
- ```
113
-
114
- **Default behavior:**
115
-
116
- - Strict parameter matching (exact equality required)
117
- - Any order allowed
118
- - Extra tools allowed
119
- - All expected tools required
120
-
121
- ### StructuredOutputScorer
122
-
123
- Evaluates if the output matches expected structured data (JSON).
124
-
125
- ```javascript
126
- import { StructuredOutputScorer } from "vitest-evals";
127
-
128
- describeEval("query generation", {
129
- data: async () => [
130
114
  {
131
- input: "Show me errors from today",
132
- expected: {
133
- dataset: "errors",
134
- query: "",
135
- sort: "-timestamp",
136
- timeRange: { statsPeriod: "24h" },
137
- },
115
+ name: "denies non-refundable invoice",
116
+ input: "Refund invoice inv_404",
117
+ expectedStatus: "denied",
138
118
  },
139
- ],
140
- task: myTask,
141
- scorers: [StructuredOutputScorer()],
142
- });
143
-
144
- // Fuzzy matching
145
- scorers: [StructuredOutputScorer({ match: "fuzzy" })];
146
-
147
- // Custom validation
148
- scorers: [
149
- StructuredOutputScorer({
150
- match: (expected, actual, key) => {
151
- if (key === "age") return actual >= 18 && actual <= 100;
152
- return expected === actual;
153
- },
154
- }),
155
- ];
156
- ```
157
-
158
- ### Custom Scorers
159
-
160
- ```javascript
161
- // Inline scorer
162
- const LengthScorer = async ({ output }) => ({
163
- score: output.length > 50 ? 1.0 : 0.0,
164
- });
165
-
166
- // TypeScript scorer with custom options
167
- import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
168
-
169
- interface CustomOptions extends BaseScorerOptions {
170
- minLength: number;
171
- }
172
-
173
- const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
174
- score: opts.output.length >= opts.minLength ? 1.0 : 0.0,
119
+ ])("$name", async ({ input, ...metadata }, { run }) => {
120
+ const result = await run(input, {
121
+ metadata,
122
+ });
123
+
124
+ expect(result.output).toMatchObject({
125
+ status: metadata.expectedStatus,
126
+ });
127
+ });
175
128
  });
176
129
  ```
177
130
 
178
- ## AI SDK Integration
179
-
180
- See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a complete example with the Vercel AI SDK.
131
+ ## Existing Agents
181
132
 
182
- Transform provider responses to our format:
133
+ For an existing agent, the intended contract is:
183
134
 
184
- ```javascript
185
- const { text, steps } = await generateText({
186
- model: openai("gpt-4o"),
187
- prompt: input,
188
- tools: { myTool: myToolDefinition },
189
- });
190
-
191
- return {
192
- result: text,
193
- toolCalls: steps
194
- .flatMap((step) => step.toolCalls)
195
- .map((call) => ({
196
- name: call.toolName,
197
- arguments: call.args,
198
- })),
199
- };
200
- ```
135
+ - pass the agent instance or per-test factory through the harness
136
+ - optionally pass `run` when the app entrypoint is not `run(input, runtime)`
137
+ - let the harness infer native tools from the existing agent by default
138
+ - only pass an explicit `tools` override when the agent hides its tool surface
201
139
 
202
- ## Advanced Usage
140
+ The harness owns normalization, diagnostics, tool capture, replay plumbing, and
141
+ reporter-facing artifacts. Your app just needs one runtime seam where those
142
+ wrapped pieces can be injected.
203
143
 
204
- ### Using autoevals
144
+ For the Pi-specific harness, output/session/usage normalization should usually
145
+ be inferred automatically. Treat low-level normalization callbacks as an escape
146
+ hatch, not part of the primary authoring path.
205
147
 
206
- For evaluation using the autoevals library:
148
+ ## Judge Matchers
207
149
 
208
- ```javascript
209
- import { Factuality, ClosedQA } from "autoevals";
150
+ Use the matcher when a judge should behave like a normal Vitest assertion.
151
+ In practice, this is usually most useful for factuality, rubric, or grounded
152
+ answer checks:
210
153
 
211
- scorers: [
212
- Factuality,
213
- ClosedQA.partial({
214
- criteria: "Does the answer mention Paris?",
215
- }),
216
- ];
154
+ ```ts
155
+ await expect(result).toSatisfyJudge(FactualityJudge);
217
156
  ```
218
157
 
219
- ### Skip Tests Conditionally
158
+ For lower-level cases, the matcher also accepts raw values and synthetic judge
159
+ context:
220
160
 
221
- ```javascript
222
- describeEval("gpt-4 tests", {
223
- skipIf: () => !process.env.OPENAI_API_KEY,
224
- // ...
161
+ ```ts
162
+ await expect({ status: "approved" }).toSatisfyJudge(MyJudge, {
163
+ inputValue: "Refund invoice inv_123",
225
164
  });
226
165
  ```
227
166
 
228
- ### Existing Test Suites
167
+ If you are writing a custom judge, wrap it with `namedJudge(...)` so reporter
168
+ output uses a stable label:
229
169
 
230
- For integration with existing Vitest test suites, you can use the `.toEval()` matcher:
170
+ ```ts
171
+ import { namedJudge } from "vitest-evals";
231
172
 
232
- > **Deprecated**: The `.toEval()` helper is deprecated. Use `describeEval()` instead for better test organization and multiple scorers support.
173
+ const FactualityJudge = namedJudge(
174
+ "FactualityJudge",
175
+ async ({ output }) => {
176
+ const answer = output;
177
+ const verdict = await judgeFactuality(answer);
233
178
 
234
- ```javascript
235
- import "vitest-evals";
236
-
237
- test("capital check", () => {
238
- const simpleFactuality = async ({ output, expected }) => ({
239
- score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
240
- });
241
-
242
- expect("What is the capital of France?").toEval(
243
- "Paris",
244
- answerQuestion,
245
- simpleFactuality,
246
- 0.8
247
- );
248
- });
249
- ```
250
-
251
- ## Configuration
252
-
253
- ### Separate Eval Configuration
254
-
255
- Create `vitest.evals.config.ts`:
256
-
257
- ```javascript
258
- import { defineConfig } from "vitest/config";
259
- import defaultConfig from "./vitest.config";
260
-
261
- export default defineConfig({
262
- ...defaultConfig,
263
- test: {
264
- ...defaultConfig.test,
265
- include: ["src/**/*.eval.{js,ts}"],
179
+ return {
180
+ score: verdict.score,
181
+ metadata: {
182
+ rationale: verdict.rationale,
183
+ },
184
+ };
266
185
  },
267
- });
186
+ );
268
187
  ```
269
188
 
270
- Run evals separately:
271
-
272
- ```shell
273
- vitest --config=vitest.evals.config.ts
274
- ```
275
-
276
- ## Development
277
-
278
- ```shell
279
- pnpm install
280
- pnpm test
189
+ For a `HarnessRun`, `toSatisfyJudge(...)` passes `result.output` as `output`.
190
+ For raw values or normalized sessions, the matcher infers the best available
191
+ output from the received value. Structured or programmatic result checks should
192
+ usually assert on `result.output` directly. When a judge needs richer context,
193
+ type it with `JudgeContext` and read `inputValue`, `metadata`, `toolCalls`, or
194
+ `session` from there.
195
+
196
+ When you only need deterministic contract checks, built-ins such as
197
+ `StructuredOutputJudge()` and `ToolCallJudge()` are still available. The primary
198
+ documentation examples intentionally use factuality/rubric judges because those
199
+ match the product's LLM-as-a-judge direction.
200
+
201
+ ## Legacy Compatibility
202
+
203
+ The root package is harness-first and judge-first. Legacy scorer-first suites
204
+ and `evaluate(...)` live under `vitest-evals/legacy`.
205
+
206
+ ```ts
207
+ import {
208
+ describeEval,
209
+ StructuredOutputScorer,
210
+ ToolCallScorer,
211
+ } from "vitest-evals/legacy";
281
212
  ```
@@ -0,0 +1,118 @@
1
+ type JsonPrimitive = string | number | boolean | null;
2
+ type JsonValue = JsonPrimitive | JsonValue[] | {
3
+ [key: string]: JsonValue;
4
+ };
5
+ type ToolCallRecord = {
6
+ id?: string;
7
+ name: string;
8
+ arguments?: Record<string, JsonValue>;
9
+ result?: JsonValue;
10
+ error?: {
11
+ message: string;
12
+ type?: string;
13
+ [key: string]: JsonValue | undefined;
14
+ };
15
+ startedAt?: string;
16
+ finishedAt?: string;
17
+ durationMs?: number;
18
+ metadata?: Record<string, JsonValue>;
19
+ };
20
+ type NormalizedMessage = {
21
+ role: "system" | "user" | "assistant" | "tool";
22
+ content?: JsonValue;
23
+ toolCalls?: ToolCallRecord[];
24
+ metadata?: Record<string, JsonValue>;
25
+ };
26
+ type UsageSummary = {
27
+ provider?: string;
28
+ model?: string;
29
+ inputTokens?: number;
30
+ outputTokens?: number;
31
+ reasoningTokens?: number;
32
+ totalTokens?: number;
33
+ estimatedCost?: number;
34
+ toolCalls?: number;
35
+ retries?: number;
36
+ metadata?: Record<string, JsonValue>;
37
+ };
38
+ type TimingSummary = {
39
+ totalMs?: number;
40
+ metadata?: Record<string, JsonValue>;
41
+ };
42
+ type NormalizedSession = {
43
+ messages: NormalizedMessage[];
44
+ outputText?: string;
45
+ provider?: string;
46
+ model?: string;
47
+ metadata?: Record<string, JsonValue>;
48
+ };
49
+ type HarnessRun = {
50
+ session: NormalizedSession;
51
+ output?: JsonValue;
52
+ usage: UsageSummary;
53
+ timings?: TimingSummary;
54
+ artifacts?: Record<string, JsonValue>;
55
+ errors: Array<Record<string, JsonValue>>;
56
+ };
57
+ type HarnessPromptOptions = {
58
+ system?: string;
59
+ metadata?: Record<string, JsonValue>;
60
+ };
61
+ type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
62
+ type HarnessRuntime = {
63
+ prompt: HarnessPrompt;
64
+ };
65
+ type HarnessRunError = Error & {
66
+ vitestEvalsRun: HarnessRun;
67
+ };
68
+ type HarnessMetadata = Record<string, unknown>;
69
+ type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
70
+ metadata: Readonly<TMetadata>;
71
+ task: {
72
+ meta: Record<string, unknown>;
73
+ };
74
+ signal?: AbortSignal;
75
+ artifacts: Record<string, JsonValue>;
76
+ setArtifact: (name: string, value: JsonValue) => void;
77
+ };
78
+ type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
79
+ name: string;
80
+ prompt?: HarnessPrompt;
81
+ run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
82
+ };
83
+ /** Returns true when a value exposes a callable method with the given name. */
84
+ declare function hasCallableMethod(value: unknown, methodName: string): boolean;
85
+ /** Normalizes an unknown value into the JSON-safe shape used by harness runs. */
86
+ declare function toJsonValue(value: unknown): JsonValue | undefined;
87
+ /** Drops non-JSON properties from a record while preserving valid values. */
88
+ declare function normalizeRecord(value: Record<string, unknown>): Record<string, JsonValue>;
89
+ /** Normalizes metadata and omits the field entirely when nothing survives. */
90
+ declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
91
+ /** Converts arbitrary content into the JSON-safe message content shape. */
92
+ declare function normalizeContent(value: unknown): JsonValue;
93
+ /** Flattens every recorded tool call from a normalized session. */
94
+ declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
95
+ /** Filters normalized session messages by role. */
96
+ declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
97
+ /** Returns every normalized system message from a session. */
98
+ declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
99
+ /** Returns every normalized user message from a session. */
100
+ declare function userMessages(session: NormalizedSession): NormalizedMessage[];
101
+ /** Returns every normalized assistant message from a session. */
102
+ declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
103
+ /** Returns every normalized tool message from a session. */
104
+ declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
105
+ /** Attaches a partial or complete harness run to an arbitrary thrown error. */
106
+ declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
107
+ /** Reads an attached harness run back off a previously wrapped error value. */
108
+ declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
109
+ /** Returns true when a value matches the normalized `HarnessRun` contract. */
110
+ declare function isHarnessRun(value: unknown): value is HarnessRun;
111
+ /** Returns true when a value matches the normalized session contract. */
112
+ declare function isNormalizedSession(value: unknown): value is NormalizedSession;
113
+ /** Reuses pre-normalized harness errors when a runtime already returns them. */
114
+ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string, JsonValue>>;
115
+ /** Serializes an arbitrary thrown value into the normalized error shape. */
116
+ declare function serializeError(error: unknown): Record<string, JsonValue>;
117
+
118
+ export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type HarnessRuntime, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };
@@ -0,0 +1,118 @@
1
+ type JsonPrimitive = string | number | boolean | null;
2
+ type JsonValue = JsonPrimitive | JsonValue[] | {
3
+ [key: string]: JsonValue;
4
+ };
5
+ type ToolCallRecord = {
6
+ id?: string;
7
+ name: string;
8
+ arguments?: Record<string, JsonValue>;
9
+ result?: JsonValue;
10
+ error?: {
11
+ message: string;
12
+ type?: string;
13
+ [key: string]: JsonValue | undefined;
14
+ };
15
+ startedAt?: string;
16
+ finishedAt?: string;
17
+ durationMs?: number;
18
+ metadata?: Record<string, JsonValue>;
19
+ };
20
+ type NormalizedMessage = {
21
+ role: "system" | "user" | "assistant" | "tool";
22
+ content?: JsonValue;
23
+ toolCalls?: ToolCallRecord[];
24
+ metadata?: Record<string, JsonValue>;
25
+ };
26
+ type UsageSummary = {
27
+ provider?: string;
28
+ model?: string;
29
+ inputTokens?: number;
30
+ outputTokens?: number;
31
+ reasoningTokens?: number;
32
+ totalTokens?: number;
33
+ estimatedCost?: number;
34
+ toolCalls?: number;
35
+ retries?: number;
36
+ metadata?: Record<string, JsonValue>;
37
+ };
38
+ type TimingSummary = {
39
+ totalMs?: number;
40
+ metadata?: Record<string, JsonValue>;
41
+ };
42
+ type NormalizedSession = {
43
+ messages: NormalizedMessage[];
44
+ outputText?: string;
45
+ provider?: string;
46
+ model?: string;
47
+ metadata?: Record<string, JsonValue>;
48
+ };
49
+ type HarnessRun = {
50
+ session: NormalizedSession;
51
+ output?: JsonValue;
52
+ usage: UsageSummary;
53
+ timings?: TimingSummary;
54
+ artifacts?: Record<string, JsonValue>;
55
+ errors: Array<Record<string, JsonValue>>;
56
+ };
57
+ type HarnessPromptOptions = {
58
+ system?: string;
59
+ metadata?: Record<string, JsonValue>;
60
+ };
61
+ type HarnessPrompt = (input: string, options?: HarnessPromptOptions) => Promise<string>;
62
+ type HarnessRuntime = {
63
+ prompt: HarnessPrompt;
64
+ };
65
+ type HarnessRunError = Error & {
66
+ vitestEvalsRun: HarnessRun;
67
+ };
68
+ type HarnessMetadata = Record<string, unknown>;
69
+ type HarnessContext<TMetadata extends HarnessMetadata = HarnessMetadata> = {
70
+ metadata: Readonly<TMetadata>;
71
+ task: {
72
+ meta: Record<string, unknown>;
73
+ };
74
+ signal?: AbortSignal;
75
+ artifacts: Record<string, JsonValue>;
76
+ setArtifact: (name: string, value: JsonValue) => void;
77
+ };
78
+ type Harness<TInput = unknown, TMetadata extends HarnessMetadata = HarnessMetadata> = {
79
+ name: string;
80
+ prompt?: HarnessPrompt;
81
+ run: (input: TInput, context: HarnessContext<TMetadata>) => Promise<HarnessRun>;
82
+ };
83
+ /** Returns true when a value exposes a callable method with the given name. */
84
+ declare function hasCallableMethod(value: unknown, methodName: string): boolean;
85
+ /** Normalizes an unknown value into the JSON-safe shape used by harness runs. */
86
+ declare function toJsonValue(value: unknown): JsonValue | undefined;
87
+ /** Drops non-JSON properties from a record while preserving valid values. */
88
+ declare function normalizeRecord(value: Record<string, unknown>): Record<string, JsonValue>;
89
+ /** Normalizes metadata and omits the field entirely when nothing survives. */
90
+ declare function normalizeMetadata(value: Record<string, unknown>): Record<string, JsonValue> | undefined;
91
+ /** Converts arbitrary content into the JSON-safe message content shape. */
92
+ declare function normalizeContent(value: unknown): JsonValue;
93
+ /** Flattens every recorded tool call from a normalized session. */
94
+ declare function toolCalls(session: NormalizedSession): ToolCallRecord[];
95
+ /** Filters normalized session messages by role. */
96
+ declare function messagesByRole(session: NormalizedSession, role: NormalizedMessage["role"]): NormalizedMessage[];
97
+ /** Returns every normalized system message from a session. */
98
+ declare function systemMessages(session: NormalizedSession): NormalizedMessage[];
99
+ /** Returns every normalized user message from a session. */
100
+ declare function userMessages(session: NormalizedSession): NormalizedMessage[];
101
+ /** Returns every normalized assistant message from a session. */
102
+ declare function assistantMessages(session: NormalizedSession): NormalizedMessage[];
103
+ /** Returns every normalized tool message from a session. */
104
+ declare function toolMessages(session: NormalizedSession): NormalizedMessage[];
105
+ /** Attaches a partial or complete harness run to an arbitrary thrown error. */
106
+ declare function attachHarnessRunToError(error: unknown, run: HarnessRun): HarnessRunError;
107
+ /** Reads an attached harness run back off a previously wrapped error value. */
108
+ declare function getHarnessRunFromError(error: unknown): HarnessRun | undefined;
109
+ /** Returns true when a value matches the normalized `HarnessRun` contract. */
110
+ declare function isHarnessRun(value: unknown): value is HarnessRun;
111
+ /** Returns true when a value matches the normalized session contract. */
112
+ declare function isNormalizedSession(value: unknown): value is NormalizedSession;
113
+ /** Reuses pre-normalized harness errors when a runtime already returns them. */
114
+ declare function resolveHarnessRunErrors(result: unknown): Array<Record<string, JsonValue>>;
115
+ /** Serializes an arbitrary thrown value into the normalized error shape. */
116
+ declare function serializeError(error: unknown): Record<string, JsonValue>;
117
+
118
+ export { type Harness, type HarnessContext, type HarnessMetadata, type HarnessPrompt, type HarnessPromptOptions, type HarnessRun, type HarnessRunError, type HarnessRuntime, type JsonPrimitive, type JsonValue, type NormalizedMessage, type NormalizedSession, type TimingSummary, type ToolCallRecord, type UsageSummary, assistantMessages, attachHarnessRunToError, getHarnessRunFromError, hasCallableMethod, isHarnessRun, isNormalizedSession, messagesByRole, normalizeContent, normalizeMetadata, normalizeRecord, resolveHarnessRunErrors, serializeError, systemMessages, toJsonValue, toolCalls, toolMessages, userMessages };