vitest-evals 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +206 -195
  2. package/dist/index.d.mts +2 -97
  3. package/dist/index.d.ts +2 -97
  4. package/dist/index.js +253 -8
  5. package/dist/index.js.map +1 -1
  6. package/dist/index.mjs +252 -8
  7. package/dist/index.mjs.map +1 -1
  8. package/dist/scorers/index.d.mts +2 -0
  9. package/dist/scorers/index.d.ts +2 -0
  10. package/dist/scorers/index.js +282 -0
  11. package/dist/scorers/index.js.map +1 -0
  12. package/dist/scorers/index.mjs +256 -0
  13. package/dist/scorers/index.mjs.map +1 -0
  14. package/dist/scorers/toolCallScorer.d.mts +240 -0
  15. package/dist/scorers/toolCallScorer.d.ts +240 -0
  16. package/dist/scorers/toolCallScorer.js +280 -0
  17. package/dist/scorers/toolCallScorer.js.map +1 -0
  18. package/dist/scorers/toolCallScorer.mjs +256 -0
  19. package/dist/scorers/toolCallScorer.mjs.map +1 -0
  20. package/package.json +11 -1
  21. package/dist/autoevals-compatibility.test.d.mts +0 -2
  22. package/dist/autoevals-compatibility.test.d.ts +0 -2
  23. package/dist/autoevals-compatibility.test.js +0 -45122
  24. package/dist/autoevals-compatibility.test.js.map +0 -1
  25. package/dist/autoevals-compatibility.test.mjs +0 -45977
  26. package/dist/autoevals-compatibility.test.mjs.map +0 -1
  27. package/dist/formatScores.test.d.mts +0 -2
  28. package/dist/formatScores.test.d.ts +0 -2
  29. package/dist/formatScores.test.js +0 -196
  30. package/dist/formatScores.test.js.map +0 -1
  31. package/dist/formatScores.test.mjs +0 -195
  32. package/dist/formatScores.test.mjs.map +0 -1
  33. package/dist/wrapText.test.d.mts +0 -2
  34. package/dist/wrapText.test.d.ts +0 -2
  35. package/dist/wrapText.test.js +0 -163
  36. package/dist/wrapText.test.js.map +0 -1
  37. package/dist/wrapText.test.mjs +0 -162
  38. package/dist/wrapText.test.mjs.map +0 -1
package/README.md CHANGED
@@ -1,214 +1,246 @@
1
1
  # vitest-evals
2
2
 
3
- This project is a prototype of extending vitest to support basic _Evals_ functionality. Evals are a type of testing that is most commonly deployed to _evaluate_ the results of calls to language models. This allows you to utilize them with a pattern of testing you're familiar with, working well with your existing continuous integration toolchain.
3
+ Evaluate LLM outputs using the familiar Vitest testing framework.
4
4
 
5
- This is heavily inspired by [Evalite](https://www.evalite.dev/), but opts for a vitest-native approach to maximize the compatibility of the existing ecosystem. This means you can use it with your existing toolchain, including reporting such as code coverage and xunit.
6
-
7
- ## Use
5
+ ## Installation
8
6
 
9
7
  ```shell
10
8
  npm install -D vitest-evals
11
9
  ```
12
10
 
13
- You've likely already got a mechanism for passing the user input into your model, for example:
11
+ ## Quick Start
14
12
 
15
13
  ```javascript
16
- async function answerQuestion(prompt: string) {
17
- const model = openai("gpt-4o");
18
- const { text } = await generateText({
19
- model,
20
- prompt,
21
- });
22
- return text;
23
- }
14
+ import { describeEval } from "vitest-evals";
15
+
16
+ describeEval("capital cities", {
17
+ data: async () => [
18
+ { input: "What is the capital of France?", expected: "Paris" },
19
+ { input: "What is the capital of Japan?", expected: "Tokyo" }
20
+ ],
21
+ task: async (input) => {
22
+ const response = await queryLLM(input);
23
+ return response; // Simple string return
24
+ },
25
+ scorers: [async ({ output, expected }) => ({
26
+ score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
27
+ })],
28
+ threshold: 0.8
29
+ });
24
30
  ```
25
31
 
26
- You'll use this as the `task` within your evals, and then you simply need to define a set of scenarios
27
- and a way to validate if the LLM is responding as you desire:
32
+ ## Tasks
33
+
34
+ Tasks process inputs and return outputs. Two formats are supported:
28
35
 
29
36
  ```javascript
30
- import { describeEval } from "vitest-evals";
31
- import { Factuality } from "autoevals";
32
-
33
- describeEval("my evals", {
34
- data: async () => {
35
- // The scenarios you wish to evaluate
36
- return [
37
- {
38
- input: "What is the capital of France?",
39
- expected: "Paris",
40
- }
41
- ];
42
- },
37
+ // Simple: just return a string
38
+ const task = async (input) => "response";
39
+
40
+ // With tool tracking: return a TaskResult
41
+ const task = async (input) => ({
42
+ result: "response",
43
+ toolCalls: [
44
+ { name: "search", arguments: { query: "..." }, result: {...} }
45
+ ]
46
+ });
47
+ ```
48
+
49
+ ## Scorers
43
50
 
44
- task: answerQuestion,
51
+ Scorers evaluate outputs and return a score (0-1). Use built-in scorers or create your own:
45
52
 
46
- // Scorers determine if the response was acceptable - in this case we're using
47
- // a secondary LLM prompt to judge the response of the first.
48
- scorers: [Factuality],
53
+ ```javascript
54
+ // Built-in scorer
55
+ import { ToolCallScorer } from "vitest-evals";
56
+ // Or import individually
57
+ import { ToolCallScorer } from "vitest-evals/scorers/toolCallScorer";
58
+
59
+ describeEval("tool usage", {
60
+ data: async () => [
61
+ { input: "Search weather", expectedTools: [{ name: "weather_api" }] }
62
+ ],
63
+ task: weatherTask,
64
+ scorers: [ToolCallScorer()]
65
+ });
49
66
 
50
- // The threshold required for the average score for this eval to pass. This will be
51
- // based on the scorers you've provided, and in the case of Factuality, we might be
52
- // ok with a 60% score (see the implementation for why).
53
- threshold: 0.6,
67
+ // Custom scorer
68
+ const LengthScorer = async ({ output }) => ({
69
+ score: output.length > 50 ? 1.0 : 0.0
70
+ });
54
71
 
55
- // The timeout for each test. Defaults to 10s. You may need to increase this if your model
56
- // provider has high latency or you're using a large number of scorers.
57
- // timeout: 60000,
72
+ // TypeScript scorer with custom options
73
+ import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
74
+
75
+ interface CustomOptions extends BaseScorerOptions {
76
+ minLength: number;
77
+ }
58
78
 
59
- // A check to determine if these tests should run. This is helpful to control tests so they only
60
- // in certain situations, for example if a model providers API key is defined.
61
- // skipIf: () => !process.env.OPENAI_API_KEY
62
- })
79
+ const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
80
+ score: opts.output.length >= opts.minLength ? 1.0 : 0.0
81
+ });
63
82
  ```
64
83
 
65
- ### Existing Test Suites
84
+ ### Built-in Scorers
85
+
86
+ #### ToolCallScorer
87
+ Evaluates if the expected tools were called with correct arguments.
66
88
 
67
89
  ```javascript
68
- // import `vitest-evals` to expose `expect().toEval()`
69
- // This can also be done via `setupFiles` pattern in `vitest`.
70
- import "vitest-evals";
71
- import { Factuality } from "autoevals";
72
-
73
- describe("my test suite", () => {
74
- it("kind of works", () => {
75
- expect("What is the capital of France?").toEval(
76
- "Paris",
77
- answerQuestion,
78
- Factuality,
79
- 0.8
80
- );
81
- });
90
+ // Basic usage - strict matching, any order
91
+ describeEval("search test", {
92
+ data: async () => [{
93
+ input: "Find Italian restaurants",
94
+ expectedTools: [
95
+ { name: "search", arguments: { type: "restaurant" } },
96
+ { name: "filter", arguments: { cuisine: "italian" } }
97
+ ]
98
+ }],
99
+ task: myTask,
100
+ scorers: [ToolCallScorer()]
82
101
  });
102
+
103
+ // Strict evaluation - exact order and parameters
104
+ scorers: [ToolCallScorer({
105
+ ordered: true, // Tools must be in exact order
106
+ params: "strict" // Parameters must match exactly
107
+ })]
108
+
109
+ // Flexible evaluation
110
+ scorers: [ToolCallScorer({
111
+ requireAll: false, // Partial matches give partial credit
112
+ allowExtras: false // No additional tools allowed
113
+ })]
114
+ ```
115
+
116
+ **Default behavior:**
117
+ - Strict parameter matching (exact equality required)
118
+ - Any order allowed
119
+ - Extra tools allowed
120
+ - All expected tools required
121
+
122
+ ## AI SDK Integration
123
+
124
+ See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a complete example with the Vercel AI SDK.
125
+
126
+ Transform provider responses to our format:
127
+
128
+ ```javascript
129
+ // Vercel AI SDK
130
+ const { text, toolCalls, toolResults } = await generateText(...);
131
+ return {
132
+ result: text,
133
+ toolCalls: toolCalls?.map((call, i) => ({
134
+ id: call.toolCallId,
135
+ name: call.toolName,
136
+ arguments: call.args,
137
+ result: toolResults?.[i]?.result,
138
+ status: toolResults?.[i]?.error ? 'failed' : 'completed'
139
+ }))
140
+ };
141
+ ```
142
+
143
+ ## Advanced Usage
144
+
145
+ ### Advanced Scorers
146
+
147
+ #### Using autoevals
148
+
149
+ For sophisticated evaluation, use autoevals scorers:
150
+
151
+ ```javascript
152
+ import { Factuality, ClosedQA } from "autoevals";
153
+
154
+ scorers: [
155
+ Factuality, // LLM-based factuality checking
156
+ ClosedQA.partial({
157
+ criteria: "Does the answer mention Paris?"
158
+ })
159
+ ]
83
160
  ```
84
161
 
85
- ### Scoring
162
+ #### Custom LLM-based Factuality Scorer
86
163
 
87
- Scorers are compatible with the `autoevals` interface, but are also simple to implement on your own:
164
+ Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
88
165
 
89
166
  ```javascript
90
- export const Contains = async (opts: {
91
- input: string,
92
- expected: string,
93
- output: string,
94
- }) => {
167
+ import { generateObject } from 'ai';
168
+ import { openai } from '@ai-sdk/openai';
169
+ import { z } from 'zod';
170
+
171
+ const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expected }) => {
172
+ if (!expected) {
173
+ return { score: 1.0, metadata: { rationale: "No expected answer" } };
174
+ }
175
+
176
+ const { object } = await generateObject({
177
+ model,
178
+ prompt: `
179
+ Compare the factual content of the submitted answer with the expert answer.
180
+
181
+ Question: ${input}
182
+ Expert: ${expected}
183
+ Submission: ${output}
184
+
185
+ Options:
186
+ (A) Subset of expert answer
187
+ (B) Superset of expert answer
188
+ (C) Same content as expert
189
+ (D) Contradicts expert answer
190
+ (E) Different but factually equivalent
191
+ `,
192
+ schema: z.object({
193
+ answer: z.enum(['A', 'B', 'C', 'D', 'E']),
194
+ rationale: z.string()
195
+ })
196
+ });
197
+
198
+ const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
95
199
  return {
96
- score: output.indexOf(expected) !== -1 ? 1.0 : 0.0,
200
+ score: scores[object.answer],
201
+ metadata: { rationale: object.rationale, answer: object.answer }
97
202
  };
98
203
  };
204
+
205
+ // Usage
206
+ scorers: [Factuality()]
99
207
  ```
100
208
 
101
- For something more realistic, here's a reimplementation of the Factuality scorer from `autoevals`, with some flexibility
102
- on the model, enabling you to evaluate against multiple models:
103
-
104
- ````javascript
105
- import { generateObject, type LanguageModel } from "ai";
106
- import { z } from "zod";
107
-
108
- /**
109
- * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
110
- *
111
- * @param model - The language model to utilize (via `ai`).
112
- *
113
- * @example
114
- * ```javascript
115
- * import { openai } from "@ai-sdk/openai";
116
- *
117
- * scorers: [Factuality(openai("gpt-4o"))]
118
- * ```
119
- */
120
- export function Factuality(model: LanguageModel) {
121
- return async Factuality(opts: {
122
- input: string;
123
- output: string;
124
- expected?: string;
125
- }) => {
126
- const { object } = await generateObject({
127
- model,
128
- /**
129
- * Prompt implementation from `autoevals`:
130
- *
131
- * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
132
- */
133
- prompt: `
134
- You are comparing a submitted answer to an expert answer on a given question. Here is the data:
135
- [BEGIN DATA]
136
- ************
137
- [Question]: ${opts.input}
138
- ************
139
- [Expert]: ${opts.expected}
140
- ************
141
- [Submission]: ${opts.output}
142
- ************
143
- [END DATA]
144
-
145
- Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
146
- The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
147
- (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
148
- (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
149
- (C) The submitted answer contains all the same details as the expert answer.
150
- (D) There is a disagreement between the submitted answer and the expert answer.
151
- (E) The answers differ, but these differences don't matter from the perspective of factuality.
152
- `,
153
- schema: z.object({
154
- answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
155
- rationale: z
156
- .string()
157
- .describe("Why you chose this answer. Be very detailed."),
158
- }),
159
- });
160
-
161
- const scores = {
162
- A: 0.4,
163
- B: 0.6,
164
- C: 1,
165
- D: 0,
166
- E: 1,
167
- };
168
-
169
- return {
170
- score: scores[object.answer],
171
- metadata: {
172
- rationale: object.rationale,
173
- },
174
- };
175
- };
176
- }
177
- ````
209
+ ### Skip Tests Conditionally
178
210
 
179
- #### Compatibility with `autoevals`
211
+ ```javascript
212
+ describeEval("gpt-4 tests", {
213
+ skipIf: () => !process.env.OPENAI_API_KEY,
214
+ // ...
215
+ });
216
+ ```
180
217
 
181
- We maintain compatibility with the [autoevals package](https://github.com/braintrustdata/autoevals) from Braintrust. To use it you'll typically need to use te `partial` helper provided on the scorers. For example, with the `ClosedQA` scorer:
218
+ ### Existing Test Suites
182
219
 
183
220
  ```javascript
184
- import { describeEval } from "vitest-evals";
185
- import { ClosedQA } from "autoevals";
186
-
187
- describeEval("my evals", {
188
- data: async () => {
189
- // The scenarios you wish to evaluate
190
- return [
191
- {
192
- input: "What is the capital of France?",
193
- expected: "Paris",
194
- }
195
- ];
196
- },
197
- task: answerQuestion,
198
- scorers: [ClosedQA.partial({
199
- criteria: "Does the submission indicate that the question is out of scope?",
200
- })],
201
- threshold: 0.6,
202
- })
221
+ import "vitest-evals";
222
+
223
+ test("capital check", () => {
224
+ const simpleFactuality = async ({ output, expected }) => ({
225
+ score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
226
+ });
227
+
228
+ expect("What is the capital of France?").toEval(
229
+ "Paris",
230
+ answerQuestion,
231
+ simpleFactuality,
232
+ 0.8
233
+ );
234
+ });
203
235
  ```
204
236
 
205
- ### Separating Evals
237
+ ## Configuration
206
238
 
207
- An alternative to `skipIf` for controlling if evals run is creating an separate `vitest` configuration for them. This gives a lot of advantages, particularly allowing you to maintain two completely separate test suites. A good pattern you can enable with this is a filename-based-test selector:
239
+ ### Separate Eval Configuration
240
+
241
+ Create `vitest.evals.config.ts`:
208
242
 
209
243
  ```javascript
210
- // vitest.evals.config.ts
211
- /// <reference types="vitest" />
212
244
  import { defineConfig } from "vitest/config";
213
245
  import defaultConfig from "./vitest.config";
214
246
 
@@ -216,41 +248,20 @@ export default defineConfig({
216
248
  ...defaultConfig,
217
249
  test: {
218
250
  ...defaultConfig.test,
219
- // run `eval` files rather than typical `test` files
220
- include: ["src/**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"],
251
+ include: ["src/**/*.eval.{js,ts}"],
221
252
  },
222
253
  });
223
254
  ```
224
255
 
225
- In the above, we're telling it to only match only `*.eval.*` files (vs the typical `*.test.*` or `*.spec.*`). We're also inheriting from our default `vitest.config.ts`. This gives us a clean way to run only tests, or run only evals:
256
+ Run evals separately:
226
257
 
227
258
  ```shell
228
259
  vitest --config=vitest.evals.config.ts
229
260
  ```
230
261
 
231
- Its recommended to add this to your `package.json`, such as under an `eval` helper:
232
-
233
- ```javascript
234
- // package.json
235
- {
236
- // ...
237
- "scripts": {
238
- // ...
239
- "eval": "vitest --config=vitest.evals.config.ts",
240
- }
241
- }
242
- ```
243
-
244
- You can then run your evals using `npm run eval`.
245
-
246
262
  ## Development
247
263
 
248
- Nothing fancy here.
249
-
250
- ```javascript
251
- pnpm install
252
- ```
253
-
254
- ```javascript
255
- pnpm test
256
- ```
264
+ ```shell
265
+ npm install
266
+ npm test
267
+ ```
package/dist/index.d.mts CHANGED
@@ -1,97 +1,2 @@
1
- import * as vitest from 'vitest';
2
-
3
- type TaskFn = (input: string) => Promise<string>;
4
- type Score = {
5
- score: number | null;
6
- metadata?: {
7
- rationale?: string;
8
- output?: string;
9
- };
10
- };
11
- type ScoreFn = (opts: {
12
- input: string;
13
- output: string;
14
- } & Record<string, unknown>) => Promise<Score> | Score;
15
- type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
16
- interface EvalMatchers<R = unknown> {
17
- toEval: ToEval<R>;
18
- }
19
- declare module "vitest" {
20
- interface Assertion<T = any> extends EvalMatchers<T> {
21
- }
22
- interface AsymmetricMatchersContaining extends EvalMatchers {
23
- }
24
- interface TaskMeta {
25
- eval?: {
26
- scores: (Score & {
27
- name: string;
28
- })[];
29
- avgScore: number;
30
- };
31
- }
32
- }
33
- /**
34
- * Creates a test suite for evaluating language model outputs.
35
- *
36
- * @param name - The name of the test suite
37
- * @param options - Configuration options
38
- * @param options.data - Async function that returns an array of test cases with input and expected values
39
- * @param options.task - Function that processes the input and returns the model output
40
- * @param options.skipIf - Optional function that determines if tests should be skipped
41
- * @param options.scorers - Array of scoring functions that evaluate model outputs
42
- * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
43
- * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
44
- *
45
- * @example
46
- * ```javascript
47
- * describeEval("capital cities test", {
48
- * data: async () => [{
49
- * input: "What is the capital of France?",
50
- * expected: "Paris"
51
- * }],
52
- * task: async (input) => {
53
- * // Query LLM here
54
- * return "Paris";
55
- * },
56
- * scorers: [checkFactuality],
57
- * threshold: 0.8
58
- * });
59
- * ```
60
- */
61
- declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
62
- data: () => Promise<{
63
- input: string;
64
- expected: string;
65
- }[]>;
66
- task: TaskFn;
67
- skipIf?: () => boolean;
68
- scorers: ScoreFn[];
69
- threshold?: number | null;
70
- timeout?: number;
71
- }): vitest.SuiteCollector<object>;
72
- declare function formatScores(scores: (Score & {
73
- name: string;
74
- })[]): string;
75
- /**
76
- * Wraps text to fit within a specified width, breaking at word boundaries.
77
- *
78
- * @param text - The text to wrap
79
- * @param width - The maximum width in characters (default: 80)
80
- * @returns The wrapped text with line breaks
81
- *
82
- * @example
83
- * ```javascript
84
- * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
85
- * console.log(wrapped);
86
- * // Output:
87
- * // This is a very
88
- * // long text that
89
- * // needs to be
90
- * // wrapped to fit
91
- * // within an 80
92
- * // character width.
93
- * ```
94
- */
95
- declare function wrapText(text: string, width?: number): string;
96
-
97
- export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores, wrapText };
1
+ import 'vitest';
2
+ export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
package/dist/index.d.ts CHANGED
@@ -1,97 +1,2 @@
1
- import * as vitest from 'vitest';
2
-
3
- type TaskFn = (input: string) => Promise<string>;
4
- type Score = {
5
- score: number | null;
6
- metadata?: {
7
- rationale?: string;
8
- output?: string;
9
- };
10
- };
11
- type ScoreFn = (opts: {
12
- input: string;
13
- output: string;
14
- } & Record<string, unknown>) => Promise<Score> | Score;
15
- type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
16
- interface EvalMatchers<R = unknown> {
17
- toEval: ToEval<R>;
18
- }
19
- declare module "vitest" {
20
- interface Assertion<T = any> extends EvalMatchers<T> {
21
- }
22
- interface AsymmetricMatchersContaining extends EvalMatchers {
23
- }
24
- interface TaskMeta {
25
- eval?: {
26
- scores: (Score & {
27
- name: string;
28
- })[];
29
- avgScore: number;
30
- };
31
- }
32
- }
33
- /**
34
- * Creates a test suite for evaluating language model outputs.
35
- *
36
- * @param name - The name of the test suite
37
- * @param options - Configuration options
38
- * @param options.data - Async function that returns an array of test cases with input and expected values
39
- * @param options.task - Function that processes the input and returns the model output
40
- * @param options.skipIf - Optional function that determines if tests should be skipped
41
- * @param options.scorers - Array of scoring functions that evaluate model outputs
42
- * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
43
- * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
44
- *
45
- * @example
46
- * ```javascript
47
- * describeEval("capital cities test", {
48
- * data: async () => [{
49
- * input: "What is the capital of France?",
50
- * expected: "Paris"
51
- * }],
52
- * task: async (input) => {
53
- * // Query LLM here
54
- * return "Paris";
55
- * },
56
- * scorers: [checkFactuality],
57
- * threshold: 0.8
58
- * });
59
- * ```
60
- */
61
- declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
62
- data: () => Promise<{
63
- input: string;
64
- expected: string;
65
- }[]>;
66
- task: TaskFn;
67
- skipIf?: () => boolean;
68
- scorers: ScoreFn[];
69
- threshold?: number | null;
70
- timeout?: number;
71
- }): vitest.SuiteCollector<object>;
72
- declare function formatScores(scores: (Score & {
73
- name: string;
74
- })[]): string;
75
- /**
76
- * Wraps text to fit within a specified width, breaking at word boundaries.
77
- *
78
- * @param text - The text to wrap
79
- * @param width - The maximum width in characters (default: 80)
80
- * @returns The wrapped text with line breaks
81
- *
82
- * @example
83
- * ```javascript
84
- * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
85
- * console.log(wrapped);
86
- * // Output:
87
- * // This is a very
88
- * // long text that
89
- * // needs to be
90
- * // wrapped to fit
91
- * // within an 80
92
- * // character width.
93
- * ```
94
- */
95
- declare function wrapText(text: string, width?: number): string;
96
-
97
- export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores, wrapText };
1
+ import 'vitest';
2
+ export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.js';