vitest-evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +211 -172
  2. package/dist/index.d.mts +2 -98
  3. package/dist/index.d.ts +2 -98
  4. package/dist/index.js +270 -11
  5. package/dist/index.js.map +1 -1
  6. package/dist/index.mjs +269 -11
  7. package/dist/index.mjs.map +1 -1
  8. package/dist/scorers/index.d.mts +2 -0
  9. package/dist/scorers/index.d.ts +2 -0
  10. package/dist/scorers/index.js +282 -0
  11. package/dist/scorers/index.js.map +1 -0
  12. package/dist/scorers/index.mjs +256 -0
  13. package/dist/scorers/index.mjs.map +1 -0
  14. package/dist/scorers/toolCallScorer.d.mts +240 -0
  15. package/dist/scorers/toolCallScorer.d.ts +240 -0
  16. package/dist/scorers/toolCallScorer.js +280 -0
  17. package/dist/scorers/toolCallScorer.js.map +1 -0
  18. package/dist/scorers/toolCallScorer.mjs +256 -0
  19. package/dist/scorers/toolCallScorer.mjs.map +1 -0
  20. package/package.json +16 -4
  21. package/dist/compatibility.test.d.mts +0 -2
  22. package/dist/compatibility.test.d.ts +0 -2
  23. package/dist/compatibility.test.js +0 -45009
  24. package/dist/compatibility.test.js.map +0 -1
  25. package/dist/compatibility.test.mjs +0 -45864
  26. package/dist/compatibility.test.mjs.map +0 -1
  27. package/dist/formatScores.test.d.mts +0 -2
  28. package/dist/formatScores.test.d.ts +0 -2
  29. package/dist/formatScores.test.js +0 -195
  30. package/dist/formatScores.test.js.map +0 -1
  31. package/dist/formatScores.test.mjs +0 -194
  32. package/dist/formatScores.test.mjs.map +0 -1
  33. package/dist/wrapText.test.d.mts +0 -2
  34. package/dist/wrapText.test.d.ts +0 -2
  35. package/dist/wrapText.test.js +0 -162
  36. package/dist/wrapText.test.js.map +0 -1
  37. package/dist/wrapText.test.mjs +0 -161
  38. package/dist/wrapText.test.mjs.map +0 -1
@@ -0,0 +1,240 @@
1
+ import * as vitest from 'vitest';
2
+
3
+ interface ToolCallScorerOptions extends BaseScorerOptions {
4
+ expectedTools?: Array<{
5
+ name: string;
6
+ arguments?: any;
7
+ }>;
8
+ }
9
+ interface ToolCallScorerConfig {
10
+ /**
11
+ * Whether tools must be called in the exact order specified
12
+ * @default false
13
+ */
14
+ ordered?: boolean;
15
+ /**
16
+ * Whether all expected tools must be called for a passing score
17
+ * When false: gives partial credit based on tools matched
18
+ * @default true
19
+ */
20
+ requireAll?: boolean;
21
+ /**
22
+ * Whether to allow additional tool calls beyond those expected
23
+ * @default true
24
+ */
25
+ allowExtras?: boolean;
26
+ /**
27
+ * How to match tool arguments/parameters
28
+ * - "strict": Exact equality required (default)
29
+ * - "fuzzy": Case-insensitive, subset matching, numeric tolerance
30
+ * - Custom function: Your own comparison logic
31
+ * @default "strict"
32
+ */
33
+ params?: "strict" | "fuzzy" | ((expected: any, actual: any) => boolean);
34
+ }
35
+ /**
36
+ * A configurable scorer for evaluating tool usage in LLM responses.
37
+ *
38
+ * The test data defines WHAT tools/arguments are expected,
39
+ * while this scorer defines HOW to evaluate them.
40
+ *
41
+ * @param config - Configuration options for the scorer
42
+ * @param config.ordered - Require exact order of tool calls
43
+ * @param config.requireAll - Require all expected tools (vs partial credit)
44
+ * @param config.allowExtras - Allow additional tool calls
45
+ * @param config.params - How to match parameters: "strict", "fuzzy", or custom function
46
+ *
47
+ * @example
48
+ * // Default: strict params, any order
49
+ * describeEval("search test", {
50
+ * data: async () => [{
51
+ * input: "Find restaurants",
52
+ * expectedTools: [
53
+ * { name: "search", arguments: { type: "restaurant" } },
54
+ * { name: "filter" }
55
+ * ]
56
+ * }],
57
+ * task: myTask,
58
+ * scorers: [ToolCallScorer()]
59
+ * });
60
+ *
61
+ * @example
62
+ * // Strict order and parameters
63
+ * describeEval("payment flow", {
64
+ * data: async () => [{
65
+ * input: "Process payment",
66
+ * expectedTools: [
67
+ * { name: "validate", arguments: { amount: 100 } },
68
+ * { name: "charge", arguments: { amount: 100, method: "card" } }
69
+ * ]
70
+ * }],
71
+ * task: myTask,
72
+ * scorers: [ToolCallScorer({ ordered: true, params: "strict" })]
73
+ * });
74
+ */
75
+ declare function ToolCallScorer(config?: ToolCallScorerConfig): ScoreFn<ToolCallScorerOptions>;
76
+
77
+ /**
78
+ * Represents a tool/function call made during task execution.
79
+ * Supports various LLM provider formats and use cases.
80
+ */
81
+ type ToolCall = {
82
+ name: string;
83
+ arguments: Record<string, any>;
84
+ result?: any;
85
+ error?: {
86
+ code?: string;
87
+ message: string;
88
+ details?: any;
89
+ };
90
+ timestamp?: number;
91
+ duration_ms?: number;
92
+ id?: string;
93
+ parent_id?: string;
94
+ status?: "pending" | "executing" | "completed" | "failed" | "cancelled";
95
+ type?: "function" | "retrieval" | "code_interpreter" | "web_search" | string;
96
+ [key: string]: any;
97
+ };
98
+ type TaskResult = {
99
+ result: string;
100
+ toolCalls?: ToolCall[];
101
+ };
102
+ /**
103
+ * Task function that processes an input and returns either a string result
104
+ * or a TaskResult object containing the result and any tool calls made.
105
+ *
106
+ * @param input - The input string to process
107
+ * @returns Promise resolving to either a string or TaskResult object
108
+ *
109
+ * @example
110
+ * // Simple tasks can just return a string
111
+ * const simpleTask: TaskFn = async (input) => "The answer is 42";
112
+ *
113
+ * // Tasks that use tools should return TaskResult
114
+ * const taskWithTools: TaskFn = async (input) => ({
115
+ * result: "The answer is 42",
116
+ * toolCalls: [{ name: "calculate", arguments: { expr: "6*7" }, result: 42 }]
117
+ * });
118
+ */
119
+ type TaskFn = (input: string) => Promise<string | TaskResult>;
120
+ type Score = {
121
+ score: number | null;
122
+ metadata?: {
123
+ rationale?: string;
124
+ output?: string;
125
+ };
126
+ };
127
+ interface BaseScorerOptions {
128
+ input: string;
129
+ output: string;
130
+ toolCalls?: ToolCall[];
131
+ }
132
+ type ScoreFn<TOptions extends BaseScorerOptions = BaseScorerOptions> = (opts: TOptions) => Promise<Score> | Score;
133
+ type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn<any>, threshold?: number) => Promise<R>;
134
+ interface EvalMatchers<R = unknown> {
135
+ toEval: ToEval<R>;
136
+ }
137
+ declare module "vitest" {
138
+ interface Assertion<T = any> extends EvalMatchers<T> {
139
+ }
140
+ interface AsymmetricMatchersContaining extends EvalMatchers {
141
+ }
142
+ interface TaskMeta {
143
+ eval?: {
144
+ scores: (Score & {
145
+ name: string;
146
+ })[];
147
+ avgScore: number;
148
+ toolCalls?: ToolCall[];
149
+ };
150
+ }
151
+ }
152
+ /**
153
+ * Creates a test suite for evaluating language model outputs.
154
+ *
155
+ * @param name - The name of the test suite
156
+ * @param options - Configuration options
157
+ * @param options.data - Async function that returns an array of test cases with input and any additional fields
158
+ * @param options.task - Function that processes the input and returns the model output
159
+ * Can return either a string or TaskResult object with result and optional toolCalls
160
+ * @param options.skipIf - Optional function that determines if tests should be skipped
161
+ * @param options.scorers - Array of scoring functions that evaluate model outputs
162
+ * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
163
+ * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
164
+ *
165
+ * @example
166
+ * ```javascript
167
+ * // Recommended: TaskResult format with tool tracking
168
+ * describeEval("capital cities test", {
169
+ * data: async () => [{
170
+ * input: "What is the capital of France?",
171
+ * expected: "Paris"
172
+ * }],
173
+ * task: async (input) => {
174
+ * const response = await queryLLM(input);
175
+ * return {
176
+ * result: response.text,
177
+ * toolCalls: response.toolCalls || []
178
+ * };
179
+ * },
180
+ * scorers: [checkFactuality],
181
+ * threshold: 0.8
182
+ * });
183
+ *
184
+ * // Example with tool usage evaluation
185
+ * describeEval("tool usage test", {
186
+ * data: async () => [{
187
+ * input: "Search for weather in Seattle",
188
+ * expectedTools: [{ name: "weather_api", arguments: { location: "Seattle" } }]
189
+ * }],
190
+ * task: async (input) => {
191
+ * return {
192
+ * result: "The weather in Seattle is 65°F",
193
+ * toolCalls: [{
194
+ * name: "weather_api",
195
+ * arguments: { location: "Seattle" },
196
+ * result: { temp: 65, condition: "partly cloudy" }
197
+ * }]
198
+ * };
199
+ * },
200
+ * scorers: [ToolCallScorer()],
201
+ * threshold: 1.0
202
+ * });
203
+ * ```
204
+ */
205
+ declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
206
+ data: () => Promise<Array<{
207
+ input: string;
208
+ } & Record<string, any>>>;
209
+ task: TaskFn;
210
+ skipIf?: () => boolean;
211
+ scorers: ScoreFn<any>[];
212
+ threshold?: number | null;
213
+ timeout?: number;
214
+ }): vitest.SuiteCollector<object>;
215
+ declare function formatScores(scores: (Score & {
216
+ name: string;
217
+ })[]): string;
218
+ /**
219
+ * Wraps text to fit within a specified width, breaking at word boundaries.
220
+ *
221
+ * @param text - The text to wrap
222
+ * @param width - The maximum width in characters (default: 80)
223
+ * @returns The wrapped text with line breaks
224
+ *
225
+ * @example
226
+ * ```javascript
227
+ * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
228
+ * console.log(wrapped);
229
+ * // Output:
230
+ * // This is a very
231
+ * // long text that
232
+ * // needs to be
233
+ * // wrapped to fit
234
+ * // within an 80
235
+ * // character width.
236
+ * ```
237
+ */
238
+ declare function wrapText(text: string, width?: number): string;
239
+
240
+ export { type BaseScorerOptions as B, type EvalMatchers as E, type Score as S, type ToolCall as T, ToolCallScorer, type ToolCallScorerConfig, type ToolCallScorerOptions, type TaskResult as a, type TaskFn as b, type ScoreFn as c, type ToEval as d, describeEval as e, formatScores as f, wrapText as w };
@@ -0,0 +1,240 @@
1
+ import * as vitest from 'vitest';
2
+
3
+ interface ToolCallScorerOptions extends BaseScorerOptions {
4
+ expectedTools?: Array<{
5
+ name: string;
6
+ arguments?: any;
7
+ }>;
8
+ }
9
+ interface ToolCallScorerConfig {
10
+ /**
11
+ * Whether tools must be called in the exact order specified
12
+ * @default false
13
+ */
14
+ ordered?: boolean;
15
+ /**
16
+ * Whether all expected tools must be called for a passing score
17
+ * When false: gives partial credit based on tools matched
18
+ * @default true
19
+ */
20
+ requireAll?: boolean;
21
+ /**
22
+ * Whether to allow additional tool calls beyond those expected
23
+ * @default true
24
+ */
25
+ allowExtras?: boolean;
26
+ /**
27
+ * How to match tool arguments/parameters
28
+ * - "strict": Exact equality required (default)
29
+ * - "fuzzy": Case-insensitive, subset matching, numeric tolerance
30
+ * - Custom function: Your own comparison logic
31
+ * @default "strict"
32
+ */
33
+ params?: "strict" | "fuzzy" | ((expected: any, actual: any) => boolean);
34
+ }
35
+ /**
36
+ * A configurable scorer for evaluating tool usage in LLM responses.
37
+ *
38
+ * The test data defines WHAT tools/arguments are expected,
39
+ * while this scorer defines HOW to evaluate them.
40
+ *
41
+ * @param config - Configuration options for the scorer
42
+ * @param config.ordered - Require exact order of tool calls
43
+ * @param config.requireAll - Require all expected tools (vs partial credit)
44
+ * @param config.allowExtras - Allow additional tool calls
45
+ * @param config.params - How to match parameters: "strict", "fuzzy", or custom function
46
+ *
47
+ * @example
48
+ * // Default: strict params, any order
49
+ * describeEval("search test", {
50
+ * data: async () => [{
51
+ * input: "Find restaurants",
52
+ * expectedTools: [
53
+ * { name: "search", arguments: { type: "restaurant" } },
54
+ * { name: "filter" }
55
+ * ]
56
+ * }],
57
+ * task: myTask,
58
+ * scorers: [ToolCallScorer()]
59
+ * });
60
+ *
61
+ * @example
62
+ * // Strict order and parameters
63
+ * describeEval("payment flow", {
64
+ * data: async () => [{
65
+ * input: "Process payment",
66
+ * expectedTools: [
67
+ * { name: "validate", arguments: { amount: 100 } },
68
+ * { name: "charge", arguments: { amount: 100, method: "card" } }
69
+ * ]
70
+ * }],
71
+ * task: myTask,
72
+ * scorers: [ToolCallScorer({ ordered: true, params: "strict" })]
73
+ * });
74
+ */
75
+ declare function ToolCallScorer(config?: ToolCallScorerConfig): ScoreFn<ToolCallScorerOptions>;
76
+
77
+ /**
78
+ * Represents a tool/function call made during task execution.
79
+ * Supports various LLM provider formats and use cases.
80
+ */
81
+ type ToolCall = {
82
+ name: string;
83
+ arguments: Record<string, any>;
84
+ result?: any;
85
+ error?: {
86
+ code?: string;
87
+ message: string;
88
+ details?: any;
89
+ };
90
+ timestamp?: number;
91
+ duration_ms?: number;
92
+ id?: string;
93
+ parent_id?: string;
94
+ status?: "pending" | "executing" | "completed" | "failed" | "cancelled";
95
+ type?: "function" | "retrieval" | "code_interpreter" | "web_search" | string;
96
+ [key: string]: any;
97
+ };
98
+ type TaskResult = {
99
+ result: string;
100
+ toolCalls?: ToolCall[];
101
+ };
102
+ /**
103
+ * Task function that processes an input and returns either a string result
104
+ * or a TaskResult object containing the result and any tool calls made.
105
+ *
106
+ * @param input - The input string to process
107
+ * @returns Promise resolving to either a string or TaskResult object
108
+ *
109
+ * @example
110
+ * // Simple tasks can just return a string
111
+ * const simpleTask: TaskFn = async (input) => "The answer is 42";
112
+ *
113
+ * // Tasks that use tools should return TaskResult
114
+ * const taskWithTools: TaskFn = async (input) => ({
115
+ * result: "The answer is 42",
116
+ * toolCalls: [{ name: "calculate", arguments: { expr: "6*7" }, result: 42 }]
117
+ * });
118
+ */
119
+ type TaskFn = (input: string) => Promise<string | TaskResult>;
120
+ type Score = {
121
+ score: number | null;
122
+ metadata?: {
123
+ rationale?: string;
124
+ output?: string;
125
+ };
126
+ };
127
+ interface BaseScorerOptions {
128
+ input: string;
129
+ output: string;
130
+ toolCalls?: ToolCall[];
131
+ }
132
+ type ScoreFn<TOptions extends BaseScorerOptions = BaseScorerOptions> = (opts: TOptions) => Promise<Score> | Score;
133
+ type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn<any>, threshold?: number) => Promise<R>;
134
+ interface EvalMatchers<R = unknown> {
135
+ toEval: ToEval<R>;
136
+ }
137
+ declare module "vitest" {
138
+ interface Assertion<T = any> extends EvalMatchers<T> {
139
+ }
140
+ interface AsymmetricMatchersContaining extends EvalMatchers {
141
+ }
142
+ interface TaskMeta {
143
+ eval?: {
144
+ scores: (Score & {
145
+ name: string;
146
+ })[];
147
+ avgScore: number;
148
+ toolCalls?: ToolCall[];
149
+ };
150
+ }
151
+ }
152
+ /**
153
+ * Creates a test suite for evaluating language model outputs.
154
+ *
155
+ * @param name - The name of the test suite
156
+ * @param options - Configuration options
157
+ * @param options.data - Async function that returns an array of test cases with input and any additional fields
158
+ * @param options.task - Function that processes the input and returns the model output
159
+ * Can return either a string or TaskResult object with result and optional toolCalls
160
+ * @param options.skipIf - Optional function that determines if tests should be skipped
161
+ * @param options.scorers - Array of scoring functions that evaluate model outputs
162
+ * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
163
+ * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
164
+ *
165
+ * @example
166
+ * ```javascript
167
+ * // Recommended: TaskResult format with tool tracking
168
+ * describeEval("capital cities test", {
169
+ * data: async () => [{
170
+ * input: "What is the capital of France?",
171
+ * expected: "Paris"
172
+ * }],
173
+ * task: async (input) => {
174
+ * const response = await queryLLM(input);
175
+ * return {
176
+ * result: response.text,
177
+ * toolCalls: response.toolCalls || []
178
+ * };
179
+ * },
180
+ * scorers: [checkFactuality],
181
+ * threshold: 0.8
182
+ * });
183
+ *
184
+ * // Example with tool usage evaluation
185
+ * describeEval("tool usage test", {
186
+ * data: async () => [{
187
+ * input: "Search for weather in Seattle",
188
+ * expectedTools: [{ name: "weather_api", arguments: { location: "Seattle" } }]
189
+ * }],
190
+ * task: async (input) => {
191
+ * return {
192
+ * result: "The weather in Seattle is 65°F",
193
+ * toolCalls: [{
194
+ * name: "weather_api",
195
+ * arguments: { location: "Seattle" },
196
+ * result: { temp: 65, condition: "partly cloudy" }
197
+ * }]
198
+ * };
199
+ * },
200
+ * scorers: [ToolCallScorer()],
201
+ * threshold: 1.0
202
+ * });
203
+ * ```
204
+ */
205
+ declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
206
+ data: () => Promise<Array<{
207
+ input: string;
208
+ } & Record<string, any>>>;
209
+ task: TaskFn;
210
+ skipIf?: () => boolean;
211
+ scorers: ScoreFn<any>[];
212
+ threshold?: number | null;
213
+ timeout?: number;
214
+ }): vitest.SuiteCollector<object>;
215
+ declare function formatScores(scores: (Score & {
216
+ name: string;
217
+ })[]): string;
218
+ /**
219
+ * Wraps text to fit within a specified width, breaking at word boundaries.
220
+ *
221
+ * @param text - The text to wrap
222
+ * @param width - The maximum width in characters (default: 80)
223
+ * @returns The wrapped text with line breaks
224
+ *
225
+ * @example
226
+ * ```javascript
227
+ * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
228
+ * console.log(wrapped);
229
+ * // Output:
230
+ * // This is a very
231
+ * // long text that
232
+ * // needs to be
233
+ * // wrapped to fit
234
+ * // within an 80
235
+ * // character width.
236
+ * ```
237
+ */
238
+ declare function wrapText(text: string, width?: number): string;
239
+
240
+ export { type BaseScorerOptions as B, type EvalMatchers as E, type Score as S, type ToolCall as T, ToolCallScorer, type ToolCallScorerConfig, type ToolCallScorerOptions, type TaskResult as a, type TaskFn as b, type ScoreFn as c, type ToEval as d, describeEval as e, formatScores as f, wrapText as w };