npm - vitest-evals - Versions diffs - 0.2.0 → 0.4.0 - Mend

vitest-evals 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +211 -172
package/dist/index.d.mts +2 -98
package/dist/index.d.ts +2 -98
package/dist/index.js +270 -11
package/dist/index.js.map +1 -1
package/dist/index.mjs +269 -11
package/dist/index.mjs.map +1 -1
package/dist/scorers/index.d.mts +2 -0
package/dist/scorers/index.d.ts +2 -0
package/dist/scorers/index.js +282 -0
package/dist/scorers/index.js.map +1 -0
package/dist/scorers/index.mjs +256 -0
package/dist/scorers/index.mjs.map +1 -0
package/dist/scorers/toolCallScorer.d.mts +240 -0
package/dist/scorers/toolCallScorer.d.ts +240 -0
package/dist/scorers/toolCallScorer.js +280 -0
package/dist/scorers/toolCallScorer.js.map +1 -0
package/dist/scorers/toolCallScorer.mjs +256 -0
package/dist/scorers/toolCallScorer.mjs.map +1 -0
package/package.json +16 -4
package/dist/compatibility.test.d.mts +0 -2
package/dist/compatibility.test.d.ts +0 -2
package/dist/compatibility.test.js +0 -45009
package/dist/compatibility.test.js.map +0 -1
package/dist/compatibility.test.mjs +0 -45864
package/dist/compatibility.test.mjs.map +0 -1
package/dist/formatScores.test.d.mts +0 -2
package/dist/formatScores.test.d.ts +0 -2
package/dist/formatScores.test.js +0 -195
package/dist/formatScores.test.js.map +0 -1
package/dist/formatScores.test.mjs +0 -194
package/dist/formatScores.test.mjs.map +0 -1
package/dist/wrapText.test.d.mts +0 -2
package/dist/wrapText.test.d.ts +0 -2
package/dist/wrapText.test.js +0 -162
package/dist/wrapText.test.js.map +0 -1
package/dist/wrapText.test.mjs +0 -161
package/dist/wrapText.test.mjs.map +0 -1

package/dist/scorers/toolCallScorer.d.mts ADDED Viewed

@@ -0,0 +1,240 @@
+import * as vitest from 'vitest';
+interface ToolCallScorerOptions extends BaseScorerOptions {
+    expectedTools?: Array<{
+        name: string;
+        arguments?: any;
+    }>;
+}
+interface ToolCallScorerConfig {
+    /**
+     * Whether tools must be called in the exact order specified
+     * @default false
+     */
+    ordered?: boolean;
+    /**
+     * Whether all expected tools must be called for a passing score
+     * When false: gives partial credit based on tools matched
+     * @default true
+     */
+    requireAll?: boolean;
+    /**
+     * Whether to allow additional tool calls beyond those expected
+     * @default true
+     */
+    allowExtras?: boolean;
+    /**
+     * How to match tool arguments/parameters
+     * - "strict": Exact equality required (default)
+     * - "fuzzy": Case-insensitive, subset matching, numeric tolerance
+     * - Custom function: Your own comparison logic
+     * @default "strict"
+     */
+    params?: "strict" | "fuzzy" | ((expected: any, actual: any) => boolean);
+}
+/**
+ * A configurable scorer for evaluating tool usage in LLM responses.
+ *
+ * The test data defines WHAT tools/arguments are expected,
+ * while this scorer defines HOW to evaluate them.
+ *
+ * @param config - Configuration options for the scorer
+ * @param config.ordered - Require exact order of tool calls
+ * @param config.requireAll - Require all expected tools (vs partial credit)
+ * @param config.allowExtras - Allow additional tool calls
+ * @param config.params - How to match parameters: "strict", "fuzzy", or custom function
+ *
+ * @example
+ * // Default: strict params, any order
+ * describeEval("search test", {
+ *   data: async () => [{
+ *     input: "Find restaurants",
+ *     expectedTools: [
+ *       { name: "search", arguments: { type: "restaurant" } },
+ *       { name: "filter" }
+ *     ]
+ *   }],
+ *   task: myTask,
+ *   scorers: [ToolCallScorer()]
+ * });
+ *
+ * @example
+ * // Strict order and parameters
+ * describeEval("payment flow", {
+ *   data: async () => [{
+ *     input: "Process payment",
+ *     expectedTools: [
+ *       { name: "validate", arguments: { amount: 100 } },
+ *       { name: "charge", arguments: { amount: 100, method: "card" } }
+ *     ]
+ *   }],
+ *   task: myTask,
+ *   scorers: [ToolCallScorer({ ordered: true, params: "strict" })]
+ * });
+ */
+declare function ToolCallScorer(config?: ToolCallScorerConfig): ScoreFn<ToolCallScorerOptions>;
+/**
+ * Represents a tool/function call made during task execution.
+ * Supports various LLM provider formats and use cases.
+ */
+type ToolCall = {
+    name: string;
+    arguments: Record<string, any>;
+    result?: any;
+    error?: {
+        code?: string;
+        message: string;
+        details?: any;
+    };
+    timestamp?: number;
+    duration_ms?: number;
+    id?: string;
+    parent_id?: string;
+    status?: "pending" | "executing" | "completed" | "failed" | "cancelled";
+    type?: "function" | "retrieval" | "code_interpreter" | "web_search" | string;
+    [key: string]: any;
+};
+type TaskResult = {
+    result: string;
+    toolCalls?: ToolCall[];
+};
+/**
+ * Task function that processes an input and returns either a string result
+ * or a TaskResult object containing the result and any tool calls made.
+ *
+ * @param input - The input string to process
+ * @returns Promise resolving to either a string or TaskResult object
+ *
+ * @example
+ * // Simple tasks can just return a string
+ * const simpleTask: TaskFn = async (input) => "The answer is 42";
+ *
+ * // Tasks that use tools should return TaskResult
+ * const taskWithTools: TaskFn = async (input) => ({
+ *   result: "The answer is 42",
+ *   toolCalls: [{ name: "calculate", arguments: { expr: "6*7" }, result: 42 }]
+ * });
+ */
+type TaskFn = (input: string) => Promise<string | TaskResult>;
+type Score = {
+    score: number | null;
+    metadata?: {
+        rationale?: string;
+        output?: string;
+    };
+};
+interface BaseScorerOptions {
+    input: string;
+    output: string;
+    toolCalls?: ToolCall[];
+}
+type ScoreFn<TOptions extends BaseScorerOptions = BaseScorerOptions> = (opts: TOptions) => Promise<Score> | Score;
+type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn<any>, threshold?: number) => Promise<R>;
+interface EvalMatchers<R = unknown> {
+    toEval: ToEval<R>;
+}
+declare module "vitest" {
+    interface Assertion<T = any> extends EvalMatchers<T> {
+    }
+    interface AsymmetricMatchersContaining extends EvalMatchers {
+    }
+    interface TaskMeta {
+        eval?: {
+            scores: (Score & {
+                name: string;
+            })[];
+            avgScore: number;
+            toolCalls?: ToolCall[];
+        };
+    }
+}
+/**
+ * Creates a test suite for evaluating language model outputs.
+ *
+ * @param name - The name of the test suite
+ * @param options - Configuration options
+ * @param options.data - Async function that returns an array of test cases with input and any additional fields
+ * @param options.task - Function that processes the input and returns the model output
+ *                       Can return either a string or TaskResult object with result and optional toolCalls
+ * @param options.skipIf - Optional function that determines if tests should be skipped
+ * @param options.scorers - Array of scoring functions that evaluate model outputs
+ * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
+ * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
+ *
+ * @example
+ * ```javascript
+ * // Recommended: TaskResult format with tool tracking
+ * describeEval("capital cities test", {
+ *   data: async () => [{
+ *     input: "What is the capital of France?",
+ *     expected: "Paris"
+ *   }],
+ *   task: async (input) => {
+ *     const response = await queryLLM(input);
+ *     return {
+ *       result: response.text,
+ *       toolCalls: response.toolCalls || []
+ *     };
+ *   },
+ *   scorers: [checkFactuality],
+ *   threshold: 0.8
+ * });
+ *
+ * // Example with tool usage evaluation
+ * describeEval("tool usage test", {
+ *   data: async () => [{
+ *     input: "Search for weather in Seattle",
+ *     expectedTools: [{ name: "weather_api", arguments: { location: "Seattle" } }]
+ *   }],
+ *   task: async (input) => {
+ *     return {
+ *       result: "The weather in Seattle is 65°F",
+ *       toolCalls: [{
+ *         name: "weather_api",
+ *         arguments: { location: "Seattle" },
+ *         result: { temp: 65, condition: "partly cloudy" }
+ *       }]
+ *     };
+ *   },
+ *   scorers: [ToolCallScorer()],
+ *   threshold: 1.0
+ * });
+ * ```
+ */
+declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
+    data: () => Promise<Array<{
+        input: string;
+    } & Record<string, any>>>;
+    task: TaskFn;
+    skipIf?: () => boolean;
+    scorers: ScoreFn<any>[];
+    threshold?: number | null;
+    timeout?: number;
+}): vitest.SuiteCollector<object>;
+declare function formatScores(scores: (Score & {
+    name: string;
+})[]): string;
+/**
+ * Wraps text to fit within a specified width, breaking at word boundaries.
+ *
+ * @param text - The text to wrap
+ * @param width - The maximum width in characters (default: 80)
+ * @returns The wrapped text with line breaks
+ *
+ * @example
+ * ```javascript
+ * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
+ * console.log(wrapped);
+ * // Output:
+ * // This is a very
+ * // long text that
+ * // needs to be
+ * // wrapped to fit
+ * // within an 80
+ * // character width.
+ * ```
+ */
+declare function wrapText(text: string, width?: number): string;
+export { type BaseScorerOptions as B, type EvalMatchers as E, type Score as S, type ToolCall as T, ToolCallScorer, type ToolCallScorerConfig, type ToolCallScorerOptions, type TaskResult as a, type TaskFn as b, type ScoreFn as c, type ToEval as d, describeEval as e, formatScores as f, wrapText as w };

package/dist/scorers/toolCallScorer.d.ts ADDED Viewed

@@ -0,0 +1,240 @@
+import * as vitest from 'vitest';
+interface ToolCallScorerOptions extends BaseScorerOptions {
+    expectedTools?: Array<{
+        name: string;
+        arguments?: any;
+    }>;
+}
+interface ToolCallScorerConfig {
+    /**
+     * Whether tools must be called in the exact order specified
+     * @default false
+     */
+    ordered?: boolean;
+    /**
+     * Whether all expected tools must be called for a passing score
+     * When false: gives partial credit based on tools matched
+     * @default true
+     */
+    requireAll?: boolean;
+    /**
+     * Whether to allow additional tool calls beyond those expected
+     * @default true
+     */
+    allowExtras?: boolean;
+    /**
+     * How to match tool arguments/parameters
+     * - "strict": Exact equality required (default)
+     * - "fuzzy": Case-insensitive, subset matching, numeric tolerance
+     * - Custom function: Your own comparison logic
+     * @default "strict"
+     */
+    params?: "strict" | "fuzzy" | ((expected: any, actual: any) => boolean);
+}
+/**
+ * A configurable scorer for evaluating tool usage in LLM responses.
+ *
+ * The test data defines WHAT tools/arguments are expected,
+ * while this scorer defines HOW to evaluate them.
+ *
+ * @param config - Configuration options for the scorer
+ * @param config.ordered - Require exact order of tool calls
+ * @param config.requireAll - Require all expected tools (vs partial credit)
+ * @param config.allowExtras - Allow additional tool calls
+ * @param config.params - How to match parameters: "strict", "fuzzy", or custom function
+ *
+ * @example
+ * // Default: strict params, any order
+ * describeEval("search test", {
+ *   data: async () => [{
+ *     input: "Find restaurants",
+ *     expectedTools: [
+ *       { name: "search", arguments: { type: "restaurant" } },
+ *       { name: "filter" }
+ *     ]
+ *   }],
+ *   task: myTask,
+ *   scorers: [ToolCallScorer()]
+ * });
+ *
+ * @example
+ * // Strict order and parameters
+ * describeEval("payment flow", {
+ *   data: async () => [{
+ *     input: "Process payment",
+ *     expectedTools: [
+ *       { name: "validate", arguments: { amount: 100 } },
+ *       { name: "charge", arguments: { amount: 100, method: "card" } }
+ *     ]
+ *   }],
+ *   task: myTask,
+ *   scorers: [ToolCallScorer({ ordered: true, params: "strict" })]
+ * });
+ */
+declare function ToolCallScorer(config?: ToolCallScorerConfig): ScoreFn<ToolCallScorerOptions>;
+/**
+ * Represents a tool/function call made during task execution.
+ * Supports various LLM provider formats and use cases.
+ */
+type ToolCall = {
+    name: string;
+    arguments: Record<string, any>;
+    result?: any;
+    error?: {
+        code?: string;
+        message: string;
+        details?: any;
+    };
+    timestamp?: number;
+    duration_ms?: number;
+    id?: string;
+    parent_id?: string;
+    status?: "pending" | "executing" | "completed" | "failed" | "cancelled";
+    type?: "function" | "retrieval" | "code_interpreter" | "web_search" | string;
+    [key: string]: any;
+};
+type TaskResult = {
+    result: string;
+    toolCalls?: ToolCall[];
+};
+/**
+ * Task function that processes an input and returns either a string result
+ * or a TaskResult object containing the result and any tool calls made.
+ *
+ * @param input - The input string to process
+ * @returns Promise resolving to either a string or TaskResult object
+ *
+ * @example
+ * // Simple tasks can just return a string
+ * const simpleTask: TaskFn = async (input) => "The answer is 42";
+ *
+ * // Tasks that use tools should return TaskResult
+ * const taskWithTools: TaskFn = async (input) => ({
+ *   result: "The answer is 42",
+ *   toolCalls: [{ name: "calculate", arguments: { expr: "6*7" }, result: 42 }]
+ * });
+ */
+type TaskFn = (input: string) => Promise<string | TaskResult>;
+type Score = {
+    score: number | null;
+    metadata?: {
+        rationale?: string;
+        output?: string;
+    };
+};
+interface BaseScorerOptions {
+    input: string;
+    output: string;
+    toolCalls?: ToolCall[];
+}
+type ScoreFn<TOptions extends BaseScorerOptions = BaseScorerOptions> = (opts: TOptions) => Promise<Score> | Score;
+type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn<any>, threshold?: number) => Promise<R>;
+interface EvalMatchers<R = unknown> {
+    toEval: ToEval<R>;
+}
+declare module "vitest" {
+    interface Assertion<T = any> extends EvalMatchers<T> {
+    }
+    interface AsymmetricMatchersContaining extends EvalMatchers {
+    }
+    interface TaskMeta {
+        eval?: {
+            scores: (Score & {
+                name: string;
+            })[];
+            avgScore: number;
+            toolCalls?: ToolCall[];
+        };
+    }
+}
+/**
+ * Creates a test suite for evaluating language model outputs.
+ *
+ * @param name - The name of the test suite
+ * @param options - Configuration options
+ * @param options.data - Async function that returns an array of test cases with input and any additional fields
+ * @param options.task - Function that processes the input and returns the model output
+ *                       Can return either a string or TaskResult object with result and optional toolCalls
+ * @param options.skipIf - Optional function that determines if tests should be skipped
+ * @param options.scorers - Array of scoring functions that evaluate model outputs
+ * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
+ * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
+ *
+ * @example
+ * ```javascript
+ * // Recommended: TaskResult format with tool tracking
+ * describeEval("capital cities test", {
+ *   data: async () => [{
+ *     input: "What is the capital of France?",
+ *     expected: "Paris"
+ *   }],
+ *   task: async (input) => {
+ *     const response = await queryLLM(input);
+ *     return {
+ *       result: response.text,
+ *       toolCalls: response.toolCalls || []
+ *     };
+ *   },
+ *   scorers: [checkFactuality],
+ *   threshold: 0.8
+ * });
+ *
+ * // Example with tool usage evaluation
+ * describeEval("tool usage test", {
+ *   data: async () => [{
+ *     input: "Search for weather in Seattle",
+ *     expectedTools: [{ name: "weather_api", arguments: { location: "Seattle" } }]
+ *   }],
+ *   task: async (input) => {
+ *     return {
+ *       result: "The weather in Seattle is 65°F",
+ *       toolCalls: [{
+ *         name: "weather_api",
+ *         arguments: { location: "Seattle" },
+ *         result: { temp: 65, condition: "partly cloudy" }
+ *       }]
+ *     };
+ *   },
+ *   scorers: [ToolCallScorer()],
+ *   threshold: 1.0
+ * });
+ * ```
+ */
+declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
+    data: () => Promise<Array<{
+        input: string;
+    } & Record<string, any>>>;
+    task: TaskFn;
+    skipIf?: () => boolean;
+    scorers: ScoreFn<any>[];
+    threshold?: number | null;
+    timeout?: number;
+}): vitest.SuiteCollector<object>;
+declare function formatScores(scores: (Score & {
+    name: string;
+})[]): string;
+/**
+ * Wraps text to fit within a specified width, breaking at word boundaries.
+ *
+ * @param text - The text to wrap
+ * @param width - The maximum width in characters (default: 80)
+ * @returns The wrapped text with line breaks
+ *
+ * @example
+ * ```javascript
+ * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
+ * console.log(wrapped);
+ * // Output:
+ * // This is a very
+ * // long text that
+ * // needs to be
+ * // wrapped to fit
+ * // within an 80
+ * // character width.
+ * ```
+ */
+declare function wrapText(text: string, width?: number): string;
+export { type BaseScorerOptions as B, type EvalMatchers as E, type Score as S, type ToolCall as T, ToolCallScorer, type ToolCallScorerConfig, type ToolCallScorerOptions, type TaskResult as a, type TaskFn as b, type ScoreFn as c, type ToEval as d, describeEval as e, formatScores as f, wrapText as w };