npm - vitest-evals - Versions diffs - 0.3.0 → 0.4.0 - Mend

vitest-evals 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +206 -195
package/dist/index.d.mts +2 -97
package/dist/index.d.ts +2 -97
package/dist/index.js +253 -8
package/dist/index.js.map +1 -1
package/dist/index.mjs +252 -8
package/dist/index.mjs.map +1 -1
package/dist/scorers/index.d.mts +2 -0
package/dist/scorers/index.d.ts +2 -0
package/dist/scorers/index.js +282 -0
package/dist/scorers/index.js.map +1 -0
package/dist/scorers/index.mjs +256 -0
package/dist/scorers/index.mjs.map +1 -0
package/dist/scorers/toolCallScorer.d.mts +240 -0
package/dist/scorers/toolCallScorer.d.ts +240 -0
package/dist/scorers/toolCallScorer.js +280 -0
package/dist/scorers/toolCallScorer.js.map +1 -0
package/dist/scorers/toolCallScorer.mjs +256 -0
package/dist/scorers/toolCallScorer.mjs.map +1 -0
package/package.json +11 -1
package/dist/autoevals-compatibility.test.d.mts +0 -2
package/dist/autoevals-compatibility.test.d.ts +0 -2
package/dist/autoevals-compatibility.test.js +0 -45122
package/dist/autoevals-compatibility.test.js.map +0 -1
package/dist/autoevals-compatibility.test.mjs +0 -45977
package/dist/autoevals-compatibility.test.mjs.map +0 -1
package/dist/formatScores.test.d.mts +0 -2
package/dist/formatScores.test.d.ts +0 -2
package/dist/formatScores.test.js +0 -196
package/dist/formatScores.test.js.map +0 -1
package/dist/formatScores.test.mjs +0 -195
package/dist/formatScores.test.mjs.map +0 -1
package/dist/wrapText.test.d.mts +0 -2
package/dist/wrapText.test.d.ts +0 -2
package/dist/wrapText.test.js +0 -163
package/dist/wrapText.test.js.map +0 -1
package/dist/wrapText.test.mjs +0 -162
package/dist/wrapText.test.mjs.map +0 -1

package/README.md CHANGED Viewed

@@ -1,214 +1,246 @@
 # vitest-evals
-This project is a prototype of extending vitest to support basic _Evals_ functionality. Evals are a type of testing that is most commonly deployed to _evaluate_ the results of calls to language models. This allows you to utilize them with a pattern of testing you're familiar with, working well with your existing continuous integration toolchain.
+Evaluate LLM outputs using the familiar Vitest testing framework.
-This is heavily inspired by [Evalite](https://www.evalite.dev/), but opts for a vitest-native approach to maximize the compatibility of the existing ecosystem. This means you can use it with your existing toolchain, including reporting such as code coverage and xunit.
-## Use
+## Installation
 ```shell
 npm install -D vitest-evals
 ```
-You've likely already got a mechanism for passing the user input into your model, for example:
+## Quick Start
 ```javascript
-async function answerQuestion(prompt: string) {
-  const model = openai("gpt-4o");
-  const { text } = await generateText({
-    model,
-    prompt,
-  });
-  return text;
-}
+import { describeEval } from "vitest-evals";
+describeEval("capital cities", {
+  data: async () => [
+    { input: "What is the capital of France?", expected: "Paris" },
+    { input: "What is the capital of Japan?", expected: "Tokyo" }
+  ],
+  task: async (input) => {
+    const response = await queryLLM(input);
+    return response; // Simple string return
+  },
+  scorers: [async ({ output, expected }) => ({
+    score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
+  })],
+  threshold: 0.8
+});
 ```
-You'll use this as the `task` within your evals, and then you simply need to define a set of scenarios
-and a way to validate if the LLM is responding as you desire:
+## Tasks
+Tasks process inputs and return outputs. Two formats are supported:
 ```javascript
-import { describeEval } from "vitest-evals";
-import { Factuality } from "autoevals";
-describeEval("my evals", {
-  data: async () => {
-    // The scenarios you wish to evaluate
-    return [
-      {
-        input: "What is the capital of France?",
-        expected: "Paris",
-      }
-    ];
-  },
+// Simple: just return a string
+const task = async (input) => "response";
+// With tool tracking: return a TaskResult
+const task = async (input) => ({
+  result: "response",
+  toolCalls: [
+    { name: "search", arguments: { query: "..." }, result: {...} }
+  ]
+});
+```
+## Scorers
-  task: answerQuestion,
+Scorers evaluate outputs and return a score (0-1). Use built-in scorers or create your own:
-  // Scorers determine if the response was acceptable - in this case we're using
-  // a secondary LLM prompt to judge the response of the first.
-  scorers: [Factuality],
+```javascript
+// Built-in scorer
+import { ToolCallScorer } from "vitest-evals";
+// Or import individually
+import { ToolCallScorer } from "vitest-evals/scorers/toolCallScorer";
+describeEval("tool usage", {
+  data: async () => [
+    { input: "Search weather", expectedTools: [{ name: "weather_api" }] }
+  ],
+  task: weatherTask,
+  scorers: [ToolCallScorer()]
+});
-  // The threshold required for the average score for this eval to pass. This will be
-  // based on the scorers you've provided, and in the case of Factuality, we might be
-  // ok with a 60% score (see the implementation for why).
-  threshold: 0.6,
+// Custom scorer
+const LengthScorer = async ({ output }) => ({
+  score: output.length > 50 ? 1.0 : 0.0
+});
-  // The timeout for each test. Defaults to 10s. You may need to increase this if your model
-  // provider has high latency or you're using a large number of scorers.
-  // timeout: 60000,
+// TypeScript scorer with custom options
+import { type ScoreFn, type BaseScorerOptions } from "vitest-evals";
+interface CustomOptions extends BaseScorerOptions {
+  minLength: number;
+}
-  // A check to determine if these tests should run. This is helpful to control tests so they only
-  // in certain situations, for example if a model providers API key is defined.
-  // skipIf: () => !process.env.OPENAI_API_KEY
-})
+const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
+  score: opts.output.length >= opts.minLength ? 1.0 : 0.0
+});
 ```
-### Existing Test Suites
+### Built-in Scorers
+#### ToolCallScorer
+Evaluates if the expected tools were called with correct arguments.
 ```javascript
-// import `vitest-evals` to expose `expect().toEval()`
-// This can also be done via `setupFiles` pattern in `vitest`.
-import "vitest-evals";
-import { Factuality } from "autoevals";
-describe("my test suite", () => {
-  it("kind of works", () => {
-    expect("What is the capital of France?").toEval(
-      "Paris",
-      answerQuestion,
-      Factuality,
-      0.8
-    );
-  });
+// Basic usage - strict matching, any order
+describeEval("search test", {
+  data: async () => [{
+    input: "Find Italian restaurants",
+    expectedTools: [
+      { name: "search", arguments: { type: "restaurant" } },
+      { name: "filter", arguments: { cuisine: "italian" } }
+    ]
+  }],
+  task: myTask,
+  scorers: [ToolCallScorer()]
 });
+// Strict evaluation - exact order and parameters
+scorers: [ToolCallScorer({
+  ordered: true,      // Tools must be in exact order
+  params: "strict"    // Parameters must match exactly
+})]
+// Flexible evaluation
+scorers: [ToolCallScorer({
+  requireAll: false,   // Partial matches give partial credit
+  allowExtras: false   // No additional tools allowed
+})]
+```
+**Default behavior:**
+- Strict parameter matching (exact equality required)
+- Any order allowed
+- Extra tools allowed
+- All expected tools required
+## AI SDK Integration
+See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a complete example with the Vercel AI SDK.
+Transform provider responses to our format:
+```javascript
+// Vercel AI SDK
+const { text, toolCalls, toolResults } = await generateText(...);
+return {
+  result: text,
+  toolCalls: toolCalls?.map((call, i) => ({
+    id: call.toolCallId,
+    name: call.toolName,
+    arguments: call.args,
+    result: toolResults?.[i]?.result,
+    status: toolResults?.[i]?.error ? 'failed' : 'completed'
+  }))
+};
+```
+## Advanced Usage
+### Advanced Scorers
+#### Using autoevals
+For sophisticated evaluation, use autoevals scorers:
+```javascript
+import { Factuality, ClosedQA } from "autoevals";
+scorers: [
+  Factuality, // LLM-based factuality checking
+  ClosedQA.partial({
+    criteria: "Does the answer mention Paris?"
+  })
+]
 ```
-### Scoring
+#### Custom LLM-based Factuality Scorer
-Scorers are compatible with the `autoevals` interface, but are also simple to implement on your own:
+Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
 ```javascript
-export const Contains = async (opts: {
-  input: string,
-  expected: string,
-  output: string,
-}) => {
+import { generateObject } from 'ai';
+import { openai } from '@ai-sdk/openai';
+import { z } from 'zod';
+const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expected }) => {
+  if (!expected) {
+    return { score: 1.0, metadata: { rationale: "No expected answer" } };
+  }
+  const { object } = await generateObject({
+    model,
+    prompt: `
+      Compare the factual content of the submitted answer with the expert answer.
+      Question: ${input}
+      Expert: ${expected}
+      Submission: ${output}
+      Options:
+      (A) Subset of expert answer
+      (B) Superset of expert answer
+      (C) Same content as expert
+      (D) Contradicts expert answer
+      (E) Different but factually equivalent
+    `,
+    schema: z.object({
+      answer: z.enum(['A', 'B', 'C', 'D', 'E']),
+      rationale: z.string()
+    })
+  });
+  const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
   return {
-    score: output.indexOf(expected) !== -1 ? 1.0 : 0.0,
+    score: scores[object.answer],
+    metadata: { rationale: object.rationale, answer: object.answer }
   };
 };
+// Usage
+scorers: [Factuality()]
 ```
-For something more realistic, here's a reimplementation of the Factuality scorer from `autoevals`, with some flexibility
-on the model, enabling you to evaluate against multiple models:
-````javascript
-import { generateObject, type LanguageModel } from "ai";
-import { z } from "zod";
-/**
- * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
- *
- * @param model - The language model to utilize (via `ai`).
- *
- * @example
- * ```javascript
- * import { openai } from "@ai-sdk/openai";
- *
- * scorers: [Factuality(openai("gpt-4o"))]
- * ```
- */
-export function Factuality(model: LanguageModel) {
-  return async Factuality(opts: {
-    input: string;
-    output: string;
-    expected?: string;
-  }) => {
-    const { object } = await generateObject({
-      model,
-      /**
-       * Prompt implementation from `autoevals`:
-       *
-       * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
-       */
-      prompt: `
-        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
-        [BEGIN DATA]
-        ************
-        [Question]: ${opts.input}
-        ************
-        [Expert]: ${opts.expected}
-        ************
-        [Submission]: ${opts.output}
-        ************
-        [END DATA]
-        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.
-        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
-        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
-        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
-        (C) The submitted answer contains all the same details as the expert answer.
-        (D) There is a disagreement between the submitted answer and the expert answer.
-        (E) The answers differ, but these differences don't matter from the perspective of factuality.
-      `,
-      schema: z.object({
-        answer: z.enum(["A", "B", "C", "D", "E"]).describe("Your selection."),
-        rationale: z
-          .string()
-          .describe("Why you chose this answer. Be very detailed."),
-      }),
-    });
-    const scores = {
-      A: 0.4,
-      B: 0.6,
-      C: 1,
-      D: 0,
-      E: 1,
-    };
-    return {
-      score: scores[object.answer],
-      metadata: {
-        rationale: object.rationale,
-      },
-    };
-  };
-}
-````
+### Skip Tests Conditionally
-#### Compatibility with `autoevals`
+```javascript
+describeEval("gpt-4 tests", {
+  skipIf: () => !process.env.OPENAI_API_KEY,
+  // ...
+});
+```
-We maintain compatibility with the [autoevals package](https://github.com/braintrustdata/autoevals) from Braintrust. To use it you'll typically need to use te `partial` helper provided on the scorers. For example, with the `ClosedQA` scorer:
+### Existing Test Suites
 ```javascript
-import { describeEval } from "vitest-evals";
-import { ClosedQA } from "autoevals";
-describeEval("my evals", {
-  data: async () => {
-    // The scenarios you wish to evaluate
-    return [
-      {
-        input: "What is the capital of France?",
-        expected: "Paris",
-      }
-    ];
-  },
-  task: answerQuestion,
-  scorers: [ClosedQA.partial({
-    criteria: "Does the submission indicate that the question is out of scope?",
-  })],
-  threshold: 0.6,
-})
+import "vitest-evals";
+test("capital check", () => {
+  const simpleFactuality = async ({ output, expected }) => ({
+    score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
+  });
+  expect("What is the capital of France?").toEval(
+    "Paris",
+    answerQuestion,
+    simpleFactuality,
+    0.8
+  );
+});
 ```
-### Separating Evals
+## Configuration
-An alternative to `skipIf` for controlling if evals run is creating an separate `vitest` configuration for them. This gives a lot of advantages, particularly allowing you to maintain two completely separate test suites. A good pattern you can enable with this is a filename-based-test selector:
+### Separate Eval Configuration
+Create `vitest.evals.config.ts`:
 ```javascript
-// vitest.evals.config.ts
-/// <reference types="vitest" />
 import { defineConfig } from "vitest/config";
 import defaultConfig from "./vitest.config";
@@ -216,41 +248,20 @@ export default defineConfig({
   ...defaultConfig,
   test: {
     ...defaultConfig.test,
-    // run `eval` files rather than typical `test` files
-    include: ["src/**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}"],
+    include: ["src/**/*.eval.{js,ts}"],
   },
 });
 ```
-In the above, we're telling it to only match only `*.eval.*` files (vs the typical `*.test.*` or `*.spec.*`). We're also inheriting from our default `vitest.config.ts`. This gives us a clean way to run only tests, or run only evals:
+Run evals separately:
 ```shell
 vitest --config=vitest.evals.config.ts
 ```
-Its recommended to add this to your `package.json`, such as under an `eval` helper:
-```javascript
-// package.json
-{
-  // ...
-  "scripts": {
-    // ...
-    "eval": "vitest --config=vitest.evals.config.ts",
-  }
-}
-```
-You can then run your evals using `npm run eval`.
 ## Development
-Nothing fancy here.
-```javascript
-pnpm install
-```
-```javascript
-pnpm test
-```
+```shell
+npm install
+npm test
+```

package/dist/index.d.mts CHANGED Viewed

@@ -1,97 +1,2 @@
-import * as vitest from 'vitest';
-type TaskFn = (input: string) => Promise<string>;
-type Score = {
-    score: number | null;
-    metadata?: {
-        rationale?: string;
-        output?: string;
-    };
-};
-type ScoreFn = (opts: {
-    input: string;
-    output: string;
-} & Record<string, unknown>) => Promise<Score> | Score;
-type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
-interface EvalMatchers<R = unknown> {
-    toEval: ToEval<R>;
-}
-declare module "vitest" {
-    interface Assertion<T = any> extends EvalMatchers<T> {
-    }
-    interface AsymmetricMatchersContaining extends EvalMatchers {
-    }
-    interface TaskMeta {
-        eval?: {
-            scores: (Score & {
-                name: string;
-            })[];
-            avgScore: number;
-        };
-    }
-}
-/**
- * Creates a test suite for evaluating language model outputs.
- *
- * @param name - The name of the test suite
- * @param options - Configuration options
- * @param options.data - Async function that returns an array of test cases with input and expected values
- * @param options.task - Function that processes the input and returns the model output
- * @param options.skipIf - Optional function that determines if tests should be skipped
- * @param options.scorers - Array of scoring functions that evaluate model outputs
- * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
- * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
- *
- * @example
- * ```javascript
- * describeEval("capital cities test", {
- *   data: async () => [{
- *     input: "What is the capital of France?",
- *     expected: "Paris"
- *   }],
- *   task: async (input) => {
- *     // Query LLM here
- *     return "Paris";
- *   },
- *   scorers: [checkFactuality],
- *   threshold: 0.8
- * });
- * ```
- */
-declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
-    data: () => Promise<{
-        input: string;
-        expected: string;
-    }[]>;
-    task: TaskFn;
-    skipIf?: () => boolean;
-    scorers: ScoreFn[];
-    threshold?: number | null;
-    timeout?: number;
-}): vitest.SuiteCollector<object>;
-declare function formatScores(scores: (Score & {
-    name: string;
-})[]): string;
-/**
- * Wraps text to fit within a specified width, breaking at word boundaries.
- *
- * @param text - The text to wrap
- * @param width - The maximum width in characters (default: 80)
- * @returns The wrapped text with line breaks
- *
- * @example
- * ```javascript
- * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
- * console.log(wrapped);
- * // Output:
- * // This is a very
- * // long text that
- * // needs to be
- * // wrapped to fit
- * // within an 80
- * // character width.
- * ```
- */
-declare function wrapText(text: string, width?: number): string;
-export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores, wrapText };
+import 'vitest';
+export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';

package/dist/index.d.ts CHANGED Viewed

@@ -1,97 +1,2 @@
-import * as vitest from 'vitest';
-type TaskFn = (input: string) => Promise<string>;
-type Score = {
-    score: number | null;
-    metadata?: {
-        rationale?: string;
-        output?: string;
-    };
-};
-type ScoreFn = (opts: {
-    input: string;
-    output: string;
-} & Record<string, unknown>) => Promise<Score> | Score;
-type ToEval<R = unknown> = (expected: string, taskFn: TaskFn, scoreFn: ScoreFn, threshold?: number) => Promise<R>;
-interface EvalMatchers<R = unknown> {
-    toEval: ToEval<R>;
-}
-declare module "vitest" {
-    interface Assertion<T = any> extends EvalMatchers<T> {
-    }
-    interface AsymmetricMatchersContaining extends EvalMatchers {
-    }
-    interface TaskMeta {
-        eval?: {
-            scores: (Score & {
-                name: string;
-            })[];
-            avgScore: number;
-        };
-    }
-}
-/**
- * Creates a test suite for evaluating language model outputs.
- *
- * @param name - The name of the test suite
- * @param options - Configuration options
- * @param options.data - Async function that returns an array of test cases with input and expected values
- * @param options.task - Function that processes the input and returns the model output
- * @param options.skipIf - Optional function that determines if tests should be skipped
- * @param options.scorers - Array of scoring functions that evaluate model outputs
- * @param options.threshold - Minimum acceptable average score (0-1), defaults to 1.0
- * @param options.timeout - Test timeout in milliseconds, defaults to 60000 (60s)
- *
- * @example
- * ```javascript
- * describeEval("capital cities test", {
- *   data: async () => [{
- *     input: "What is the capital of France?",
- *     expected: "Paris"
- *   }],
- *   task: async (input) => {
- *     // Query LLM here
- *     return "Paris";
- *   },
- *   scorers: [checkFactuality],
- *   threshold: 0.8
- * });
- * ```
- */
-declare function describeEval(name: string, { data, task, skipIf, scorers, threshold, timeout, }: {
-    data: () => Promise<{
-        input: string;
-        expected: string;
-    }[]>;
-    task: TaskFn;
-    skipIf?: () => boolean;
-    scorers: ScoreFn[];
-    threshold?: number | null;
-    timeout?: number;
-}): vitest.SuiteCollector<object>;
-declare function formatScores(scores: (Score & {
-    name: string;
-})[]): string;
-/**
- * Wraps text to fit within a specified width, breaking at word boundaries.
- *
- * @param text - The text to wrap
- * @param width - The maximum width in characters (default: 80)
- * @returns The wrapped text with line breaks
- *
- * @example
- * ```javascript
- * const wrapped = wrapText("This is a very long text that needs to be wrapped to fit within an 80 character width.", 20);
- * console.log(wrapped);
- * // Output:
- * // This is a very
- * // long text that
- * // needs to be
- * // wrapped to fit
- * // within an 80
- * // character width.
- * ```
- */
-declare function wrapText(text: string, width?: number): string;
-export { type EvalMatchers, type Score, type ScoreFn, type TaskFn, type ToEval, describeEval, formatScores, wrapText };
+import 'vitest';
+export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.js';