npm - vitest-evals - Versions diffs - 0.4.0 → 0.5.0 - Mend

vitest-evals 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/README.md +101 -65
package/dist/index.d.mts +2 -1
package/dist/index.d.ts +2 -1
package/dist/index.js +307 -76
package/dist/index.js.map +1 -1
package/dist/index.mjs +306 -76
package/dist/index.mjs.map +1 -1
package/dist/scorers/index.d.mts +2 -1
package/dist/scorers/index.d.ts +2 -1
package/dist/scorers/index.js +322 -73
package/dist/scorers/index.js.map +1 -1
package/dist/scorers/index.mjs +316 -72
package/dist/scorers/index.mjs.map +1 -1
package/dist/scorers/structuredOutputScorer.d.mts +3 -0
package/dist/scorers/structuredOutputScorer.d.ts +3 -0
package/dist/scorers/structuredOutputScorer.js +299 -0
package/dist/scorers/structuredOutputScorer.js.map +1 -0
package/dist/scorers/structuredOutputScorer.mjs +273 -0
package/dist/scorers/structuredOutputScorer.mjs.map +1 -0
package/dist/scorers/toolCallScorer.d.mts +123 -31
package/dist/scorers/toolCallScorer.d.ts +123 -31
package/dist/scorers/toolCallScorer.js +161 -70
package/dist/scorers/toolCallScorer.js.map +1 -1
package/dist/scorers/toolCallScorer.mjs +162 -71
package/dist/scorers/toolCallScorer.mjs.map +1 -1
package/dist/scorers/utils.d.mts +103 -0
package/dist/scorers/utils.d.ts +103 -0
package/dist/scorers/utils.js +176 -0
package/dist/scorers/utils.js.map +1 -0
package/dist/scorers/utils.mjs +146 -0
package/dist/scorers/utils.mjs.map +1 -0
package/package.json +13 -10

package/README.md CHANGED Viewed

@@ -16,16 +16,18 @@ import { describeEval } from "vitest-evals";
 describeEval("capital cities", {
   data: async () => [
     { input: "What is the capital of France?", expected: "Paris" },
-    { input: "What is the capital of Japan?", expected: "Tokyo" }
+    { input: "What is the capital of Japan?", expected: "Tokyo" },
   ],
   task: async (input) => {
     const response = await queryLLM(input);
     return response; // Simple string return
   },
-  scorers: [async ({ output, expected }) => ({
-    score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
-  })],
-  threshold: 0.8
+  scorers: [
+    async ({ output, expected }) => ({
+      score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
+    }),
+  ],
+  threshold: 0.8,
 });
 ```
@@ -58,15 +60,15 @@ import { ToolCallScorer } from "vitest-evals/scorers/toolCallScorer";
 describeEval("tool usage", {
   data: async () => [
-    { input: "Search weather", expectedTools: [{ name: "weather_api" }] }
+    { input: "Search weather", expectedTools: [{ name: "weather_api" }] },
   ],
   task: weatherTask,
-  scorers: [ToolCallScorer()]
+  scorers: [ToolCallScorer()],
 });
 // Custom scorer
 const LengthScorer = async ({ output }) => ({
-  score: output.length > 50 ? 1.0 : 0.0
+  score: output.length > 50 ? 1.0 : 0.0,
 });
 // TypeScript scorer with custom options
@@ -77,46 +79,54 @@ interface CustomOptions extends BaseScorerOptions {
 }
 const TypedScorer: ScoreFn<CustomOptions> = async (opts) => ({
-  score: opts.output.length >= opts.minLength ? 1.0 : 0.0
+  score: opts.output.length >= opts.minLength ? 1.0 : 0.0,
 });
 ```
 ### Built-in Scorers
 #### ToolCallScorer
 Evaluates if the expected tools were called with correct arguments.
 ```javascript
 // Basic usage - strict matching, any order
 describeEval("search test", {
-  data: async () => [{
-    input: "Find Italian restaurants",
-    expectedTools: [
-      { name: "search", arguments: { type: "restaurant" } },
-      { name: "filter", arguments: { cuisine: "italian" } }
-    ]
-  }],
+  data: async () => [
+    {
+      input: "Find Italian restaurants",
+      expectedTools: [
+        { name: "search", arguments: { type: "restaurant" } },
+        { name: "filter", arguments: { cuisine: "italian" } },
+      ],
+    },
+  ],
   task: myTask,
-  scorers: [ToolCallScorer()]
+  scorers: [ToolCallScorer()],
 });
 // Strict evaluation - exact order and parameters
-scorers: [ToolCallScorer({
-  ordered: true,      // Tools must be in exact order
-  params: "strict"    // Parameters must match exactly
-})]
+scorers: [
+  ToolCallScorer({
+    ordered: true, // Tools must be in exact order
+    params: "strict", // Parameters must match exactly
+  }),
+];
 // Flexible evaluation
-scorers: [ToolCallScorer({
-  requireAll: false,   // Partial matches give partial credit
-  allowExtras: false   // No additional tools allowed
-})]
+scorers: [
+  ToolCallScorer({
+    requireAll: false, // Partial matches give partial credit
+    allowExtras: false, // No additional tools allowed
+  }),
+];
 ```
 **Default behavior:**
 - Strict parameter matching (exact equality required)
 - Any order allowed
-- Extra tools allowed
+- Extra tools allowed
 - All expected tools required
 ## AI SDK Integration
@@ -126,17 +136,20 @@ See [`src/ai-sdk-integration.test.ts`](src/ai-sdk-integration.test.ts) for a com
 Transform provider responses to our format:
 ```javascript
-// Vercel AI SDK
-const { text, toolCalls, toolResults } = await generateText(...);
+const { text, steps } = await generateText({
+  model: openai("gpt-4o"),
+  prompt: input,
+  tools: { myTool: myToolDefinition },
+});
 return {
   result: text,
-  toolCalls: toolCalls?.map((call, i) => ({
-    id: call.toolCallId,
-    name: call.toolName,
-    arguments: call.args,
-    result: toolResults?.[i]?.result,
-    status: toolResults?.[i]?.error ? 'failed' : 'completed'
-  }))
+  toolCalls: steps
+    .flatMap((step) => step.toolCalls)
+    .map((call) => ({
+      name: call.toolName,
+      arguments: call.args,
+    })),
 };
 ```
@@ -154,9 +167,9 @@ import { Factuality, ClosedQA } from "autoevals";
 scorers: [
   Factuality, // LLM-based factuality checking
   ClosedQA.partial({
-    criteria: "Does the answer mention Paris?"
-  })
-]
+    criteria: "Does the answer mention Paris?",
+  }),
+];
 ```
 #### Custom LLM-based Factuality Scorer
@@ -164,18 +177,18 @@ scorers: [
 Here's an example of implementing your own LLM-based factuality scorer using the Vercel AI SDK:
 ```javascript
-import { generateObject } from 'ai';
-import { openai } from '@ai-sdk/openai';
-import { z } from 'zod';
-const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expected }) => {
-  if (!expected) {
-    return { score: 1.0, metadata: { rationale: "No expected answer" } };
-  }
-  const { object } = await generateObject({
-    model,
-    prompt: `
+import { generateObject } from "ai";
+import { openai } from "@ai-sdk/openai";
+import { z } from "zod";
+const Factuality = (model = openai("gpt-4o")) => async ({ input, output, expected }) => {
+    if (!expected) {
+      return { score: 1.0, metadata: { rationale: "No expected answer" } };
+    }
+    const { object } = await generateObject({
+      model,
+      prompt: `
       Compare the factual content of the submitted answer with the expert answer.
       Question: ${input}
@@ -189,21 +202,21 @@ const Factuality = (model = openai('gpt-4o')) => async ({ input, output, expecte
       (D) Contradicts expert answer
       (E) Different but factually equivalent
     `,
-    schema: z.object({
-      answer: z.enum(['A', 'B', 'C', 'D', 'E']),
-      rationale: z.string()
-    })
-  });
-  const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
-  return {
-    score: scores[object.answer],
-    metadata: { rationale: object.rationale, answer: object.answer }
+      schema: z.object({
+        answer: z.enum(["A", "B", "C", "D", "E"]),
+        rationale: z.string(),
+      }),
+    });
+    const scores = { A: 0.4, B: 0.6, C: 1, D: 0, E: 1 };
+    return {
+      score: scores[object.answer],
+      metadata: { rationale: object.rationale, answer: object.answer },
+    };
   };
-};
 // Usage
-scorers: [Factuality()]
+scorers: [Factuality()];
 ```
 ### Skip Tests Conditionally
@@ -217,14 +230,18 @@ describeEval("gpt-4 tests", {
 ### Existing Test Suites
+For integration with existing Vitest test suites, you can use the `.toEval()` matcher:
+> **⚠️ Deprecated**: The `.toEval()` helper is deprecated. Use `describeEval()` instead for better test organization and multiple scorers support. We may consider bringing back a similar check, but its currently too limited for many scorer implementations.
 ```javascript
 import "vitest-evals";
 test("capital check", () => {
   const simpleFactuality = async ({ output, expected }) => ({
-    score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0
+    score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
   });
   expect("What is the capital of France?").toEval(
     "Paris",
     answerQuestion,
@@ -234,6 +251,25 @@ test("capital check", () => {
 });
 ```
+**Recommended migration** to `describeEval()`:
+```javascript
+import { describeEval } from "vitest-evals";
+describeEval("capital check", {
+  data: async () => [
+    { input: "What is the capital of France?", expected: "Paris" },
+  ],
+  task: answerQuestion,
+  scorers: [
+    async ({ output, expected }) => ({
+      score: output.toLowerCase().includes(expected.toLowerCase()) ? 1.0 : 0.0,
+    }),
+  ],
+  threshold: 0.8,
+});
+```
 ## Configuration
 ### Separate Eval Configuration
@@ -264,4 +300,4 @@ vitest --config=vitest.evals.config.ts
 ```shell
 npm install
 npm test
-```
+```

package/dist/index.d.mts CHANGED Viewed

@@ -1,2 +1,3 @@
 import 'vitest';
-export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
+export { B as BaseScorerOptions, E as EvalMatchers, e as Score, f as ScoreFn, b as StructuredOutputScorer, S as StructuredOutputScorerOptions, d as TaskFn, c as TaskResult, g as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, h as describeEval, i as formatScores, w as wrapText } from './scorers/toolCallScorer.mjs';
+import './scorers/utils.mjs';

package/dist/index.d.ts CHANGED Viewed

@@ -1,2 +1,3 @@
 import 'vitest';
-export { B as BaseScorerOptions, E as EvalMatchers, S as Score, c as ScoreFn, b as TaskFn, a as TaskResult, d as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, e as describeEval, f as formatScores, w as wrapText } from './scorers/toolCallScorer.js';
+export { B as BaseScorerOptions, E as EvalMatchers, e as Score, f as ScoreFn, b as StructuredOutputScorer, S as StructuredOutputScorerOptions, d as TaskFn, c as TaskResult, g as ToEval, T as ToolCall, ToolCallScorer, ToolCallScorerOptions, h as describeEval, i as formatScores, w as wrapText } from './scorers/toolCallScorer.js';
+import './scorers/utils.js';