@ai-sdk-tool/eval 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +111 -0
- package/data/BFCL_v3_multiple.json +200 -0
- package/data/BFCL_v3_multiple_possible_answer.json +200 -0
- package/data/BFCL_v3_parallel.json +200 -0
- package/data/BFCL_v3_parallel_multiple.json +200 -0
- package/data/BFCL_v3_parallel_multiple_possible_answer.json +200 -0
- package/data/BFCL_v3_parallel_possible_answer.json +200 -0
- package/data/BFCL_v3_simple.json +400 -0
- package/data/BFCL_v3_simple_possible_answer.json +400 -0
- package/data/json_generation_expected.jsonl +6 -0
- package/data/json_generation_tests.jsonl +6 -0
- package/dist/index.cjs +877 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +97 -0
- package/dist/index.d.ts +97 -0
- package/dist/index.js +834 -0
- package/dist/index.js.map +1 -0
- package/package.json +41 -0
package/README.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# Vercel AI SDK - Evaluation Tool (`@ai-sdk-tool/eval`)
|
|
2
|
+
|
|
3
|
+
This package provides a standardized, extensible, and reproducible way to benchmark and evaluate the performance of Language Models (`LanguageModel` instances) within the Vercel AI SDK ecosystem.
|
|
4
|
+
|
|
5
|
+
It allows developers to:
|
|
6
|
+
|
|
7
|
+
- Compare different models (e.g., Gemma, Llama, GPT) under the same conditions.
|
|
8
|
+
- Quantify the impact of model updates or configuration changes.
|
|
9
|
+
- Create custom benchmarks tailored to specific use cases (e.g., 'Korean proficiency', 'code generation').
|
|
10
|
+
- Automate the evaluation process across a matrix of models and configurations.
|
|
11
|
+
|
|
12
|
+
## Core Concepts
|
|
13
|
+
|
|
14
|
+
- **Benchmark (`LanguageModelV2Benchmark`)**: A standardized interface for creating an evaluation task. It has a `run` method that takes a `LanguageModel` and returns a `BenchmarkResult`.
|
|
15
|
+
- **`evaluate` function**: The core function that runs a set of benchmarks against one or more models and provides a report on the results.
|
|
16
|
+
- **Reporter**: Formats the evaluation results into different outputs, such as a human-readable console report or a machine-readable JSON object.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
# This package is currently local within the monorepo.
|
|
22
|
+
# In the future, it would be published to npm:
|
|
23
|
+
# npm install @ai-sdk-tool/eval
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
Here's how to evaluate two different models against the built-in Berkeley Function-Calling Leaderboard (BFCL) benchmark for simple function calls.
|
|
29
|
+
|
|
30
|
+
```typescript
|
|
31
|
+
import { evaluate, bfclSimpleBenchmark } from "@ai-sdk-tool/eval";
|
|
32
|
+
import { openrouter } from "ai/providers/openrouter";
|
|
33
|
+
|
|
34
|
+
// 1. Define the models you want to evaluate
|
|
35
|
+
const gemma9b = openrouter("google/gemma-3-9b-it");
|
|
36
|
+
const gemma27b = openrouter("google/gemma-3-27b-it");
|
|
37
|
+
|
|
38
|
+
// 2. Run the evaluation
|
|
39
|
+
async function runMyEvaluation() {
|
|
40
|
+
console.log("Starting model evaluation...");
|
|
41
|
+
|
|
42
|
+
const results = await evaluate({
|
|
43
|
+
models: [gemma9b, gemma27b],
|
|
44
|
+
benchmarks: [bfclSimpleBenchmark], // Use a built-in benchmark
|
|
45
|
+
reporter: "console", // 'console' or 'json'
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
console.log("Evaluation complete!");
|
|
49
|
+
// The console reporter will have already printed a detailed report.
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
runMyEvaluation();
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Built-in Benchmarks
|
|
56
|
+
|
|
57
|
+
This package includes several pre-built benchmarks.
|
|
58
|
+
|
|
59
|
+
- `bfclSimpleBenchmark`: Evaluates simple, single function calls.
|
|
60
|
+
- `bfclParallelBenchmark`: Evaluates parallel (multi-tool) function calls.
|
|
61
|
+
- `bfclMultipleBenchmark`: Evaluates multiple calls to the same function.
|
|
62
|
+
- `bfclParallelMultipleBenchmark`: A combination of parallel and multiple function calls.
|
|
63
|
+
- `jsonGenerationBenchmark`: Evaluates the model's ability to generate schema-compliant JSON. _(Note: This benchmark is temporarily disabled due to a TypeScript compilation issue)._
|
|
64
|
+
|
|
65
|
+
## Creating a Custom Benchmark
|
|
66
|
+
|
|
67
|
+
You can easily create your own benchmark by implementing the `LanguageModelV2Benchmark` interface. This is useful for testing model performance on tasks specific to your application.
|
|
68
|
+
|
|
69
|
+
**Example: A custom benchmark to test politeness.**
|
|
70
|
+
|
|
71
|
+
```typescript
|
|
72
|
+
import {
|
|
73
|
+
LanguageModelV2Benchmark,
|
|
74
|
+
BenchmarkResult,
|
|
75
|
+
EvaluateOptions,
|
|
76
|
+
} from "@ai-sdk-tool/eval";
|
|
77
|
+
import { LanguageModel, generateText } from "ai";
|
|
78
|
+
|
|
79
|
+
// Define the benchmark object
|
|
80
|
+
export const politenessBenchmark: LanguageModelV2Benchmark = {
|
|
81
|
+
name: "politeness-check",
|
|
82
|
+
version: "1.0.0",
|
|
83
|
+
description: "Checks if the model's response is polite.",
|
|
84
|
+
|
|
85
|
+
async run(model: LanguageModel): Promise<BenchmarkResult> {
|
|
86
|
+
const { text } = await generateText({
|
|
87
|
+
model,
|
|
88
|
+
prompt:
|
|
89
|
+
"A customer is angry because their order is late. Write a response.",
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
const isPolite = !text.toLowerCase().includes("sorry, but");
|
|
93
|
+
const score = isPolite ? 1 : 0;
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
score,
|
|
97
|
+
success: isPolite,
|
|
98
|
+
metrics: {
|
|
99
|
+
length: text.length,
|
|
100
|
+
},
|
|
101
|
+
logs: [`Response: "${text}"`],
|
|
102
|
+
};
|
|
103
|
+
},
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
// You can then use it in the evaluate function:
|
|
107
|
+
// await evaluate({
|
|
108
|
+
// models: myModel,
|
|
109
|
+
// benchmarks: [politenessBenchmark],
|
|
110
|
+
// });
|
|
111
|
+
```
|