@ai-sdk-tool/eval 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -3
- package/data/BFCL_v4_multi_turn_base.jsonl +200 -0
- package/data/BFCL_v4_multi_turn_long_context.jsonl +200 -0
- package/data/BFCL_v4_multi_turn_miss_func.jsonl +200 -0
- package/data/BFCL_v4_multi_turn_miss_param.jsonl +200 -0
- package/data/multi_turn_func_doc/gorilla_file_system.jsonl +18 -0
- package/data/multi_turn_func_doc/math_api.jsonl +17 -0
- package/data/multi_turn_func_doc/memory_kv.jsonl +15 -0
- package/data/multi_turn_func_doc/memory_rec_sum.jsonl +5 -0
- package/data/multi_turn_func_doc/memory_vector.jsonl +12 -0
- package/data/multi_turn_func_doc/message_api.jsonl +10 -0
- package/data/multi_turn_func_doc/posting_api.jsonl +14 -0
- package/data/multi_turn_func_doc/ticket_api.jsonl +9 -0
- package/data/multi_turn_func_doc/trading_bot.jsonl +20 -0
- package/data/multi_turn_func_doc/travel_booking.jsonl +18 -0
- package/data/multi_turn_func_doc/vehicle_control.jsonl +22 -0
- package/data/multi_turn_func_doc/web_search.jsonl +2 -0
- package/data/possible_answer/BFCL_v4_multi_turn_base.jsonl +200 -0
- package/data/possible_answer/BFCL_v4_multi_turn_long_context.jsonl +200 -0
- package/data/possible_answer/BFCL_v4_multi_turn_miss_func.jsonl +200 -0
- package/data/possible_answer/BFCL_v4_multi_turn_miss_param.jsonl +200 -0
- package/dist/index.cjs +4526 -62
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -1
- package/dist/index.d.ts +17 -1
- package/dist/index.js +4525 -62
- package/dist/index.js.map +1 -1
- package/package.json +5 -4
package/README.md
CHANGED
|
@@ -33,15 +33,13 @@ import { evaluate, bfclSimpleBenchmark } from "@ai-sdk-tool/eval";
|
|
|
33
33
|
import { openrouter } from "ai/providers/openrouter";
|
|
34
34
|
|
|
35
35
|
// 1. Define the models you want to evaluate
|
|
36
|
-
const gemma9b = openrouter("google/gemma-3-9b-it");
|
|
37
|
-
const gemma27b = openrouter("google/gemma-3-27b-it");
|
|
38
36
|
|
|
39
37
|
// 2. Run the evaluation
|
|
40
38
|
async function runMyEvaluation() {
|
|
41
39
|
console.log("Starting model evaluation...");
|
|
42
40
|
|
|
43
41
|
const results = await evaluate({
|
|
44
|
-
models: [
|
|
42
|
+
models: [/* your models here */],
|
|
45
43
|
benchmarks: [bfclSimpleBenchmark], // Use a built-in benchmark
|
|
46
44
|
reporter: "console", // 'console' or 'json'
|
|
47
45
|
});
|
|
@@ -67,8 +65,16 @@ This package includes several pre-built benchmarks.
|
|
|
67
65
|
- `bfclParallelBenchmark`: Evaluates parallel (multi-tool) function calls.
|
|
68
66
|
- `bfclMultipleBenchmark`: Evaluates multiple calls to the same function.
|
|
69
67
|
- `bfclParallelMultipleBenchmark`: A combination of parallel and multiple function calls.
|
|
68
|
+
- `bfclMultiTurnBaseBenchmark`: Evaluates BFCL v4 multi-turn base cases.
|
|
69
|
+
- `bfclMultiTurnLongContextBenchmark`: Evaluates BFCL v4 multi-turn long-context cases.
|
|
70
|
+
- `bfclMultiTurnMissFuncBenchmark`: Evaluates BFCL v4 multi-turn missing-function cases.
|
|
71
|
+
- `bfclMultiTurnMissParamBenchmark`: Evaluates BFCL v4 multi-turn missing-parameter cases.
|
|
70
72
|
- `jsonGenerationBenchmark`: Evaluates the model's ability to generate schema-compliant JSON.
|
|
71
73
|
|
|
74
|
+
Note: Multi-turn benchmarks now execute tool calls with a native TypeScript implementation and do not require Python at runtime.
|
|
75
|
+
|
|
76
|
+
BFCL evaluation data will be downloaded automatically on first run. For manual download, visit the [BFCL repository](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard).
|
|
77
|
+
|
|
72
78
|
To try a JSON generation run locally:
|
|
73
79
|
|
|
74
80
|
```bash
|