@ai-sdk-tool/eval 0.1.8 → 1.0.0-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/data/{BFCL_v3_multiple.json → BFCL_v3_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_multiple_possible_answer.json → BFCL_v3_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel.json → BFCL_v3_parallel.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple.json → BFCL_v3_parallel_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple_possible_answer.json → BFCL_v3_parallel_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_possible_answer.json → BFCL_v3_parallel_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_simple.json → BFCL_v3_simple.jsonl} +1 -1
- package/data/{BFCL_v3_simple_possible_answer.json → BFCL_v3_simple_possible_answer.jsonl} +1 -1
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +2122 -1119
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +30 -10
- package/dist/index.d.ts +30 -10
- package/dist/index.js +2153 -1143
- package/dist/index.js.map +1 -1
- package/package.json +19 -16
package/dist/index.d.cts
CHANGED
|
@@ -31,7 +31,7 @@ interface BenchmarkResult {
|
|
|
31
31
|
/**
|
|
32
32
|
* The interface for defining a language model benchmark.
|
|
33
33
|
*/
|
|
34
|
-
interface
|
|
34
|
+
interface LanguageModelV3Benchmark {
|
|
35
35
|
/**
|
|
36
36
|
* A unique name for the benchmark.
|
|
37
37
|
*/
|
|
@@ -78,7 +78,7 @@ interface EvaluateOptions {
|
|
|
78
78
|
/**
|
|
79
79
|
* An array of benchmarks to run against the models.
|
|
80
80
|
*/
|
|
81
|
-
benchmarks:
|
|
81
|
+
benchmarks: LanguageModelV3Benchmark[];
|
|
82
82
|
/**
|
|
83
83
|
* The reporter to use for displaying results.
|
|
84
84
|
* Defaults to 'console'.
|
|
@@ -94,14 +94,34 @@ interface EvaluateOptions {
|
|
|
94
94
|
maxTokens?: number;
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
-
declare
|
|
97
|
+
declare const bfclSimpleBenchmark: LanguageModelV3Benchmark;
|
|
98
|
+
declare const bfclParallelBenchmark: LanguageModelV3Benchmark;
|
|
99
|
+
declare const bfclMultipleBenchmark: LanguageModelV3Benchmark;
|
|
100
|
+
declare const bfclParallelMultipleBenchmark: LanguageModelV3Benchmark;
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* ComplexFuncBench - Complex Function Calling Benchmark
|
|
104
|
+
*
|
|
105
|
+
* This benchmark evaluates models on complex function calling scenarios including:
|
|
106
|
+
* - Multi-step function calls in a single turn
|
|
107
|
+
* - Function calling with constraints
|
|
108
|
+
* - Parameter value reasoning from implicit information
|
|
109
|
+
* - Long parameter values (500+ tokens)
|
|
110
|
+
* - Parallel function calls
|
|
111
|
+
*
|
|
112
|
+
* Dataset: https://huggingface.co/datasets/THUDM/ComplexFuncBench
|
|
113
|
+
* Paper: https://arxiv.org/abs/2501.10132
|
|
114
|
+
*/
|
|
98
115
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
116
|
+
/**
|
|
117
|
+
* ComplexFuncBench benchmark - tests complex function calling scenarios
|
|
118
|
+
* including multi-step calls, constraints, parameter reasoning, and long parameters.
|
|
119
|
+
*/
|
|
120
|
+
declare const complexFuncBenchBenchmark: LanguageModelV3Benchmark;
|
|
103
121
|
|
|
104
|
-
declare const jsonGenerationBenchmark:
|
|
105
|
-
declare const jsonGenerationSchemaOnlyBenchmark:
|
|
122
|
+
declare const jsonGenerationBenchmark: LanguageModelV3Benchmark;
|
|
123
|
+
declare const jsonGenerationSchemaOnlyBenchmark: LanguageModelV3Benchmark;
|
|
124
|
+
|
|
125
|
+
declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
|
|
106
126
|
|
|
107
|
-
export { type BenchmarkResult, type EvaluateOptions, type
|
|
127
|
+
export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, complexFuncBenchBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
|
package/dist/index.d.ts
CHANGED
|
@@ -31,7 +31,7 @@ interface BenchmarkResult {
|
|
|
31
31
|
/**
|
|
32
32
|
* The interface for defining a language model benchmark.
|
|
33
33
|
*/
|
|
34
|
-
interface
|
|
34
|
+
interface LanguageModelV3Benchmark {
|
|
35
35
|
/**
|
|
36
36
|
* A unique name for the benchmark.
|
|
37
37
|
*/
|
|
@@ -78,7 +78,7 @@ interface EvaluateOptions {
|
|
|
78
78
|
/**
|
|
79
79
|
* An array of benchmarks to run against the models.
|
|
80
80
|
*/
|
|
81
|
-
benchmarks:
|
|
81
|
+
benchmarks: LanguageModelV3Benchmark[];
|
|
82
82
|
/**
|
|
83
83
|
* The reporter to use for displaying results.
|
|
84
84
|
* Defaults to 'console'.
|
|
@@ -94,14 +94,34 @@ interface EvaluateOptions {
|
|
|
94
94
|
maxTokens?: number;
|
|
95
95
|
}
|
|
96
96
|
|
|
97
|
-
declare
|
|
97
|
+
declare const bfclSimpleBenchmark: LanguageModelV3Benchmark;
|
|
98
|
+
declare const bfclParallelBenchmark: LanguageModelV3Benchmark;
|
|
99
|
+
declare const bfclMultipleBenchmark: LanguageModelV3Benchmark;
|
|
100
|
+
declare const bfclParallelMultipleBenchmark: LanguageModelV3Benchmark;
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* ComplexFuncBench - Complex Function Calling Benchmark
|
|
104
|
+
*
|
|
105
|
+
* This benchmark evaluates models on complex function calling scenarios including:
|
|
106
|
+
* - Multi-step function calls in a single turn
|
|
107
|
+
* - Function calling with constraints
|
|
108
|
+
* - Parameter value reasoning from implicit information
|
|
109
|
+
* - Long parameter values (500+ tokens)
|
|
110
|
+
* - Parallel function calls
|
|
111
|
+
*
|
|
112
|
+
* Dataset: https://huggingface.co/datasets/THUDM/ComplexFuncBench
|
|
113
|
+
* Paper: https://arxiv.org/abs/2501.10132
|
|
114
|
+
*/
|
|
98
115
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
116
|
+
/**
|
|
117
|
+
* ComplexFuncBench benchmark - tests complex function calling scenarios
|
|
118
|
+
* including multi-step calls, constraints, parameter reasoning, and long parameters.
|
|
119
|
+
*/
|
|
120
|
+
declare const complexFuncBenchBenchmark: LanguageModelV3Benchmark;
|
|
103
121
|
|
|
104
|
-
declare const jsonGenerationBenchmark:
|
|
105
|
-
declare const jsonGenerationSchemaOnlyBenchmark:
|
|
122
|
+
declare const jsonGenerationBenchmark: LanguageModelV3Benchmark;
|
|
123
|
+
declare const jsonGenerationSchemaOnlyBenchmark: LanguageModelV3Benchmark;
|
|
124
|
+
|
|
125
|
+
declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
|
|
106
126
|
|
|
107
|
-
export { type BenchmarkResult, type EvaluateOptions, type
|
|
127
|
+
export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, complexFuncBenchBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
|