@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/{BFCL_v3_parallel.jsonl → BFCL_v4_parallel.jsonl} +2 -2
- package/data/{BFCL_v3_parallel_possible_answer.jsonl → BFCL_v4_parallel_possible_answer.jsonl} +2 -2
- package/data/BFCL_v4_simple.jsonl +400 -0
- package/data/BFCL_v4_simple_possible_answer.jsonl +400 -0
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +1264 -263
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +77 -11
- package/dist/index.d.ts +77 -11
- package/dist/index.js +1268 -264
- package/dist/index.js.map +1 -1
- package/package.json +18 -11
- package/data/BFCL_v3_simple.jsonl +0 -400
- package/data/BFCL_v3_simple_possible_answer.jsonl +0 -400
- /package/data/{BFCL_v3_multiple.jsonl → BFCL_v4_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_multiple_possible_answer.jsonl → BFCL_v4_multiple_possible_answer.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple.jsonl → BFCL_v4_parallel_multiple.jsonl} +0 -0
- /package/data/{BFCL_v3_parallel_multiple_possible_answer.jsonl → BFCL_v4_parallel_multiple_possible_answer.jsonl} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -1,9 +1,26 @@
|
|
|
1
|
+
import { LanguageModelV3Middleware } from '@ai-sdk/provider';
|
|
1
2
|
import { LanguageModel } from 'ai';
|
|
2
3
|
|
|
4
|
+
/**
|
|
5
|
+
* Model configuration for evaluation.
|
|
6
|
+
* Allows specifying a base model with optional middleware for proper cache ordering.
|
|
7
|
+
*/
|
|
8
|
+
interface ModelConfig {
|
|
9
|
+
/**
|
|
10
|
+
* The base language model (before any middleware is applied).
|
|
11
|
+
*/
|
|
12
|
+
model: LanguageModel;
|
|
13
|
+
/**
|
|
14
|
+
* Optional middleware to apply to the model.
|
|
15
|
+
* When cache is enabled, the cache middleware will be applied BEFORE this middleware,
|
|
16
|
+
* ensuring that cache keys are generated from the final transformed params.
|
|
17
|
+
*/
|
|
18
|
+
middleware?: LanguageModelV3Middleware | LanguageModelV3Middleware[];
|
|
19
|
+
}
|
|
3
20
|
/**
|
|
4
21
|
* The result of a single benchmark run.
|
|
5
22
|
*/
|
|
6
|
-
|
|
23
|
+
interface BenchmarkResult {
|
|
7
24
|
/**
|
|
8
25
|
* A numeric score for the benchmark.
|
|
9
26
|
* The interpretation of this score is up to the benchmark author.
|
|
@@ -27,11 +44,11 @@ type BenchmarkResult = {
|
|
|
27
44
|
* An error object if the benchmark failed unexpectedly.
|
|
28
45
|
*/
|
|
29
46
|
error?: Error;
|
|
30
|
-
}
|
|
47
|
+
}
|
|
31
48
|
/**
|
|
32
49
|
* The interface for defining a language model benchmark.
|
|
33
50
|
*/
|
|
34
|
-
|
|
51
|
+
interface LanguageModelV3Benchmark {
|
|
35
52
|
/**
|
|
36
53
|
* A unique name for the benchmark.
|
|
37
54
|
*/
|
|
@@ -51,30 +68,38 @@ type LanguageModelV3Benchmark = {
|
|
|
51
68
|
* @returns A promise that resolves to a BenchmarkResult.
|
|
52
69
|
*/
|
|
53
70
|
run(model: LanguageModel, config?: Record<string, unknown>): Promise<BenchmarkResult>;
|
|
54
|
-
}
|
|
71
|
+
}
|
|
55
72
|
/**
|
|
56
73
|
* The supported reporter types.
|
|
57
74
|
*/
|
|
58
|
-
type ReporterType = "console" | "json" | "console.debug";
|
|
75
|
+
type ReporterType = "console" | "json" | "console.debug" | "console.summary";
|
|
59
76
|
/**
|
|
60
77
|
* The full result object for an evaluation run,
|
|
61
78
|
* containing results for all model-benchmark combinations.
|
|
62
79
|
*/
|
|
63
|
-
|
|
80
|
+
interface EvaluationResult {
|
|
64
81
|
model: string;
|
|
65
82
|
/** Optional user-provided key when models are passed as a keyed map */
|
|
66
83
|
modelKey?: string;
|
|
67
84
|
benchmark: string;
|
|
68
85
|
result: BenchmarkResult;
|
|
69
|
-
}
|
|
86
|
+
}
|
|
70
87
|
/**
|
|
71
88
|
* Options for the `evaluate` function.
|
|
72
89
|
*/
|
|
73
|
-
|
|
90
|
+
interface EvaluateOptions {
|
|
74
91
|
/**
|
|
75
92
|
* The language model or models to evaluate.
|
|
93
|
+
* Can be:
|
|
94
|
+
* - A single LanguageModel or ModelConfig
|
|
95
|
+
* - An array of LanguageModel or ModelConfig
|
|
96
|
+
* - A keyed record of LanguageModel or ModelConfig
|
|
97
|
+
*
|
|
98
|
+
* When using ModelConfig with middleware and cache enabled,
|
|
99
|
+
* the cache middleware is applied innermost (closest to the model),
|
|
100
|
+
* ensuring cache keys reflect the final transformed params.
|
|
76
101
|
*/
|
|
77
|
-
models: LanguageModel | LanguageModel[] | Record<string, LanguageModel>;
|
|
102
|
+
models: LanguageModel | ModelConfig | (LanguageModel | ModelConfig)[] | Record<string, LanguageModel | ModelConfig>;
|
|
78
103
|
/**
|
|
79
104
|
* An array of benchmarks to run against the models.
|
|
80
105
|
*/
|
|
@@ -92,16 +117,57 @@ type EvaluateOptions = {
|
|
|
92
117
|
* Optional maximum number of tokens to generate during evaluation.
|
|
93
118
|
*/
|
|
94
119
|
maxTokens?: number;
|
|
95
|
-
|
|
120
|
+
/**
|
|
121
|
+
* Options for disk-based response caching.
|
|
122
|
+
* When enabled, LLM responses are cached to disk to avoid redundant API calls.
|
|
123
|
+
*/
|
|
124
|
+
cache?: {
|
|
125
|
+
/**
|
|
126
|
+
* Whether to enable disk caching.
|
|
127
|
+
* @default false
|
|
128
|
+
*/
|
|
129
|
+
enabled?: boolean;
|
|
130
|
+
/**
|
|
131
|
+
* Directory to store cache files.
|
|
132
|
+
* @default '.ai-cache'
|
|
133
|
+
*/
|
|
134
|
+
cacheDir?: string;
|
|
135
|
+
/**
|
|
136
|
+
* Whether to log cache hits/misses for debugging.
|
|
137
|
+
* @default false
|
|
138
|
+
*/
|
|
139
|
+
debug?: boolean;
|
|
140
|
+
};
|
|
141
|
+
}
|
|
96
142
|
|
|
97
143
|
declare const bfclSimpleBenchmark: LanguageModelV3Benchmark;
|
|
98
144
|
declare const bfclParallelBenchmark: LanguageModelV3Benchmark;
|
|
99
145
|
declare const bfclMultipleBenchmark: LanguageModelV3Benchmark;
|
|
100
146
|
declare const bfclParallelMultipleBenchmark: LanguageModelV3Benchmark;
|
|
101
147
|
|
|
148
|
+
/**
|
|
149
|
+
* ComplexFuncBench - Complex Function Calling Benchmark
|
|
150
|
+
*
|
|
151
|
+
* This benchmark evaluates models on complex function calling scenarios including:
|
|
152
|
+
* - Multi-step function calls in a single turn
|
|
153
|
+
* - Function calling with constraints
|
|
154
|
+
* - Parameter value reasoning from implicit information
|
|
155
|
+
* - Long parameter values (500+ tokens)
|
|
156
|
+
* - Parallel function calls
|
|
157
|
+
*
|
|
158
|
+
* Dataset: https://huggingface.co/datasets/THUDM/ComplexFuncBench
|
|
159
|
+
* Paper: https://arxiv.org/abs/2501.10132
|
|
160
|
+
*/
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* ComplexFuncBench benchmark - tests complex function calling scenarios
|
|
164
|
+
* including multi-step calls, constraints, parameter reasoning, and long parameters.
|
|
165
|
+
*/
|
|
166
|
+
declare const complexFuncBenchBenchmark: LanguageModelV3Benchmark;
|
|
167
|
+
|
|
102
168
|
declare const jsonGenerationBenchmark: LanguageModelV3Benchmark;
|
|
103
169
|
declare const jsonGenerationSchemaOnlyBenchmark: LanguageModelV3Benchmark;
|
|
104
170
|
|
|
105
171
|
declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
|
|
106
172
|
|
|
107
|
-
export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
|
|
173
|
+
export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ModelConfig, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, complexFuncBenchBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,26 @@
|
|
|
1
|
+
import { LanguageModelV3Middleware } from '@ai-sdk/provider';
|
|
1
2
|
import { LanguageModel } from 'ai';
|
|
2
3
|
|
|
4
|
+
/**
|
|
5
|
+
* Model configuration for evaluation.
|
|
6
|
+
* Allows specifying a base model with optional middleware for proper cache ordering.
|
|
7
|
+
*/
|
|
8
|
+
interface ModelConfig {
|
|
9
|
+
/**
|
|
10
|
+
* The base language model (before any middleware is applied).
|
|
11
|
+
*/
|
|
12
|
+
model: LanguageModel;
|
|
13
|
+
/**
|
|
14
|
+
* Optional middleware to apply to the model.
|
|
15
|
+
* When cache is enabled, the cache middleware will be applied BEFORE this middleware,
|
|
16
|
+
* ensuring that cache keys are generated from the final transformed params.
|
|
17
|
+
*/
|
|
18
|
+
middleware?: LanguageModelV3Middleware | LanguageModelV3Middleware[];
|
|
19
|
+
}
|
|
3
20
|
/**
|
|
4
21
|
* The result of a single benchmark run.
|
|
5
22
|
*/
|
|
6
|
-
|
|
23
|
+
interface BenchmarkResult {
|
|
7
24
|
/**
|
|
8
25
|
* A numeric score for the benchmark.
|
|
9
26
|
* The interpretation of this score is up to the benchmark author.
|
|
@@ -27,11 +44,11 @@ type BenchmarkResult = {
|
|
|
27
44
|
* An error object if the benchmark failed unexpectedly.
|
|
28
45
|
*/
|
|
29
46
|
error?: Error;
|
|
30
|
-
}
|
|
47
|
+
}
|
|
31
48
|
/**
|
|
32
49
|
* The interface for defining a language model benchmark.
|
|
33
50
|
*/
|
|
34
|
-
|
|
51
|
+
interface LanguageModelV3Benchmark {
|
|
35
52
|
/**
|
|
36
53
|
* A unique name for the benchmark.
|
|
37
54
|
*/
|
|
@@ -51,30 +68,38 @@ type LanguageModelV3Benchmark = {
|
|
|
51
68
|
* @returns A promise that resolves to a BenchmarkResult.
|
|
52
69
|
*/
|
|
53
70
|
run(model: LanguageModel, config?: Record<string, unknown>): Promise<BenchmarkResult>;
|
|
54
|
-
}
|
|
71
|
+
}
|
|
55
72
|
/**
|
|
56
73
|
* The supported reporter types.
|
|
57
74
|
*/
|
|
58
|
-
type ReporterType = "console" | "json" | "console.debug";
|
|
75
|
+
type ReporterType = "console" | "json" | "console.debug" | "console.summary";
|
|
59
76
|
/**
|
|
60
77
|
* The full result object for an evaluation run,
|
|
61
78
|
* containing results for all model-benchmark combinations.
|
|
62
79
|
*/
|
|
63
|
-
|
|
80
|
+
interface EvaluationResult {
|
|
64
81
|
model: string;
|
|
65
82
|
/** Optional user-provided key when models are passed as a keyed map */
|
|
66
83
|
modelKey?: string;
|
|
67
84
|
benchmark: string;
|
|
68
85
|
result: BenchmarkResult;
|
|
69
|
-
}
|
|
86
|
+
}
|
|
70
87
|
/**
|
|
71
88
|
* Options for the `evaluate` function.
|
|
72
89
|
*/
|
|
73
|
-
|
|
90
|
+
interface EvaluateOptions {
|
|
74
91
|
/**
|
|
75
92
|
* The language model or models to evaluate.
|
|
93
|
+
* Can be:
|
|
94
|
+
* - A single LanguageModel or ModelConfig
|
|
95
|
+
* - An array of LanguageModel or ModelConfig
|
|
96
|
+
* - A keyed record of LanguageModel or ModelConfig
|
|
97
|
+
*
|
|
98
|
+
* When using ModelConfig with middleware and cache enabled,
|
|
99
|
+
* the cache middleware is applied innermost (closest to the model),
|
|
100
|
+
* ensuring cache keys reflect the final transformed params.
|
|
76
101
|
*/
|
|
77
|
-
models: LanguageModel | LanguageModel[] | Record<string, LanguageModel>;
|
|
102
|
+
models: LanguageModel | ModelConfig | (LanguageModel | ModelConfig)[] | Record<string, LanguageModel | ModelConfig>;
|
|
78
103
|
/**
|
|
79
104
|
* An array of benchmarks to run against the models.
|
|
80
105
|
*/
|
|
@@ -92,16 +117,57 @@ type EvaluateOptions = {
|
|
|
92
117
|
* Optional maximum number of tokens to generate during evaluation.
|
|
93
118
|
*/
|
|
94
119
|
maxTokens?: number;
|
|
95
|
-
|
|
120
|
+
/**
|
|
121
|
+
* Options for disk-based response caching.
|
|
122
|
+
* When enabled, LLM responses are cached to disk to avoid redundant API calls.
|
|
123
|
+
*/
|
|
124
|
+
cache?: {
|
|
125
|
+
/**
|
|
126
|
+
* Whether to enable disk caching.
|
|
127
|
+
* @default false
|
|
128
|
+
*/
|
|
129
|
+
enabled?: boolean;
|
|
130
|
+
/**
|
|
131
|
+
* Directory to store cache files.
|
|
132
|
+
* @default '.ai-cache'
|
|
133
|
+
*/
|
|
134
|
+
cacheDir?: string;
|
|
135
|
+
/**
|
|
136
|
+
* Whether to log cache hits/misses for debugging.
|
|
137
|
+
* @default false
|
|
138
|
+
*/
|
|
139
|
+
debug?: boolean;
|
|
140
|
+
};
|
|
141
|
+
}
|
|
96
142
|
|
|
97
143
|
declare const bfclSimpleBenchmark: LanguageModelV3Benchmark;
|
|
98
144
|
declare const bfclParallelBenchmark: LanguageModelV3Benchmark;
|
|
99
145
|
declare const bfclMultipleBenchmark: LanguageModelV3Benchmark;
|
|
100
146
|
declare const bfclParallelMultipleBenchmark: LanguageModelV3Benchmark;
|
|
101
147
|
|
|
148
|
+
/**
|
|
149
|
+
* ComplexFuncBench - Complex Function Calling Benchmark
|
|
150
|
+
*
|
|
151
|
+
* This benchmark evaluates models on complex function calling scenarios including:
|
|
152
|
+
* - Multi-step function calls in a single turn
|
|
153
|
+
* - Function calling with constraints
|
|
154
|
+
* - Parameter value reasoning from implicit information
|
|
155
|
+
* - Long parameter values (500+ tokens)
|
|
156
|
+
* - Parallel function calls
|
|
157
|
+
*
|
|
158
|
+
* Dataset: https://huggingface.co/datasets/THUDM/ComplexFuncBench
|
|
159
|
+
* Paper: https://arxiv.org/abs/2501.10132
|
|
160
|
+
*/
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* ComplexFuncBench benchmark - tests complex function calling scenarios
|
|
164
|
+
* including multi-step calls, constraints, parameter reasoning, and long parameters.
|
|
165
|
+
*/
|
|
166
|
+
declare const complexFuncBenchBenchmark: LanguageModelV3Benchmark;
|
|
167
|
+
|
|
102
168
|
declare const jsonGenerationBenchmark: LanguageModelV3Benchmark;
|
|
103
169
|
declare const jsonGenerationSchemaOnlyBenchmark: LanguageModelV3Benchmark;
|
|
104
170
|
|
|
105
171
|
declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
|
|
106
172
|
|
|
107
|
-
export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
|
|
173
|
+
export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ModelConfig, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, complexFuncBenchBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
|