@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -3,7 +3,7 @@ import { LanguageModel } from 'ai';
3
3
  /**
4
4
  * The result of a single benchmark run.
5
5
  */
6
- type BenchmarkResult = {
6
+ interface BenchmarkResult {
7
7
  /**
8
8
  * A numeric score for the benchmark.
9
9
  * The interpretation of this score is up to the benchmark author.
@@ -27,11 +27,11 @@ type BenchmarkResult = {
27
27
  * An error object if the benchmark failed unexpectedly.
28
28
  */
29
29
  error?: Error;
30
- };
30
+ }
31
31
  /**
32
32
  * The interface for defining a language model benchmark.
33
33
  */
34
- type LanguageModelV3Benchmark = {
34
+ interface LanguageModelV3Benchmark {
35
35
  /**
36
36
  * A unique name for the benchmark.
37
37
  */
@@ -51,7 +51,7 @@ type LanguageModelV3Benchmark = {
51
51
  * @returns A promise that resolves to a BenchmarkResult.
52
52
  */
53
53
  run(model: LanguageModel, config?: Record<string, unknown>): Promise<BenchmarkResult>;
54
- };
54
+ }
55
55
  /**
56
56
  * The supported reporter types.
57
57
  */
@@ -60,17 +60,17 @@ type ReporterType = "console" | "json" | "console.debug";
60
60
  * The full result object for an evaluation run,
61
61
  * containing results for all model-benchmark combinations.
62
62
  */
63
- type EvaluationResult = {
63
+ interface EvaluationResult {
64
64
  model: string;
65
65
  /** Optional user-provided key when models are passed as a keyed map */
66
66
  modelKey?: string;
67
67
  benchmark: string;
68
68
  result: BenchmarkResult;
69
- };
69
+ }
70
70
  /**
71
71
  * Options for the `evaluate` function.
72
72
  */
73
- type EvaluateOptions = {
73
+ interface EvaluateOptions {
74
74
  /**
75
75
  * The language model or models to evaluate.
76
76
  */
@@ -92,16 +92,36 @@ type EvaluateOptions = {
92
92
  * Optional maximum number of tokens to generate during evaluation.
93
93
  */
94
94
  maxTokens?: number;
95
- };
95
+ }
96
96
 
97
97
  declare const bfclSimpleBenchmark: LanguageModelV3Benchmark;
98
98
  declare const bfclParallelBenchmark: LanguageModelV3Benchmark;
99
99
  declare const bfclMultipleBenchmark: LanguageModelV3Benchmark;
100
100
  declare const bfclParallelMultipleBenchmark: LanguageModelV3Benchmark;
101
101
 
102
+ /**
103
+ * ComplexFuncBench - Complex Function Calling Benchmark
104
+ *
105
+ * This benchmark evaluates models on complex function calling scenarios including:
106
+ * - Multi-step function calls in a single turn
107
+ * - Function calling with constraints
108
+ * - Parameter value reasoning from implicit information
109
+ * - Long parameter values (500+ tokens)
110
+ * - Parallel function calls
111
+ *
112
+ * Dataset: https://huggingface.co/datasets/THUDM/ComplexFuncBench
113
+ * Paper: https://arxiv.org/abs/2501.10132
114
+ */
115
+
116
+ /**
117
+ * ComplexFuncBench benchmark - tests complex function calling scenarios
118
+ * including multi-step calls, constraints, parameter reasoning, and long parameters.
119
+ */
120
+ declare const complexFuncBenchBenchmark: LanguageModelV3Benchmark;
121
+
102
122
  declare const jsonGenerationBenchmark: LanguageModelV3Benchmark;
103
123
  declare const jsonGenerationSchemaOnlyBenchmark: LanguageModelV3Benchmark;
104
124
 
105
125
  declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
106
126
 
107
- export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
127
+ export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, complexFuncBenchBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
package/dist/index.d.ts CHANGED
@@ -3,7 +3,7 @@ import { LanguageModel } from 'ai';
3
3
  /**
4
4
  * The result of a single benchmark run.
5
5
  */
6
- type BenchmarkResult = {
6
+ interface BenchmarkResult {
7
7
  /**
8
8
  * A numeric score for the benchmark.
9
9
  * The interpretation of this score is up to the benchmark author.
@@ -27,11 +27,11 @@ type BenchmarkResult = {
27
27
  * An error object if the benchmark failed unexpectedly.
28
28
  */
29
29
  error?: Error;
30
- };
30
+ }
31
31
  /**
32
32
  * The interface for defining a language model benchmark.
33
33
  */
34
- type LanguageModelV3Benchmark = {
34
+ interface LanguageModelV3Benchmark {
35
35
  /**
36
36
  * A unique name for the benchmark.
37
37
  */
@@ -51,7 +51,7 @@ type LanguageModelV3Benchmark = {
51
51
  * @returns A promise that resolves to a BenchmarkResult.
52
52
  */
53
53
  run(model: LanguageModel, config?: Record<string, unknown>): Promise<BenchmarkResult>;
54
- };
54
+ }
55
55
  /**
56
56
  * The supported reporter types.
57
57
  */
@@ -60,17 +60,17 @@ type ReporterType = "console" | "json" | "console.debug";
60
60
  * The full result object for an evaluation run,
61
61
  * containing results for all model-benchmark combinations.
62
62
  */
63
- type EvaluationResult = {
63
+ interface EvaluationResult {
64
64
  model: string;
65
65
  /** Optional user-provided key when models are passed as a keyed map */
66
66
  modelKey?: string;
67
67
  benchmark: string;
68
68
  result: BenchmarkResult;
69
- };
69
+ }
70
70
  /**
71
71
  * Options for the `evaluate` function.
72
72
  */
73
- type EvaluateOptions = {
73
+ interface EvaluateOptions {
74
74
  /**
75
75
  * The language model or models to evaluate.
76
76
  */
@@ -92,16 +92,36 @@ type EvaluateOptions = {
92
92
  * Optional maximum number of tokens to generate during evaluation.
93
93
  */
94
94
  maxTokens?: number;
95
- };
95
+ }
96
96
 
97
97
  declare const bfclSimpleBenchmark: LanguageModelV3Benchmark;
98
98
  declare const bfclParallelBenchmark: LanguageModelV3Benchmark;
99
99
  declare const bfclMultipleBenchmark: LanguageModelV3Benchmark;
100
100
  declare const bfclParallelMultipleBenchmark: LanguageModelV3Benchmark;
101
101
 
102
+ /**
103
+ * ComplexFuncBench - Complex Function Calling Benchmark
104
+ *
105
+ * This benchmark evaluates models on complex function calling scenarios including:
106
+ * - Multi-step function calls in a single turn
107
+ * - Function calling with constraints
108
+ * - Parameter value reasoning from implicit information
109
+ * - Long parameter values (500+ tokens)
110
+ * - Parallel function calls
111
+ *
112
+ * Dataset: https://huggingface.co/datasets/THUDM/ComplexFuncBench
113
+ * Paper: https://arxiv.org/abs/2501.10132
114
+ */
115
+
116
+ /**
117
+ * ComplexFuncBench benchmark - tests complex function calling scenarios
118
+ * including multi-step calls, constraints, parameter reasoning, and long parameters.
119
+ */
120
+ declare const complexFuncBenchBenchmark: LanguageModelV3Benchmark;
121
+
102
122
  declare const jsonGenerationBenchmark: LanguageModelV3Benchmark;
103
123
  declare const jsonGenerationSchemaOnlyBenchmark: LanguageModelV3Benchmark;
104
124
 
105
125
  declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
106
126
 
107
- export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };
127
+ export { type BenchmarkResult, type EvaluateOptions, type LanguageModelV3Benchmark, type ReporterType, bfclMultipleBenchmark, bfclParallelBenchmark, bfclParallelMultipleBenchmark, bfclSimpleBenchmark, complexFuncBenchBenchmark, evaluate, jsonGenerationBenchmark, jsonGenerationSchemaOnlyBenchmark };