@ai-sdk-tool/eval 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +503 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -2
- package/dist/index.d.ts +4 -2
- package/dist/index.js +504 -33
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -5,14 +5,15 @@ var colors = {
|
|
|
5
5
|
red: "\x1B[31m",
|
|
6
6
|
yellow: "\x1B[33m",
|
|
7
7
|
cyan: "\x1B[36m",
|
|
8
|
-
magenta: "\x1B[35m"
|
|
8
|
+
magenta: "\x1B[35m",
|
|
9
|
+
gray: "\x1B[90m"
|
|
9
10
|
};
|
|
10
11
|
function printResult(result) {
|
|
11
|
-
const { model, benchmark, result: benchmarkResult } = result;
|
|
12
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
12
13
|
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
13
14
|
console.log(
|
|
14
15
|
`
|
|
15
|
-
${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
16
|
+
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
16
17
|
);
|
|
17
18
|
console.log(
|
|
18
19
|
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
@@ -38,6 +39,186 @@ function consoleReporter(results) {
|
|
|
38
39
|
console.log("\n---------------------------\n");
|
|
39
40
|
}
|
|
40
41
|
|
|
42
|
+
// src/reporters/console.debug.ts
|
|
43
|
+
var colors2 = {
|
|
44
|
+
reset: "\x1B[0m",
|
|
45
|
+
green: "\x1B[32m",
|
|
46
|
+
red: "\x1B[31m",
|
|
47
|
+
yellow: "\x1B[33m",
|
|
48
|
+
cyan: "\x1B[36m",
|
|
49
|
+
magenta: "\x1B[35m",
|
|
50
|
+
gray: "\x1B[90m",
|
|
51
|
+
bold: "\x1B[1m",
|
|
52
|
+
underline: "\x1B[4m"
|
|
53
|
+
};
|
|
54
|
+
function colorizeDiffLine(line) {
|
|
55
|
+
if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
|
|
56
|
+
if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
|
|
57
|
+
if (line.startsWith("@"))
|
|
58
|
+
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
59
|
+
return line;
|
|
60
|
+
}
|
|
61
|
+
function uniqueLines(lines) {
|
|
62
|
+
const seen = /* @__PURE__ */ new Set();
|
|
63
|
+
const out = [];
|
|
64
|
+
for (const l of lines) {
|
|
65
|
+
if (seen.has(l)) continue;
|
|
66
|
+
seen.add(l);
|
|
67
|
+
out.push(l);
|
|
68
|
+
}
|
|
69
|
+
return out;
|
|
70
|
+
}
|
|
71
|
+
function suggestFixFromDiff(parsed) {
|
|
72
|
+
const suggestions = [];
|
|
73
|
+
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
74
|
+
if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
|
|
75
|
+
const expectedName = expected?.function;
|
|
76
|
+
const actualName = actual?.function;
|
|
77
|
+
if (expectedName && actualName && expectedName !== actualName) {
|
|
78
|
+
suggestions.push(
|
|
79
|
+
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
if (Array.isArray(expected?.functions)) {
|
|
83
|
+
suggestions.push(
|
|
84
|
+
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
|
|
89
|
+
const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
|
|
90
|
+
if (missing.length) {
|
|
91
|
+
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
|
|
95
|
+
const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
|
|
96
|
+
if (extras.length) {
|
|
97
|
+
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
if (diff && diff.some((d) => d.startsWith("@@ param "))) {
|
|
101
|
+
const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
|
|
102
|
+
for (const param of targets) {
|
|
103
|
+
const allowedLine = diff.find(
|
|
104
|
+
(d) => d.startsWith("- expected one of:")
|
|
105
|
+
);
|
|
106
|
+
if (allowedLine) {
|
|
107
|
+
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
108
|
+
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
109
|
+
} else {
|
|
110
|
+
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
115
|
+
if (error_type.includes("missing_required")) {
|
|
116
|
+
suggestions.push(
|
|
117
|
+
"Add all required parameters defined by the tool schema."
|
|
118
|
+
);
|
|
119
|
+
} else if (error_type.includes("unexpected_param")) {
|
|
120
|
+
suggestions.push("Remove parameters not present in the tool schema.");
|
|
121
|
+
} else if (error_type.includes("wrong_count")) {
|
|
122
|
+
suggestions.push(
|
|
123
|
+
"Adjust the number of tool calls to match expected count."
|
|
124
|
+
);
|
|
125
|
+
} else if (error_type.includes("wrong_func_name")) {
|
|
126
|
+
suggestions.push("Use the exact expected function name from the schema.");
|
|
127
|
+
} else if (error_type.includes("value_error")) {
|
|
128
|
+
suggestions.push("Choose a value from the allowed options.");
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return uniqueLines(suggestions);
|
|
132
|
+
}
|
|
133
|
+
function consoleDebugReporter(results) {
|
|
134
|
+
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
135
|
+
for (const r of results) {
|
|
136
|
+
const { model, modelKey, benchmark, result } = r;
|
|
137
|
+
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
138
|
+
console.log(
|
|
139
|
+
`
|
|
140
|
+
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
141
|
+
);
|
|
142
|
+
console.log(
|
|
143
|
+
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
144
|
+
);
|
|
145
|
+
const metrics = Object.entries(result.metrics);
|
|
146
|
+
if (metrics.length > 0) {
|
|
147
|
+
console.log(" Metrics:");
|
|
148
|
+
for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
|
|
149
|
+
}
|
|
150
|
+
if (result.logs && result.logs.length) {
|
|
151
|
+
const failLogs = result.logs.filter(
|
|
152
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
153
|
+
);
|
|
154
|
+
const hasFails = failLogs.length > 0;
|
|
155
|
+
if (hasFails) {
|
|
156
|
+
console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
|
|
157
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
158
|
+
for (const l of failLogs) {
|
|
159
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
160
|
+
try {
|
|
161
|
+
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
162
|
+
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
163
|
+
} catch {
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
for (const line of failLogs) {
|
|
168
|
+
if (line.startsWith("[FAIL]")) {
|
|
169
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
170
|
+
const failId = m?.[1];
|
|
171
|
+
if (failId && debugIds.has(failId)) continue;
|
|
172
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
173
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
174
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
175
|
+
} else if (line.startsWith("[STACK]")) {
|
|
176
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
177
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
178
|
+
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
179
|
+
try {
|
|
180
|
+
const parsed = JSON.parse(payload);
|
|
181
|
+
const { id, expected, actual, message, diff } = parsed;
|
|
182
|
+
console.log(
|
|
183
|
+
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
184
|
+
);
|
|
185
|
+
if (diff && Array.isArray(diff)) {
|
|
186
|
+
for (const dLine of diff)
|
|
187
|
+
console.log(" " + colorizeDiffLine(dLine));
|
|
188
|
+
} else {
|
|
189
|
+
console.log(" expected:");
|
|
190
|
+
console.log(
|
|
191
|
+
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
192
|
+
);
|
|
193
|
+
console.log(" actual:");
|
|
194
|
+
console.log(
|
|
195
|
+
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
196
|
+
);
|
|
197
|
+
}
|
|
198
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
199
|
+
if (suggestions.length) {
|
|
200
|
+
console.log(
|
|
201
|
+
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
202
|
+
);
|
|
203
|
+
for (const s of suggestions) console.log(` \u2022 ${s}`);
|
|
204
|
+
}
|
|
205
|
+
} catch {
|
|
206
|
+
console.log(` ${line}`);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
} else {
|
|
211
|
+
const info = result.logs.filter(
|
|
212
|
+
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
213
|
+
);
|
|
214
|
+
for (const line of info)
|
|
215
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
console.log("\n------------------------------------\n");
|
|
220
|
+
}
|
|
221
|
+
|
|
41
222
|
// src/reporters/json.ts
|
|
42
223
|
function jsonReporter(results) {
|
|
43
224
|
const serializableResults = results.map((r) => ({
|
|
@@ -53,30 +234,35 @@ function jsonReporter(results) {
|
|
|
53
234
|
// src/reporters/index.ts
|
|
54
235
|
var reporters = {
|
|
55
236
|
console: consoleReporter,
|
|
56
|
-
json: jsonReporter
|
|
237
|
+
json: jsonReporter,
|
|
238
|
+
"console.debug": consoleDebugReporter
|
|
57
239
|
};
|
|
58
240
|
|
|
59
241
|
// src/evaluate.ts
|
|
60
|
-
async function runSingleBenchmark(model, benchmark) {
|
|
242
|
+
async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
61
243
|
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
62
244
|
try {
|
|
63
|
-
console.log(
|
|
245
|
+
console.log(
|
|
246
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
247
|
+
);
|
|
64
248
|
const result = await benchmark.run(model);
|
|
65
249
|
console.log(
|
|
66
|
-
`[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
250
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
67
251
|
);
|
|
68
252
|
return {
|
|
69
253
|
model: modelId,
|
|
254
|
+
modelKey,
|
|
70
255
|
benchmark: benchmark.name,
|
|
71
256
|
result
|
|
72
257
|
};
|
|
73
258
|
} catch (error) {
|
|
74
259
|
console.error(
|
|
75
|
-
`[${modelId}] Error running benchmark: ${benchmark.name}`,
|
|
260
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
76
261
|
error
|
|
77
262
|
);
|
|
78
263
|
return {
|
|
79
264
|
model: modelId,
|
|
265
|
+
modelKey,
|
|
80
266
|
benchmark: benchmark.name,
|
|
81
267
|
result: {
|
|
82
268
|
score: 0,
|
|
@@ -89,11 +275,26 @@ async function runSingleBenchmark(model, benchmark) {
|
|
|
89
275
|
}
|
|
90
276
|
async function evaluate(options) {
|
|
91
277
|
const { models, benchmarks, reporter = "console" } = options;
|
|
92
|
-
const
|
|
278
|
+
const modelEntries = [];
|
|
279
|
+
if (Array.isArray(models)) {
|
|
280
|
+
for (const m of models) modelEntries.push([void 0, m]);
|
|
281
|
+
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
282
|
+
modelEntries.push([void 0, models]);
|
|
283
|
+
} else {
|
|
284
|
+
for (const [key, m] of Object.entries(
|
|
285
|
+
models
|
|
286
|
+
)) {
|
|
287
|
+
modelEntries.push([key, m]);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
93
290
|
const allResults = [];
|
|
94
|
-
for (const model of
|
|
291
|
+
for (const [modelKey, model] of modelEntries) {
|
|
95
292
|
for (const benchmark of benchmarks) {
|
|
96
|
-
const evaluationResult = await runSingleBenchmark(
|
|
293
|
+
const evaluationResult = await runSingleBenchmark(
|
|
294
|
+
model,
|
|
295
|
+
benchmark,
|
|
296
|
+
modelKey
|
|
297
|
+
);
|
|
97
298
|
allResults.push(evaluationResult);
|
|
98
299
|
}
|
|
99
300
|
}
|
|
@@ -418,7 +619,7 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
418
619
|
};
|
|
419
620
|
|
|
420
621
|
// src/benchmarks/bfcl.ts
|
|
421
|
-
import { generateText as generateText2, jsonSchema } from "ai";
|
|
622
|
+
import { generateText as generateText2, jsonSchema, tool } from "ai";
|
|
422
623
|
import { promises as fs3 } from "fs";
|
|
423
624
|
import path3 from "path";
|
|
424
625
|
|
|
@@ -436,7 +637,9 @@ function checkStringValue(param, modelValue, possibleAnswers) {
|
|
|
436
637
|
if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
|
|
437
638
|
return {
|
|
438
639
|
valid: false,
|
|
439
|
-
error: `Invalid value for parameter '${param}':
|
|
640
|
+
error: `Invalid value for parameter '${param}': ${JSON.stringify(
|
|
641
|
+
modelValue
|
|
642
|
+
)}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
|
|
440
643
|
error_type: "value_error:string"
|
|
441
644
|
};
|
|
442
645
|
}
|
|
@@ -490,15 +693,55 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
490
693
|
if (!hasMatch) {
|
|
491
694
|
return {
|
|
492
695
|
valid: false,
|
|
493
|
-
error: `Invalid value for list parameter '${paramName}'
|
|
696
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
697
|
+
modelValue
|
|
698
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
494
699
|
error_type: "value_error:list"
|
|
495
700
|
};
|
|
496
701
|
}
|
|
497
702
|
} else {
|
|
498
|
-
|
|
703
|
+
const hasMatch = possibleValues.some((possibleValue) => {
|
|
704
|
+
if (modelValue === possibleValue) return true;
|
|
705
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
706
|
+
try {
|
|
707
|
+
const normalizeObject = (obj) => {
|
|
708
|
+
if (Array.isArray(obj)) {
|
|
709
|
+
return obj.map(normalizeObject);
|
|
710
|
+
}
|
|
711
|
+
if (obj && typeof obj === "object") {
|
|
712
|
+
const normalized = {};
|
|
713
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
714
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
715
|
+
normalized[key] = value[0];
|
|
716
|
+
} else {
|
|
717
|
+
normalized[key] = normalizeObject(value);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
return normalized;
|
|
721
|
+
}
|
|
722
|
+
return obj;
|
|
723
|
+
};
|
|
724
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
725
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
726
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
727
|
+
} catch {
|
|
728
|
+
return false;
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
732
|
+
return modelValue.toString() === possibleValue;
|
|
733
|
+
}
|
|
734
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
735
|
+
return modelValue === possibleValue.toString();
|
|
736
|
+
}
|
|
737
|
+
return false;
|
|
738
|
+
});
|
|
739
|
+
if (!hasMatch) {
|
|
499
740
|
return {
|
|
500
741
|
valid: false,
|
|
501
|
-
error: `Invalid value for parameter '${paramName}'
|
|
742
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
743
|
+
modelValue
|
|
744
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
502
745
|
error_type: "value_error:other"
|
|
503
746
|
};
|
|
504
747
|
}
|
|
@@ -594,7 +837,8 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
594
837
|
if (!modelOutput || modelOutput.length !== 1) {
|
|
595
838
|
return {
|
|
596
839
|
valid: false,
|
|
597
|
-
error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}
|
|
840
|
+
error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
|
|
841
|
+
error_type: "simple:wrong_count"
|
|
598
842
|
};
|
|
599
843
|
}
|
|
600
844
|
return simpleFunctionChecker(
|
|
@@ -623,7 +867,11 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
623
867
|
}
|
|
624
868
|
return { valid: true };
|
|
625
869
|
} catch (e) {
|
|
626
|
-
return {
|
|
870
|
+
return {
|
|
871
|
+
valid: false,
|
|
872
|
+
error: `Checker Error: ${e.message}`,
|
|
873
|
+
error_type: "checker_error"
|
|
874
|
+
};
|
|
627
875
|
}
|
|
628
876
|
}
|
|
629
877
|
function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
@@ -675,7 +923,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
675
923
|
if (copy.items) copy.items = fixSchema(copy.items);
|
|
676
924
|
return copy;
|
|
677
925
|
};
|
|
678
|
-
|
|
926
|
+
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
927
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
928
|
+
logs.push(
|
|
929
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
930
|
+
);
|
|
931
|
+
const runSingleCase = async (testCase) => {
|
|
932
|
+
const caseLogs = [];
|
|
679
933
|
const { function: tools, question: messages } = testCase;
|
|
680
934
|
try {
|
|
681
935
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
@@ -693,33 +947,49 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
693
947
|
type: "function",
|
|
694
948
|
name: sanitized,
|
|
695
949
|
description: t.description,
|
|
696
|
-
|
|
697
|
-
inputSchema: jsonSchema(inputSchema)
|
|
950
|
+
inputSchema
|
|
698
951
|
};
|
|
699
952
|
});
|
|
953
|
+
const toolsMap = Object.fromEntries(
|
|
954
|
+
transformedTools.map((t) => [
|
|
955
|
+
t.name,
|
|
956
|
+
tool({
|
|
957
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
958
|
+
inputSchema: jsonSchema(t.inputSchema)
|
|
959
|
+
})
|
|
960
|
+
])
|
|
961
|
+
);
|
|
700
962
|
try {
|
|
701
963
|
const firstTool = transformedTools[0];
|
|
702
964
|
const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
|
|
703
|
-
|
|
965
|
+
caseLogs.push(
|
|
704
966
|
`[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
705
967
|
);
|
|
706
968
|
} catch (e) {
|
|
707
|
-
|
|
969
|
+
caseLogs.push(
|
|
708
970
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
709
971
|
);
|
|
710
972
|
}
|
|
711
973
|
const { toolCalls, text, finishReason } = await generateText2({
|
|
712
974
|
model,
|
|
713
975
|
messages: flatMessages,
|
|
714
|
-
tools:
|
|
715
|
-
toolChoice: "
|
|
976
|
+
tools: toolsMap,
|
|
977
|
+
toolChoice: "auto",
|
|
978
|
+
// Pass original schema information to middleware
|
|
979
|
+
providerOptions: {
|
|
980
|
+
toolCallMiddleware: {
|
|
981
|
+
originalToolSchemas: Object.fromEntries(
|
|
982
|
+
transformedTools.map((t) => [t.name, t.inputSchema])
|
|
983
|
+
)
|
|
984
|
+
}
|
|
985
|
+
}
|
|
716
986
|
});
|
|
717
987
|
try {
|
|
718
|
-
|
|
988
|
+
caseLogs.push(
|
|
719
989
|
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
720
990
|
);
|
|
721
991
|
} catch {
|
|
722
|
-
|
|
992
|
+
caseLogs.push(
|
|
723
993
|
`[DEBUG] ${testCase.id}: failed to serialize toolCalls`
|
|
724
994
|
);
|
|
725
995
|
}
|
|
@@ -752,20 +1022,221 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
752
1022
|
possibleAnswer
|
|
753
1023
|
);
|
|
754
1024
|
if (checkerResult.valid) {
|
|
755
|
-
|
|
756
|
-
logs
|
|
1025
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1026
|
+
return { valid: true, logs: caseLogs };
|
|
757
1027
|
} else {
|
|
758
|
-
|
|
1028
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1029
|
+
try {
|
|
1030
|
+
const category = testCase.id.split("_")[0];
|
|
1031
|
+
const diff = [];
|
|
1032
|
+
const summarizeArgs = (args) => {
|
|
1033
|
+
if (args == null) return args;
|
|
1034
|
+
if (typeof args !== "object") return args;
|
|
1035
|
+
return Object.keys(args).sort().reduce((acc, k) => {
|
|
1036
|
+
acc[k] = args[k];
|
|
1037
|
+
return acc;
|
|
1038
|
+
}, {});
|
|
1039
|
+
};
|
|
1040
|
+
const expected = {};
|
|
1041
|
+
const actual = {};
|
|
1042
|
+
if (category === "simple") {
|
|
1043
|
+
const funcDesc = tools[0];
|
|
1044
|
+
const gt = possibleAnswer.ground_truth?.[0];
|
|
1045
|
+
const expectedFuncName = funcDesc?.name;
|
|
1046
|
+
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
1047
|
+
const received = restoredCalls[0];
|
|
1048
|
+
const receivedName = received?.toolName ?? received?.name;
|
|
1049
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
1050
|
+
expected.function = expectedFuncName;
|
|
1051
|
+
expected.params = expectedParams;
|
|
1052
|
+
actual.function = receivedName;
|
|
1053
|
+
actual.args = receivedArgs;
|
|
1054
|
+
if (expectedFuncName !== receivedName) {
|
|
1055
|
+
diff.push(`@@ function name`);
|
|
1056
|
+
diff.push(`- ${expectedFuncName}`);
|
|
1057
|
+
diff.push(`+ ${receivedName}`);
|
|
1058
|
+
}
|
|
1059
|
+
if (expectedParams && receivedArgs) {
|
|
1060
|
+
const required = funcDesc?.parameters?.required ?? [];
|
|
1061
|
+
for (const req of required) {
|
|
1062
|
+
if (!(req in receivedArgs)) {
|
|
1063
|
+
diff.push(`- missing required param: ${req}`);
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1067
|
+
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1068
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1072
|
+
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1073
|
+
const allowed = expectedParams[k];
|
|
1074
|
+
const got = receivedArgs[k];
|
|
1075
|
+
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1076
|
+
try {
|
|
1077
|
+
if (Array.isArray(got)) {
|
|
1078
|
+
return JSON.stringify(
|
|
1079
|
+
got.map((x) => String(x)).sort()
|
|
1080
|
+
) === JSON.stringify(
|
|
1081
|
+
v.map((x) => String(x)).sort()
|
|
1082
|
+
);
|
|
1083
|
+
}
|
|
1084
|
+
} catch {
|
|
1085
|
+
}
|
|
1086
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1087
|
+
});
|
|
1088
|
+
if (!includes) {
|
|
1089
|
+
diff.push(`@@ param ${k}`);
|
|
1090
|
+
diff.push(
|
|
1091
|
+
`- expected one of: ${JSON.stringify(allowed)}`
|
|
1092
|
+
);
|
|
1093
|
+
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
} else {
|
|
1099
|
+
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
1100
|
+
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
1101
|
+
const actualNames = restoredCalls.map(
|
|
1102
|
+
(c) => c.toolName ?? c.name
|
|
1103
|
+
);
|
|
1104
|
+
expected.functions = expectedNames;
|
|
1105
|
+
actual.functions = actualNames;
|
|
1106
|
+
if (expectedNames.length !== actualNames.length) {
|
|
1107
|
+
diff.push(`@@ call count`);
|
|
1108
|
+
diff.push(`- expected ${expectedNames.length}`);
|
|
1109
|
+
diff.push(`+ got ${actualNames.length}`);
|
|
1110
|
+
}
|
|
1111
|
+
const missing = expectedNames.filter(
|
|
1112
|
+
(n) => !actualNames.includes(n)
|
|
1113
|
+
);
|
|
1114
|
+
const extra = actualNames.filter(
|
|
1115
|
+
(n) => !expectedNames.includes(n)
|
|
1116
|
+
);
|
|
1117
|
+
for (const m of missing)
|
|
1118
|
+
diff.push(`- missing function: ${m}`);
|
|
1119
|
+
for (const e of extra)
|
|
1120
|
+
diff.push(`+ unexpected function: ${e}`);
|
|
1121
|
+
const usedActual = /* @__PURE__ */ new Set();
|
|
1122
|
+
for (const expectedObj of gtArr) {
|
|
1123
|
+
const fname = Object.keys(expectedObj)[0];
|
|
1124
|
+
let matchedIndex = -1;
|
|
1125
|
+
for (let i = 0; i < restoredCalls.length; i++) {
|
|
1126
|
+
if (usedActual.has(i)) continue;
|
|
1127
|
+
const rc = restoredCalls[i];
|
|
1128
|
+
const rcName = rc?.toolName ?? rc?.name;
|
|
1129
|
+
if (rcName === fname) {
|
|
1130
|
+
matchedIndex = i;
|
|
1131
|
+
break;
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
if (matchedIndex === -1) continue;
|
|
1135
|
+
usedActual.add(matchedIndex);
|
|
1136
|
+
const received = restoredCalls[matchedIndex];
|
|
1137
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
1138
|
+
const expectedParamsAllowed = expectedObj[fname];
|
|
1139
|
+
const funcDesc = tools.find(
|
|
1140
|
+
(t) => t.name === fname
|
|
1141
|
+
);
|
|
1142
|
+
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1143
|
+
diff.push(`@@ function ${fname}`);
|
|
1144
|
+
if (expectedParamsAllowed && receivedArgs) {
|
|
1145
|
+
for (const req of requiredParams) {
|
|
1146
|
+
if (!(req in receivedArgs)) {
|
|
1147
|
+
diff.push(`- missing required param: ${req}`);
|
|
1148
|
+
}
|
|
1149
|
+
}
|
|
1150
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1151
|
+
if (!Object.prototype.hasOwnProperty.call(
|
|
1152
|
+
expectedParamsAllowed,
|
|
1153
|
+
k
|
|
1154
|
+
)) {
|
|
1155
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
1156
|
+
}
|
|
1157
|
+
}
|
|
1158
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1159
|
+
if (Object.prototype.hasOwnProperty.call(
|
|
1160
|
+
expectedParamsAllowed,
|
|
1161
|
+
k
|
|
1162
|
+
)) {
|
|
1163
|
+
const allowed = expectedParamsAllowed[k];
|
|
1164
|
+
const got = receivedArgs[k];
|
|
1165
|
+
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1166
|
+
try {
|
|
1167
|
+
if (Array.isArray(got)) {
|
|
1168
|
+
return JSON.stringify(
|
|
1169
|
+
got.map((x) => String(x)).sort()
|
|
1170
|
+
) === JSON.stringify(
|
|
1171
|
+
v.map((x) => String(x)).sort()
|
|
1172
|
+
);
|
|
1173
|
+
}
|
|
1174
|
+
} catch {
|
|
1175
|
+
}
|
|
1176
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1177
|
+
});
|
|
1178
|
+
if (!includes) {
|
|
1179
|
+
diff.push(`@@ param ${k}`);
|
|
1180
|
+
diff.push(
|
|
1181
|
+
`- expected one of: ${JSON.stringify(allowed)}`
|
|
1182
|
+
);
|
|
1183
|
+
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
caseLogs.push(
|
|
1191
|
+
`[DEBUG-FAIL] ${JSON.stringify({
|
|
1192
|
+
id: testCase.id,
|
|
1193
|
+
message: checkerResult.error,
|
|
1194
|
+
error_type: checkerResult.error_type,
|
|
1195
|
+
expected,
|
|
1196
|
+
actual,
|
|
1197
|
+
diff
|
|
1198
|
+
})}`
|
|
1199
|
+
);
|
|
1200
|
+
} catch {
|
|
1201
|
+
caseLogs.push(
|
|
1202
|
+
`[DEBUG] ${testCase.id}: failed to build debug diff`
|
|
1203
|
+
);
|
|
1204
|
+
}
|
|
1205
|
+
return { valid: false, logs: caseLogs };
|
|
759
1206
|
}
|
|
760
1207
|
} catch (e) {
|
|
761
|
-
|
|
1208
|
+
caseLogs.push(
|
|
762
1209
|
`[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
|
|
763
1210
|
);
|
|
764
1211
|
if (e?.stack) {
|
|
765
|
-
|
|
1212
|
+
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
766
1213
|
}
|
|
1214
|
+
return { valid: false, logs: caseLogs };
|
|
767
1215
|
}
|
|
768
|
-
}
|
|
1216
|
+
};
|
|
1217
|
+
const mapWithConcurrency = async (items, limit2, mapper) => {
|
|
1218
|
+
const results = new Array(items.length);
|
|
1219
|
+
let idx = 0;
|
|
1220
|
+
const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
|
|
1221
|
+
while (true) {
|
|
1222
|
+
const current = idx++;
|
|
1223
|
+
if (current >= items.length) break;
|
|
1224
|
+
results[current] = await mapper(items[current], current);
|
|
1225
|
+
}
|
|
1226
|
+
});
|
|
1227
|
+
await Promise.all(workers);
|
|
1228
|
+
return results;
|
|
1229
|
+
};
|
|
1230
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1231
|
+
testCases,
|
|
1232
|
+
concurrency,
|
|
1233
|
+
async (tc) => runSingleCase(tc)
|
|
1234
|
+
);
|
|
1235
|
+
correctCount = resultsPerCase.reduce(
|
|
1236
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1237
|
+
0
|
|
1238
|
+
);
|
|
1239
|
+
for (const r of resultsPerCase) logs.push(...r.logs);
|
|
769
1240
|
if (testCases.length === 0) {
|
|
770
1241
|
return {
|
|
771
1242
|
score: 0,
|