@ai-sdk-tool/eval 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +503 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -2
- package/dist/index.d.ts +4 -2
- package/dist/index.js +504 -33
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -47,14 +47,15 @@ var colors = {
|
|
|
47
47
|
red: "\x1B[31m",
|
|
48
48
|
yellow: "\x1B[33m",
|
|
49
49
|
cyan: "\x1B[36m",
|
|
50
|
-
magenta: "\x1B[35m"
|
|
50
|
+
magenta: "\x1B[35m",
|
|
51
|
+
gray: "\x1B[90m"
|
|
51
52
|
};
|
|
52
53
|
function printResult(result) {
|
|
53
|
-
const { model, benchmark, result: benchmarkResult } = result;
|
|
54
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
54
55
|
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
55
56
|
console.log(
|
|
56
57
|
`
|
|
57
|
-
${colors.cyan}[${model}]${colors.reset} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
58
|
+
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
58
59
|
);
|
|
59
60
|
console.log(
|
|
60
61
|
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
@@ -80,6 +81,186 @@ function consoleReporter(results) {
|
|
|
80
81
|
console.log("\n---------------------------\n");
|
|
81
82
|
}
|
|
82
83
|
|
|
84
|
+
// src/reporters/console.debug.ts
|
|
85
|
+
var colors2 = {
|
|
86
|
+
reset: "\x1B[0m",
|
|
87
|
+
green: "\x1B[32m",
|
|
88
|
+
red: "\x1B[31m",
|
|
89
|
+
yellow: "\x1B[33m",
|
|
90
|
+
cyan: "\x1B[36m",
|
|
91
|
+
magenta: "\x1B[35m",
|
|
92
|
+
gray: "\x1B[90m",
|
|
93
|
+
bold: "\x1B[1m",
|
|
94
|
+
underline: "\x1B[4m"
|
|
95
|
+
};
|
|
96
|
+
function colorizeDiffLine(line) {
|
|
97
|
+
if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
|
|
98
|
+
if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
|
|
99
|
+
if (line.startsWith("@"))
|
|
100
|
+
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
101
|
+
return line;
|
|
102
|
+
}
|
|
103
|
+
function uniqueLines(lines) {
|
|
104
|
+
const seen = /* @__PURE__ */ new Set();
|
|
105
|
+
const out = [];
|
|
106
|
+
for (const l of lines) {
|
|
107
|
+
if (seen.has(l)) continue;
|
|
108
|
+
seen.add(l);
|
|
109
|
+
out.push(l);
|
|
110
|
+
}
|
|
111
|
+
return out;
|
|
112
|
+
}
|
|
113
|
+
function suggestFixFromDiff(parsed) {
|
|
114
|
+
const suggestions = [];
|
|
115
|
+
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
116
|
+
if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
|
|
117
|
+
const expectedName = expected?.function;
|
|
118
|
+
const actualName = actual?.function;
|
|
119
|
+
if (expectedName && actualName && expectedName !== actualName) {
|
|
120
|
+
suggestions.push(
|
|
121
|
+
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
122
|
+
);
|
|
123
|
+
}
|
|
124
|
+
if (Array.isArray(expected?.functions)) {
|
|
125
|
+
suggestions.push(
|
|
126
|
+
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
127
|
+
);
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
|
|
131
|
+
const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
|
|
132
|
+
if (missing.length) {
|
|
133
|
+
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
|
|
137
|
+
const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
|
|
138
|
+
if (extras.length) {
|
|
139
|
+
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
if (diff && diff.some((d) => d.startsWith("@@ param "))) {
|
|
143
|
+
const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
|
|
144
|
+
for (const param of targets) {
|
|
145
|
+
const allowedLine = diff.find(
|
|
146
|
+
(d) => d.startsWith("- expected one of:")
|
|
147
|
+
);
|
|
148
|
+
if (allowedLine) {
|
|
149
|
+
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
150
|
+
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
151
|
+
} else {
|
|
152
|
+
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
157
|
+
if (error_type.includes("missing_required")) {
|
|
158
|
+
suggestions.push(
|
|
159
|
+
"Add all required parameters defined by the tool schema."
|
|
160
|
+
);
|
|
161
|
+
} else if (error_type.includes("unexpected_param")) {
|
|
162
|
+
suggestions.push("Remove parameters not present in the tool schema.");
|
|
163
|
+
} else if (error_type.includes("wrong_count")) {
|
|
164
|
+
suggestions.push(
|
|
165
|
+
"Adjust the number of tool calls to match expected count."
|
|
166
|
+
);
|
|
167
|
+
} else if (error_type.includes("wrong_func_name")) {
|
|
168
|
+
suggestions.push("Use the exact expected function name from the schema.");
|
|
169
|
+
} else if (error_type.includes("value_error")) {
|
|
170
|
+
suggestions.push("Choose a value from the allowed options.");
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return uniqueLines(suggestions);
|
|
174
|
+
}
|
|
175
|
+
function consoleDebugReporter(results) {
|
|
176
|
+
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
177
|
+
for (const r of results) {
|
|
178
|
+
const { model, modelKey, benchmark, result } = r;
|
|
179
|
+
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
180
|
+
console.log(
|
|
181
|
+
`
|
|
182
|
+
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
183
|
+
);
|
|
184
|
+
console.log(
|
|
185
|
+
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
186
|
+
);
|
|
187
|
+
const metrics = Object.entries(result.metrics);
|
|
188
|
+
if (metrics.length > 0) {
|
|
189
|
+
console.log(" Metrics:");
|
|
190
|
+
for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
|
|
191
|
+
}
|
|
192
|
+
if (result.logs && result.logs.length) {
|
|
193
|
+
const failLogs = result.logs.filter(
|
|
194
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
195
|
+
);
|
|
196
|
+
const hasFails = failLogs.length > 0;
|
|
197
|
+
if (hasFails) {
|
|
198
|
+
console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
|
|
199
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
200
|
+
for (const l of failLogs) {
|
|
201
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
202
|
+
try {
|
|
203
|
+
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
204
|
+
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
205
|
+
} catch {
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
for (const line of failLogs) {
|
|
210
|
+
if (line.startsWith("[FAIL]")) {
|
|
211
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
212
|
+
const failId = m?.[1];
|
|
213
|
+
if (failId && debugIds.has(failId)) continue;
|
|
214
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
215
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
216
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
217
|
+
} else if (line.startsWith("[STACK]")) {
|
|
218
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
219
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
220
|
+
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
221
|
+
try {
|
|
222
|
+
const parsed = JSON.parse(payload);
|
|
223
|
+
const { id, expected, actual, message, diff } = parsed;
|
|
224
|
+
console.log(
|
|
225
|
+
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
226
|
+
);
|
|
227
|
+
if (diff && Array.isArray(diff)) {
|
|
228
|
+
for (const dLine of diff)
|
|
229
|
+
console.log(" " + colorizeDiffLine(dLine));
|
|
230
|
+
} else {
|
|
231
|
+
console.log(" expected:");
|
|
232
|
+
console.log(
|
|
233
|
+
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
234
|
+
);
|
|
235
|
+
console.log(" actual:");
|
|
236
|
+
console.log(
|
|
237
|
+
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
238
|
+
);
|
|
239
|
+
}
|
|
240
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
241
|
+
if (suggestions.length) {
|
|
242
|
+
console.log(
|
|
243
|
+
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
244
|
+
);
|
|
245
|
+
for (const s of suggestions) console.log(` \u2022 ${s}`);
|
|
246
|
+
}
|
|
247
|
+
} catch {
|
|
248
|
+
console.log(` ${line}`);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
} else {
|
|
253
|
+
const info = result.logs.filter(
|
|
254
|
+
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
255
|
+
);
|
|
256
|
+
for (const line of info)
|
|
257
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
console.log("\n------------------------------------\n");
|
|
262
|
+
}
|
|
263
|
+
|
|
83
264
|
// src/reporters/json.ts
|
|
84
265
|
function jsonReporter(results) {
|
|
85
266
|
const serializableResults = results.map((r) => ({
|
|
@@ -95,30 +276,35 @@ function jsonReporter(results) {
|
|
|
95
276
|
// src/reporters/index.ts
|
|
96
277
|
var reporters = {
|
|
97
278
|
console: consoleReporter,
|
|
98
|
-
json: jsonReporter
|
|
279
|
+
json: jsonReporter,
|
|
280
|
+
"console.debug": consoleDebugReporter
|
|
99
281
|
};
|
|
100
282
|
|
|
101
283
|
// src/evaluate.ts
|
|
102
|
-
async function runSingleBenchmark(model, benchmark) {
|
|
284
|
+
async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
103
285
|
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
104
286
|
try {
|
|
105
|
-
console.log(
|
|
287
|
+
console.log(
|
|
288
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
289
|
+
);
|
|
106
290
|
const result = await benchmark.run(model);
|
|
107
291
|
console.log(
|
|
108
|
-
`[${modelId}] Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
292
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
109
293
|
);
|
|
110
294
|
return {
|
|
111
295
|
model: modelId,
|
|
296
|
+
modelKey,
|
|
112
297
|
benchmark: benchmark.name,
|
|
113
298
|
result
|
|
114
299
|
};
|
|
115
300
|
} catch (error) {
|
|
116
301
|
console.error(
|
|
117
|
-
`[${modelId}] Error running benchmark: ${benchmark.name}`,
|
|
302
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
118
303
|
error
|
|
119
304
|
);
|
|
120
305
|
return {
|
|
121
306
|
model: modelId,
|
|
307
|
+
modelKey,
|
|
122
308
|
benchmark: benchmark.name,
|
|
123
309
|
result: {
|
|
124
310
|
score: 0,
|
|
@@ -131,11 +317,26 @@ async function runSingleBenchmark(model, benchmark) {
|
|
|
131
317
|
}
|
|
132
318
|
async function evaluate(options) {
|
|
133
319
|
const { models, benchmarks, reporter = "console" } = options;
|
|
134
|
-
const
|
|
320
|
+
const modelEntries = [];
|
|
321
|
+
if (Array.isArray(models)) {
|
|
322
|
+
for (const m of models) modelEntries.push([void 0, m]);
|
|
323
|
+
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
324
|
+
modelEntries.push([void 0, models]);
|
|
325
|
+
} else {
|
|
326
|
+
for (const [key, m] of Object.entries(
|
|
327
|
+
models
|
|
328
|
+
)) {
|
|
329
|
+
modelEntries.push([key, m]);
|
|
330
|
+
}
|
|
331
|
+
}
|
|
135
332
|
const allResults = [];
|
|
136
|
-
for (const model of
|
|
333
|
+
for (const [modelKey, model] of modelEntries) {
|
|
137
334
|
for (const benchmark of benchmarks) {
|
|
138
|
-
const evaluationResult = await runSingleBenchmark(
|
|
335
|
+
const evaluationResult = await runSingleBenchmark(
|
|
336
|
+
model,
|
|
337
|
+
benchmark,
|
|
338
|
+
modelKey
|
|
339
|
+
);
|
|
139
340
|
allResults.push(evaluationResult);
|
|
140
341
|
}
|
|
141
342
|
}
|
|
@@ -478,7 +679,9 @@ function checkStringValue(param, modelValue, possibleAnswers) {
|
|
|
478
679
|
if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
|
|
479
680
|
return {
|
|
480
681
|
valid: false,
|
|
481
|
-
error: `Invalid value for parameter '${param}':
|
|
682
|
+
error: `Invalid value for parameter '${param}': ${JSON.stringify(
|
|
683
|
+
modelValue
|
|
684
|
+
)}. Expected one of ${JSON.stringify(possibleAnswers)}.`,
|
|
482
685
|
error_type: "value_error:string"
|
|
483
686
|
};
|
|
484
687
|
}
|
|
@@ -532,15 +735,55 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
|
532
735
|
if (!hasMatch) {
|
|
533
736
|
return {
|
|
534
737
|
valid: false,
|
|
535
|
-
error: `Invalid value for list parameter '${paramName}'
|
|
738
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
739
|
+
modelValue
|
|
740
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
536
741
|
error_type: "value_error:list"
|
|
537
742
|
};
|
|
538
743
|
}
|
|
539
744
|
} else {
|
|
540
|
-
|
|
745
|
+
const hasMatch = possibleValues.some((possibleValue) => {
|
|
746
|
+
if (modelValue === possibleValue) return true;
|
|
747
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
748
|
+
try {
|
|
749
|
+
const normalizeObject = (obj) => {
|
|
750
|
+
if (Array.isArray(obj)) {
|
|
751
|
+
return obj.map(normalizeObject);
|
|
752
|
+
}
|
|
753
|
+
if (obj && typeof obj === "object") {
|
|
754
|
+
const normalized = {};
|
|
755
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
756
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
757
|
+
normalized[key] = value[0];
|
|
758
|
+
} else {
|
|
759
|
+
normalized[key] = normalizeObject(value);
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
return normalized;
|
|
763
|
+
}
|
|
764
|
+
return obj;
|
|
765
|
+
};
|
|
766
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
767
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
768
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
769
|
+
} catch {
|
|
770
|
+
return false;
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
774
|
+
return modelValue.toString() === possibleValue;
|
|
775
|
+
}
|
|
776
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
777
|
+
return modelValue === possibleValue.toString();
|
|
778
|
+
}
|
|
779
|
+
return false;
|
|
780
|
+
});
|
|
781
|
+
if (!hasMatch) {
|
|
541
782
|
return {
|
|
542
783
|
valid: false,
|
|
543
|
-
error: `Invalid value for parameter '${paramName}'
|
|
784
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
785
|
+
modelValue
|
|
786
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
544
787
|
error_type: "value_error:other"
|
|
545
788
|
};
|
|
546
789
|
}
|
|
@@ -636,7 +879,8 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
636
879
|
if (!modelOutput || modelOutput.length !== 1) {
|
|
637
880
|
return {
|
|
638
881
|
valid: false,
|
|
639
|
-
error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}
|
|
882
|
+
error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
|
|
883
|
+
error_type: "simple:wrong_count"
|
|
640
884
|
};
|
|
641
885
|
}
|
|
642
886
|
return simpleFunctionChecker(
|
|
@@ -665,7 +909,11 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
665
909
|
}
|
|
666
910
|
return { valid: true };
|
|
667
911
|
} catch (e) {
|
|
668
|
-
return {
|
|
912
|
+
return {
|
|
913
|
+
valid: false,
|
|
914
|
+
error: `Checker Error: ${e.message}`,
|
|
915
|
+
error_type: "checker_error"
|
|
916
|
+
};
|
|
669
917
|
}
|
|
670
918
|
}
|
|
671
919
|
function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
@@ -717,7 +965,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
717
965
|
if (copy.items) copy.items = fixSchema(copy.items);
|
|
718
966
|
return copy;
|
|
719
967
|
};
|
|
720
|
-
|
|
968
|
+
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
969
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
970
|
+
logs.push(
|
|
971
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
972
|
+
);
|
|
973
|
+
const runSingleCase = async (testCase) => {
|
|
974
|
+
const caseLogs = [];
|
|
721
975
|
const { function: tools, question: messages } = testCase;
|
|
722
976
|
try {
|
|
723
977
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
@@ -735,33 +989,49 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
735
989
|
type: "function",
|
|
736
990
|
name: sanitized,
|
|
737
991
|
description: t.description,
|
|
738
|
-
|
|
739
|
-
inputSchema: (0, import_ai2.jsonSchema)(inputSchema)
|
|
992
|
+
inputSchema
|
|
740
993
|
};
|
|
741
994
|
});
|
|
995
|
+
const toolsMap = Object.fromEntries(
|
|
996
|
+
transformedTools.map((t) => [
|
|
997
|
+
t.name,
|
|
998
|
+
(0, import_ai2.tool)({
|
|
999
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
1000
|
+
inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
|
|
1001
|
+
})
|
|
1002
|
+
])
|
|
1003
|
+
);
|
|
742
1004
|
try {
|
|
743
1005
|
const firstTool = transformedTools[0];
|
|
744
1006
|
const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
|
|
745
|
-
|
|
1007
|
+
caseLogs.push(
|
|
746
1008
|
`[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
747
1009
|
);
|
|
748
1010
|
} catch (e) {
|
|
749
|
-
|
|
1011
|
+
caseLogs.push(
|
|
750
1012
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
751
1013
|
);
|
|
752
1014
|
}
|
|
753
1015
|
const { toolCalls, text, finishReason } = await (0, import_ai2.generateText)({
|
|
754
1016
|
model,
|
|
755
1017
|
messages: flatMessages,
|
|
756
|
-
tools:
|
|
757
|
-
toolChoice: "
|
|
1018
|
+
tools: toolsMap,
|
|
1019
|
+
toolChoice: "auto",
|
|
1020
|
+
// Pass original schema information to middleware
|
|
1021
|
+
providerOptions: {
|
|
1022
|
+
toolCallMiddleware: {
|
|
1023
|
+
originalToolSchemas: Object.fromEntries(
|
|
1024
|
+
transformedTools.map((t) => [t.name, t.inputSchema])
|
|
1025
|
+
)
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
758
1028
|
});
|
|
759
1029
|
try {
|
|
760
|
-
|
|
1030
|
+
caseLogs.push(
|
|
761
1031
|
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
762
1032
|
);
|
|
763
1033
|
} catch {
|
|
764
|
-
|
|
1034
|
+
caseLogs.push(
|
|
765
1035
|
`[DEBUG] ${testCase.id}: failed to serialize toolCalls`
|
|
766
1036
|
);
|
|
767
1037
|
}
|
|
@@ -794,20 +1064,221 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
794
1064
|
possibleAnswer
|
|
795
1065
|
);
|
|
796
1066
|
if (checkerResult.valid) {
|
|
797
|
-
|
|
798
|
-
logs
|
|
1067
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1068
|
+
return { valid: true, logs: caseLogs };
|
|
799
1069
|
} else {
|
|
800
|
-
|
|
1070
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1071
|
+
try {
|
|
1072
|
+
const category = testCase.id.split("_")[0];
|
|
1073
|
+
const diff = [];
|
|
1074
|
+
const summarizeArgs = (args) => {
|
|
1075
|
+
if (args == null) return args;
|
|
1076
|
+
if (typeof args !== "object") return args;
|
|
1077
|
+
return Object.keys(args).sort().reduce((acc, k) => {
|
|
1078
|
+
acc[k] = args[k];
|
|
1079
|
+
return acc;
|
|
1080
|
+
}, {});
|
|
1081
|
+
};
|
|
1082
|
+
const expected = {};
|
|
1083
|
+
const actual = {};
|
|
1084
|
+
if (category === "simple") {
|
|
1085
|
+
const funcDesc = tools[0];
|
|
1086
|
+
const gt = possibleAnswer.ground_truth?.[0];
|
|
1087
|
+
const expectedFuncName = funcDesc?.name;
|
|
1088
|
+
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
1089
|
+
const received = restoredCalls[0];
|
|
1090
|
+
const receivedName = received?.toolName ?? received?.name;
|
|
1091
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
1092
|
+
expected.function = expectedFuncName;
|
|
1093
|
+
expected.params = expectedParams;
|
|
1094
|
+
actual.function = receivedName;
|
|
1095
|
+
actual.args = receivedArgs;
|
|
1096
|
+
if (expectedFuncName !== receivedName) {
|
|
1097
|
+
diff.push(`@@ function name`);
|
|
1098
|
+
diff.push(`- ${expectedFuncName}`);
|
|
1099
|
+
diff.push(`+ ${receivedName}`);
|
|
1100
|
+
}
|
|
1101
|
+
if (expectedParams && receivedArgs) {
|
|
1102
|
+
const required = funcDesc?.parameters?.required ?? [];
|
|
1103
|
+
for (const req of required) {
|
|
1104
|
+
if (!(req in receivedArgs)) {
|
|
1105
|
+
diff.push(`- missing required param: ${req}`);
|
|
1106
|
+
}
|
|
1107
|
+
}
|
|
1108
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1109
|
+
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1110
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1114
|
+
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1115
|
+
const allowed = expectedParams[k];
|
|
1116
|
+
const got = receivedArgs[k];
|
|
1117
|
+
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1118
|
+
try {
|
|
1119
|
+
if (Array.isArray(got)) {
|
|
1120
|
+
return JSON.stringify(
|
|
1121
|
+
got.map((x) => String(x)).sort()
|
|
1122
|
+
) === JSON.stringify(
|
|
1123
|
+
v.map((x) => String(x)).sort()
|
|
1124
|
+
);
|
|
1125
|
+
}
|
|
1126
|
+
} catch {
|
|
1127
|
+
}
|
|
1128
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1129
|
+
});
|
|
1130
|
+
if (!includes) {
|
|
1131
|
+
diff.push(`@@ param ${k}`);
|
|
1132
|
+
diff.push(
|
|
1133
|
+
`- expected one of: ${JSON.stringify(allowed)}`
|
|
1134
|
+
);
|
|
1135
|
+
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
} else {
|
|
1141
|
+
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
1142
|
+
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
1143
|
+
const actualNames = restoredCalls.map(
|
|
1144
|
+
(c) => c.toolName ?? c.name
|
|
1145
|
+
);
|
|
1146
|
+
expected.functions = expectedNames;
|
|
1147
|
+
actual.functions = actualNames;
|
|
1148
|
+
if (expectedNames.length !== actualNames.length) {
|
|
1149
|
+
diff.push(`@@ call count`);
|
|
1150
|
+
diff.push(`- expected ${expectedNames.length}`);
|
|
1151
|
+
diff.push(`+ got ${actualNames.length}`);
|
|
1152
|
+
}
|
|
1153
|
+
const missing = expectedNames.filter(
|
|
1154
|
+
(n) => !actualNames.includes(n)
|
|
1155
|
+
);
|
|
1156
|
+
const extra = actualNames.filter(
|
|
1157
|
+
(n) => !expectedNames.includes(n)
|
|
1158
|
+
);
|
|
1159
|
+
for (const m of missing)
|
|
1160
|
+
diff.push(`- missing function: ${m}`);
|
|
1161
|
+
for (const e of extra)
|
|
1162
|
+
diff.push(`+ unexpected function: ${e}`);
|
|
1163
|
+
const usedActual = /* @__PURE__ */ new Set();
|
|
1164
|
+
for (const expectedObj of gtArr) {
|
|
1165
|
+
const fname = Object.keys(expectedObj)[0];
|
|
1166
|
+
let matchedIndex = -1;
|
|
1167
|
+
for (let i = 0; i < restoredCalls.length; i++) {
|
|
1168
|
+
if (usedActual.has(i)) continue;
|
|
1169
|
+
const rc = restoredCalls[i];
|
|
1170
|
+
const rcName = rc?.toolName ?? rc?.name;
|
|
1171
|
+
if (rcName === fname) {
|
|
1172
|
+
matchedIndex = i;
|
|
1173
|
+
break;
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
if (matchedIndex === -1) continue;
|
|
1177
|
+
usedActual.add(matchedIndex);
|
|
1178
|
+
const received = restoredCalls[matchedIndex];
|
|
1179
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
1180
|
+
const expectedParamsAllowed = expectedObj[fname];
|
|
1181
|
+
const funcDesc = tools.find(
|
|
1182
|
+
(t) => t.name === fname
|
|
1183
|
+
);
|
|
1184
|
+
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1185
|
+
diff.push(`@@ function ${fname}`);
|
|
1186
|
+
if (expectedParamsAllowed && receivedArgs) {
|
|
1187
|
+
for (const req of requiredParams) {
|
|
1188
|
+
if (!(req in receivedArgs)) {
|
|
1189
|
+
diff.push(`- missing required param: ${req}`);
|
|
1190
|
+
}
|
|
1191
|
+
}
|
|
1192
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1193
|
+
if (!Object.prototype.hasOwnProperty.call(
|
|
1194
|
+
expectedParamsAllowed,
|
|
1195
|
+
k
|
|
1196
|
+
)) {
|
|
1197
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
1201
|
+
if (Object.prototype.hasOwnProperty.call(
|
|
1202
|
+
expectedParamsAllowed,
|
|
1203
|
+
k
|
|
1204
|
+
)) {
|
|
1205
|
+
const allowed = expectedParamsAllowed[k];
|
|
1206
|
+
const got = receivedArgs[k];
|
|
1207
|
+
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1208
|
+
try {
|
|
1209
|
+
if (Array.isArray(got)) {
|
|
1210
|
+
return JSON.stringify(
|
|
1211
|
+
got.map((x) => String(x)).sort()
|
|
1212
|
+
) === JSON.stringify(
|
|
1213
|
+
v.map((x) => String(x)).sort()
|
|
1214
|
+
);
|
|
1215
|
+
}
|
|
1216
|
+
} catch {
|
|
1217
|
+
}
|
|
1218
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1219
|
+
});
|
|
1220
|
+
if (!includes) {
|
|
1221
|
+
diff.push(`@@ param ${k}`);
|
|
1222
|
+
diff.push(
|
|
1223
|
+
`- expected one of: ${JSON.stringify(allowed)}`
|
|
1224
|
+
);
|
|
1225
|
+
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
caseLogs.push(
|
|
1233
|
+
`[DEBUG-FAIL] ${JSON.stringify({
|
|
1234
|
+
id: testCase.id,
|
|
1235
|
+
message: checkerResult.error,
|
|
1236
|
+
error_type: checkerResult.error_type,
|
|
1237
|
+
expected,
|
|
1238
|
+
actual,
|
|
1239
|
+
diff
|
|
1240
|
+
})}`
|
|
1241
|
+
);
|
|
1242
|
+
} catch {
|
|
1243
|
+
caseLogs.push(
|
|
1244
|
+
`[DEBUG] ${testCase.id}: failed to build debug diff`
|
|
1245
|
+
);
|
|
1246
|
+
}
|
|
1247
|
+
return { valid: false, logs: caseLogs };
|
|
801
1248
|
}
|
|
802
1249
|
} catch (e) {
|
|
803
|
-
|
|
1250
|
+
caseLogs.push(
|
|
804
1251
|
`[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
|
|
805
1252
|
);
|
|
806
1253
|
if (e?.stack) {
|
|
807
|
-
|
|
1254
|
+
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
808
1255
|
}
|
|
1256
|
+
return { valid: false, logs: caseLogs };
|
|
809
1257
|
}
|
|
810
|
-
}
|
|
1258
|
+
};
|
|
1259
|
+
const mapWithConcurrency = async (items, limit2, mapper) => {
|
|
1260
|
+
const results = new Array(items.length);
|
|
1261
|
+
let idx = 0;
|
|
1262
|
+
const workers = new Array(Math.min(limit2, items.length)).fill(0).map(async () => {
|
|
1263
|
+
while (true) {
|
|
1264
|
+
const current = idx++;
|
|
1265
|
+
if (current >= items.length) break;
|
|
1266
|
+
results[current] = await mapper(items[current], current);
|
|
1267
|
+
}
|
|
1268
|
+
});
|
|
1269
|
+
await Promise.all(workers);
|
|
1270
|
+
return results;
|
|
1271
|
+
};
|
|
1272
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1273
|
+
testCases,
|
|
1274
|
+
concurrency,
|
|
1275
|
+
async (tc) => runSingleCase(tc)
|
|
1276
|
+
);
|
|
1277
|
+
correctCount = resultsPerCase.reduce(
|
|
1278
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1279
|
+
0
|
|
1280
|
+
);
|
|
1281
|
+
for (const r of resultsPerCase) logs.push(...r.logs);
|
|
811
1282
|
if (testCases.length === 0) {
|
|
812
1283
|
return {
|
|
813
1284
|
score: 0,
|