@ai-sdk-tool/eval 0.1.8 → 1.0.0-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/data/{BFCL_v3_multiple.json → BFCL_v3_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_multiple_possible_answer.json → BFCL_v3_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel.json → BFCL_v3_parallel.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple.json → BFCL_v3_parallel_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple_possible_answer.json → BFCL_v3_parallel_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_possible_answer.json → BFCL_v3_parallel_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_simple.json → BFCL_v3_simple.jsonl} +1 -1
- package/data/{BFCL_v3_simple_possible_answer.json → BFCL_v3_simple_possible_answer.jsonl} +1 -1
- package/data/ComplexFuncBench.jsonl +1000 -0
- package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
- package/dist/index.cjs +2122 -1119
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +30 -10
- package/dist/index.d.ts +30 -10
- package/dist/index.js +2153 -1143
- package/dist/index.js.map +1 -1
- package/package.json +19 -16
package/dist/index.js
CHANGED
|
@@ -1,424 +1,18 @@
|
|
|
1
|
-
// src/reporters/console.ts
|
|
2
|
-
var colors = {
|
|
3
|
-
reset: "\x1B[0m",
|
|
4
|
-
green: "\x1B[32m",
|
|
5
|
-
red: "\x1B[31m",
|
|
6
|
-
yellow: "\x1B[33m",
|
|
7
|
-
cyan: "\x1B[36m",
|
|
8
|
-
magenta: "\x1B[35m",
|
|
9
|
-
gray: "\x1B[90m"
|
|
10
|
-
};
|
|
11
|
-
function printResult(result) {
|
|
12
|
-
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
13
|
-
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
14
|
-
console.log(
|
|
15
|
-
`
|
|
16
|
-
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
17
|
-
);
|
|
18
|
-
console.log(
|
|
19
|
-
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
20
|
-
);
|
|
21
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
22
|
-
if (metrics.length > 0) {
|
|
23
|
-
console.log(" Metrics:");
|
|
24
|
-
for (const [key, value] of metrics) {
|
|
25
|
-
console.log(` - ${key}: ${value}`);
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
if (benchmarkResult.error) {
|
|
29
|
-
console.log(
|
|
30
|
-
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
31
|
-
);
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
function consoleReporter(results) {
|
|
35
|
-
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
36
|
-
for (const result of results) {
|
|
37
|
-
printResult(result);
|
|
38
|
-
}
|
|
39
|
-
console.log("\n---------------------------\n");
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
// src/reporters/console.debug.ts
|
|
43
|
-
var colors2 = {
|
|
44
|
-
reset: "\x1B[0m",
|
|
45
|
-
green: "\x1B[32m",
|
|
46
|
-
red: "\x1B[31m",
|
|
47
|
-
yellow: "\x1B[33m",
|
|
48
|
-
cyan: "\x1B[36m",
|
|
49
|
-
magenta: "\x1B[35m",
|
|
50
|
-
gray: "\x1B[90m",
|
|
51
|
-
bold: "\x1B[1m",
|
|
52
|
-
underline: "\x1B[4m"
|
|
53
|
-
};
|
|
54
|
-
function colorizeDiffLine(line) {
|
|
55
|
-
if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
|
|
56
|
-
if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
|
|
57
|
-
if (line.startsWith("@"))
|
|
58
|
-
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
59
|
-
return line;
|
|
60
|
-
}
|
|
61
|
-
function uniqueLines(lines) {
|
|
62
|
-
const seen = /* @__PURE__ */ new Set();
|
|
63
|
-
const out = [];
|
|
64
|
-
for (const l of lines) {
|
|
65
|
-
if (seen.has(l)) continue;
|
|
66
|
-
seen.add(l);
|
|
67
|
-
out.push(l);
|
|
68
|
-
}
|
|
69
|
-
return out;
|
|
70
|
-
}
|
|
71
|
-
function suggestFixFromDiff(parsed) {
|
|
72
|
-
const suggestions = [];
|
|
73
|
-
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
74
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
75
|
-
const expectedName = expected?.function;
|
|
76
|
-
const actualName = actual?.function;
|
|
77
|
-
if (expectedName && actualName && expectedName !== actualName) {
|
|
78
|
-
suggestions.push(
|
|
79
|
-
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
80
|
-
);
|
|
81
|
-
}
|
|
82
|
-
if (Array.isArray(expected?.functions)) {
|
|
83
|
-
suggestions.push(
|
|
84
|
-
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
85
|
-
);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
89
|
-
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
90
|
-
if (missing.length) {
|
|
91
|
-
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
95
|
-
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
96
|
-
if (extras.length) {
|
|
97
|
-
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
101
|
-
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
102
|
-
for (const param of targets) {
|
|
103
|
-
const allowedOneOfLine = diff.find(
|
|
104
|
-
(d) => String(d).startsWith("- expected one of:")
|
|
105
|
-
);
|
|
106
|
-
const allowedSingleLine = diff.find(
|
|
107
|
-
(d) => String(d).startsWith("- expected:")
|
|
108
|
-
);
|
|
109
|
-
if (allowedSingleLine) {
|
|
110
|
-
const value = allowedSingleLine.replace("- expected: ", "");
|
|
111
|
-
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
112
|
-
} else if (allowedOneOfLine) {
|
|
113
|
-
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
114
|
-
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
115
|
-
} else {
|
|
116
|
-
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
121
|
-
if (error_type.includes("missing_required")) {
|
|
122
|
-
suggestions.push(
|
|
123
|
-
"Add all required parameters defined by the tool schema."
|
|
124
|
-
);
|
|
125
|
-
} else if (error_type.includes("unexpected_param")) {
|
|
126
|
-
suggestions.push("Remove parameters not present in the tool schema.");
|
|
127
|
-
} else if (error_type.includes("wrong_count")) {
|
|
128
|
-
suggestions.push(
|
|
129
|
-
"Adjust the number of tool calls to match expected count."
|
|
130
|
-
);
|
|
131
|
-
} else if (error_type.includes("wrong_func_name")) {
|
|
132
|
-
suggestions.push("Use the exact expected function name from the schema.");
|
|
133
|
-
} else if (error_type.includes("value_error")) {
|
|
134
|
-
suggestions.push("Choose a value from the allowed options.");
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
return uniqueLines(suggestions);
|
|
138
|
-
}
|
|
139
|
-
function consoleDebugReporter(results) {
|
|
140
|
-
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
141
|
-
for (const r of results) {
|
|
142
|
-
const { model, modelKey, benchmark, result } = r;
|
|
143
|
-
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
144
|
-
console.log(
|
|
145
|
-
`
|
|
146
|
-
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
147
|
-
);
|
|
148
|
-
console.log(
|
|
149
|
-
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
150
|
-
);
|
|
151
|
-
const metrics = Object.entries(result.metrics);
|
|
152
|
-
if (metrics.length > 0) {
|
|
153
|
-
console.log(" Metrics:");
|
|
154
|
-
for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
|
|
155
|
-
}
|
|
156
|
-
if (result.logs && result.logs.length) {
|
|
157
|
-
const failLogs = result.logs.filter(
|
|
158
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
159
|
-
);
|
|
160
|
-
const hasFails = failLogs.length > 0;
|
|
161
|
-
if (hasFails) {
|
|
162
|
-
let getTestIdFromLogLine2 = function(line) {
|
|
163
|
-
if (line.startsWith("[FAIL]")) {
|
|
164
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
165
|
-
return m?.[1];
|
|
166
|
-
}
|
|
167
|
-
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
168
|
-
try {
|
|
169
|
-
const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
170
|
-
return String(parsed?.id ?? "");
|
|
171
|
-
} catch {
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
175
|
-
try {
|
|
176
|
-
const parsed = JSON.parse(
|
|
177
|
-
line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
|
|
178
|
-
);
|
|
179
|
-
return String(parsed?.id ?? "");
|
|
180
|
-
} catch {
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
return void 0;
|
|
184
|
-
};
|
|
185
|
-
var getTestIdFromLogLine = getTestIdFromLogLine2;
|
|
186
|
-
const byId = /* @__PURE__ */ new Map();
|
|
187
|
-
for (const line of failLogs) {
|
|
188
|
-
const id = getTestIdFromLogLine2(line);
|
|
189
|
-
const key = id ?? "__general__";
|
|
190
|
-
const arr = byId.get(key) ?? [];
|
|
191
|
-
arr.push(line);
|
|
192
|
-
byId.set(key, arr);
|
|
193
|
-
}
|
|
194
|
-
console.log(
|
|
195
|
-
` ${colors2.bold}Failure details (grouped):${colors2.reset}`
|
|
196
|
-
);
|
|
197
|
-
for (const [groupId, lines] of byId) {
|
|
198
|
-
if (groupId !== "__general__") {
|
|
199
|
-
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
200
|
-
}
|
|
201
|
-
const debugIds = /* @__PURE__ */ new Set();
|
|
202
|
-
for (const l of lines) {
|
|
203
|
-
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
204
|
-
try {
|
|
205
|
-
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
206
|
-
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
207
|
-
} catch {
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
for (const line of lines) {
|
|
212
|
-
if (line.startsWith("[FAIL]")) {
|
|
213
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
214
|
-
const failId = m?.[1];
|
|
215
|
-
if (failId && debugIds.has(failId)) continue;
|
|
216
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
217
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
218
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
219
|
-
} else if (line.startsWith("[STACK]")) {
|
|
220
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
221
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
222
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
223
|
-
try {
|
|
224
|
-
const parsed = JSON.parse(payload);
|
|
225
|
-
const { message, diff, expected, actual } = parsed;
|
|
226
|
-
if (message)
|
|
227
|
-
console.log(
|
|
228
|
-
` ${colors2.bold}${message}${colors2.reset}`
|
|
229
|
-
);
|
|
230
|
-
if (diff && Array.isArray(diff)) {
|
|
231
|
-
for (const dLine of diff)
|
|
232
|
-
console.log(" " + colorizeDiffLine(dLine));
|
|
233
|
-
} else {
|
|
234
|
-
console.log(" expected:");
|
|
235
|
-
console.log(
|
|
236
|
-
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
237
|
-
);
|
|
238
|
-
console.log(" actual:");
|
|
239
|
-
console.log(
|
|
240
|
-
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
241
|
-
);
|
|
242
|
-
}
|
|
243
|
-
const suggestions = suggestFixFromDiff(parsed);
|
|
244
|
-
if (suggestions.length) {
|
|
245
|
-
console.log(
|
|
246
|
-
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
247
|
-
);
|
|
248
|
-
for (const s of suggestions)
|
|
249
|
-
console.log(` \u2022 ${s}`);
|
|
250
|
-
}
|
|
251
|
-
} catch {
|
|
252
|
-
console.log(` ${line}`);
|
|
253
|
-
}
|
|
254
|
-
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
255
|
-
const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
|
|
256
|
-
try {
|
|
257
|
-
const ctx = JSON.parse(payload);
|
|
258
|
-
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
259
|
-
if (ctx.tool_schema) {
|
|
260
|
-
console.log(
|
|
261
|
-
colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
|
|
262
|
-
);
|
|
263
|
-
}
|
|
264
|
-
if (ctx.last_user_query) {
|
|
265
|
-
console.log(
|
|
266
|
-
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
267
|
-
);
|
|
268
|
-
}
|
|
269
|
-
if (ctx.raw_model_text) {
|
|
270
|
-
console.log(
|
|
271
|
-
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
272
|
-
);
|
|
273
|
-
}
|
|
274
|
-
if (ctx.parsed_tool_calls) {
|
|
275
|
-
console.log(
|
|
276
|
-
colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
|
|
277
|
-
);
|
|
278
|
-
}
|
|
279
|
-
if (ctx.ground_truth) {
|
|
280
|
-
console.log(
|
|
281
|
-
colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
|
|
282
|
-
);
|
|
283
|
-
}
|
|
284
|
-
if (ctx.finish_reason) {
|
|
285
|
-
console.log(
|
|
286
|
-
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
287
|
-
);
|
|
288
|
-
}
|
|
289
|
-
} catch {
|
|
290
|
-
console.log(` ${line}`);
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
}
|
|
295
|
-
} else {
|
|
296
|
-
const info = result.logs.filter(
|
|
297
|
-
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
298
|
-
);
|
|
299
|
-
for (const line of info)
|
|
300
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
console.log("\n------------------------------------\n");
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
// src/reporters/json.ts
|
|
308
|
-
function jsonReporter(results) {
|
|
309
|
-
const serializableResults = results.map((r) => ({
|
|
310
|
-
...r,
|
|
311
|
-
result: {
|
|
312
|
-
...r.result,
|
|
313
|
-
error: r.result.error?.message
|
|
314
|
-
}
|
|
315
|
-
}));
|
|
316
|
-
console.log(JSON.stringify(serializableResults, null, 2));
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
// src/reporters/index.ts
|
|
320
|
-
var reporters = {
|
|
321
|
-
console: consoleReporter,
|
|
322
|
-
json: jsonReporter,
|
|
323
|
-
"console.debug": consoleDebugReporter
|
|
324
|
-
};
|
|
325
|
-
|
|
326
|
-
// src/evaluate.ts
|
|
327
|
-
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
328
|
-
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
329
|
-
try {
|
|
330
|
-
console.log(
|
|
331
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
332
|
-
);
|
|
333
|
-
const result = await benchmark.run(model, config);
|
|
334
|
-
console.log(
|
|
335
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
336
|
-
);
|
|
337
|
-
return {
|
|
338
|
-
model: modelId,
|
|
339
|
-
modelKey,
|
|
340
|
-
benchmark: benchmark.name,
|
|
341
|
-
result
|
|
342
|
-
};
|
|
343
|
-
} catch (error) {
|
|
344
|
-
console.error(
|
|
345
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
346
|
-
error
|
|
347
|
-
);
|
|
348
|
-
return {
|
|
349
|
-
model: modelId,
|
|
350
|
-
modelKey,
|
|
351
|
-
benchmark: benchmark.name,
|
|
352
|
-
result: {
|
|
353
|
-
score: 0,
|
|
354
|
-
success: false,
|
|
355
|
-
metrics: {},
|
|
356
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
357
|
-
}
|
|
358
|
-
};
|
|
359
|
-
}
|
|
360
|
-
}
|
|
361
|
-
async function evaluate(options) {
|
|
362
|
-
const {
|
|
363
|
-
models,
|
|
364
|
-
benchmarks,
|
|
365
|
-
reporter = "console",
|
|
366
|
-
temperature,
|
|
367
|
-
maxTokens
|
|
368
|
-
} = options;
|
|
369
|
-
const modelEntries = [];
|
|
370
|
-
if (Array.isArray(models)) {
|
|
371
|
-
for (const m of models) modelEntries.push([void 0, m]);
|
|
372
|
-
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
373
|
-
modelEntries.push([void 0, models]);
|
|
374
|
-
} else {
|
|
375
|
-
for (const [key, m] of Object.entries(
|
|
376
|
-
models
|
|
377
|
-
)) {
|
|
378
|
-
modelEntries.push([key, m]);
|
|
379
|
-
}
|
|
380
|
-
}
|
|
381
|
-
const allResults = [];
|
|
382
|
-
for (const [modelKey, model] of modelEntries) {
|
|
383
|
-
for (const benchmark of benchmarks) {
|
|
384
|
-
const config = {};
|
|
385
|
-
if (temperature !== void 0) config.temperature = temperature;
|
|
386
|
-
if (maxTokens !== void 0) config.maxTokens = maxTokens;
|
|
387
|
-
const evaluationResult = await runSingleBenchmark(
|
|
388
|
-
model,
|
|
389
|
-
benchmark,
|
|
390
|
-
modelKey,
|
|
391
|
-
Object.keys(config).length > 0 ? config : void 0
|
|
392
|
-
);
|
|
393
|
-
allResults.push(evaluationResult);
|
|
394
|
-
}
|
|
395
|
-
}
|
|
396
|
-
const report = reporters[reporter];
|
|
397
|
-
if (report) {
|
|
398
|
-
report(allResults);
|
|
399
|
-
} else {
|
|
400
|
-
console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
|
|
401
|
-
reporters.console(allResults);
|
|
402
|
-
}
|
|
403
|
-
return allResults;
|
|
404
|
-
}
|
|
405
|
-
|
|
406
1
|
// src/benchmarks/bfcl.ts
|
|
407
|
-
import { generateText, jsonSchema, tool } from "ai";
|
|
408
2
|
import { promises as fs2 } from "fs";
|
|
409
3
|
import path2 from "path";
|
|
4
|
+
import {
|
|
5
|
+
generateText,
|
|
6
|
+
jsonSchema,
|
|
7
|
+
tool
|
|
8
|
+
} from "ai";
|
|
410
9
|
|
|
411
10
|
// src/utils/paths.ts
|
|
412
11
|
import fs from "fs";
|
|
413
12
|
import { createRequire } from "module";
|
|
414
13
|
import path from "path";
|
|
415
14
|
import { fileURLToPath } from "url";
|
|
416
|
-
function
|
|
417
|
-
const moduleUrl = fromModuleUrl;
|
|
418
|
-
const override = process.env.BFCL_DATA_DIR;
|
|
419
|
-
if (override && override.trim().length > 0) {
|
|
420
|
-
return override;
|
|
421
|
-
}
|
|
15
|
+
function tryResolveViaPackageEntry(moduleUrl) {
|
|
422
16
|
try {
|
|
423
17
|
const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
|
|
424
18
|
const requireFromEntry = createRequire(baseForRequireEntry);
|
|
@@ -426,43 +20,80 @@ function resolveDataDir(fromModuleUrl) {
|
|
|
426
20
|
const entryDir = path.dirname(entryPath);
|
|
427
21
|
const guessPkgRoot = fs.existsSync(path.join(entryDir, "..")) ? path.resolve(entryDir, "..") : entryDir;
|
|
428
22
|
const dataAtRoot = path.join(guessPkgRoot, "data");
|
|
429
|
-
if (fs.existsSync(dataAtRoot))
|
|
430
|
-
|
|
23
|
+
if (fs.existsSync(dataAtRoot)) {
|
|
24
|
+
return dataAtRoot;
|
|
25
|
+
}
|
|
26
|
+
} catch (e) {
|
|
431
27
|
}
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
function tryResolveViaPackageJson(moduleUrl) {
|
|
432
31
|
try {
|
|
433
32
|
const baseForRequire = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
|
|
434
33
|
const require2 = createRequire(baseForRequire);
|
|
435
34
|
const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
|
|
436
35
|
const pkgDir = path.dirname(pkgJsonPath);
|
|
437
36
|
const dataAtPkg = path.join(pkgDir, "data");
|
|
438
|
-
if (fs.existsSync(dataAtPkg))
|
|
439
|
-
|
|
37
|
+
if (fs.existsSync(dataAtPkg)) {
|
|
38
|
+
return dataAtPkg;
|
|
39
|
+
}
|
|
40
|
+
} catch (e) {
|
|
440
41
|
}
|
|
441
|
-
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
function getStartDir(moduleUrl) {
|
|
442
45
|
if (moduleUrl) {
|
|
443
46
|
try {
|
|
444
|
-
|
|
445
|
-
} catch {
|
|
446
|
-
|
|
47
|
+
return path.dirname(fileURLToPath(moduleUrl));
|
|
48
|
+
} catch (e) {
|
|
49
|
+
return process.cwd();
|
|
447
50
|
}
|
|
448
|
-
} else {
|
|
449
|
-
startDir = process.cwd();
|
|
450
51
|
}
|
|
52
|
+
return process.cwd();
|
|
53
|
+
}
|
|
54
|
+
function findDataDirByTraversal(startDir) {
|
|
451
55
|
let dir = startDir;
|
|
452
|
-
|
|
56
|
+
const MAX_PARENT_TRAVERSAL_DEPTH = 6;
|
|
57
|
+
for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
|
|
453
58
|
const dataCandidate = path.join(dir, "data");
|
|
454
|
-
if (fs.existsSync(dataCandidate))
|
|
59
|
+
if (fs.existsSync(dataCandidate)) {
|
|
60
|
+
return dataCandidate;
|
|
61
|
+
}
|
|
455
62
|
const parent = path.resolve(dir, "..");
|
|
456
|
-
if (parent === dir)
|
|
63
|
+
if (parent === dir) {
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
457
66
|
dir = parent;
|
|
458
67
|
}
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
function resolveDataDir(fromModuleUrl) {
|
|
71
|
+
const override = process.env.BFCL_DATA_DIR;
|
|
72
|
+
if (override && override.trim().length > 0) {
|
|
73
|
+
return override;
|
|
74
|
+
}
|
|
75
|
+
const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
|
|
76
|
+
if (viaEntry) {
|
|
77
|
+
return viaEntry;
|
|
78
|
+
}
|
|
79
|
+
const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
|
|
80
|
+
if (viaPackageJson) {
|
|
81
|
+
return viaPackageJson;
|
|
82
|
+
}
|
|
83
|
+
const startDir = getStartDir(fromModuleUrl);
|
|
84
|
+
const viaTraversal = findDataDirByTraversal(startDir);
|
|
85
|
+
if (viaTraversal) {
|
|
86
|
+
return viaTraversal;
|
|
87
|
+
}
|
|
459
88
|
const pkgRoot = path.resolve(startDir, "..", "..");
|
|
460
89
|
return path.join(pkgRoot, "data");
|
|
461
90
|
}
|
|
462
91
|
|
|
463
92
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
464
93
|
function standardizeString(input) {
|
|
465
|
-
if (typeof input !== "string")
|
|
94
|
+
if (typeof input !== "string") {
|
|
95
|
+
return input;
|
|
96
|
+
}
|
|
466
97
|
const regex = /[ ,./\\-_*^]/g;
|
|
467
98
|
return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
|
|
468
99
|
}
|
|
@@ -482,131 +113,185 @@ function checkStringValue(param, modelValue, possibleAnswers) {
|
|
|
482
113
|
}
|
|
483
114
|
return { valid: true };
|
|
484
115
|
}
|
|
485
|
-
function
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
const expectedFuncName = funcDescription.name;
|
|
489
|
-
const expectedParams = funcDescription.parameters.properties;
|
|
490
|
-
const requiredParams = funcDescription.parameters.required;
|
|
491
|
-
if (modelFuncName !== expectedFuncName) {
|
|
492
|
-
return {
|
|
493
|
-
valid: false,
|
|
494
|
-
error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
|
|
495
|
-
error_type: "simple_function_checker:wrong_func_name"
|
|
496
|
-
};
|
|
116
|
+
function normalizeObject(obj) {
|
|
117
|
+
if (Array.isArray(obj)) {
|
|
118
|
+
return obj.map(normalizeObject);
|
|
497
119
|
}
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
};
|
|
120
|
+
if (obj && typeof obj === "object") {
|
|
121
|
+
const normalized = {};
|
|
122
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
123
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
124
|
+
normalized[key] = value[0];
|
|
125
|
+
} else {
|
|
126
|
+
normalized[key] = normalizeObject(value);
|
|
127
|
+
}
|
|
507
128
|
}
|
|
129
|
+
return normalized;
|
|
508
130
|
}
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
paramName,
|
|
523
|
-
modelValue,
|
|
524
|
-
possibleValues ?? []
|
|
525
|
-
);
|
|
526
|
-
if (!result.valid) return result;
|
|
527
|
-
} else if (Array.isArray(modelValue)) {
|
|
528
|
-
const modelValueStr = JSON.stringify(
|
|
529
|
-
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
530
|
-
);
|
|
531
|
-
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
532
|
-
if (!Array.isArray(p)) return false;
|
|
533
|
-
return JSON.stringify(
|
|
534
|
-
p.map((v) => standardizeString(String(v))).sort()
|
|
535
|
-
) === modelValueStr;
|
|
536
|
-
}) : false;
|
|
537
|
-
if (!hasMatch) {
|
|
538
|
-
return {
|
|
539
|
-
valid: false,
|
|
540
|
-
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
541
|
-
modelValue
|
|
542
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
543
|
-
error_type: "value_error:list"
|
|
544
|
-
};
|
|
545
|
-
}
|
|
546
|
-
} else {
|
|
547
|
-
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
|
|
548
|
-
if (modelValue === possibleValue) return true;
|
|
549
|
-
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
550
|
-
try {
|
|
551
|
-
const normalizeObject = (obj) => {
|
|
552
|
-
if (Array.isArray(obj)) {
|
|
553
|
-
return obj.map(normalizeObject);
|
|
554
|
-
}
|
|
555
|
-
if (obj && typeof obj === "object") {
|
|
556
|
-
const normalized = {};
|
|
557
|
-
for (const [key, value] of Object.entries(
|
|
558
|
-
obj
|
|
559
|
-
)) {
|
|
560
|
-
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
561
|
-
normalized[key] = value[0];
|
|
562
|
-
} else {
|
|
563
|
-
normalized[key] = normalizeObject(value);
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
return normalized;
|
|
567
|
-
}
|
|
568
|
-
return obj;
|
|
569
|
-
};
|
|
570
|
-
const normalizedModel = normalizeObject(modelValue);
|
|
571
|
-
const normalizedPossible = normalizeObject(possibleValue);
|
|
572
|
-
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
573
|
-
} catch {
|
|
574
|
-
return false;
|
|
575
|
-
}
|
|
576
|
-
}
|
|
577
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
578
|
-
return modelValue.toString() === possibleValue;
|
|
579
|
-
}
|
|
580
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
581
|
-
return modelValue === possibleValue.toString();
|
|
582
|
-
}
|
|
583
|
-
return false;
|
|
584
|
-
}) : false;
|
|
585
|
-
if (!hasMatch) {
|
|
586
|
-
return {
|
|
587
|
-
valid: false,
|
|
588
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
589
|
-
modelValue
|
|
590
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
591
|
-
error_type: "value_error:other"
|
|
592
|
-
};
|
|
593
|
-
}
|
|
594
|
-
}
|
|
131
|
+
return obj;
|
|
132
|
+
}
|
|
133
|
+
function valuesMatch(modelValue, possibleValue) {
|
|
134
|
+
if (modelValue === possibleValue) {
|
|
135
|
+
return true;
|
|
136
|
+
}
|
|
137
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
138
|
+
try {
|
|
139
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
140
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
141
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
142
|
+
} catch (e) {
|
|
143
|
+
return false;
|
|
595
144
|
}
|
|
596
145
|
}
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
146
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
147
|
+
return modelValue.toString() === possibleValue;
|
|
148
|
+
}
|
|
149
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
150
|
+
return modelValue === possibleValue.toString();
|
|
151
|
+
}
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
function checkArrayValue(paramName, modelValue, possibleValues) {
|
|
155
|
+
const modelValueStr = JSON.stringify(
|
|
156
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
157
|
+
);
|
|
158
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
159
|
+
if (!Array.isArray(p)) {
|
|
160
|
+
return false;
|
|
161
|
+
}
|
|
162
|
+
return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
|
|
163
|
+
}) : false;
|
|
164
|
+
if (!hasMatch) {
|
|
165
|
+
return {
|
|
166
|
+
valid: false,
|
|
167
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
168
|
+
modelValue
|
|
169
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
170
|
+
error_type: "value_error:list"
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
return { valid: true };
|
|
174
|
+
}
|
|
175
|
+
function checkObjectValue(paramName, modelValue, possibleValues) {
|
|
176
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
|
|
177
|
+
(possibleValue) => valuesMatch(modelValue, possibleValue)
|
|
178
|
+
) : false;
|
|
179
|
+
if (!hasMatch) {
|
|
180
|
+
return {
|
|
181
|
+
valid: false,
|
|
182
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
183
|
+
modelValue
|
|
184
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
185
|
+
error_type: "value_error:other"
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
return { valid: true };
|
|
189
|
+
}
|
|
190
|
+
function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
191
|
+
const funcNameCheck = checkFunctionName(
|
|
192
|
+
funcDescription.name,
|
|
193
|
+
modelToolCall.toolName
|
|
194
|
+
);
|
|
195
|
+
if (!funcNameCheck.valid) {
|
|
196
|
+
return funcNameCheck;
|
|
197
|
+
}
|
|
198
|
+
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
199
|
+
const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
|
|
200
|
+
const context = {
|
|
201
|
+
funcDescription,
|
|
202
|
+
modelToolCall,
|
|
203
|
+
possibleAnswerParams,
|
|
204
|
+
expectedParams: funcDescription.parameters.properties
|
|
205
|
+
};
|
|
206
|
+
const requiredCheck = checkRequiredParams(
|
|
207
|
+
funcDescription.parameters.required,
|
|
208
|
+
argsObj
|
|
209
|
+
);
|
|
210
|
+
if (!requiredCheck.valid) {
|
|
211
|
+
return requiredCheck;
|
|
212
|
+
}
|
|
213
|
+
const paramsCheck = checkAllParameters(argsObj, context);
|
|
214
|
+
if (!paramsCheck.valid) {
|
|
215
|
+
return paramsCheck;
|
|
216
|
+
}
|
|
217
|
+
const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
|
|
218
|
+
if (!optionalCheck.valid) {
|
|
219
|
+
return optionalCheck;
|
|
220
|
+
}
|
|
221
|
+
return { valid: true };
|
|
222
|
+
}
|
|
223
|
+
function checkFunctionName(expected, actual) {
|
|
224
|
+
if (actual !== expected) {
|
|
225
|
+
return {
|
|
226
|
+
valid: false,
|
|
227
|
+
error: `Function name '${actual}' does not match expected '${expected}'.`,
|
|
228
|
+
error_type: "simple_function_checker:wrong_func_name"
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
return { valid: true };
|
|
232
|
+
}
|
|
233
|
+
function checkRequiredParams(requiredParams, argsObj) {
|
|
234
|
+
for (const param of requiredParams) {
|
|
235
|
+
if (!(param in argsObj)) {
|
|
601
236
|
return {
|
|
602
237
|
valid: false,
|
|
603
|
-
error: `Missing
|
|
604
|
-
error_type: "simple_function_checker:
|
|
238
|
+
error: `Missing required parameter: '${param}'.`,
|
|
239
|
+
error_type: "simple_function_checker:missing_required"
|
|
605
240
|
};
|
|
606
241
|
}
|
|
607
242
|
}
|
|
608
243
|
return { valid: true };
|
|
609
244
|
}
|
|
245
|
+
function checkAllParameters(argsObj, context) {
|
|
246
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
247
|
+
const paramCheck = checkSingleParameter(
|
|
248
|
+
paramName,
|
|
249
|
+
argsObj[paramName],
|
|
250
|
+
context
|
|
251
|
+
);
|
|
252
|
+
if (!paramCheck.valid) {
|
|
253
|
+
return paramCheck;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
return { valid: true };
|
|
257
|
+
}
|
|
258
|
+
function checkSingleParameter(paramName, modelValue, context) {
|
|
259
|
+
if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
|
|
260
|
+
return {
|
|
261
|
+
valid: false,
|
|
262
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
263
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
const possibleValues = context.possibleAnswerParams[paramName];
|
|
267
|
+
if (typeof modelValue === "string") {
|
|
268
|
+
return checkStringValue(
|
|
269
|
+
paramName,
|
|
270
|
+
modelValue,
|
|
271
|
+
possibleValues != null ? possibleValues : []
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
if (Array.isArray(modelValue)) {
|
|
275
|
+
return checkArrayValue(paramName, modelValue, possibleValues);
|
|
276
|
+
}
|
|
277
|
+
return checkObjectValue(paramName, modelValue, possibleValues);
|
|
278
|
+
}
|
|
279
|
+
function checkOptionalParams(argsObj, possibleAnswerParams) {
|
|
280
|
+
for (const paramName in possibleAnswerParams) {
|
|
281
|
+
if (Object.hasOwn(possibleAnswerParams, paramName)) {
|
|
282
|
+
const val = possibleAnswerParams[paramName];
|
|
283
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
284
|
+
if (!(paramName in argsObj || isOptional)) {
|
|
285
|
+
return {
|
|
286
|
+
valid: false,
|
|
287
|
+
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
288
|
+
error_type: "simple_function_checker:missing_optional"
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return { valid: true };
|
|
294
|
+
}
|
|
610
295
|
function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
|
|
611
296
|
if (modelToolCalls.length !== possibleAnswers.length) {
|
|
612
297
|
return {
|
|
@@ -629,8 +314,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
|
|
|
629
314
|
};
|
|
630
315
|
}
|
|
631
316
|
let foundMatch = false;
|
|
632
|
-
for (let i = 0; i < modelToolCalls.length; i
|
|
633
|
-
if (matchedModelCallIndices.has(i))
|
|
317
|
+
for (let i = 0; i < modelToolCalls.length; i += 1) {
|
|
318
|
+
if (matchedModelCallIndices.has(i)) {
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
634
321
|
const checkerResult = simpleFunctionChecker(
|
|
635
322
|
funcDescription,
|
|
636
323
|
modelToolCalls[i],
|
|
@@ -679,6 +366,39 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
679
366
|
}
|
|
680
367
|
|
|
681
368
|
// src/benchmarks/bfcl.ts
|
|
369
|
+
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
370
|
+
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
371
|
+
function convertGroundTruthToXML(call) {
|
|
372
|
+
const keys = Object.keys(call);
|
|
373
|
+
if (keys.length === 0) {
|
|
374
|
+
return "<empty_call />";
|
|
375
|
+
}
|
|
376
|
+
const funcName = keys[0];
|
|
377
|
+
if (!funcName) {
|
|
378
|
+
return "<undefined_function />";
|
|
379
|
+
}
|
|
380
|
+
const params = call[funcName];
|
|
381
|
+
if (!params || typeof params !== "object") {
|
|
382
|
+
return `<${funcName} />`;
|
|
383
|
+
}
|
|
384
|
+
let xml = `<${funcName}>
|
|
385
|
+
`;
|
|
386
|
+
for (const [key, value] of Object.entries(params)) {
|
|
387
|
+
const displayValue = Array.isArray(value) ? value[0] : value;
|
|
388
|
+
let valueStr;
|
|
389
|
+
if (typeof displayValue === "string") {
|
|
390
|
+
valueStr = displayValue;
|
|
391
|
+
} else if (displayValue === null || displayValue === void 0) {
|
|
392
|
+
valueStr = "";
|
|
393
|
+
} else {
|
|
394
|
+
valueStr = JSON.stringify(displayValue);
|
|
395
|
+
}
|
|
396
|
+
xml += ` <${key}>${valueStr}</${key}>
|
|
397
|
+
`;
|
|
398
|
+
}
|
|
399
|
+
xml += `</${funcName}>`;
|
|
400
|
+
return xml;
|
|
401
|
+
}
|
|
682
402
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
683
403
|
const category = testCase.id.split("_")[0];
|
|
684
404
|
try {
|
|
@@ -695,19 +415,22 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
695
415
|
modelOutput[0],
|
|
696
416
|
possibleAnswer.ground_truth[0]
|
|
697
417
|
);
|
|
698
|
-
}
|
|
418
|
+
}
|
|
419
|
+
if (category === "parallel") {
|
|
699
420
|
return parallelFunctionCheckerNoOrder(
|
|
700
421
|
testCase.function,
|
|
701
422
|
modelOutput,
|
|
702
423
|
possibleAnswer.ground_truth
|
|
703
424
|
);
|
|
704
|
-
}
|
|
425
|
+
}
|
|
426
|
+
if (category === "multiple") {
|
|
705
427
|
return multipleFunctionChecker(
|
|
706
428
|
testCase.function,
|
|
707
429
|
modelOutput,
|
|
708
430
|
possibleAnswer.ground_truth
|
|
709
431
|
);
|
|
710
|
-
}
|
|
432
|
+
}
|
|
433
|
+
if (category.includes("parallel-multiple")) {
|
|
711
434
|
return parallelFunctionCheckerNoOrder(
|
|
712
435
|
testCase.function,
|
|
713
436
|
modelOutput,
|
|
@@ -743,8 +466,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
743
466
|
path2.join(dataPath, answerDataFile),
|
|
744
467
|
"utf-8"
|
|
745
468
|
);
|
|
746
|
-
testCases = testCasesJson.split(
|
|
747
|
-
const possibleAnswers = possibleAnswersJson.split(
|
|
469
|
+
testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
470
|
+
const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
748
471
|
const possibleAnswersMap = new Map(
|
|
749
472
|
possibleAnswers.map((ans) => [ans.id, ans])
|
|
750
473
|
);
|
|
@@ -756,425 +479,684 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
756
479
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
757
480
|
);
|
|
758
481
|
}
|
|
759
|
-
const
|
|
760
|
-
if (!
|
|
482
|
+
const fixSchemaType2 = (copy) => {
|
|
483
|
+
if (!copy.type) {
|
|
484
|
+
return;
|
|
485
|
+
}
|
|
486
|
+
if (copy.type === "dict") {
|
|
487
|
+
copy.type = "object";
|
|
488
|
+
}
|
|
489
|
+
if (copy.type === "tuple") {
|
|
490
|
+
copy.type = "array";
|
|
491
|
+
}
|
|
492
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
493
|
+
copy.type = "number";
|
|
494
|
+
}
|
|
495
|
+
};
|
|
496
|
+
const fixSchemaProperties = (copy, fixSchemaFn) => {
|
|
497
|
+
if (!copy.properties || typeof copy.properties !== "object") {
|
|
498
|
+
return;
|
|
499
|
+
}
|
|
500
|
+
for (const k of Object.keys(copy.properties)) {
|
|
501
|
+
copy.properties[k] = fixSchemaFn(
|
|
502
|
+
copy.properties[k]
|
|
503
|
+
);
|
|
504
|
+
}
|
|
505
|
+
};
|
|
506
|
+
const fixSchema2 = (schema) => {
|
|
507
|
+
if (!schema || typeof schema !== "object") {
|
|
761
508
|
return { type: "object", properties: {} };
|
|
762
|
-
|
|
509
|
+
}
|
|
510
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
|
|
763
511
|
if (!Array.isArray(copy)) {
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
512
|
+
fixSchemaType2(copy);
|
|
513
|
+
fixSchemaProperties(copy, fixSchema2);
|
|
514
|
+
if (copy.items) {
|
|
515
|
+
copy.items = fixSchema2(copy.items);
|
|
768
516
|
}
|
|
769
|
-
if (copy.properties && typeof copy.properties === "object") {
|
|
770
|
-
for (const k of Object.keys(copy.properties)) {
|
|
771
|
-
copy.properties[k] = fixSchema(
|
|
772
|
-
copy.properties[k]
|
|
773
|
-
);
|
|
774
|
-
}
|
|
775
|
-
}
|
|
776
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
777
517
|
return copy;
|
|
778
518
|
}
|
|
779
519
|
return copy;
|
|
780
520
|
};
|
|
781
|
-
const
|
|
782
|
-
const
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
const
|
|
787
|
-
const
|
|
788
|
-
const
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
521
|
+
const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
522
|
+
const sanitizeName = (toolName) => {
|
|
523
|
+
const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
|
|
524
|
+
return s.length > 0 ? s : "tool";
|
|
525
|
+
};
|
|
526
|
+
const buildTransformedTools = (tools, fixSchemaFn) => {
|
|
527
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
528
|
+
const transformedTools = tools.map((t) => {
|
|
529
|
+
const fixed = fixSchemaFn(t.parameters);
|
|
530
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
531
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
532
|
+
const sanitized = sanitizeName(t.name);
|
|
533
|
+
nameMap.set(sanitized, t.name);
|
|
534
|
+
return {
|
|
535
|
+
type: "function",
|
|
536
|
+
name: sanitized,
|
|
537
|
+
description: t.description,
|
|
538
|
+
inputSchema
|
|
799
539
|
};
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
540
|
+
});
|
|
541
|
+
return { transformedTools, nameMap };
|
|
542
|
+
};
|
|
543
|
+
const parseDebugToolCalls = (raw) => {
|
|
544
|
+
if (!raw) {
|
|
545
|
+
return [];
|
|
546
|
+
}
|
|
547
|
+
try {
|
|
548
|
+
const arr = JSON.parse(raw);
|
|
549
|
+
return Array.isArray(arr) ? arr : [];
|
|
550
|
+
} catch (e) {
|
|
551
|
+
return [];
|
|
552
|
+
}
|
|
553
|
+
};
|
|
554
|
+
const getSanitizedName = (rawName, transformedTools) => {
|
|
555
|
+
var _a, _b;
|
|
556
|
+
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
557
|
+
return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
|
|
558
|
+
}
|
|
559
|
+
return rawName;
|
|
560
|
+
};
|
|
561
|
+
const parseToolArgs = (extractedArgs) => {
|
|
562
|
+
if (typeof extractedArgs !== "string") {
|
|
563
|
+
return extractedArgs;
|
|
564
|
+
}
|
|
565
|
+
try {
|
|
566
|
+
return JSON.parse(extractedArgs);
|
|
567
|
+
} catch (e) {
|
|
568
|
+
return extractedArgs;
|
|
569
|
+
}
|
|
570
|
+
};
|
|
571
|
+
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
572
|
+
var _a, _b, _c, _d, _e, _f;
|
|
573
|
+
const call = c;
|
|
574
|
+
const rawName = (_a = call.toolName) != null ? _a : call.name;
|
|
575
|
+
const sanitizedFromIndex = getSanitizedName(
|
|
576
|
+
rawName,
|
|
577
|
+
transformedTools
|
|
578
|
+
);
|
|
579
|
+
const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
|
|
580
|
+
const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
|
|
581
|
+
const parsedArgs = parseToolArgs(extractedArgs);
|
|
582
|
+
return {
|
|
583
|
+
...call,
|
|
584
|
+
toolName: originalName,
|
|
585
|
+
name: originalName,
|
|
586
|
+
args: parsedArgs != null ? parsedArgs : {}
|
|
587
|
+
};
|
|
588
|
+
});
|
|
589
|
+
const summarizeArgs = (args) => {
|
|
590
|
+
if (args == null) {
|
|
591
|
+
return args;
|
|
592
|
+
}
|
|
593
|
+
if (typeof args !== "object") {
|
|
594
|
+
return args;
|
|
595
|
+
}
|
|
596
|
+
return Object.keys(args).sort().reduce(
|
|
597
|
+
(acc, k) => {
|
|
598
|
+
acc[k] = args[k];
|
|
599
|
+
return acc;
|
|
600
|
+
},
|
|
601
|
+
{}
|
|
602
|
+
);
|
|
603
|
+
};
|
|
604
|
+
const generateParamMismatchDiff = (paramName, allowed, got) => {
|
|
605
|
+
const diffLines = [];
|
|
606
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
607
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
608
|
+
const expectedLine = (() => {
|
|
609
|
+
if (allowedArray.length === 1) {
|
|
610
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
611
|
+
}
|
|
612
|
+
const formatted = allowedArray.map(
|
|
613
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
614
|
+
).join(", ");
|
|
615
|
+
return `- expected one of: ${formatted}`;
|
|
616
|
+
})();
|
|
617
|
+
diffLines.push(expectedLine);
|
|
618
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
619
|
+
return diffLines;
|
|
620
|
+
};
|
|
621
|
+
const paramValueMatches = (allowed, got) => {
|
|
622
|
+
if (!Array.isArray(allowed)) {
|
|
623
|
+
return false;
|
|
624
|
+
}
|
|
625
|
+
return allowed.some((v) => {
|
|
822
626
|
try {
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
`[DEBUG] ${testCase.id}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
827
|
-
);
|
|
627
|
+
if (Array.isArray(got)) {
|
|
628
|
+
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
629
|
+
}
|
|
828
630
|
} catch (e) {
|
|
829
|
-
caseLogs.push(
|
|
830
|
-
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
831
|
-
);
|
|
832
631
|
}
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
});
|
|
848
|
-
const mwOriginalText = debugSummaryRef.originalText;
|
|
849
|
-
const mwParsedToolCalls = (() => {
|
|
850
|
-
const raw = debugSummaryRef.toolCalls;
|
|
851
|
-
if (!raw) return [];
|
|
852
|
-
try {
|
|
853
|
-
const arr = JSON.parse(raw);
|
|
854
|
-
return Array.isArray(arr) ? arr : [];
|
|
855
|
-
} catch {
|
|
856
|
-
return [];
|
|
857
|
-
}
|
|
858
|
-
})();
|
|
859
|
-
try {
|
|
860
|
-
caseLogs.push(
|
|
861
|
-
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
862
|
-
);
|
|
863
|
-
} catch {
|
|
864
|
-
caseLogs.push(
|
|
865
|
-
`[DEBUG] ${testCase.id}: failed to serialize toolCalls`
|
|
866
|
-
);
|
|
632
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
633
|
+
});
|
|
634
|
+
};
|
|
635
|
+
const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
|
|
636
|
+
if (expectedName !== receivedName) {
|
|
637
|
+
diff.push("@@ function name");
|
|
638
|
+
diff.push(`- ${expectedName}`);
|
|
639
|
+
diff.push(`+ ${receivedName}`);
|
|
640
|
+
}
|
|
641
|
+
};
|
|
642
|
+
const checkMissingParams = (required, receivedArgs, diff) => {
|
|
643
|
+
for (const req of required) {
|
|
644
|
+
if (!(req in receivedArgs)) {
|
|
645
|
+
diff.push(`- missing required param: ${req}`);
|
|
867
646
|
}
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
647
|
+
}
|
|
648
|
+
};
|
|
649
|
+
const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
|
|
650
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
651
|
+
if (!(k in expectedParams)) {
|
|
652
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
871
653
|
}
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
} catch {
|
|
882
|
-
}
|
|
654
|
+
}
|
|
655
|
+
};
|
|
656
|
+
const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
|
|
657
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
658
|
+
if (k in expectedParams) {
|
|
659
|
+
const allowed = expectedParams[k];
|
|
660
|
+
const got = receivedArgs[k];
|
|
661
|
+
if (!paramValueMatches(allowed, got)) {
|
|
662
|
+
diff.push(...generateParamMismatchDiff(k, allowed, got));
|
|
883
663
|
}
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
};
|
|
667
|
+
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
668
|
+
var _a, _b, _c, _d;
|
|
669
|
+
const funcDesc = tools[0];
|
|
670
|
+
const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
|
|
671
|
+
const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
|
|
672
|
+
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
673
|
+
const received = restoredCalls[0];
|
|
674
|
+
const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
|
|
675
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
676
|
+
const expected = {
|
|
677
|
+
function: expectedFuncName,
|
|
678
|
+
params: expectedParams
|
|
679
|
+
};
|
|
680
|
+
const actual = {
|
|
681
|
+
function: receivedName,
|
|
682
|
+
args: receivedArgs
|
|
683
|
+
};
|
|
684
|
+
const diff = [];
|
|
685
|
+
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
686
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
687
|
+
const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
688
|
+
checkMissingParams(
|
|
689
|
+
required,
|
|
690
|
+
receivedArgs,
|
|
691
|
+
diff
|
|
692
|
+
);
|
|
693
|
+
checkUnexpectedParams(
|
|
694
|
+
expectedParams,
|
|
695
|
+
receivedArgs,
|
|
696
|
+
diff
|
|
697
|
+
);
|
|
698
|
+
checkParamValueMismatches(
|
|
699
|
+
expectedParams,
|
|
700
|
+
receivedArgs,
|
|
701
|
+
diff
|
|
702
|
+
);
|
|
703
|
+
}
|
|
704
|
+
return { expected, actual, diff };
|
|
705
|
+
};
|
|
706
|
+
const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
|
|
707
|
+
if (expectedCount !== actualCount) {
|
|
708
|
+
diff.push("@@ call count");
|
|
709
|
+
diff.push(`- expected ${expectedCount}`);
|
|
710
|
+
diff.push(`+ got ${actualCount}`);
|
|
711
|
+
}
|
|
712
|
+
};
|
|
713
|
+
const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
|
|
714
|
+
const missing = expectedNames.filter((n) => !actualNames.includes(n));
|
|
715
|
+
const extra = actualNames.filter((n) => !expectedNames.includes(n));
|
|
716
|
+
for (const m of missing) {
|
|
717
|
+
diff.push(`- missing function: ${m}`);
|
|
718
|
+
}
|
|
719
|
+
for (const e of extra) {
|
|
720
|
+
diff.push(`+ unexpected function: ${e}`);
|
|
721
|
+
}
|
|
722
|
+
};
|
|
723
|
+
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
724
|
+
var _a;
|
|
725
|
+
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
726
|
+
if (usedActual.has(i)) {
|
|
727
|
+
continue;
|
|
728
|
+
}
|
|
729
|
+
const rc = restoredCalls[i];
|
|
730
|
+
const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
|
|
731
|
+
if (rcName === fname) {
|
|
732
|
+
return i;
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
return -1;
|
|
736
|
+
};
|
|
737
|
+
const validateFunctionParams = (options) => {
|
|
738
|
+
const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
|
|
739
|
+
checkMissingParams(requiredParams, receivedArgs, diff);
|
|
740
|
+
checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
|
|
741
|
+
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
742
|
+
};
|
|
743
|
+
const processExpectedCall = (options) => {
|
|
744
|
+
var _a, _b;
|
|
745
|
+
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
746
|
+
const fname = Object.keys(expectedObj)[0];
|
|
747
|
+
const matchedIndex = findMatchingCallIndex(
|
|
748
|
+
fname,
|
|
749
|
+
restoredCalls,
|
|
750
|
+
usedActual
|
|
751
|
+
);
|
|
752
|
+
if (matchedIndex === -1) {
|
|
753
|
+
return;
|
|
754
|
+
}
|
|
755
|
+
usedActual.add(matchedIndex);
|
|
756
|
+
const received = restoredCalls[matchedIndex];
|
|
757
|
+
const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
|
|
758
|
+
const expectedParamsAllowed = expectedObj[fname];
|
|
759
|
+
const funcDesc = tools.find((t) => t.name === fname);
|
|
760
|
+
const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
|
|
761
|
+
diff.push(`@@ function ${fname}`);
|
|
762
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
763
|
+
validateFunctionParams({
|
|
764
|
+
receivedArgs,
|
|
765
|
+
expectedParamsAllowed,
|
|
766
|
+
requiredParams,
|
|
767
|
+
diff
|
|
890
768
|
});
|
|
891
|
-
|
|
892
|
-
|
|
769
|
+
}
|
|
770
|
+
};
|
|
771
|
+
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
772
|
+
var _a;
|
|
773
|
+
const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
|
|
774
|
+
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
775
|
+
const actualNames = restoredCalls.map(
|
|
776
|
+
(c) => {
|
|
777
|
+
var _a2;
|
|
778
|
+
return (_a2 = c.toolName) != null ? _a2 : c.name;
|
|
779
|
+
}
|
|
780
|
+
);
|
|
781
|
+
const expected = {
|
|
782
|
+
functions: expectedNames
|
|
783
|
+
};
|
|
784
|
+
const actual = { functions: actualNames };
|
|
785
|
+
const diff = [];
|
|
786
|
+
checkCallCountMismatch(
|
|
787
|
+
expectedNames.length,
|
|
788
|
+
actualNames.length,
|
|
789
|
+
diff
|
|
790
|
+
);
|
|
791
|
+
addMissingAndExtraFunctions(expectedNames, actualNames, diff);
|
|
792
|
+
const usedActual = /* @__PURE__ */ new Set();
|
|
793
|
+
for (const expectedObj of gtArr) {
|
|
794
|
+
processExpectedCall({
|
|
795
|
+
expectedObj,
|
|
893
796
|
restoredCalls,
|
|
894
|
-
|
|
797
|
+
tools,
|
|
798
|
+
usedActual,
|
|
799
|
+
diff
|
|
800
|
+
});
|
|
801
|
+
}
|
|
802
|
+
return { expected, actual, diff };
|
|
803
|
+
};
|
|
804
|
+
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
805
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
|
|
806
|
+
logs.push(
|
|
807
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
808
|
+
);
|
|
809
|
+
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
810
|
+
var _a, _b, _c, _d;
|
|
811
|
+
try {
|
|
812
|
+
const firstTool = transformedTools[0];
|
|
813
|
+
const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
|
|
814
|
+
caseLogs.push(
|
|
815
|
+
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
895
816
|
);
|
|
896
|
-
if (checkerResult.valid) {
|
|
897
|
-
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
898
|
-
return { valid: true, logs: caseLogs };
|
|
899
|
-
} else {
|
|
900
|
-
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
901
|
-
try {
|
|
902
|
-
let generateParamMismatchDiff2 = function(paramName, allowed, got) {
|
|
903
|
-
const diffLines = [];
|
|
904
|
-
diffLines.push(`@@ param ${paramName}`);
|
|
905
|
-
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
906
|
-
const expectedLine = (() => {
|
|
907
|
-
if (allowedArray.length === 1) {
|
|
908
|
-
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
909
|
-
}
|
|
910
|
-
const formatted = allowedArray.map(
|
|
911
|
-
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
912
|
-
).join(", ");
|
|
913
|
-
return `- expected one of: ${formatted}`;
|
|
914
|
-
})();
|
|
915
|
-
diffLines.push(expectedLine);
|
|
916
|
-
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
917
|
-
return diffLines;
|
|
918
|
-
};
|
|
919
|
-
var generateParamMismatchDiff = generateParamMismatchDiff2;
|
|
920
|
-
const category = testCase.id.split("_")[0];
|
|
921
|
-
const diff = [];
|
|
922
|
-
const summarizeArgs = (args) => {
|
|
923
|
-
if (args == null) return args;
|
|
924
|
-
if (typeof args !== "object") return args;
|
|
925
|
-
return Object.keys(args).sort().reduce(
|
|
926
|
-
(acc, k) => {
|
|
927
|
-
acc[k] = args[k];
|
|
928
|
-
return acc;
|
|
929
|
-
},
|
|
930
|
-
{}
|
|
931
|
-
);
|
|
932
|
-
};
|
|
933
|
-
const expected = {};
|
|
934
|
-
const actual = {};
|
|
935
|
-
if (category === "simple") {
|
|
936
|
-
const funcDesc = tools[0];
|
|
937
|
-
const gt = possibleAnswer.ground_truth?.[0];
|
|
938
|
-
const expectedFuncName = funcDesc?.name;
|
|
939
|
-
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
940
|
-
const received = restoredCalls[0];
|
|
941
|
-
const receivedName = received?.toolName ?? received?.name;
|
|
942
|
-
const receivedArgs = summarizeArgs(received?.args);
|
|
943
|
-
expected.function = expectedFuncName;
|
|
944
|
-
expected.params = expectedParams;
|
|
945
|
-
actual.function = receivedName;
|
|
946
|
-
actual.args = receivedArgs;
|
|
947
|
-
if (expectedFuncName !== receivedName) {
|
|
948
|
-
diff.push(`@@ function name`);
|
|
949
|
-
diff.push(`- ${expectedFuncName}`);
|
|
950
|
-
diff.push(`+ ${receivedName}`);
|
|
951
|
-
}
|
|
952
|
-
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
953
|
-
const required = funcDesc?.parameters?.required ?? [];
|
|
954
|
-
for (const req of required) {
|
|
955
|
-
if (!(req in receivedArgs)) {
|
|
956
|
-
diff.push(`- missing required param: ${req}`);
|
|
957
|
-
}
|
|
958
|
-
}
|
|
959
|
-
for (const k of Object.keys(
|
|
960
|
-
receivedArgs
|
|
961
|
-
)) {
|
|
962
|
-
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
963
|
-
diff.push(`+ unexpected param: ${k}`);
|
|
964
|
-
}
|
|
965
|
-
}
|
|
966
|
-
for (const k of Object.keys(
|
|
967
|
-
receivedArgs
|
|
968
|
-
)) {
|
|
969
|
-
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
970
|
-
const allowed = expectedParams[k];
|
|
971
|
-
const got = receivedArgs[k];
|
|
972
|
-
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
973
|
-
try {
|
|
974
|
-
if (Array.isArray(got)) {
|
|
975
|
-
return JSON.stringify(
|
|
976
|
-
got.map((x) => String(x)).sort()
|
|
977
|
-
) === JSON.stringify(
|
|
978
|
-
v.map((x) => String(x)).sort()
|
|
979
|
-
);
|
|
980
|
-
}
|
|
981
|
-
} catch {
|
|
982
|
-
}
|
|
983
|
-
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
984
|
-
});
|
|
985
|
-
if (!includes) {
|
|
986
|
-
diff.push(
|
|
987
|
-
...generateParamMismatchDiff2(k, allowed, got)
|
|
988
|
-
);
|
|
989
|
-
}
|
|
990
|
-
}
|
|
991
|
-
}
|
|
992
|
-
}
|
|
993
|
-
} else {
|
|
994
|
-
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
995
|
-
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
996
|
-
const actualNames = restoredCalls.map(
|
|
997
|
-
(c) => c.toolName ?? c.name
|
|
998
|
-
);
|
|
999
|
-
expected.functions = expectedNames;
|
|
1000
|
-
actual.functions = actualNames;
|
|
1001
|
-
if (expectedNames.length !== actualNames.length) {
|
|
1002
|
-
diff.push(`@@ call count`);
|
|
1003
|
-
diff.push(`- expected ${expectedNames.length}`);
|
|
1004
|
-
diff.push(`+ got ${actualNames.length}`);
|
|
1005
|
-
}
|
|
1006
|
-
const missing = expectedNames.filter(
|
|
1007
|
-
(n) => !actualNames.includes(n)
|
|
1008
|
-
);
|
|
1009
|
-
const extra = actualNames.filter(
|
|
1010
|
-
(n) => !expectedNames.includes(n)
|
|
1011
|
-
);
|
|
1012
|
-
for (const m of missing)
|
|
1013
|
-
diff.push(`- missing function: ${m}`);
|
|
1014
|
-
for (const e of extra)
|
|
1015
|
-
diff.push(`+ unexpected function: ${e}`);
|
|
1016
|
-
const usedActual = /* @__PURE__ */ new Set();
|
|
1017
|
-
for (const expectedObj of gtArr) {
|
|
1018
|
-
const fname = Object.keys(expectedObj)[0];
|
|
1019
|
-
let matchedIndex = -1;
|
|
1020
|
-
for (let i = 0; i < restoredCalls.length; i++) {
|
|
1021
|
-
if (usedActual.has(i)) continue;
|
|
1022
|
-
const rc = restoredCalls[i];
|
|
1023
|
-
const rcName = rc?.toolName ?? rc?.name;
|
|
1024
|
-
if (rcName === fname) {
|
|
1025
|
-
matchedIndex = i;
|
|
1026
|
-
break;
|
|
1027
|
-
}
|
|
1028
|
-
}
|
|
1029
|
-
if (matchedIndex === -1) continue;
|
|
1030
|
-
usedActual.add(matchedIndex);
|
|
1031
|
-
const received = restoredCalls[matchedIndex];
|
|
1032
|
-
const receivedArgs = summarizeArgs(received?.args);
|
|
1033
|
-
const expectedParamsAllowed = expectedObj[fname];
|
|
1034
|
-
const funcDesc = tools.find(
|
|
1035
|
-
(t) => t.name === fname
|
|
1036
|
-
);
|
|
1037
|
-
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1038
|
-
diff.push(`@@ function ${fname}`);
|
|
1039
|
-
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1040
|
-
for (const req of requiredParams) {
|
|
1041
|
-
if (!(req in receivedArgs)) {
|
|
1042
|
-
diff.push(`- missing required param: ${req}`);
|
|
1043
|
-
}
|
|
1044
|
-
}
|
|
1045
|
-
for (const k of Object.keys(
|
|
1046
|
-
receivedArgs
|
|
1047
|
-
)) {
|
|
1048
|
-
if (!Object.prototype.hasOwnProperty.call(
|
|
1049
|
-
expectedParamsAllowed,
|
|
1050
|
-
k
|
|
1051
|
-
)) {
|
|
1052
|
-
diff.push(`+ unexpected param: ${k}`);
|
|
1053
|
-
}
|
|
1054
|
-
}
|
|
1055
|
-
for (const k of Object.keys(
|
|
1056
|
-
receivedArgs
|
|
1057
|
-
)) {
|
|
1058
|
-
if (Object.prototype.hasOwnProperty.call(
|
|
1059
|
-
expectedParamsAllowed,
|
|
1060
|
-
k
|
|
1061
|
-
)) {
|
|
1062
|
-
const allowed = expectedParamsAllowed[k];
|
|
1063
|
-
const got = receivedArgs[k];
|
|
1064
|
-
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1065
|
-
try {
|
|
1066
|
-
if (Array.isArray(got)) {
|
|
1067
|
-
return JSON.stringify(
|
|
1068
|
-
got.map((x) => String(x)).sort()
|
|
1069
|
-
) === JSON.stringify(
|
|
1070
|
-
v.map((x) => String(x)).sort()
|
|
1071
|
-
);
|
|
1072
|
-
}
|
|
1073
|
-
} catch {
|
|
1074
|
-
}
|
|
1075
|
-
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1076
|
-
});
|
|
1077
|
-
if (!includes) {
|
|
1078
|
-
diff.push(
|
|
1079
|
-
...generateParamMismatchDiff2(k, allowed, got)
|
|
1080
|
-
);
|
|
1081
|
-
}
|
|
1082
|
-
}
|
|
1083
|
-
}
|
|
1084
|
-
}
|
|
1085
|
-
}
|
|
1086
|
-
}
|
|
1087
|
-
caseLogs.push(
|
|
1088
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
1089
|
-
id: testCase.id,
|
|
1090
|
-
message: checkerResult.error,
|
|
1091
|
-
error_type: checkerResult.error_type,
|
|
1092
|
-
expected,
|
|
1093
|
-
actual,
|
|
1094
|
-
diff
|
|
1095
|
-
})}`
|
|
1096
|
-
);
|
|
1097
|
-
try {
|
|
1098
|
-
const lastUser = (() => {
|
|
1099
|
-
const reversed = [...flatMessages].reverse();
|
|
1100
|
-
const found = reversed.find(
|
|
1101
|
-
(m) => m.role === "user"
|
|
1102
|
-
);
|
|
1103
|
-
return found?.content ?? void 0;
|
|
1104
|
-
})();
|
|
1105
|
-
const contextPayload = {
|
|
1106
|
-
id: testCase.id,
|
|
1107
|
-
tool_schema: tools,
|
|
1108
|
-
last_user_query: lastUser,
|
|
1109
|
-
raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
|
|
1110
|
-
finish_reason: finishReason,
|
|
1111
|
-
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1112
|
-
ground_truth: possibleAnswer.ground_truth
|
|
1113
|
-
};
|
|
1114
|
-
caseLogs.push(
|
|
1115
|
-
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
1116
|
-
);
|
|
1117
|
-
} catch {
|
|
1118
|
-
}
|
|
1119
|
-
} catch {
|
|
1120
|
-
caseLogs.push(
|
|
1121
|
-
`[DEBUG] ${testCase.id}: failed to build debug diff`
|
|
1122
|
-
);
|
|
1123
|
-
}
|
|
1124
|
-
return { valid: false, logs: caseLogs };
|
|
1125
|
-
}
|
|
1126
817
|
} catch (e) {
|
|
1127
818
|
caseLogs.push(
|
|
1128
|
-
`[
|
|
819
|
+
`[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
|
|
1129
820
|
);
|
|
1130
|
-
if (e?.stack) {
|
|
1131
|
-
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
1132
|
-
}
|
|
1133
|
-
return { valid: false, logs: caseLogs };
|
|
1134
821
|
}
|
|
1135
822
|
};
|
|
1136
|
-
const
|
|
1137
|
-
const
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
return results;
|
|
823
|
+
const logRawToolCalls = (options) => {
|
|
824
|
+
const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
|
|
825
|
+
try {
|
|
826
|
+
caseLogs.push(
|
|
827
|
+
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
828
|
+
);
|
|
829
|
+
} catch (e) {
|
|
830
|
+
caseLogs.push(
|
|
831
|
+
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
832
|
+
);
|
|
833
|
+
}
|
|
1148
834
|
};
|
|
1149
|
-
const
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
835
|
+
const buildFailureContext = (options) => {
|
|
836
|
+
const {
|
|
837
|
+
testCase,
|
|
838
|
+
tools,
|
|
839
|
+
flatMessages,
|
|
840
|
+
mwOriginalText,
|
|
841
|
+
text,
|
|
842
|
+
finishReason,
|
|
843
|
+
mwParsedToolCalls,
|
|
844
|
+
restoredCalls,
|
|
845
|
+
possibleAnswer
|
|
846
|
+
} = options;
|
|
847
|
+
const lastUser = (() => {
|
|
848
|
+
var _a;
|
|
849
|
+
const reversed = [...flatMessages].reverse();
|
|
850
|
+
const found = reversed.find(
|
|
851
|
+
(m) => m.role === "user"
|
|
852
|
+
);
|
|
853
|
+
return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
|
|
854
|
+
})();
|
|
855
|
+
const rawModelText = (() => {
|
|
856
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
857
|
+
return mwOriginalText;
|
|
858
|
+
}
|
|
859
|
+
if (typeof text === "string") {
|
|
860
|
+
return text;
|
|
861
|
+
}
|
|
862
|
+
return "";
|
|
863
|
+
})();
|
|
1160
864
|
return {
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
865
|
+
id: testCase.id,
|
|
866
|
+
tool_schema: tools,
|
|
867
|
+
last_user_query: lastUser,
|
|
868
|
+
raw_model_text: rawModelText,
|
|
869
|
+
finish_reason: finishReason,
|
|
870
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
871
|
+
ground_truth: possibleAnswer.ground_truth
|
|
1165
872
|
};
|
|
1166
|
-
}
|
|
1167
|
-
const
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
873
|
+
};
|
|
874
|
+
const logFailureDetails = (options) => {
|
|
875
|
+
const {
|
|
876
|
+
testCase,
|
|
877
|
+
tools,
|
|
878
|
+
possibleAnswer,
|
|
879
|
+
restoredCalls,
|
|
880
|
+
checkerResult,
|
|
881
|
+
flatMessages,
|
|
882
|
+
mwOriginalText,
|
|
883
|
+
text,
|
|
884
|
+
finishReason,
|
|
885
|
+
mwParsedToolCalls,
|
|
886
|
+
caseLogs
|
|
887
|
+
} = options;
|
|
888
|
+
try {
|
|
889
|
+
const category = testCase.id.split("_")[0];
|
|
890
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
891
|
+
tools,
|
|
892
|
+
possibleAnswer,
|
|
893
|
+
restoredCalls
|
|
894
|
+
) : buildParallelDiff(
|
|
895
|
+
tools,
|
|
896
|
+
possibleAnswer,
|
|
897
|
+
restoredCalls
|
|
898
|
+
);
|
|
899
|
+
caseLogs.push(
|
|
900
|
+
`[DEBUG-FAIL] ${JSON.stringify({
|
|
901
|
+
id: testCase.id,
|
|
902
|
+
message: checkerResult.error,
|
|
903
|
+
error_type: checkerResult.error_type,
|
|
904
|
+
expected,
|
|
905
|
+
actual,
|
|
906
|
+
diff
|
|
907
|
+
})}`
|
|
908
|
+
);
|
|
909
|
+
try {
|
|
910
|
+
const contextPayload = buildFailureContext({
|
|
911
|
+
testCase,
|
|
912
|
+
tools,
|
|
913
|
+
flatMessages,
|
|
914
|
+
mwOriginalText,
|
|
915
|
+
text,
|
|
916
|
+
finishReason,
|
|
917
|
+
mwParsedToolCalls,
|
|
918
|
+
restoredCalls,
|
|
919
|
+
possibleAnswer
|
|
920
|
+
});
|
|
921
|
+
caseLogs.push(
|
|
922
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
923
|
+
);
|
|
924
|
+
} catch (e) {
|
|
925
|
+
}
|
|
926
|
+
} catch (e) {
|
|
927
|
+
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
928
|
+
}
|
|
929
|
+
};
|
|
930
|
+
const buildToolsMap = (transformedTools) => Object.fromEntries(
|
|
931
|
+
transformedTools.map((t) => [
|
|
932
|
+
t.name,
|
|
933
|
+
tool({
|
|
934
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
935
|
+
inputSchema: jsonSchema(
|
|
936
|
+
t.inputSchema
|
|
937
|
+
)
|
|
938
|
+
})
|
|
939
|
+
])
|
|
940
|
+
);
|
|
941
|
+
const executeModelGeneration = async (options) => {
|
|
942
|
+
const {
|
|
943
|
+
model: modelInstance,
|
|
944
|
+
flatMessages,
|
|
945
|
+
toolsMap,
|
|
946
|
+
temperature,
|
|
947
|
+
maxTokens
|
|
948
|
+
} = options;
|
|
949
|
+
const debugSummaryRef = {};
|
|
950
|
+
const providerOptions = {
|
|
951
|
+
toolCallMiddleware: {
|
|
952
|
+
debugSummary: debugSummaryRef
|
|
953
|
+
}
|
|
954
|
+
};
|
|
955
|
+
const { toolCalls, text, finishReason } = await generateText({
|
|
956
|
+
model: modelInstance,
|
|
957
|
+
messages: flatMessages,
|
|
958
|
+
tools: toolsMap,
|
|
959
|
+
toolChoice: "auto",
|
|
960
|
+
providerOptions,
|
|
961
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
962
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
963
|
+
});
|
|
964
|
+
return { toolCalls, text, finishReason, debugSummaryRef };
|
|
965
|
+
};
|
|
966
|
+
const processValidationResult = (options) => {
|
|
967
|
+
const {
|
|
968
|
+
checkerResult,
|
|
969
|
+
testCase,
|
|
970
|
+
tools,
|
|
971
|
+
possibleAnswer,
|
|
972
|
+
restoredCalls,
|
|
973
|
+
flatMessages,
|
|
974
|
+
mwOriginalText,
|
|
975
|
+
text,
|
|
976
|
+
finishReason,
|
|
977
|
+
mwParsedToolCalls,
|
|
978
|
+
caseLogs
|
|
979
|
+
} = options;
|
|
980
|
+
if (checkerResult.valid) {
|
|
981
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
982
|
+
return { valid: true, logs: caseLogs };
|
|
983
|
+
}
|
|
984
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
985
|
+
logFailureDetails({
|
|
986
|
+
testCase,
|
|
987
|
+
tools,
|
|
988
|
+
possibleAnswer,
|
|
989
|
+
restoredCalls,
|
|
990
|
+
checkerResult,
|
|
991
|
+
flatMessages,
|
|
992
|
+
mwOriginalText,
|
|
993
|
+
text,
|
|
994
|
+
finishReason,
|
|
995
|
+
mwParsedToolCalls,
|
|
996
|
+
caseLogs
|
|
997
|
+
});
|
|
998
|
+
return { valid: false, logs: caseLogs };
|
|
999
|
+
};
|
|
1000
|
+
const prepareTestCaseData = (testCase) => {
|
|
1001
|
+
const { function: tools, question: messages } = testCase;
|
|
1002
|
+
const flatMessages = flattenMessages(messages);
|
|
1003
|
+
const { transformedTools, nameMap } = buildTransformedTools(
|
|
1004
|
+
tools,
|
|
1005
|
+
fixSchema2
|
|
1006
|
+
);
|
|
1007
|
+
const toolsMap = buildToolsMap(transformedTools);
|
|
1008
|
+
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
1009
|
+
};
|
|
1010
|
+
const processModelResponse = (options) => {
|
|
1011
|
+
const {
|
|
1012
|
+
testCase,
|
|
1013
|
+
toolCalls,
|
|
1014
|
+
text,
|
|
1015
|
+
finishReason,
|
|
1016
|
+
debugSummaryRef,
|
|
1017
|
+
nameMap,
|
|
1018
|
+
transformedTools,
|
|
1019
|
+
flatMessages,
|
|
1020
|
+
tools,
|
|
1021
|
+
caseLogs
|
|
1022
|
+
} = options;
|
|
1023
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
1024
|
+
const mwParsedToolCalls = parseDebugToolCalls(
|
|
1025
|
+
debugSummaryRef.toolCalls
|
|
1026
|
+
);
|
|
1027
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1028
|
+
if (!possibleAnswer) {
|
|
1029
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1030
|
+
}
|
|
1031
|
+
if (process.env.DEBUG_PARSER_OUTPUT === "true") {
|
|
1032
|
+
const groundTruth = possibleAnswer.ground_truth;
|
|
1033
|
+
const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
|
|
1034
|
+
console.log("\n========== BFCL CASE DEBUG ==========");
|
|
1035
|
+
console.log(`Test Case: ${testCase.id}`);
|
|
1036
|
+
console.log(`Expected count: ${groundTruth.length} call(s)`);
|
|
1037
|
+
console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
|
|
1038
|
+
console.log(expectedXML);
|
|
1039
|
+
console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
|
|
1040
|
+
console.log(mwOriginalText || text || "(empty)");
|
|
1041
|
+
console.log(
|
|
1042
|
+
"\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
|
|
1043
|
+
);
|
|
1044
|
+
console.log(JSON.stringify(toolCalls, null, 2));
|
|
1045
|
+
console.log("======================================\n");
|
|
1046
|
+
}
|
|
1047
|
+
logRawToolCalls({
|
|
1048
|
+
toolCalls,
|
|
1049
|
+
finishReason,
|
|
1050
|
+
text,
|
|
1051
|
+
testCaseId: testCase.id,
|
|
1052
|
+
caseLogs
|
|
1053
|
+
});
|
|
1054
|
+
const restoredCalls = restoreToolCalls(
|
|
1055
|
+
toolCalls || [],
|
|
1056
|
+
nameMap,
|
|
1057
|
+
transformedTools
|
|
1058
|
+
);
|
|
1059
|
+
const checkerResult = check(testCase, restoredCalls, possibleAnswer);
|
|
1060
|
+
return processValidationResult({
|
|
1061
|
+
checkerResult,
|
|
1062
|
+
testCase,
|
|
1063
|
+
tools,
|
|
1064
|
+
possibleAnswer,
|
|
1065
|
+
restoredCalls,
|
|
1066
|
+
flatMessages,
|
|
1067
|
+
mwOriginalText,
|
|
1068
|
+
text,
|
|
1069
|
+
finishReason,
|
|
1070
|
+
mwParsedToolCalls,
|
|
1071
|
+
caseLogs
|
|
1072
|
+
});
|
|
1073
|
+
};
|
|
1074
|
+
const runSingleCase2 = async (testCase) => {
|
|
1075
|
+
const caseLogs = [];
|
|
1076
|
+
const { function: tools } = testCase;
|
|
1077
|
+
const temp = config == null ? void 0 : config.temperature;
|
|
1078
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1079
|
+
const maxTok = config == null ? void 0 : config.maxTokens;
|
|
1080
|
+
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1081
|
+
try {
|
|
1082
|
+
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
1083
|
+
logFirstToolDebug(transformedTools, testCase.id, caseLogs);
|
|
1084
|
+
const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
|
|
1085
|
+
model,
|
|
1086
|
+
flatMessages,
|
|
1087
|
+
toolsMap,
|
|
1088
|
+
temperature,
|
|
1089
|
+
maxTokens
|
|
1090
|
+
});
|
|
1091
|
+
return processModelResponse({
|
|
1092
|
+
testCase,
|
|
1093
|
+
toolCalls,
|
|
1094
|
+
text,
|
|
1095
|
+
finishReason,
|
|
1096
|
+
debugSummaryRef,
|
|
1097
|
+
nameMap,
|
|
1098
|
+
transformedTools,
|
|
1099
|
+
flatMessages,
|
|
1100
|
+
tools,
|
|
1101
|
+
caseLogs
|
|
1102
|
+
});
|
|
1103
|
+
} catch (e) {
|
|
1104
|
+
caseLogs.push(
|
|
1105
|
+
`[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
|
|
1106
|
+
);
|
|
1107
|
+
if (e == null ? void 0 : e.stack) {
|
|
1108
|
+
caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
|
|
1109
|
+
}
|
|
1110
|
+
return { valid: false, logs: caseLogs };
|
|
1111
|
+
}
|
|
1112
|
+
};
|
|
1113
|
+
const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
|
|
1114
|
+
const results = new Array(items.length);
|
|
1115
|
+
let idx = 0;
|
|
1116
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1117
|
+
while (true) {
|
|
1118
|
+
const current = idx;
|
|
1119
|
+
idx += 1;
|
|
1120
|
+
if (current >= items.length) {
|
|
1121
|
+
break;
|
|
1122
|
+
}
|
|
1123
|
+
results[current] = await mapper(items[current], current);
|
|
1124
|
+
}
|
|
1125
|
+
});
|
|
1126
|
+
await Promise.all(workers);
|
|
1127
|
+
return results;
|
|
1128
|
+
};
|
|
1129
|
+
const resultsPerCase = await mapWithConcurrency2(
|
|
1130
|
+
testCases,
|
|
1131
|
+
concurrency,
|
|
1132
|
+
async (tc) => runSingleCase2(tc)
|
|
1133
|
+
);
|
|
1134
|
+
correctCount = resultsPerCase.reduce(
|
|
1135
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1136
|
+
0
|
|
1137
|
+
);
|
|
1138
|
+
for (const r of resultsPerCase) {
|
|
1139
|
+
logs.push(...r.logs);
|
|
1140
|
+
}
|
|
1141
|
+
if (testCases.length === 0) {
|
|
1142
|
+
return {
|
|
1143
|
+
score: 0,
|
|
1144
|
+
success: false,
|
|
1145
|
+
metrics: {},
|
|
1146
|
+
logs: ["No test cases found."]
|
|
1147
|
+
};
|
|
1148
|
+
}
|
|
1149
|
+
const score = correctCount / testCases.length;
|
|
1150
|
+
return {
|
|
1151
|
+
score,
|
|
1152
|
+
success: score > 0.95,
|
|
1153
|
+
// High success threshold as requested
|
|
1154
|
+
metrics: {
|
|
1155
|
+
correct_count: correctCount,
|
|
1156
|
+
total_cases: testCases.length,
|
|
1157
|
+
accuracy: score
|
|
1158
|
+
},
|
|
1159
|
+
logs
|
|
1178
1160
|
};
|
|
1179
1161
|
} catch (e) {
|
|
1180
1162
|
return {
|
|
@@ -1182,7 +1164,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1182
1164
|
success: false,
|
|
1183
1165
|
metrics: {},
|
|
1184
1166
|
error: e,
|
|
1185
|
-
logs: [
|
|
1167
|
+
logs: [
|
|
1168
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1169
|
+
]
|
|
1186
1170
|
};
|
|
1187
1171
|
}
|
|
1188
1172
|
}
|
|
@@ -1191,204 +1175,721 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1191
1175
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1192
1176
|
"bfcl-simple",
|
|
1193
1177
|
"BFCL Simple Function Calling",
|
|
1194
|
-
"BFCL_v3_simple.
|
|
1195
|
-
"BFCL_v3_simple_possible_answer.
|
|
1178
|
+
"BFCL_v3_simple.jsonl",
|
|
1179
|
+
"BFCL_v3_simple_possible_answer.jsonl"
|
|
1196
1180
|
);
|
|
1197
1181
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1198
1182
|
"bfcl-parallel",
|
|
1199
1183
|
"BFCL Parallel Function Calling",
|
|
1200
|
-
"BFCL_v3_parallel.
|
|
1201
|
-
"BFCL_v3_parallel_possible_answer.
|
|
1184
|
+
"BFCL_v3_parallel.jsonl",
|
|
1185
|
+
"BFCL_v3_parallel_possible_answer.jsonl"
|
|
1202
1186
|
);
|
|
1203
1187
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1204
1188
|
"bfcl-multiple",
|
|
1205
1189
|
"BFCL Multiple Function Calling",
|
|
1206
|
-
"BFCL_v3_multiple.
|
|
1207
|
-
"BFCL_v3_multiple_possible_answer.
|
|
1190
|
+
"BFCL_v3_multiple.jsonl",
|
|
1191
|
+
"BFCL_v3_multiple_possible_answer.jsonl"
|
|
1208
1192
|
);
|
|
1209
1193
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1210
1194
|
"bfcl-parallel-multiple",
|
|
1211
1195
|
"BFCL Parallel & Multiple Function Calling",
|
|
1212
|
-
"BFCL_v3_parallel_multiple.
|
|
1213
|
-
"BFCL_v3_parallel_multiple_possible_answer.
|
|
1196
|
+
"BFCL_v3_parallel_multiple.jsonl",
|
|
1197
|
+
"BFCL_v3_parallel_multiple_possible_answer.jsonl"
|
|
1214
1198
|
);
|
|
1215
1199
|
|
|
1216
|
-
// src/benchmarks/
|
|
1217
|
-
import { generateText as generateText2 } from "ai";
|
|
1218
|
-
import Ajv from "ajv";
|
|
1200
|
+
// src/benchmarks/complex-func-bench.ts
|
|
1219
1201
|
import { promises as fs3 } from "fs";
|
|
1220
1202
|
import path3 from "path";
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1203
|
+
import {
|
|
1204
|
+
generateText as generateText2,
|
|
1205
|
+
jsonSchema as jsonSchema2,
|
|
1206
|
+
tool as tool2
|
|
1207
|
+
} from "ai";
|
|
1208
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1209
|
+
function standardizeString2(input) {
|
|
1210
|
+
if (typeof input !== "string") {
|
|
1211
|
+
return input;
|
|
1212
|
+
}
|
|
1213
|
+
return input.toLowerCase().trim();
|
|
1214
|
+
}
|
|
1215
|
+
function valuesMatch2(modelValue, expectedValue) {
|
|
1216
|
+
if (modelValue === expectedValue) {
|
|
1217
|
+
return true;
|
|
1218
|
+
}
|
|
1219
|
+
if (typeof modelValue === "string" && typeof expectedValue === "string") {
|
|
1220
|
+
return standardizeString2(modelValue) === standardizeString2(expectedValue);
|
|
1221
|
+
}
|
|
1222
|
+
if (typeof modelValue === "number" && typeof expectedValue === "string") {
|
|
1223
|
+
return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
|
|
1224
|
+
}
|
|
1225
|
+
if (typeof modelValue === "string" && typeof expectedValue === "number") {
|
|
1226
|
+
return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
|
|
1225
1227
|
}
|
|
1226
|
-
|
|
1227
|
-
if (fenceMatch) {
|
|
1228
|
-
const inner = fenceMatch[1].trim();
|
|
1228
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
|
|
1229
1229
|
try {
|
|
1230
|
-
return JSON.
|
|
1231
|
-
} catch {
|
|
1230
|
+
return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
|
|
1231
|
+
} catch (e) {
|
|
1232
|
+
return false;
|
|
1233
|
+
}
|
|
1234
|
+
}
|
|
1235
|
+
return false;
|
|
1236
|
+
}
|
|
1237
|
+
function validateFunctionName(modelFuncName, expectedFuncName) {
|
|
1238
|
+
if (modelFuncName !== expectedFuncName) {
|
|
1239
|
+
return {
|
|
1240
|
+
valid: false,
|
|
1241
|
+
error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
|
|
1242
|
+
error_type: "function_name_mismatch"
|
|
1243
|
+
};
|
|
1244
|
+
}
|
|
1245
|
+
return { valid: true };
|
|
1246
|
+
}
|
|
1247
|
+
function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
|
|
1248
|
+
for (const param of requiredParams) {
|
|
1249
|
+
if (!(param in modelArgs) && param in expectedArgs) {
|
|
1250
|
+
return {
|
|
1251
|
+
valid: false,
|
|
1252
|
+
error: `Missing required parameter: '${param}'`,
|
|
1253
|
+
error_type: "missing_required_param"
|
|
1254
|
+
};
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
return { valid: true };
|
|
1258
|
+
}
|
|
1259
|
+
function validateParamValues(expectedArgs, modelArgs, requiredParams) {
|
|
1260
|
+
for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
|
|
1261
|
+
if (!(paramName in modelArgs)) {
|
|
1262
|
+
if (!requiredParams.includes(paramName)) {
|
|
1263
|
+
continue;
|
|
1264
|
+
}
|
|
1265
|
+
return {
|
|
1266
|
+
valid: false,
|
|
1267
|
+
error: `Missing parameter: '${paramName}'`,
|
|
1268
|
+
error_type: "missing_param"
|
|
1269
|
+
};
|
|
1270
|
+
}
|
|
1271
|
+
const modelValue = modelArgs[paramName];
|
|
1272
|
+
if (!valuesMatch2(modelValue, expectedValue)) {
|
|
1273
|
+
return {
|
|
1274
|
+
valid: false,
|
|
1275
|
+
error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
|
|
1276
|
+
error_type: "value_mismatch"
|
|
1277
|
+
};
|
|
1278
|
+
}
|
|
1279
|
+
}
|
|
1280
|
+
return { valid: true };
|
|
1281
|
+
}
|
|
1282
|
+
function checkFunctionCall(modelCall, expected, toolSpecs) {
|
|
1283
|
+
var _a, _b, _c, _d;
|
|
1284
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1285
|
+
const expectedArgs = expected[expectedFuncName];
|
|
1286
|
+
const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
|
|
1287
|
+
const modelArgs = (_b = modelCall.args) != null ? _b : {};
|
|
1288
|
+
const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
|
|
1289
|
+
if (!nameResult.valid) {
|
|
1290
|
+
return nameResult;
|
|
1291
|
+
}
|
|
1292
|
+
const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
|
|
1293
|
+
const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
|
|
1294
|
+
const requiredResult = validateRequiredParams(
|
|
1295
|
+
requiredParams,
|
|
1296
|
+
modelArgs,
|
|
1297
|
+
expectedArgs
|
|
1298
|
+
);
|
|
1299
|
+
if (!requiredResult.valid) {
|
|
1300
|
+
return requiredResult;
|
|
1301
|
+
}
|
|
1302
|
+
return validateParamValues(expectedArgs, modelArgs, requiredParams);
|
|
1303
|
+
}
|
|
1304
|
+
function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
|
|
1305
|
+
if (modelCalls.length !== expectedCalls.length) {
|
|
1306
|
+
return {
|
|
1307
|
+
valid: false,
|
|
1308
|
+
error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
|
|
1309
|
+
error_type: "wrong_call_count"
|
|
1310
|
+
};
|
|
1311
|
+
}
|
|
1312
|
+
if (expectedCalls.length === 1) {
|
|
1313
|
+
return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
|
|
1314
|
+
}
|
|
1315
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
1316
|
+
for (const expected of expectedCalls) {
|
|
1317
|
+
let foundMatch = false;
|
|
1318
|
+
for (let i = 0; i < modelCalls.length; i++) {
|
|
1319
|
+
if (matchedIndices.has(i)) {
|
|
1320
|
+
continue;
|
|
1321
|
+
}
|
|
1322
|
+
const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
|
|
1323
|
+
if (result.valid) {
|
|
1324
|
+
matchedIndices.add(i);
|
|
1325
|
+
foundMatch = true;
|
|
1326
|
+
break;
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
if (!foundMatch) {
|
|
1330
|
+
const expectedFuncName = Object.keys(expected)[0];
|
|
1331
|
+
return {
|
|
1332
|
+
valid: false,
|
|
1333
|
+
error: `Could not find matching call for function '${expectedFuncName}'`,
|
|
1334
|
+
error_type: "no_matching_call"
|
|
1335
|
+
};
|
|
1336
|
+
}
|
|
1337
|
+
}
|
|
1338
|
+
return { valid: true };
|
|
1339
|
+
}
|
|
1340
|
+
var fixSchemaType = (copy) => {
|
|
1341
|
+
if (!copy.type) {
|
|
1342
|
+
return;
|
|
1343
|
+
}
|
|
1344
|
+
if (copy.type === "dict") {
|
|
1345
|
+
copy.type = "object";
|
|
1346
|
+
}
|
|
1347
|
+
if (copy.type === "tuple") {
|
|
1348
|
+
copy.type = "array";
|
|
1349
|
+
}
|
|
1350
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
1351
|
+
copy.type = "number";
|
|
1352
|
+
}
|
|
1353
|
+
};
|
|
1354
|
+
var fixSchema = (schema) => {
|
|
1355
|
+
if (!schema || typeof schema !== "object") {
|
|
1356
|
+
return { type: "object", properties: {} };
|
|
1357
|
+
}
|
|
1358
|
+
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
1359
|
+
if (!Array.isArray(copy)) {
|
|
1360
|
+
fixSchemaType(copy);
|
|
1361
|
+
if (copy.properties && typeof copy.properties === "object") {
|
|
1362
|
+
for (const k of Object.keys(copy.properties)) {
|
|
1363
|
+
copy.properties[k] = fixSchema(
|
|
1364
|
+
copy.properties[k]
|
|
1365
|
+
);
|
|
1366
|
+
}
|
|
1367
|
+
}
|
|
1368
|
+
if (copy.items) {
|
|
1369
|
+
copy.items = fixSchema(copy.items);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
return copy;
|
|
1373
|
+
};
|
|
1374
|
+
function buildTools(tools) {
|
|
1375
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
1376
|
+
const transformedTools = tools.map((t) => {
|
|
1377
|
+
const fixed = fixSchema(t.parameters);
|
|
1378
|
+
const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
|
|
1379
|
+
const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
|
|
1380
|
+
nameMap.set(sanitized, t.name);
|
|
1381
|
+
return {
|
|
1382
|
+
type: "function",
|
|
1383
|
+
name: sanitized,
|
|
1384
|
+
description: t.description,
|
|
1385
|
+
inputSchema
|
|
1386
|
+
};
|
|
1387
|
+
});
|
|
1388
|
+
const toolsMap = Object.fromEntries(
|
|
1389
|
+
transformedTools.map((t) => [
|
|
1390
|
+
t.name,
|
|
1391
|
+
tool2({
|
|
1392
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
1393
|
+
inputSchema: jsonSchema2(t.inputSchema)
|
|
1394
|
+
})
|
|
1395
|
+
])
|
|
1396
|
+
);
|
|
1397
|
+
return { nameMap, toolsMap };
|
|
1398
|
+
}
|
|
1399
|
+
async function mapWithConcurrency(items, concurrencyLimit, mapper) {
|
|
1400
|
+
const results = new Array(items.length);
|
|
1401
|
+
let idx = 0;
|
|
1402
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1403
|
+
while (true) {
|
|
1404
|
+
const current = idx;
|
|
1405
|
+
idx += 1;
|
|
1406
|
+
if (current >= items.length) {
|
|
1407
|
+
break;
|
|
1408
|
+
}
|
|
1409
|
+
results[current] = await mapper(items[current]);
|
|
1410
|
+
}
|
|
1411
|
+
});
|
|
1412
|
+
await Promise.all(workers);
|
|
1413
|
+
return results;
|
|
1414
|
+
}
|
|
1415
|
+
async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
|
|
1416
|
+
const caseLogs = [];
|
|
1417
|
+
const { function: tools, question: messages } = testCase;
|
|
1418
|
+
try {
|
|
1419
|
+
const { nameMap, toolsMap } = buildTools(tools);
|
|
1420
|
+
const debugSummaryRef = {};
|
|
1421
|
+
const providerOptions = {
|
|
1422
|
+
toolCallMiddleware: { debugSummary: debugSummaryRef }
|
|
1423
|
+
};
|
|
1424
|
+
const { toolCalls, finishReason } = await generateText2({
|
|
1425
|
+
model,
|
|
1426
|
+
messages,
|
|
1427
|
+
tools: toolsMap,
|
|
1428
|
+
toolChoice: "auto",
|
|
1429
|
+
providerOptions,
|
|
1430
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
1431
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
1432
|
+
});
|
|
1433
|
+
const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
|
|
1434
|
+
var _a, _b, _c, _d;
|
|
1435
|
+
const rawName = (_a = c.toolName) != null ? _a : c.name;
|
|
1436
|
+
const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
|
|
1437
|
+
return {
|
|
1438
|
+
toolName: originalName,
|
|
1439
|
+
name: originalName,
|
|
1440
|
+
args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
|
|
1441
|
+
};
|
|
1442
|
+
});
|
|
1443
|
+
caseLogs.push(
|
|
1444
|
+
`[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
|
|
1445
|
+
);
|
|
1446
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1447
|
+
if (!possibleAnswer) {
|
|
1448
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1449
|
+
}
|
|
1450
|
+
const checkerResult = checkAllFunctionCalls(
|
|
1451
|
+
restoredCalls,
|
|
1452
|
+
possibleAnswer.ground_truth,
|
|
1453
|
+
tools
|
|
1454
|
+
);
|
|
1455
|
+
if (checkerResult.valid) {
|
|
1456
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
1457
|
+
return { valid: true, logs: caseLogs };
|
|
1458
|
+
}
|
|
1459
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
1460
|
+
return { valid: false, logs: caseLogs };
|
|
1461
|
+
} catch (e) {
|
|
1462
|
+
caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
|
|
1463
|
+
return { valid: false, logs: caseLogs };
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
async function loadTestData(dataPath, testDataFile) {
|
|
1467
|
+
const testCasesJson = await fs3.readFile(
|
|
1468
|
+
path3.join(dataPath, testDataFile),
|
|
1469
|
+
"utf-8"
|
|
1470
|
+
);
|
|
1471
|
+
return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1472
|
+
}
|
|
1473
|
+
async function loadAnswerData(dataPath, answerDataFile) {
|
|
1474
|
+
const answersJson = await fs3.readFile(
|
|
1475
|
+
path3.join(dataPath, answerDataFile),
|
|
1476
|
+
"utf-8"
|
|
1477
|
+
);
|
|
1478
|
+
const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1479
|
+
return new Map(answers.map((ans) => [ans.id, ans]));
|
|
1480
|
+
}
|
|
1481
|
+
function getConfigValues(config) {
|
|
1482
|
+
const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
|
|
1483
|
+
const limit = limitEnv ? Number(limitEnv) : void 0;
|
|
1484
|
+
const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
|
|
1485
|
+
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
1486
|
+
const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
|
|
1487
|
+
const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
|
|
1488
|
+
return { limit, concurrency, temperature, maxTokens };
|
|
1489
|
+
}
|
|
1490
|
+
function aggregateResults(resultsPerCase, testCases) {
|
|
1491
|
+
const logs = [];
|
|
1492
|
+
const correctCount = resultsPerCase.reduce(
|
|
1493
|
+
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1494
|
+
0
|
|
1495
|
+
);
|
|
1496
|
+
for (const r of resultsPerCase) {
|
|
1497
|
+
logs.push(...r.logs);
|
|
1498
|
+
}
|
|
1499
|
+
if (testCases.length === 0) {
|
|
1500
|
+
return {
|
|
1501
|
+
score: 0,
|
|
1502
|
+
success: false,
|
|
1503
|
+
metrics: {},
|
|
1504
|
+
logs: ["No test cases found."]
|
|
1505
|
+
};
|
|
1506
|
+
}
|
|
1507
|
+
const score = correctCount / testCases.length;
|
|
1508
|
+
return {
|
|
1509
|
+
score,
|
|
1510
|
+
success: score > 0.5,
|
|
1511
|
+
metrics: {
|
|
1512
|
+
correct_count: correctCount,
|
|
1513
|
+
total_cases: testCases.length,
|
|
1514
|
+
accuracy: score
|
|
1515
|
+
},
|
|
1516
|
+
logs
|
|
1517
|
+
};
|
|
1518
|
+
}
|
|
1519
|
+
function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
|
|
1520
|
+
return {
|
|
1521
|
+
name,
|
|
1522
|
+
version: "1.0.0",
|
|
1523
|
+
description,
|
|
1524
|
+
async run(model, config) {
|
|
1525
|
+
var _a;
|
|
1526
|
+
const logs = [];
|
|
1527
|
+
try {
|
|
1528
|
+
const dataPath = resolveDataDir();
|
|
1529
|
+
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
1530
|
+
let testCases = await loadTestData(dataPath, testDataFile);
|
|
1531
|
+
const possibleAnswersMap = await loadAnswerData(
|
|
1532
|
+
dataPath,
|
|
1533
|
+
answerDataFile
|
|
1534
|
+
);
|
|
1535
|
+
const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
|
|
1536
|
+
if (limit && Number.isFinite(limit) && limit > 0) {
|
|
1537
|
+
testCases = testCases.slice(0, limit);
|
|
1538
|
+
logs.push(`[INFO] Limiting test cases to ${limit}`);
|
|
1539
|
+
}
|
|
1540
|
+
logs.push(
|
|
1541
|
+
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
1542
|
+
);
|
|
1543
|
+
const resultsPerCase = await mapWithConcurrency(
|
|
1544
|
+
testCases,
|
|
1545
|
+
concurrency,
|
|
1546
|
+
(tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
|
|
1547
|
+
);
|
|
1548
|
+
const result = aggregateResults(resultsPerCase, testCases);
|
|
1549
|
+
result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
|
|
1550
|
+
return result;
|
|
1551
|
+
} catch (e) {
|
|
1552
|
+
return {
|
|
1553
|
+
score: 0,
|
|
1554
|
+
success: false,
|
|
1555
|
+
metrics: {},
|
|
1556
|
+
error: e,
|
|
1557
|
+
logs: [
|
|
1558
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1559
|
+
]
|
|
1560
|
+
};
|
|
1561
|
+
}
|
|
1232
1562
|
}
|
|
1563
|
+
};
|
|
1564
|
+
}
|
|
1565
|
+
var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
|
|
1566
|
+
"complex-func-bench",
|
|
1567
|
+
"ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
|
|
1568
|
+
"ComplexFuncBench.jsonl",
|
|
1569
|
+
"ComplexFuncBench_possible_answer.jsonl"
|
|
1570
|
+
);
|
|
1571
|
+
|
|
1572
|
+
// src/benchmarks/json-generation.ts
|
|
1573
|
+
import { promises as fs4 } from "fs";
|
|
1574
|
+
import path4 from "path";
|
|
1575
|
+
import { generateText as generateText3 } from "ai";
|
|
1576
|
+
import Ajv from "ajv";
|
|
1577
|
+
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1578
|
+
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1579
|
+
var NEWLINE_REGEX = /\r?\n/;
|
|
1580
|
+
var LINE_SPLIT_REGEX3 = /\r?\n/;
|
|
1581
|
+
function tryDirectParse(text) {
|
|
1582
|
+
try {
|
|
1583
|
+
return JSON.parse(text);
|
|
1584
|
+
} catch (e) {
|
|
1585
|
+
return;
|
|
1586
|
+
}
|
|
1587
|
+
}
|
|
1588
|
+
function tryCodeFenceParse(text) {
|
|
1589
|
+
const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
|
|
1590
|
+
if (!fenceMatch) {
|
|
1591
|
+
return;
|
|
1592
|
+
}
|
|
1593
|
+
const inner = fenceMatch[1].trim();
|
|
1594
|
+
try {
|
|
1595
|
+
return JSON.parse(inner);
|
|
1596
|
+
} catch (e) {
|
|
1597
|
+
return;
|
|
1233
1598
|
}
|
|
1599
|
+
}
|
|
1600
|
+
function tryBracketScan(text) {
|
|
1234
1601
|
const startIdxObj = text.indexOf("{");
|
|
1235
1602
|
const startIdxArr = text.indexOf("[");
|
|
1236
1603
|
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1237
|
-
if (start === void 0)
|
|
1604
|
+
if (start === void 0) {
|
|
1605
|
+
return;
|
|
1606
|
+
}
|
|
1238
1607
|
const open = text[start] === "{" ? "{" : "[";
|
|
1239
1608
|
const close = open === "{" ? "}" : "]";
|
|
1240
1609
|
let depth = 0;
|
|
1241
|
-
for (let i = start; i < text.length; i
|
|
1610
|
+
for (let i = start; i < text.length; i += 1) {
|
|
1242
1611
|
const ch = text[i];
|
|
1243
|
-
if (ch === open)
|
|
1244
|
-
|
|
1612
|
+
if (ch === open) {
|
|
1613
|
+
depth += 1;
|
|
1614
|
+
} else if (ch === close) {
|
|
1615
|
+
depth -= 1;
|
|
1616
|
+
}
|
|
1245
1617
|
if (depth === 0) {
|
|
1246
1618
|
const candidate = text.slice(start, i + 1);
|
|
1247
1619
|
try {
|
|
1248
1620
|
return JSON.parse(candidate);
|
|
1249
|
-
} catch {
|
|
1621
|
+
} catch (e) {
|
|
1622
|
+
return;
|
|
1250
1623
|
}
|
|
1251
|
-
break;
|
|
1252
1624
|
}
|
|
1253
1625
|
}
|
|
1254
|
-
return
|
|
1626
|
+
return;
|
|
1627
|
+
}
|
|
1628
|
+
function extractFirstJsonBlock(text) {
|
|
1629
|
+
const directResult = tryDirectParse(text);
|
|
1630
|
+
if (directResult !== void 0) {
|
|
1631
|
+
return directResult;
|
|
1632
|
+
}
|
|
1633
|
+
const fenceResult = tryCodeFenceParse(text);
|
|
1634
|
+
if (fenceResult !== void 0) {
|
|
1635
|
+
return fenceResult;
|
|
1636
|
+
}
|
|
1637
|
+
return tryBracketScan(text);
|
|
1255
1638
|
}
|
|
1256
1639
|
function subsetMatch(expected, actual) {
|
|
1257
1640
|
if (expected === null || typeof expected !== "object") {
|
|
1258
1641
|
return expected === actual;
|
|
1259
1642
|
}
|
|
1260
1643
|
if (Array.isArray(expected)) {
|
|
1261
|
-
if (!Array.isArray(actual))
|
|
1262
|
-
|
|
1263
|
-
|
|
1644
|
+
if (!Array.isArray(actual)) {
|
|
1645
|
+
return false;
|
|
1646
|
+
}
|
|
1647
|
+
for (let i = 0; i < expected.length; i += 1) {
|
|
1648
|
+
if (!subsetMatch(expected[i], actual[i])) {
|
|
1649
|
+
return false;
|
|
1650
|
+
}
|
|
1264
1651
|
}
|
|
1265
1652
|
return true;
|
|
1266
1653
|
}
|
|
1267
|
-
if (actual === null || typeof actual !== "object")
|
|
1654
|
+
if (actual === null || typeof actual !== "object") {
|
|
1655
|
+
return false;
|
|
1656
|
+
}
|
|
1268
1657
|
const eObj = expected;
|
|
1269
1658
|
const aObj = actual;
|
|
1270
1659
|
for (const key of Object.keys(eObj)) {
|
|
1271
|
-
if (!subsetMatch(eObj[key], aObj[key]))
|
|
1660
|
+
if (!subsetMatch(eObj[key], aObj[key])) {
|
|
1661
|
+
return false;
|
|
1662
|
+
}
|
|
1272
1663
|
}
|
|
1273
1664
|
return true;
|
|
1274
1665
|
}
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1666
|
+
async function loadDatasets() {
|
|
1667
|
+
try {
|
|
1668
|
+
const dataDir = resolveDataDir();
|
|
1669
|
+
const testsJsonl = await fs4.readFile(
|
|
1670
|
+
path4.join(dataDir, "json_generation_tests.jsonl"),
|
|
1671
|
+
"utf-8"
|
|
1672
|
+
);
|
|
1673
|
+
const expectedJsonl = await fs4.readFile(
|
|
1674
|
+
path4.join(dataDir, "json_generation_expected.jsonl"),
|
|
1675
|
+
"utf-8"
|
|
1676
|
+
);
|
|
1677
|
+
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1678
|
+
const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1286
1679
|
const expectedMap = /* @__PURE__ */ new Map();
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1680
|
+
for (const r of expecteds) {
|
|
1681
|
+
expectedMap.set(r.id, r);
|
|
1682
|
+
}
|
|
1683
|
+
return { tests, expectedMap };
|
|
1684
|
+
} catch (e) {
|
|
1685
|
+
return {
|
|
1686
|
+
tests: [],
|
|
1687
|
+
expectedMap: /* @__PURE__ */ new Map(),
|
|
1688
|
+
error: e
|
|
1689
|
+
};
|
|
1690
|
+
}
|
|
1691
|
+
}
|
|
1692
|
+
function buildMessages(tc) {
|
|
1693
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1694
|
+
return [
|
|
1695
|
+
{
|
|
1696
|
+
role: "system",
|
|
1697
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1698
|
+
},
|
|
1699
|
+
{
|
|
1700
|
+
role: "user",
|
|
1701
|
+
content: [
|
|
1702
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1703
|
+
"JSON Schema:",
|
|
1704
|
+
schemaStr,
|
|
1705
|
+
"Facts:",
|
|
1706
|
+
tc.promptFacts,
|
|
1707
|
+
"Output must be a single JSON only, with no additional text."
|
|
1708
|
+
].join("\n\n")
|
|
1709
|
+
}
|
|
1710
|
+
];
|
|
1711
|
+
}
|
|
1712
|
+
function validateTestCase(tc, parsed, context) {
|
|
1713
|
+
const validate = context.ajv.compile(tc.schema);
|
|
1714
|
+
const valid = validate(parsed);
|
|
1715
|
+
if (!valid) {
|
|
1716
|
+
context.logs.push(
|
|
1717
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1718
|
+
);
|
|
1719
|
+
}
|
|
1720
|
+
const expectedRec = context.expectedMap.get(tc.id);
|
|
1721
|
+
if (!expectedRec) {
|
|
1722
|
+
context.logs.push(
|
|
1723
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1724
|
+
);
|
|
1725
|
+
}
|
|
1726
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1727
|
+
return { valid, valuesOk, parsed };
|
|
1728
|
+
}
|
|
1729
|
+
async function processTestCase(tc, context) {
|
|
1730
|
+
var _a;
|
|
1731
|
+
const messages = buildMessages(tc);
|
|
1732
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1733
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1734
|
+
const { text } = await generateText3({
|
|
1735
|
+
model: context.model,
|
|
1736
|
+
messages,
|
|
1737
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1738
|
+
});
|
|
1739
|
+
let parsed;
|
|
1740
|
+
try {
|
|
1741
|
+
parsed = extractFirstJsonBlock(text);
|
|
1742
|
+
} catch (e) {
|
|
1743
|
+
}
|
|
1744
|
+
if (parsed === void 0) {
|
|
1745
|
+
context.validation.logs.push(
|
|
1746
|
+
`[FAIL] ${tc.id}: Unable to parse JSON from model output.`
|
|
1747
|
+
);
|
|
1748
|
+
return { schemaValid: false, valueMatch: false, correct: false };
|
|
1749
|
+
}
|
|
1750
|
+
const {
|
|
1751
|
+
valid,
|
|
1752
|
+
valuesOk,
|
|
1753
|
+
parsed: validatedParsed
|
|
1754
|
+
} = validateTestCase(tc, parsed, context.validation);
|
|
1755
|
+
const correct = valid && valuesOk;
|
|
1756
|
+
if (correct) {
|
|
1757
|
+
context.validation.logs.push(`[PASS] ${tc.id}`);
|
|
1758
|
+
} else {
|
|
1759
|
+
context.validation.logs.push(
|
|
1760
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1761
|
+
validatedParsed
|
|
1762
|
+
)}`
|
|
1763
|
+
);
|
|
1764
|
+
}
|
|
1765
|
+
return { schemaValid: valid, valueMatch: valuesOk, correct };
|
|
1766
|
+
}
|
|
1767
|
+
var jsonGenerationBenchmark = {
|
|
1768
|
+
name: "json-generation",
|
|
1769
|
+
version: "2.1.0",
|
|
1770
|
+
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1771
|
+
async run(model, config) {
|
|
1772
|
+
const logs = [];
|
|
1773
|
+
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1774
|
+
const { tests, expectedMap, error } = await loadDatasets();
|
|
1775
|
+
if (error) {
|
|
1302
1776
|
return {
|
|
1303
1777
|
score: 0,
|
|
1304
1778
|
success: false,
|
|
1305
1779
|
metrics: {},
|
|
1306
|
-
logs: [
|
|
1307
|
-
|
|
1780
|
+
logs: [
|
|
1781
|
+
`[FATAL] Failed to load json-generation datasets: ${error.message}`
|
|
1782
|
+
],
|
|
1783
|
+
error
|
|
1308
1784
|
};
|
|
1309
1785
|
}
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
{
|
|
1315
|
-
role: "system",
|
|
1316
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1317
|
-
},
|
|
1318
|
-
{
|
|
1319
|
-
role: "user",
|
|
1320
|
-
content: [
|
|
1321
|
-
"Generate a JSON object that reflects the following facts.",
|
|
1322
|
-
"JSON Schema:",
|
|
1323
|
-
schemaStr,
|
|
1324
|
-
"Facts:",
|
|
1325
|
-
tc.promptFacts,
|
|
1326
|
-
"Output must be a single JSON only, with no additional text."
|
|
1327
|
-
].join("\n\n")
|
|
1328
|
-
}
|
|
1329
|
-
];
|
|
1330
|
-
const temp = config?.temperature;
|
|
1331
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1332
|
-
const { text } = await generateText2({
|
|
1333
|
-
model,
|
|
1334
|
-
messages,
|
|
1335
|
-
...temperature !== void 0 ? { temperature } : {}
|
|
1336
|
-
});
|
|
1337
|
-
let parsed;
|
|
1338
|
-
try {
|
|
1339
|
-
parsed = extractFirstJsonBlock(text);
|
|
1340
|
-
} catch {
|
|
1341
|
-
}
|
|
1342
|
-
if (parsed === void 0) {
|
|
1343
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1344
|
-
continue;
|
|
1345
|
-
}
|
|
1346
|
-
const validate = ajv.compile(tc.schema);
|
|
1347
|
-
const valid = validate(parsed);
|
|
1348
|
-
if (valid) schemaValidCount++;
|
|
1349
|
-
else
|
|
1350
|
-
logs.push(
|
|
1351
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1352
|
-
);
|
|
1353
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
1354
|
-
if (!expectedRec) {
|
|
1355
|
-
logs.push(
|
|
1356
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1357
|
-
);
|
|
1358
|
-
}
|
|
1359
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1360
|
-
if (valuesOk) valueMatchCount++;
|
|
1361
|
-
if (valid && valuesOk) {
|
|
1362
|
-
correctCount++;
|
|
1363
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
1364
|
-
} else {
|
|
1365
|
-
logs.push(
|
|
1366
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1367
|
-
parsed
|
|
1368
|
-
)}`
|
|
1369
|
-
);
|
|
1370
|
-
}
|
|
1371
|
-
} catch (e) {
|
|
1372
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1373
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1374
|
-
}
|
|
1375
|
-
}
|
|
1376
|
-
const total = tests.length;
|
|
1377
|
-
const score = correctCount / total;
|
|
1378
|
-
return {
|
|
1379
|
-
score,
|
|
1380
|
-
success: score >= 0.8,
|
|
1381
|
-
metrics: {
|
|
1382
|
-
total_cases: total,
|
|
1383
|
-
correct_count: correctCount,
|
|
1384
|
-
schema_valid_count: schemaValidCount,
|
|
1385
|
-
value_match_count: valueMatchCount,
|
|
1386
|
-
accuracy: score
|
|
1387
|
-
},
|
|
1388
|
-
logs
|
|
1786
|
+
const context = {
|
|
1787
|
+
model,
|
|
1788
|
+
config,
|
|
1789
|
+
validation: { expectedMap, ajv, logs }
|
|
1389
1790
|
};
|
|
1791
|
+
const counts = await processAllTests(tests, context);
|
|
1792
|
+
return buildBenchmarkResult(tests.length, counts, logs);
|
|
1390
1793
|
}
|
|
1391
1794
|
};
|
|
1795
|
+
async function processAllTests(tests, context) {
|
|
1796
|
+
let schemaValidCount = 0;
|
|
1797
|
+
let valueMatchCount = 0;
|
|
1798
|
+
let correctCount = 0;
|
|
1799
|
+
for (const tc of tests) {
|
|
1800
|
+
try {
|
|
1801
|
+
const result = await processTestCase(tc, context);
|
|
1802
|
+
if (result.schemaValid) {
|
|
1803
|
+
schemaValidCount += 1;
|
|
1804
|
+
}
|
|
1805
|
+
if (result.valueMatch) {
|
|
1806
|
+
valueMatchCount += 1;
|
|
1807
|
+
}
|
|
1808
|
+
if (result.correct) {
|
|
1809
|
+
correctCount += 1;
|
|
1810
|
+
}
|
|
1811
|
+
} catch (e) {
|
|
1812
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1813
|
+
context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1814
|
+
}
|
|
1815
|
+
}
|
|
1816
|
+
return { schemaValidCount, valueMatchCount, correctCount };
|
|
1817
|
+
}
|
|
1818
|
+
function buildBenchmarkResult(total, counts, logs) {
|
|
1819
|
+
const score = counts.correctCount / total;
|
|
1820
|
+
return {
|
|
1821
|
+
score,
|
|
1822
|
+
success: score >= 0.8,
|
|
1823
|
+
metrics: {
|
|
1824
|
+
total_cases: total,
|
|
1825
|
+
correct_count: counts.correctCount,
|
|
1826
|
+
schema_valid_count: counts.schemaValidCount,
|
|
1827
|
+
value_match_count: counts.valueMatchCount,
|
|
1828
|
+
accuracy: score
|
|
1829
|
+
},
|
|
1830
|
+
logs
|
|
1831
|
+
};
|
|
1832
|
+
}
|
|
1833
|
+
async function loadSchemaOnlyTests() {
|
|
1834
|
+
try {
|
|
1835
|
+
const dataDir = resolveDataDir();
|
|
1836
|
+
const testsJsonl = await fs4.readFile(
|
|
1837
|
+
path4.join(dataDir, "json_generation_tests.jsonl"),
|
|
1838
|
+
"utf-8"
|
|
1839
|
+
);
|
|
1840
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1841
|
+
return { tests };
|
|
1842
|
+
} catch (e) {
|
|
1843
|
+
return { tests: [], error: e };
|
|
1844
|
+
}
|
|
1845
|
+
}
|
|
1846
|
+
async function processSchemaOnlyTestCase(tc, context) {
|
|
1847
|
+
var _a;
|
|
1848
|
+
const messages = buildMessages(tc);
|
|
1849
|
+
const temp = (_a = context.config) == null ? void 0 : _a.temperature;
|
|
1850
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1851
|
+
const { text } = await generateText3({
|
|
1852
|
+
model: context.model,
|
|
1853
|
+
messages,
|
|
1854
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1855
|
+
});
|
|
1856
|
+
let parsed;
|
|
1857
|
+
try {
|
|
1858
|
+
parsed = extractFirstJsonBlock(text);
|
|
1859
|
+
} catch (e) {
|
|
1860
|
+
}
|
|
1861
|
+
if (parsed === void 0) {
|
|
1862
|
+
context.logs.push(
|
|
1863
|
+
`[FAIL] ${tc.id}: Could not parse JSON from model output.`
|
|
1864
|
+
);
|
|
1865
|
+
return false;
|
|
1866
|
+
}
|
|
1867
|
+
const validate = context.ajv.compile(tc.schema);
|
|
1868
|
+
const valid = validate(parsed);
|
|
1869
|
+
if (valid) {
|
|
1870
|
+
context.logs.push(`[PASS] ${tc.id}`);
|
|
1871
|
+
return true;
|
|
1872
|
+
}
|
|
1873
|
+
context.logs.push(
|
|
1874
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1875
|
+
);
|
|
1876
|
+
return false;
|
|
1877
|
+
}
|
|
1878
|
+
async function runSchemaOnlyTests(tests, context) {
|
|
1879
|
+
let schemaValidCount = 0;
|
|
1880
|
+
for (const tc of tests) {
|
|
1881
|
+
try {
|
|
1882
|
+
const isValid = await processSchemaOnlyTestCase(tc, context);
|
|
1883
|
+
if (isValid) {
|
|
1884
|
+
schemaValidCount += 1;
|
|
1885
|
+
}
|
|
1886
|
+
} catch (e) {
|
|
1887
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1888
|
+
context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1889
|
+
}
|
|
1890
|
+
}
|
|
1891
|
+
return schemaValidCount;
|
|
1892
|
+
}
|
|
1392
1893
|
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1393
1894
|
name: "json-generation-schema-only",
|
|
1394
1895
|
version: "1.0.1",
|
|
@@ -1396,76 +1897,19 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1396
1897
|
async run(model, config) {
|
|
1397
1898
|
const logs = [];
|
|
1398
1899
|
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
const
|
|
1402
|
-
const testsJsonl = await fs3.readFile(
|
|
1403
|
-
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1404
|
-
"utf-8"
|
|
1405
|
-
);
|
|
1406
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1407
|
-
} catch (e) {
|
|
1408
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1900
|
+
const { tests, error } = await loadSchemaOnlyTests();
|
|
1901
|
+
if (error) {
|
|
1902
|
+
const msg = error.message;
|
|
1409
1903
|
return {
|
|
1410
1904
|
score: 0,
|
|
1411
1905
|
success: false,
|
|
1412
1906
|
metrics: {},
|
|
1413
1907
|
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1414
|
-
error
|
|
1908
|
+
error
|
|
1415
1909
|
};
|
|
1416
1910
|
}
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
try {
|
|
1420
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1421
|
-
const messages = [
|
|
1422
|
-
{
|
|
1423
|
-
role: "system",
|
|
1424
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1425
|
-
},
|
|
1426
|
-
{
|
|
1427
|
-
role: "user",
|
|
1428
|
-
content: [
|
|
1429
|
-
"Generate a JSON object that reflects the following facts.",
|
|
1430
|
-
"JSON Schema:",
|
|
1431
|
-
schemaStr,
|
|
1432
|
-
"Facts:",
|
|
1433
|
-
tc.promptFacts,
|
|
1434
|
-
"Output must be a single JSON only, with no additional text."
|
|
1435
|
-
].join("\n\n")
|
|
1436
|
-
}
|
|
1437
|
-
];
|
|
1438
|
-
const temp = config?.temperature;
|
|
1439
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1440
|
-
const { text } = await generateText2({
|
|
1441
|
-
model,
|
|
1442
|
-
messages,
|
|
1443
|
-
...temperature !== void 0 ? { temperature } : {}
|
|
1444
|
-
});
|
|
1445
|
-
let parsed;
|
|
1446
|
-
try {
|
|
1447
|
-
parsed = extractFirstJsonBlock(text);
|
|
1448
|
-
} catch {
|
|
1449
|
-
}
|
|
1450
|
-
if (parsed === void 0) {
|
|
1451
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1452
|
-
continue;
|
|
1453
|
-
}
|
|
1454
|
-
const validate = ajv.compile(tc.schema);
|
|
1455
|
-
const valid = validate(parsed);
|
|
1456
|
-
if (valid) {
|
|
1457
|
-
schemaValidCount++;
|
|
1458
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
1459
|
-
} else {
|
|
1460
|
-
logs.push(
|
|
1461
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1462
|
-
);
|
|
1463
|
-
}
|
|
1464
|
-
} catch (e) {
|
|
1465
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1466
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1467
|
-
}
|
|
1468
|
-
}
|
|
1911
|
+
const context = { model, config, ajv, logs };
|
|
1912
|
+
const schemaValidCount = await runSchemaOnlyTests(tests, context);
|
|
1469
1913
|
const total = tests.length;
|
|
1470
1914
|
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1471
1915
|
return {
|
|
@@ -1480,11 +1924,577 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1480
1924
|
};
|
|
1481
1925
|
}
|
|
1482
1926
|
};
|
|
1927
|
+
|
|
1928
|
+
// src/reporters/console.ts
|
|
1929
|
+
var colors = {
|
|
1930
|
+
reset: "\x1B[0m",
|
|
1931
|
+
green: "\x1B[32m",
|
|
1932
|
+
red: "\x1B[31m",
|
|
1933
|
+
yellow: "\x1B[33m",
|
|
1934
|
+
cyan: "\x1B[36m",
|
|
1935
|
+
magenta: "\x1B[35m",
|
|
1936
|
+
gray: "\x1B[90m",
|
|
1937
|
+
white: "\x1B[37m",
|
|
1938
|
+
bgRed: "\x1B[41m"
|
|
1939
|
+
};
|
|
1940
|
+
function formatDiff(diff) {
|
|
1941
|
+
if (!diff || diff.length === 0) {
|
|
1942
|
+
return "";
|
|
1943
|
+
}
|
|
1944
|
+
return diff.map((line) => {
|
|
1945
|
+
if (line.startsWith("-")) {
|
|
1946
|
+
return `${colors.red}${line}${colors.reset}`;
|
|
1947
|
+
}
|
|
1948
|
+
if (line.startsWith("+")) {
|
|
1949
|
+
return `${colors.green}${line}${colors.reset}`;
|
|
1950
|
+
}
|
|
1951
|
+
if (line.startsWith("@@")) {
|
|
1952
|
+
return `${colors.cyan}${line}${colors.reset}`;
|
|
1953
|
+
}
|
|
1954
|
+
return line;
|
|
1955
|
+
}).join("\n ");
|
|
1956
|
+
}
|
|
1957
|
+
function printFailLogs(logs) {
|
|
1958
|
+
const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
|
|
1959
|
+
for (const log of failLogs) {
|
|
1960
|
+
try {
|
|
1961
|
+
const jsonStr = log.replace("[DEBUG-FAIL] ", "");
|
|
1962
|
+
const data = JSON.parse(jsonStr);
|
|
1963
|
+
console.log(`
|
|
1964
|
+
${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
|
|
1965
|
+
console.log(
|
|
1966
|
+
` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
|
|
1967
|
+
);
|
|
1968
|
+
console.log(` Message: ${data.message}`);
|
|
1969
|
+
if (data.diff && Array.isArray(data.diff)) {
|
|
1970
|
+
console.log(` Diff:
|
|
1971
|
+
${formatDiff(data.diff)}`);
|
|
1972
|
+
}
|
|
1973
|
+
if (data.expected && data.actual) {
|
|
1974
|
+
const expStr = JSON.stringify(data.expected);
|
|
1975
|
+
const actStr = JSON.stringify(data.actual);
|
|
1976
|
+
if (expStr.length < 100 && actStr.length < 100) {
|
|
1977
|
+
console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
|
|
1978
|
+
console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
|
|
1979
|
+
}
|
|
1980
|
+
}
|
|
1981
|
+
} catch (_e) {
|
|
1982
|
+
console.log(` Raw Log: ${log}`);
|
|
1983
|
+
}
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
function printResult(result) {
|
|
1987
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1988
|
+
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
1989
|
+
console.log(
|
|
1990
|
+
`
|
|
1991
|
+
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
1992
|
+
);
|
|
1993
|
+
console.log(
|
|
1994
|
+
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
1995
|
+
);
|
|
1996
|
+
const metrics = Object.entries(benchmarkResult.metrics);
|
|
1997
|
+
if (metrics.length > 0) {
|
|
1998
|
+
console.log(" Metrics:");
|
|
1999
|
+
for (const [key, value] of metrics) {
|
|
2000
|
+
console.log(` - ${key}: ${value}`);
|
|
2001
|
+
}
|
|
2002
|
+
}
|
|
2003
|
+
if (benchmarkResult.error) {
|
|
2004
|
+
console.log(
|
|
2005
|
+
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
2006
|
+
);
|
|
2007
|
+
}
|
|
2008
|
+
if (!benchmarkResult.success && benchmarkResult.logs) {
|
|
2009
|
+
printFailLogs(benchmarkResult.logs);
|
|
2010
|
+
const failLogs = benchmarkResult.logs.filter(
|
|
2011
|
+
(l) => l.startsWith("[DEBUG-FAIL]")
|
|
2012
|
+
);
|
|
2013
|
+
if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
|
|
2014
|
+
console.log(" Raw Logs (Sample):");
|
|
2015
|
+
for (const l of benchmarkResult.logs.slice(0, 10)) {
|
|
2016
|
+
console.log(` ${l}`);
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
}
|
|
2020
|
+
}
|
|
2021
|
+
function consoleReporter(results) {
|
|
2022
|
+
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
2023
|
+
for (const result of results) {
|
|
2024
|
+
printResult(result);
|
|
2025
|
+
}
|
|
2026
|
+
console.log("\n---------------------------\n");
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
// src/reporters/console.debug.ts
|
|
2030
|
+
var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
|
|
2031
|
+
var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
|
|
2032
|
+
var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
|
|
2033
|
+
var colors2 = {
|
|
2034
|
+
reset: "\x1B[0m",
|
|
2035
|
+
green: "\x1B[32m",
|
|
2036
|
+
red: "\x1B[31m",
|
|
2037
|
+
yellow: "\x1B[33m",
|
|
2038
|
+
cyan: "\x1B[36m",
|
|
2039
|
+
magenta: "\x1B[35m",
|
|
2040
|
+
gray: "\x1B[90m",
|
|
2041
|
+
bold: "\x1B[1m",
|
|
2042
|
+
underline: "\x1B[4m"
|
|
2043
|
+
};
|
|
2044
|
+
function colorizeDiffLine(line) {
|
|
2045
|
+
if (line.startsWith("+")) {
|
|
2046
|
+
return `${colors2.green}${line}${colors2.reset}`;
|
|
2047
|
+
}
|
|
2048
|
+
if (line.startsWith("-")) {
|
|
2049
|
+
return `${colors2.red}${line}${colors2.reset}`;
|
|
2050
|
+
}
|
|
2051
|
+
if (line.startsWith("@")) {
|
|
2052
|
+
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
2053
|
+
}
|
|
2054
|
+
return line;
|
|
2055
|
+
}
|
|
2056
|
+
function uniqueLines(lines) {
|
|
2057
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2058
|
+
const out = [];
|
|
2059
|
+
for (const l of lines) {
|
|
2060
|
+
if (seen.has(l)) {
|
|
2061
|
+
continue;
|
|
2062
|
+
}
|
|
2063
|
+
seen.add(l);
|
|
2064
|
+
out.push(l);
|
|
2065
|
+
}
|
|
2066
|
+
return out;
|
|
2067
|
+
}
|
|
2068
|
+
function hasFunctionNameIssue(diff) {
|
|
2069
|
+
return diff.some(
|
|
2070
|
+
(d) => String(d).includes("function name") || String(d).includes("missing function:")
|
|
2071
|
+
);
|
|
2072
|
+
}
|
|
2073
|
+
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
2074
|
+
const expectedName = expected == null ? void 0 : expected.function;
|
|
2075
|
+
const actualName = actual == null ? void 0 : actual.function;
|
|
2076
|
+
if (expectedName && actualName && expectedName !== actualName) {
|
|
2077
|
+
suggestions.push(
|
|
2078
|
+
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
2079
|
+
);
|
|
2080
|
+
}
|
|
2081
|
+
if (Array.isArray(expected == null ? void 0 : expected.functions)) {
|
|
2082
|
+
suggestions.push(
|
|
2083
|
+
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
2084
|
+
);
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2087
|
+
function suggestMissingParamFix(diff, suggestions) {
|
|
2088
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
2089
|
+
if (missing.length) {
|
|
2090
|
+
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
function suggestUnexpectedParamFix(diff, suggestions) {
|
|
2094
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
2095
|
+
if (extras.length) {
|
|
2096
|
+
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
2097
|
+
}
|
|
2098
|
+
}
|
|
2099
|
+
function suggestParamValueFix(diff, suggestions) {
|
|
2100
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
2101
|
+
for (const param of targets) {
|
|
2102
|
+
const allowedOneOfLine = diff.find(
|
|
2103
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
2104
|
+
);
|
|
2105
|
+
const allowedSingleLine = diff.find(
|
|
2106
|
+
(d) => String(d).startsWith("- expected:")
|
|
2107
|
+
);
|
|
2108
|
+
if (allowedSingleLine) {
|
|
2109
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
2110
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
2111
|
+
} else if (allowedOneOfLine) {
|
|
2112
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
2113
|
+
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
2114
|
+
} else {
|
|
2115
|
+
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
2116
|
+
}
|
|
2117
|
+
}
|
|
2118
|
+
}
|
|
2119
|
+
function suggestFromErrorType(error_type, suggestions) {
|
|
2120
|
+
if (error_type.includes("missing_required")) {
|
|
2121
|
+
suggestions.push("Add all required parameters defined by the tool schema.");
|
|
2122
|
+
} else if (error_type.includes("unexpected_param")) {
|
|
2123
|
+
suggestions.push("Remove parameters not present in the tool schema.");
|
|
2124
|
+
} else if (error_type.includes("wrong_count")) {
|
|
2125
|
+
suggestions.push(
|
|
2126
|
+
"Adjust the number of tool calls to match expected count."
|
|
2127
|
+
);
|
|
2128
|
+
} else if (error_type.includes("wrong_func_name")) {
|
|
2129
|
+
suggestions.push("Use the exact expected function name from the schema.");
|
|
2130
|
+
} else if (error_type.includes("value_error")) {
|
|
2131
|
+
suggestions.push("Choose a value from the allowed options.");
|
|
2132
|
+
}
|
|
2133
|
+
}
|
|
2134
|
+
function suggestFixFromDiff(parsed) {
|
|
2135
|
+
const suggestions = [];
|
|
2136
|
+
const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
|
|
2137
|
+
if (!Array.isArray(diff)) {
|
|
2138
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
2139
|
+
suggestFromErrorType(error_type, suggestions);
|
|
2140
|
+
}
|
|
2141
|
+
return uniqueLines(suggestions);
|
|
2142
|
+
}
|
|
2143
|
+
if (hasFunctionNameIssue(diff)) {
|
|
2144
|
+
suggestFunctionNameFix(expected, actual, suggestions);
|
|
2145
|
+
}
|
|
2146
|
+
if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
2147
|
+
suggestMissingParamFix(diff, suggestions);
|
|
2148
|
+
}
|
|
2149
|
+
if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
2150
|
+
suggestUnexpectedParamFix(diff, suggestions);
|
|
2151
|
+
}
|
|
2152
|
+
if (diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
2153
|
+
suggestParamValueFix(diff, suggestions);
|
|
2154
|
+
}
|
|
2155
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
2156
|
+
suggestFromErrorType(error_type, suggestions);
|
|
2157
|
+
}
|
|
2158
|
+
return uniqueLines(suggestions);
|
|
2159
|
+
}
|
|
2160
|
+
function getTestIdFromLogLine(line) {
|
|
2161
|
+
var _a, _b;
|
|
2162
|
+
if (line.startsWith("[FAIL]")) {
|
|
2163
|
+
const m = line.match(FAIL_ID_REGEX);
|
|
2164
|
+
return m == null ? void 0 : m[1];
|
|
2165
|
+
}
|
|
2166
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
2167
|
+
try {
|
|
2168
|
+
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
2169
|
+
return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
|
|
2170
|
+
} catch (e) {
|
|
2171
|
+
}
|
|
2172
|
+
}
|
|
2173
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
2174
|
+
try {
|
|
2175
|
+
const parsed = JSON.parse(
|
|
2176
|
+
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
2177
|
+
);
|
|
2178
|
+
return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
|
|
2179
|
+
} catch (e) {
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
return;
|
|
2183
|
+
}
|
|
2184
|
+
function groupLogsByTestId(failLogs) {
|
|
2185
|
+
var _a;
|
|
2186
|
+
const byId = /* @__PURE__ */ new Map();
|
|
2187
|
+
for (const line of failLogs) {
|
|
2188
|
+
const id = getTestIdFromLogLine(line);
|
|
2189
|
+
const key = id != null ? id : "__general__";
|
|
2190
|
+
const arr = (_a = byId.get(key)) != null ? _a : [];
|
|
2191
|
+
arr.push(line);
|
|
2192
|
+
byId.set(key, arr);
|
|
2193
|
+
}
|
|
2194
|
+
return byId;
|
|
2195
|
+
}
|
|
2196
|
+
function collectDebugIds(lines) {
|
|
2197
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
2198
|
+
for (const l of lines) {
|
|
2199
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
2200
|
+
try {
|
|
2201
|
+
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
2202
|
+
if (parsed == null ? void 0 : parsed.id) {
|
|
2203
|
+
debugIds.add(String(parsed.id));
|
|
2204
|
+
}
|
|
2205
|
+
} catch (e) {
|
|
2206
|
+
}
|
|
2207
|
+
}
|
|
2208
|
+
}
|
|
2209
|
+
return debugIds;
|
|
2210
|
+
}
|
|
2211
|
+
function printIndentedJson(prefix, data, color) {
|
|
2212
|
+
console.log(
|
|
2213
|
+
color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n ") + colors2.reset
|
|
2214
|
+
);
|
|
2215
|
+
}
|
|
2216
|
+
function displayDebugFailLine(line) {
|
|
2217
|
+
const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
|
|
2218
|
+
try {
|
|
2219
|
+
const parsed = JSON.parse(payload);
|
|
2220
|
+
const { message, diff, expected, actual } = parsed;
|
|
2221
|
+
if (message) {
|
|
2222
|
+
console.log(` ${colors2.bold}${message}${colors2.reset}`);
|
|
2223
|
+
}
|
|
2224
|
+
if (diff && Array.isArray(diff)) {
|
|
2225
|
+
for (const dLine of diff) {
|
|
2226
|
+
console.log(` ${colorizeDiffLine(dLine)}`);
|
|
2227
|
+
}
|
|
2228
|
+
} else {
|
|
2229
|
+
console.log(" expected:");
|
|
2230
|
+
printIndentedJson(" ", expected, colors2.green);
|
|
2231
|
+
console.log(" actual:");
|
|
2232
|
+
printIndentedJson(" ", actual, colors2.red);
|
|
2233
|
+
}
|
|
2234
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
2235
|
+
if (suggestions.length) {
|
|
2236
|
+
console.log(` ${colors2.bold}Suggested fix:${colors2.reset}`);
|
|
2237
|
+
for (const s of suggestions) {
|
|
2238
|
+
console.log(` \u2022 ${s}`);
|
|
2239
|
+
}
|
|
2240
|
+
}
|
|
2241
|
+
} catch (e) {
|
|
2242
|
+
console.log(` ${line}`);
|
|
2243
|
+
}
|
|
2244
|
+
}
|
|
2245
|
+
function displayContextInfo(ctx) {
|
|
2246
|
+
if (ctx.tool_schema) {
|
|
2247
|
+
printIndentedJson(" tool schema: ", ctx.tool_schema, colors2.gray);
|
|
2248
|
+
}
|
|
2249
|
+
if (ctx.last_user_query) {
|
|
2250
|
+
console.log(
|
|
2251
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
2252
|
+
);
|
|
2253
|
+
}
|
|
2254
|
+
if (ctx.raw_model_text) {
|
|
2255
|
+
console.log(
|
|
2256
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
2257
|
+
);
|
|
2258
|
+
}
|
|
2259
|
+
if (ctx.parsed_tool_calls) {
|
|
2260
|
+
printIndentedJson(
|
|
2261
|
+
" parsed tool calls: ",
|
|
2262
|
+
ctx.parsed_tool_calls,
|
|
2263
|
+
colors2.gray
|
|
2264
|
+
);
|
|
2265
|
+
}
|
|
2266
|
+
if (ctx.ground_truth) {
|
|
2267
|
+
printIndentedJson(
|
|
2268
|
+
" ground truth: ",
|
|
2269
|
+
ctx.ground_truth,
|
|
2270
|
+
colors2.gray
|
|
2271
|
+
);
|
|
2272
|
+
}
|
|
2273
|
+
if (ctx.finish_reason) {
|
|
2274
|
+
console.log(
|
|
2275
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
2276
|
+
);
|
|
2277
|
+
}
|
|
2278
|
+
}
|
|
2279
|
+
function displayDebugFailContextLine(line) {
|
|
2280
|
+
const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
|
|
2281
|
+
try {
|
|
2282
|
+
const ctx = JSON.parse(payload);
|
|
2283
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
2284
|
+
displayContextInfo(ctx);
|
|
2285
|
+
} catch (e) {
|
|
2286
|
+
console.log(` ${line}`);
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
function displayLogLine(line, debugIds) {
|
|
2290
|
+
if (line.startsWith("[FAIL]")) {
|
|
2291
|
+
const m = line.match(FAIL_ID_REGEX);
|
|
2292
|
+
const failId = m == null ? void 0 : m[1];
|
|
2293
|
+
if (failId && debugIds.has(failId)) {
|
|
2294
|
+
return;
|
|
2295
|
+
}
|
|
2296
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
2297
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
2298
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
2299
|
+
} else if (line.startsWith("[STACK]")) {
|
|
2300
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
2301
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
2302
|
+
displayDebugFailLine(line);
|
|
2303
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
2304
|
+
displayDebugFailContextLine(line);
|
|
2305
|
+
}
|
|
2306
|
+
}
|
|
2307
|
+
function displayGroupedFailures(byId) {
|
|
2308
|
+
console.log(` ${colors2.bold}Failure details (grouped):${colors2.reset}`);
|
|
2309
|
+
for (const [groupId, lines] of byId) {
|
|
2310
|
+
if (groupId !== "__general__") {
|
|
2311
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
2312
|
+
}
|
|
2313
|
+
const debugIds = collectDebugIds(lines);
|
|
2314
|
+
for (const line of lines) {
|
|
2315
|
+
displayLogLine(line, debugIds);
|
|
2316
|
+
}
|
|
2317
|
+
}
|
|
2318
|
+
}
|
|
2319
|
+
function displaySuccessLogs(logs) {
|
|
2320
|
+
const info = logs.filter(
|
|
2321
|
+
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
2322
|
+
);
|
|
2323
|
+
for (const line of info) {
|
|
2324
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
2325
|
+
}
|
|
2326
|
+
}
|
|
2327
|
+
function filterFailureLogs(logs) {
|
|
2328
|
+
return logs.filter(
|
|
2329
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
2330
|
+
);
|
|
2331
|
+
}
|
|
2332
|
+
function displayResultLogs(logs) {
|
|
2333
|
+
const failLogs = filterFailureLogs(logs);
|
|
2334
|
+
const hasFails = failLogs.length > 0;
|
|
2335
|
+
if (hasFails) {
|
|
2336
|
+
const byId = groupLogsByTestId(failLogs);
|
|
2337
|
+
displayGroupedFailures(byId);
|
|
2338
|
+
} else {
|
|
2339
|
+
displaySuccessLogs(logs);
|
|
2340
|
+
}
|
|
2341
|
+
}
|
|
2342
|
+
function displayMetrics(metrics) {
|
|
2343
|
+
if (metrics.length > 0) {
|
|
2344
|
+
console.log(" Metrics:");
|
|
2345
|
+
for (const [k, v] of metrics) {
|
|
2346
|
+
console.log(` - ${k}: ${v}`);
|
|
2347
|
+
}
|
|
2348
|
+
}
|
|
2349
|
+
}
|
|
2350
|
+
function displayResultHeader(r) {
|
|
2351
|
+
const { model, modelKey, benchmark, result } = r;
|
|
2352
|
+
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
2353
|
+
console.log(
|
|
2354
|
+
`
|
|
2355
|
+
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
2356
|
+
);
|
|
2357
|
+
console.log(
|
|
2358
|
+
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
2359
|
+
);
|
|
2360
|
+
}
|
|
2361
|
+
function consoleDebugReporter(results) {
|
|
2362
|
+
var _a;
|
|
2363
|
+
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
2364
|
+
for (const r of results) {
|
|
2365
|
+
displayResultHeader(r);
|
|
2366
|
+
displayMetrics(Object.entries(r.result.metrics));
|
|
2367
|
+
if ((_a = r.result.logs) == null ? void 0 : _a.length) {
|
|
2368
|
+
displayResultLogs(r.result.logs);
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
console.log("\n------------------------------------\n");
|
|
2372
|
+
}
|
|
2373
|
+
|
|
2374
|
+
// src/reporters/json.ts
|
|
2375
|
+
function jsonReporter(results) {
|
|
2376
|
+
const serializableResults = results.map((r) => {
|
|
2377
|
+
var _a;
|
|
2378
|
+
return {
|
|
2379
|
+
...r,
|
|
2380
|
+
result: {
|
|
2381
|
+
...r.result,
|
|
2382
|
+
error: (_a = r.result.error) == null ? void 0 : _a.message
|
|
2383
|
+
}
|
|
2384
|
+
};
|
|
2385
|
+
});
|
|
2386
|
+
console.log(JSON.stringify(serializableResults, null, 2));
|
|
2387
|
+
}
|
|
2388
|
+
|
|
2389
|
+
// src/reporters/index.ts
|
|
2390
|
+
var reporters = {
|
|
2391
|
+
console: consoleReporter,
|
|
2392
|
+
json: jsonReporter,
|
|
2393
|
+
"console.debug": consoleDebugReporter
|
|
2394
|
+
};
|
|
2395
|
+
|
|
2396
|
+
// src/evaluate.ts
|
|
2397
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
2398
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
2399
|
+
try {
|
|
2400
|
+
console.log(
|
|
2401
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
2402
|
+
);
|
|
2403
|
+
const result = await benchmark.run(model, config);
|
|
2404
|
+
console.log(
|
|
2405
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
2406
|
+
);
|
|
2407
|
+
return {
|
|
2408
|
+
model: modelId,
|
|
2409
|
+
modelKey,
|
|
2410
|
+
benchmark: benchmark.name,
|
|
2411
|
+
result
|
|
2412
|
+
};
|
|
2413
|
+
} catch (error) {
|
|
2414
|
+
console.error(
|
|
2415
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
2416
|
+
error
|
|
2417
|
+
);
|
|
2418
|
+
return {
|
|
2419
|
+
model: modelId,
|
|
2420
|
+
modelKey,
|
|
2421
|
+
benchmark: benchmark.name,
|
|
2422
|
+
result: {
|
|
2423
|
+
score: 0,
|
|
2424
|
+
success: false,
|
|
2425
|
+
metrics: {},
|
|
2426
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
2427
|
+
}
|
|
2428
|
+
};
|
|
2429
|
+
}
|
|
2430
|
+
}
|
|
2431
|
+
function normalizeModels(models) {
|
|
2432
|
+
const modelEntries = [];
|
|
2433
|
+
if (Array.isArray(models)) {
|
|
2434
|
+
for (const m of models) {
|
|
2435
|
+
modelEntries.push([void 0, m]);
|
|
2436
|
+
}
|
|
2437
|
+
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
2438
|
+
modelEntries.push([void 0, models]);
|
|
2439
|
+
} else {
|
|
2440
|
+
for (const [key, m] of Object.entries(
|
|
2441
|
+
models
|
|
2442
|
+
)) {
|
|
2443
|
+
modelEntries.push([key, m]);
|
|
2444
|
+
}
|
|
2445
|
+
}
|
|
2446
|
+
return modelEntries;
|
|
2447
|
+
}
|
|
2448
|
+
function buildConfig(temperature, maxTokens) {
|
|
2449
|
+
const config = {};
|
|
2450
|
+
if (temperature !== void 0) {
|
|
2451
|
+
config.temperature = temperature;
|
|
2452
|
+
}
|
|
2453
|
+
if (maxTokens !== void 0) {
|
|
2454
|
+
config.maxTokens = maxTokens;
|
|
2455
|
+
}
|
|
2456
|
+
return Object.keys(config).length > 0 ? config : void 0;
|
|
2457
|
+
}
|
|
2458
|
+
function executeReporter(reporter, results) {
|
|
2459
|
+
const report = reporters[reporter];
|
|
2460
|
+
if (report) {
|
|
2461
|
+
report(results);
|
|
2462
|
+
} else {
|
|
2463
|
+
console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
|
|
2464
|
+
reporters.console(results);
|
|
2465
|
+
}
|
|
2466
|
+
}
|
|
2467
|
+
async function evaluate(options) {
|
|
2468
|
+
const {
|
|
2469
|
+
models,
|
|
2470
|
+
benchmarks,
|
|
2471
|
+
reporter = "console",
|
|
2472
|
+
temperature,
|
|
2473
|
+
maxTokens
|
|
2474
|
+
} = options;
|
|
2475
|
+
const modelEntries = normalizeModels(models);
|
|
2476
|
+
const config = buildConfig(temperature, maxTokens);
|
|
2477
|
+
const allResults = [];
|
|
2478
|
+
for (const [modelKey, model] of modelEntries) {
|
|
2479
|
+
for (const benchmark of benchmarks) {
|
|
2480
|
+
const evaluationResult = await runSingleBenchmark(
|
|
2481
|
+
model,
|
|
2482
|
+
benchmark,
|
|
2483
|
+
modelKey,
|
|
2484
|
+
config
|
|
2485
|
+
);
|
|
2486
|
+
allResults.push(evaluationResult);
|
|
2487
|
+
}
|
|
2488
|
+
}
|
|
2489
|
+
executeReporter(reporter, allResults);
|
|
2490
|
+
return allResults;
|
|
2491
|
+
}
|
|
1483
2492
|
export {
|
|
1484
2493
|
bfclMultipleBenchmark,
|
|
1485
2494
|
bfclParallelBenchmark,
|
|
1486
2495
|
bfclParallelMultipleBenchmark,
|
|
1487
2496
|
bfclSimpleBenchmark,
|
|
2497
|
+
complexFuncBenchBenchmark,
|
|
1488
2498
|
evaluate,
|
|
1489
2499
|
jsonGenerationBenchmark,
|
|
1490
2500
|
jsonGenerationSchemaOnlyBenchmark
|