@ai-sdk-tool/eval 0.1.8 → 1.0.0-canary.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/data/{BFCL_v3_multiple.json → BFCL_v3_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_multiple_possible_answer.json → BFCL_v3_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel.json → BFCL_v3_parallel.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple.json → BFCL_v3_parallel_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple_possible_answer.json → BFCL_v3_parallel_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_possible_answer.json → BFCL_v3_parallel_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_simple.json → BFCL_v3_simple.jsonl} +1 -1
- package/data/{BFCL_v3_simple_possible_answer.json → BFCL_v3_simple_possible_answer.jsonl} +1 -1
- package/dist/index.cjs +1611 -1104
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -17
- package/dist/index.d.ts +17 -17
- package/dist/index.js +1592 -1081
- package/dist/index.js.map +1 -1
- package/package.json +7 -10
package/dist/index.cjs
CHANGED
|
@@ -40,471 +40,98 @@ __export(index_exports, {
|
|
|
40
40
|
});
|
|
41
41
|
module.exports = __toCommonJS(index_exports);
|
|
42
42
|
|
|
43
|
-
// src/
|
|
44
|
-
var
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
red: "\x1B[31m",
|
|
48
|
-
yellow: "\x1B[33m",
|
|
49
|
-
cyan: "\x1B[36m",
|
|
50
|
-
magenta: "\x1B[35m",
|
|
51
|
-
gray: "\x1B[90m"
|
|
52
|
-
};
|
|
53
|
-
function printResult(result) {
|
|
54
|
-
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
55
|
-
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
56
|
-
console.log(
|
|
57
|
-
`
|
|
58
|
-
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
59
|
-
);
|
|
60
|
-
console.log(
|
|
61
|
-
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
62
|
-
);
|
|
63
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
64
|
-
if (metrics.length > 0) {
|
|
65
|
-
console.log(" Metrics:");
|
|
66
|
-
for (const [key, value] of metrics) {
|
|
67
|
-
console.log(` - ${key}: ${value}`);
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
if (benchmarkResult.error) {
|
|
71
|
-
console.log(
|
|
72
|
-
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
73
|
-
);
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
function consoleReporter(results) {
|
|
77
|
-
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
78
|
-
for (const result of results) {
|
|
79
|
-
printResult(result);
|
|
80
|
-
}
|
|
81
|
-
console.log("\n---------------------------\n");
|
|
82
|
-
}
|
|
43
|
+
// src/benchmarks/bfcl.ts
|
|
44
|
+
var import_node_fs2 = require("fs");
|
|
45
|
+
var import_node_path2 = __toESM(require("path"), 1);
|
|
46
|
+
var import_ai = require("ai");
|
|
83
47
|
|
|
84
|
-
// src/
|
|
85
|
-
var
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if (line.startsWith("@"))
|
|
100
|
-
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
101
|
-
return line;
|
|
102
|
-
}
|
|
103
|
-
function uniqueLines(lines) {
|
|
104
|
-
const seen = /* @__PURE__ */ new Set();
|
|
105
|
-
const out = [];
|
|
106
|
-
for (const l of lines) {
|
|
107
|
-
if (seen.has(l)) continue;
|
|
108
|
-
seen.add(l);
|
|
109
|
-
out.push(l);
|
|
110
|
-
}
|
|
111
|
-
return out;
|
|
112
|
-
}
|
|
113
|
-
function suggestFixFromDiff(parsed) {
|
|
114
|
-
const suggestions = [];
|
|
115
|
-
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
116
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
117
|
-
const expectedName = expected?.function;
|
|
118
|
-
const actualName = actual?.function;
|
|
119
|
-
if (expectedName && actualName && expectedName !== actualName) {
|
|
120
|
-
suggestions.push(
|
|
121
|
-
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
122
|
-
);
|
|
123
|
-
}
|
|
124
|
-
if (Array.isArray(expected?.functions)) {
|
|
125
|
-
suggestions.push(
|
|
126
|
-
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
127
|
-
);
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
131
|
-
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
132
|
-
if (missing.length) {
|
|
133
|
-
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
134
|
-
}
|
|
135
|
-
}
|
|
136
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
137
|
-
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
138
|
-
if (extras.length) {
|
|
139
|
-
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
143
|
-
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
144
|
-
for (const param of targets) {
|
|
145
|
-
const allowedOneOfLine = diff.find(
|
|
146
|
-
(d) => String(d).startsWith("- expected one of:")
|
|
147
|
-
);
|
|
148
|
-
const allowedSingleLine = diff.find(
|
|
149
|
-
(d) => String(d).startsWith("- expected:")
|
|
150
|
-
);
|
|
151
|
-
if (allowedSingleLine) {
|
|
152
|
-
const value = allowedSingleLine.replace("- expected: ", "");
|
|
153
|
-
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
154
|
-
} else if (allowedOneOfLine) {
|
|
155
|
-
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
156
|
-
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
157
|
-
} else {
|
|
158
|
-
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
163
|
-
if (error_type.includes("missing_required")) {
|
|
164
|
-
suggestions.push(
|
|
165
|
-
"Add all required parameters defined by the tool schema."
|
|
166
|
-
);
|
|
167
|
-
} else if (error_type.includes("unexpected_param")) {
|
|
168
|
-
suggestions.push("Remove parameters not present in the tool schema.");
|
|
169
|
-
} else if (error_type.includes("wrong_count")) {
|
|
170
|
-
suggestions.push(
|
|
171
|
-
"Adjust the number of tool calls to match expected count."
|
|
172
|
-
);
|
|
173
|
-
} else if (error_type.includes("wrong_func_name")) {
|
|
174
|
-
suggestions.push("Use the exact expected function name from the schema.");
|
|
175
|
-
} else if (error_type.includes("value_error")) {
|
|
176
|
-
suggestions.push("Choose a value from the allowed options.");
|
|
48
|
+
// src/utils/paths.ts
|
|
49
|
+
var import_node_fs = __toESM(require("fs"), 1);
|
|
50
|
+
var import_node_module = require("module");
|
|
51
|
+
var import_node_path = __toESM(require("path"), 1);
|
|
52
|
+
var import_node_url = require("url");
|
|
53
|
+
function tryResolveViaPackageEntry(moduleUrl) {
|
|
54
|
+
try {
|
|
55
|
+
const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || import_node_path.default.join(process.cwd(), "package.json");
|
|
56
|
+
const requireFromEntry = (0, import_node_module.createRequire)(baseForRequireEntry);
|
|
57
|
+
const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
|
|
58
|
+
const entryDir = import_node_path.default.dirname(entryPath);
|
|
59
|
+
const guessPkgRoot = import_node_fs.default.existsSync(import_node_path.default.join(entryDir, "..")) ? import_node_path.default.resolve(entryDir, "..") : entryDir;
|
|
60
|
+
const dataAtRoot = import_node_path.default.join(guessPkgRoot, "data");
|
|
61
|
+
if (import_node_fs.default.existsSync(dataAtRoot)) {
|
|
62
|
+
return dataAtRoot;
|
|
177
63
|
}
|
|
64
|
+
} catch {
|
|
178
65
|
}
|
|
179
|
-
return
|
|
66
|
+
return null;
|
|
180
67
|
}
|
|
181
|
-
function
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
const
|
|
185
|
-
const
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
console.log(
|
|
191
|
-
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
192
|
-
);
|
|
193
|
-
const metrics = Object.entries(result.metrics);
|
|
194
|
-
if (metrics.length > 0) {
|
|
195
|
-
console.log(" Metrics:");
|
|
196
|
-
for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
|
|
197
|
-
}
|
|
198
|
-
if (result.logs && result.logs.length) {
|
|
199
|
-
const failLogs = result.logs.filter(
|
|
200
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
201
|
-
);
|
|
202
|
-
const hasFails = failLogs.length > 0;
|
|
203
|
-
if (hasFails) {
|
|
204
|
-
let getTestIdFromLogLine2 = function(line) {
|
|
205
|
-
if (line.startsWith("[FAIL]")) {
|
|
206
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
207
|
-
return m?.[1];
|
|
208
|
-
}
|
|
209
|
-
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
210
|
-
try {
|
|
211
|
-
const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
212
|
-
return String(parsed?.id ?? "");
|
|
213
|
-
} catch {
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
217
|
-
try {
|
|
218
|
-
const parsed = JSON.parse(
|
|
219
|
-
line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
|
|
220
|
-
);
|
|
221
|
-
return String(parsed?.id ?? "");
|
|
222
|
-
} catch {
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
return void 0;
|
|
226
|
-
};
|
|
227
|
-
var getTestIdFromLogLine = getTestIdFromLogLine2;
|
|
228
|
-
const byId = /* @__PURE__ */ new Map();
|
|
229
|
-
for (const line of failLogs) {
|
|
230
|
-
const id = getTestIdFromLogLine2(line);
|
|
231
|
-
const key = id ?? "__general__";
|
|
232
|
-
const arr = byId.get(key) ?? [];
|
|
233
|
-
arr.push(line);
|
|
234
|
-
byId.set(key, arr);
|
|
235
|
-
}
|
|
236
|
-
console.log(
|
|
237
|
-
` ${colors2.bold}Failure details (grouped):${colors2.reset}`
|
|
238
|
-
);
|
|
239
|
-
for (const [groupId, lines] of byId) {
|
|
240
|
-
if (groupId !== "__general__") {
|
|
241
|
-
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
242
|
-
}
|
|
243
|
-
const debugIds = /* @__PURE__ */ new Set();
|
|
244
|
-
for (const l of lines) {
|
|
245
|
-
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
246
|
-
try {
|
|
247
|
-
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
248
|
-
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
249
|
-
} catch {
|
|
250
|
-
}
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
for (const line of lines) {
|
|
254
|
-
if (line.startsWith("[FAIL]")) {
|
|
255
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
256
|
-
const failId = m?.[1];
|
|
257
|
-
if (failId && debugIds.has(failId)) continue;
|
|
258
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
259
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
260
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
261
|
-
} else if (line.startsWith("[STACK]")) {
|
|
262
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
263
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
264
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
265
|
-
try {
|
|
266
|
-
const parsed = JSON.parse(payload);
|
|
267
|
-
const { message, diff, expected, actual } = parsed;
|
|
268
|
-
if (message)
|
|
269
|
-
console.log(
|
|
270
|
-
` ${colors2.bold}${message}${colors2.reset}`
|
|
271
|
-
);
|
|
272
|
-
if (diff && Array.isArray(diff)) {
|
|
273
|
-
for (const dLine of diff)
|
|
274
|
-
console.log(" " + colorizeDiffLine(dLine));
|
|
275
|
-
} else {
|
|
276
|
-
console.log(" expected:");
|
|
277
|
-
console.log(
|
|
278
|
-
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
279
|
-
);
|
|
280
|
-
console.log(" actual:");
|
|
281
|
-
console.log(
|
|
282
|
-
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
283
|
-
);
|
|
284
|
-
}
|
|
285
|
-
const suggestions = suggestFixFromDiff(parsed);
|
|
286
|
-
if (suggestions.length) {
|
|
287
|
-
console.log(
|
|
288
|
-
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
289
|
-
);
|
|
290
|
-
for (const s of suggestions)
|
|
291
|
-
console.log(` \u2022 ${s}`);
|
|
292
|
-
}
|
|
293
|
-
} catch {
|
|
294
|
-
console.log(` ${line}`);
|
|
295
|
-
}
|
|
296
|
-
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
297
|
-
const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
|
|
298
|
-
try {
|
|
299
|
-
const ctx = JSON.parse(payload);
|
|
300
|
-
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
301
|
-
if (ctx.tool_schema) {
|
|
302
|
-
console.log(
|
|
303
|
-
colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
|
|
304
|
-
);
|
|
305
|
-
}
|
|
306
|
-
if (ctx.last_user_query) {
|
|
307
|
-
console.log(
|
|
308
|
-
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
309
|
-
);
|
|
310
|
-
}
|
|
311
|
-
if (ctx.raw_model_text) {
|
|
312
|
-
console.log(
|
|
313
|
-
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
314
|
-
);
|
|
315
|
-
}
|
|
316
|
-
if (ctx.parsed_tool_calls) {
|
|
317
|
-
console.log(
|
|
318
|
-
colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
|
|
319
|
-
);
|
|
320
|
-
}
|
|
321
|
-
if (ctx.ground_truth) {
|
|
322
|
-
console.log(
|
|
323
|
-
colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
|
|
324
|
-
);
|
|
325
|
-
}
|
|
326
|
-
if (ctx.finish_reason) {
|
|
327
|
-
console.log(
|
|
328
|
-
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
329
|
-
);
|
|
330
|
-
}
|
|
331
|
-
} catch {
|
|
332
|
-
console.log(` ${line}`);
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
}
|
|
337
|
-
} else {
|
|
338
|
-
const info = result.logs.filter(
|
|
339
|
-
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
340
|
-
);
|
|
341
|
-
for (const line of info)
|
|
342
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
343
|
-
}
|
|
68
|
+
function tryResolveViaPackageJson(moduleUrl) {
|
|
69
|
+
try {
|
|
70
|
+
const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_node_path.default.join(process.cwd(), "package.json");
|
|
71
|
+
const require2 = (0, import_node_module.createRequire)(baseForRequire);
|
|
72
|
+
const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
|
|
73
|
+
const pkgDir = import_node_path.default.dirname(pkgJsonPath);
|
|
74
|
+
const dataAtPkg = import_node_path.default.join(pkgDir, "data");
|
|
75
|
+
if (import_node_fs.default.existsSync(dataAtPkg)) {
|
|
76
|
+
return dataAtPkg;
|
|
344
77
|
}
|
|
78
|
+
} catch {
|
|
345
79
|
}
|
|
346
|
-
|
|
80
|
+
return null;
|
|
347
81
|
}
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
...r.result,
|
|
355
|
-
error: r.result.error?.message
|
|
82
|
+
function getStartDir(moduleUrl) {
|
|
83
|
+
if (moduleUrl) {
|
|
84
|
+
try {
|
|
85
|
+
return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
|
|
86
|
+
} catch {
|
|
87
|
+
return process.cwd();
|
|
356
88
|
}
|
|
357
|
-
}));
|
|
358
|
-
console.log(JSON.stringify(serializableResults, null, 2));
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
// src/reporters/index.ts
|
|
362
|
-
var reporters = {
|
|
363
|
-
console: consoleReporter,
|
|
364
|
-
json: jsonReporter,
|
|
365
|
-
"console.debug": consoleDebugReporter
|
|
366
|
-
};
|
|
367
|
-
|
|
368
|
-
// src/evaluate.ts
|
|
369
|
-
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
370
|
-
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
371
|
-
try {
|
|
372
|
-
console.log(
|
|
373
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
374
|
-
);
|
|
375
|
-
const result = await benchmark.run(model, config);
|
|
376
|
-
console.log(
|
|
377
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
378
|
-
);
|
|
379
|
-
return {
|
|
380
|
-
model: modelId,
|
|
381
|
-
modelKey,
|
|
382
|
-
benchmark: benchmark.name,
|
|
383
|
-
result
|
|
384
|
-
};
|
|
385
|
-
} catch (error) {
|
|
386
|
-
console.error(
|
|
387
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
388
|
-
error
|
|
389
|
-
);
|
|
390
|
-
return {
|
|
391
|
-
model: modelId,
|
|
392
|
-
modelKey,
|
|
393
|
-
benchmark: benchmark.name,
|
|
394
|
-
result: {
|
|
395
|
-
score: 0,
|
|
396
|
-
success: false,
|
|
397
|
-
metrics: {},
|
|
398
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
399
|
-
}
|
|
400
|
-
};
|
|
401
89
|
}
|
|
90
|
+
return process.cwd();
|
|
402
91
|
}
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
} = options;
|
|
411
|
-
const modelEntries = [];
|
|
412
|
-
if (Array.isArray(models)) {
|
|
413
|
-
for (const m of models) modelEntries.push([void 0, m]);
|
|
414
|
-
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
415
|
-
modelEntries.push([void 0, models]);
|
|
416
|
-
} else {
|
|
417
|
-
for (const [key, m] of Object.entries(
|
|
418
|
-
models
|
|
419
|
-
)) {
|
|
420
|
-
modelEntries.push([key, m]);
|
|
92
|
+
function findDataDirByTraversal(startDir) {
|
|
93
|
+
let dir = startDir;
|
|
94
|
+
const MAX_PARENT_TRAVERSAL_DEPTH = 6;
|
|
95
|
+
for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
|
|
96
|
+
const dataCandidate = import_node_path.default.join(dir, "data");
|
|
97
|
+
if (import_node_fs.default.existsSync(dataCandidate)) {
|
|
98
|
+
return dataCandidate;
|
|
421
99
|
}
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
for (const benchmark of benchmarks) {
|
|
426
|
-
const config = {};
|
|
427
|
-
if (temperature !== void 0) config.temperature = temperature;
|
|
428
|
-
if (maxTokens !== void 0) config.maxTokens = maxTokens;
|
|
429
|
-
const evaluationResult = await runSingleBenchmark(
|
|
430
|
-
model,
|
|
431
|
-
benchmark,
|
|
432
|
-
modelKey,
|
|
433
|
-
Object.keys(config).length > 0 ? config : void 0
|
|
434
|
-
);
|
|
435
|
-
allResults.push(evaluationResult);
|
|
100
|
+
const parent = import_node_path.default.resolve(dir, "..");
|
|
101
|
+
if (parent === dir) {
|
|
102
|
+
break;
|
|
436
103
|
}
|
|
104
|
+
dir = parent;
|
|
437
105
|
}
|
|
438
|
-
|
|
439
|
-
if (report) {
|
|
440
|
-
report(allResults);
|
|
441
|
-
} else {
|
|
442
|
-
console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
|
|
443
|
-
reporters.console(allResults);
|
|
444
|
-
}
|
|
445
|
-
return allResults;
|
|
106
|
+
return null;
|
|
446
107
|
}
|
|
447
|
-
|
|
448
|
-
// src/benchmarks/bfcl.ts
|
|
449
|
-
var import_ai = require("ai");
|
|
450
|
-
var import_fs2 = require("fs");
|
|
451
|
-
var import_path2 = __toESM(require("path"), 1);
|
|
452
|
-
|
|
453
|
-
// src/utils/paths.ts
|
|
454
|
-
var import_fs = __toESM(require("fs"), 1);
|
|
455
|
-
var import_module = require("module");
|
|
456
|
-
var import_path = __toESM(require("path"), 1);
|
|
457
|
-
var import_url = require("url");
|
|
458
108
|
function resolveDataDir(fromModuleUrl) {
|
|
459
|
-
const moduleUrl = fromModuleUrl;
|
|
460
109
|
const override = process.env.BFCL_DATA_DIR;
|
|
461
110
|
if (override && override.trim().length > 0) {
|
|
462
111
|
return override;
|
|
463
112
|
}
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
const entryPath = requireFromEntry.resolve("@ai-sdk-tool/eval");
|
|
468
|
-
const entryDir = import_path.default.dirname(entryPath);
|
|
469
|
-
const guessPkgRoot = import_fs.default.existsSync(import_path.default.join(entryDir, "..")) ? import_path.default.resolve(entryDir, "..") : entryDir;
|
|
470
|
-
const dataAtRoot = import_path.default.join(guessPkgRoot, "data");
|
|
471
|
-
if (import_fs.default.existsSync(dataAtRoot)) return dataAtRoot;
|
|
472
|
-
} catch {
|
|
473
|
-
}
|
|
474
|
-
try {
|
|
475
|
-
const baseForRequire = typeof moduleUrl === "string" && moduleUrl || import_path.default.join(process.cwd(), "package.json");
|
|
476
|
-
const require2 = (0, import_module.createRequire)(baseForRequire);
|
|
477
|
-
const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
|
|
478
|
-
const pkgDir = import_path.default.dirname(pkgJsonPath);
|
|
479
|
-
const dataAtPkg = import_path.default.join(pkgDir, "data");
|
|
480
|
-
if (import_fs.default.existsSync(dataAtPkg)) return dataAtPkg;
|
|
481
|
-
} catch {
|
|
113
|
+
const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
|
|
114
|
+
if (viaEntry) {
|
|
115
|
+
return viaEntry;
|
|
482
116
|
}
|
|
483
|
-
|
|
484
|
-
if (
|
|
485
|
-
|
|
486
|
-
startDir = import_path.default.dirname((0, import_url.fileURLToPath)(moduleUrl));
|
|
487
|
-
} catch {
|
|
488
|
-
startDir = process.cwd();
|
|
489
|
-
}
|
|
490
|
-
} else {
|
|
491
|
-
startDir = process.cwd();
|
|
117
|
+
const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
|
|
118
|
+
if (viaPackageJson) {
|
|
119
|
+
return viaPackageJson;
|
|
492
120
|
}
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
const parent = import_path.default.resolve(dir, "..");
|
|
498
|
-
if (parent === dir) break;
|
|
499
|
-
dir = parent;
|
|
121
|
+
const startDir = getStartDir(fromModuleUrl);
|
|
122
|
+
const viaTraversal = findDataDirByTraversal(startDir);
|
|
123
|
+
if (viaTraversal) {
|
|
124
|
+
return viaTraversal;
|
|
500
125
|
}
|
|
501
|
-
const pkgRoot =
|
|
502
|
-
return
|
|
126
|
+
const pkgRoot = import_node_path.default.resolve(startDir, "..", "..");
|
|
127
|
+
return import_node_path.default.join(pkgRoot, "data");
|
|
503
128
|
}
|
|
504
129
|
|
|
505
130
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
506
131
|
function standardizeString(input) {
|
|
507
|
-
if (typeof input !== "string")
|
|
132
|
+
if (typeof input !== "string") {
|
|
133
|
+
return input;
|
|
134
|
+
}
|
|
508
135
|
const regex = /[ ,./\\-_*^]/g;
|
|
509
136
|
return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
|
|
510
137
|
}
|
|
@@ -524,131 +151,185 @@ function checkStringValue(param, modelValue, possibleAnswers) {
|
|
|
524
151
|
}
|
|
525
152
|
return { valid: true };
|
|
526
153
|
}
|
|
527
|
-
function
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
const expectedFuncName = funcDescription.name;
|
|
531
|
-
const expectedParams = funcDescription.parameters.properties;
|
|
532
|
-
const requiredParams = funcDescription.parameters.required;
|
|
533
|
-
if (modelFuncName !== expectedFuncName) {
|
|
534
|
-
return {
|
|
535
|
-
valid: false,
|
|
536
|
-
error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
|
|
537
|
-
error_type: "simple_function_checker:wrong_func_name"
|
|
538
|
-
};
|
|
154
|
+
function normalizeObject(obj) {
|
|
155
|
+
if (Array.isArray(obj)) {
|
|
156
|
+
return obj.map(normalizeObject);
|
|
539
157
|
}
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
valid: false,
|
|
546
|
-
error: `Missing required parameter: '${param}'.`,
|
|
547
|
-
error_type: "simple_function_checker:missing_required"
|
|
548
|
-
};
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
if (modelArgs && typeof modelArgs === "object") {
|
|
552
|
-
for (const paramName of Object.keys(argsObj)) {
|
|
553
|
-
const modelValue = argsObj[paramName];
|
|
554
|
-
if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
|
|
555
|
-
return {
|
|
556
|
-
valid: false,
|
|
557
|
-
error: `Unexpected parameter: '${paramName}'.`,
|
|
558
|
-
error_type: "simple_function_checker:unexpected_param"
|
|
559
|
-
};
|
|
560
|
-
}
|
|
561
|
-
const possibleValues = possibleAnswerParams[paramName];
|
|
562
|
-
if (typeof modelValue === "string") {
|
|
563
|
-
const result = checkStringValue(
|
|
564
|
-
paramName,
|
|
565
|
-
modelValue,
|
|
566
|
-
possibleValues ?? []
|
|
567
|
-
);
|
|
568
|
-
if (!result.valid) return result;
|
|
569
|
-
} else if (Array.isArray(modelValue)) {
|
|
570
|
-
const modelValueStr = JSON.stringify(
|
|
571
|
-
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
572
|
-
);
|
|
573
|
-
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
574
|
-
if (!Array.isArray(p)) return false;
|
|
575
|
-
return JSON.stringify(
|
|
576
|
-
p.map((v) => standardizeString(String(v))).sort()
|
|
577
|
-
) === modelValueStr;
|
|
578
|
-
}) : false;
|
|
579
|
-
if (!hasMatch) {
|
|
580
|
-
return {
|
|
581
|
-
valid: false,
|
|
582
|
-
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
583
|
-
modelValue
|
|
584
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
585
|
-
error_type: "value_error:list"
|
|
586
|
-
};
|
|
587
|
-
}
|
|
158
|
+
if (obj && typeof obj === "object") {
|
|
159
|
+
const normalized = {};
|
|
160
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
161
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
162
|
+
normalized[key] = value[0];
|
|
588
163
|
} else {
|
|
589
|
-
|
|
590
|
-
if (modelValue === possibleValue) return true;
|
|
591
|
-
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
592
|
-
try {
|
|
593
|
-
const normalizeObject = (obj) => {
|
|
594
|
-
if (Array.isArray(obj)) {
|
|
595
|
-
return obj.map(normalizeObject);
|
|
596
|
-
}
|
|
597
|
-
if (obj && typeof obj === "object") {
|
|
598
|
-
const normalized = {};
|
|
599
|
-
for (const [key, value] of Object.entries(
|
|
600
|
-
obj
|
|
601
|
-
)) {
|
|
602
|
-
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
603
|
-
normalized[key] = value[0];
|
|
604
|
-
} else {
|
|
605
|
-
normalized[key] = normalizeObject(value);
|
|
606
|
-
}
|
|
607
|
-
}
|
|
608
|
-
return normalized;
|
|
609
|
-
}
|
|
610
|
-
return obj;
|
|
611
|
-
};
|
|
612
|
-
const normalizedModel = normalizeObject(modelValue);
|
|
613
|
-
const normalizedPossible = normalizeObject(possibleValue);
|
|
614
|
-
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
615
|
-
} catch {
|
|
616
|
-
return false;
|
|
617
|
-
}
|
|
618
|
-
}
|
|
619
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
620
|
-
return modelValue.toString() === possibleValue;
|
|
621
|
-
}
|
|
622
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
623
|
-
return modelValue === possibleValue.toString();
|
|
624
|
-
}
|
|
625
|
-
return false;
|
|
626
|
-
}) : false;
|
|
627
|
-
if (!hasMatch) {
|
|
628
|
-
return {
|
|
629
|
-
valid: false,
|
|
630
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
631
|
-
modelValue
|
|
632
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
633
|
-
error_type: "value_error:other"
|
|
634
|
-
};
|
|
635
|
-
}
|
|
164
|
+
normalized[key] = normalizeObject(value);
|
|
636
165
|
}
|
|
637
166
|
}
|
|
167
|
+
return normalized;
|
|
638
168
|
}
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
169
|
+
return obj;
|
|
170
|
+
}
|
|
171
|
+
function valuesMatch(modelValue, possibleValue) {
|
|
172
|
+
if (modelValue === possibleValue) {
|
|
173
|
+
return true;
|
|
174
|
+
}
|
|
175
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
176
|
+
try {
|
|
177
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
178
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
179
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
180
|
+
} catch {
|
|
181
|
+
return false;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
185
|
+
return modelValue.toString() === possibleValue;
|
|
186
|
+
}
|
|
187
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
188
|
+
return modelValue === possibleValue.toString();
|
|
189
|
+
}
|
|
190
|
+
return false;
|
|
191
|
+
}
|
|
192
|
+
function checkArrayValue(paramName, modelValue, possibleValues) {
|
|
193
|
+
const modelValueStr = JSON.stringify(
|
|
194
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
195
|
+
);
|
|
196
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
197
|
+
if (!Array.isArray(p)) {
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
|
|
201
|
+
}) : false;
|
|
202
|
+
if (!hasMatch) {
|
|
203
|
+
return {
|
|
204
|
+
valid: false,
|
|
205
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
206
|
+
modelValue
|
|
207
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
208
|
+
error_type: "value_error:list"
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
return { valid: true };
|
|
212
|
+
}
|
|
213
|
+
function checkObjectValue(paramName, modelValue, possibleValues) {
|
|
214
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
|
|
215
|
+
(possibleValue) => valuesMatch(modelValue, possibleValue)
|
|
216
|
+
) : false;
|
|
217
|
+
if (!hasMatch) {
|
|
218
|
+
return {
|
|
219
|
+
valid: false,
|
|
220
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
221
|
+
modelValue
|
|
222
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
223
|
+
error_type: "value_error:other"
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
return { valid: true };
|
|
227
|
+
}
|
|
228
|
+
function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
229
|
+
const funcNameCheck = checkFunctionName(
|
|
230
|
+
funcDescription.name,
|
|
231
|
+
modelToolCall.toolName
|
|
232
|
+
);
|
|
233
|
+
if (!funcNameCheck.valid) {
|
|
234
|
+
return funcNameCheck;
|
|
235
|
+
}
|
|
236
|
+
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
237
|
+
const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
|
|
238
|
+
const context = {
|
|
239
|
+
funcDescription,
|
|
240
|
+
modelToolCall,
|
|
241
|
+
possibleAnswerParams,
|
|
242
|
+
expectedParams: funcDescription.parameters.properties
|
|
243
|
+
};
|
|
244
|
+
const requiredCheck = checkRequiredParams(
|
|
245
|
+
funcDescription.parameters.required,
|
|
246
|
+
argsObj
|
|
247
|
+
);
|
|
248
|
+
if (!requiredCheck.valid) {
|
|
249
|
+
return requiredCheck;
|
|
250
|
+
}
|
|
251
|
+
const paramsCheck = checkAllParameters(argsObj, context);
|
|
252
|
+
if (!paramsCheck.valid) {
|
|
253
|
+
return paramsCheck;
|
|
254
|
+
}
|
|
255
|
+
const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
|
|
256
|
+
if (!optionalCheck.valid) {
|
|
257
|
+
return optionalCheck;
|
|
258
|
+
}
|
|
259
|
+
return { valid: true };
|
|
260
|
+
}
|
|
261
|
+
function checkFunctionName(expected, actual) {
|
|
262
|
+
if (actual !== expected) {
|
|
263
|
+
return {
|
|
264
|
+
valid: false,
|
|
265
|
+
error: `Function name '${actual}' does not match expected '${expected}'.`,
|
|
266
|
+
error_type: "simple_function_checker:wrong_func_name"
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
return { valid: true };
|
|
270
|
+
}
|
|
271
|
+
function checkRequiredParams(requiredParams, argsObj) {
|
|
272
|
+
for (const param of requiredParams) {
|
|
273
|
+
if (!(param in argsObj)) {
|
|
643
274
|
return {
|
|
644
275
|
valid: false,
|
|
645
|
-
error: `Missing
|
|
646
|
-
error_type: "simple_function_checker:
|
|
276
|
+
error: `Missing required parameter: '${param}'.`,
|
|
277
|
+
error_type: "simple_function_checker:missing_required"
|
|
647
278
|
};
|
|
648
279
|
}
|
|
649
280
|
}
|
|
650
281
|
return { valid: true };
|
|
651
282
|
}
|
|
283
|
+
function checkAllParameters(argsObj, context) {
|
|
284
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
285
|
+
const paramCheck = checkSingleParameter(
|
|
286
|
+
paramName,
|
|
287
|
+
argsObj[paramName],
|
|
288
|
+
context
|
|
289
|
+
);
|
|
290
|
+
if (!paramCheck.valid) {
|
|
291
|
+
return paramCheck;
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
return { valid: true };
|
|
295
|
+
}
|
|
296
|
+
function checkSingleParameter(paramName, modelValue, context) {
|
|
297
|
+
if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
|
|
298
|
+
return {
|
|
299
|
+
valid: false,
|
|
300
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
301
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
const possibleValues = context.possibleAnswerParams[paramName];
|
|
305
|
+
if (typeof modelValue === "string") {
|
|
306
|
+
return checkStringValue(
|
|
307
|
+
paramName,
|
|
308
|
+
modelValue,
|
|
309
|
+
possibleValues ?? []
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
if (Array.isArray(modelValue)) {
|
|
313
|
+
return checkArrayValue(paramName, modelValue, possibleValues);
|
|
314
|
+
}
|
|
315
|
+
return checkObjectValue(paramName, modelValue, possibleValues);
|
|
316
|
+
}
|
|
317
|
+
function checkOptionalParams(argsObj, possibleAnswerParams) {
|
|
318
|
+
for (const paramName in possibleAnswerParams) {
|
|
319
|
+
if (Object.hasOwn(possibleAnswerParams, paramName)) {
|
|
320
|
+
const val = possibleAnswerParams[paramName];
|
|
321
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
322
|
+
if (!(paramName in argsObj || isOptional)) {
|
|
323
|
+
return {
|
|
324
|
+
valid: false,
|
|
325
|
+
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
326
|
+
error_type: "simple_function_checker:missing_optional"
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
return { valid: true };
|
|
332
|
+
}
|
|
652
333
|
function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
|
|
653
334
|
if (modelToolCalls.length !== possibleAnswers.length) {
|
|
654
335
|
return {
|
|
@@ -671,8 +352,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
|
|
|
671
352
|
};
|
|
672
353
|
}
|
|
673
354
|
let foundMatch = false;
|
|
674
|
-
for (let i = 0; i < modelToolCalls.length; i
|
|
675
|
-
if (matchedModelCallIndices.has(i))
|
|
355
|
+
for (let i = 0; i < modelToolCalls.length; i += 1) {
|
|
356
|
+
if (matchedModelCallIndices.has(i)) {
|
|
357
|
+
continue;
|
|
358
|
+
}
|
|
676
359
|
const checkerResult = simpleFunctionChecker(
|
|
677
360
|
funcDescription,
|
|
678
361
|
modelToolCalls[i],
|
|
@@ -721,6 +404,8 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
721
404
|
}
|
|
722
405
|
|
|
723
406
|
// src/benchmarks/bfcl.ts
|
|
407
|
+
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
408
|
+
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
724
409
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
725
410
|
const category = testCase.id.split("_")[0];
|
|
726
411
|
try {
|
|
@@ -737,19 +422,22 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
737
422
|
modelOutput[0],
|
|
738
423
|
possibleAnswer.ground_truth[0]
|
|
739
424
|
);
|
|
740
|
-
}
|
|
425
|
+
}
|
|
426
|
+
if (category === "parallel") {
|
|
741
427
|
return parallelFunctionCheckerNoOrder(
|
|
742
428
|
testCase.function,
|
|
743
429
|
modelOutput,
|
|
744
430
|
possibleAnswer.ground_truth
|
|
745
431
|
);
|
|
746
|
-
}
|
|
432
|
+
}
|
|
433
|
+
if (category === "multiple") {
|
|
747
434
|
return multipleFunctionChecker(
|
|
748
435
|
testCase.function,
|
|
749
436
|
modelOutput,
|
|
750
437
|
possibleAnswer.ground_truth
|
|
751
438
|
);
|
|
752
|
-
}
|
|
439
|
+
}
|
|
440
|
+
if (category.includes("parallel-multiple")) {
|
|
753
441
|
return parallelFunctionCheckerNoOrder(
|
|
754
442
|
testCase.function,
|
|
755
443
|
modelOutput,
|
|
@@ -777,16 +465,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
777
465
|
try {
|
|
778
466
|
const dataPath = resolveDataDir();
|
|
779
467
|
logs.push(`[INFO] Using data dir: ${dataPath}`);
|
|
780
|
-
const testCasesJson = await
|
|
781
|
-
|
|
468
|
+
const testCasesJson = await import_node_fs2.promises.readFile(
|
|
469
|
+
import_node_path2.default.join(dataPath, testDataFile),
|
|
782
470
|
"utf-8"
|
|
783
471
|
);
|
|
784
|
-
const possibleAnswersJson = await
|
|
785
|
-
|
|
472
|
+
const possibleAnswersJson = await import_node_fs2.promises.readFile(
|
|
473
|
+
import_node_path2.default.join(dataPath, answerDataFile),
|
|
786
474
|
"utf-8"
|
|
787
475
|
);
|
|
788
|
-
testCases = testCasesJson.split(
|
|
789
|
-
const possibleAnswers = possibleAnswersJson.split(
|
|
476
|
+
testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
477
|
+
const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
790
478
|
const possibleAnswersMap = new Map(
|
|
791
479
|
possibleAnswers.map((ans) => [ans.id, ans])
|
|
792
480
|
);
|
|
@@ -798,373 +486,600 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
798
486
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
799
487
|
);
|
|
800
488
|
}
|
|
489
|
+
const fixSchemaType = (copy) => {
|
|
490
|
+
if (!copy.type) {
|
|
491
|
+
return;
|
|
492
|
+
}
|
|
493
|
+
if (copy.type === "dict") {
|
|
494
|
+
copy.type = "object";
|
|
495
|
+
}
|
|
496
|
+
if (copy.type === "tuple") {
|
|
497
|
+
copy.type = "array";
|
|
498
|
+
}
|
|
499
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
500
|
+
copy.type = "number";
|
|
501
|
+
}
|
|
502
|
+
};
|
|
503
|
+
const fixSchemaProperties = (copy, fixSchemaFn) => {
|
|
504
|
+
if (!copy.properties || typeof copy.properties !== "object") {
|
|
505
|
+
return;
|
|
506
|
+
}
|
|
507
|
+
for (const k of Object.keys(copy.properties)) {
|
|
508
|
+
copy.properties[k] = fixSchemaFn(
|
|
509
|
+
copy.properties[k]
|
|
510
|
+
);
|
|
511
|
+
}
|
|
512
|
+
};
|
|
801
513
|
const fixSchema = (schema) => {
|
|
802
|
-
if (!schema || typeof schema !== "object")
|
|
514
|
+
if (!schema || typeof schema !== "object") {
|
|
803
515
|
return { type: "object", properties: {} };
|
|
516
|
+
}
|
|
804
517
|
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
805
518
|
if (!Array.isArray(copy)) {
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
}
|
|
811
|
-
if (copy.properties && typeof copy.properties === "object") {
|
|
812
|
-
for (const k of Object.keys(copy.properties)) {
|
|
813
|
-
copy.properties[k] = fixSchema(
|
|
814
|
-
copy.properties[k]
|
|
815
|
-
);
|
|
816
|
-
}
|
|
519
|
+
fixSchemaType(copy);
|
|
520
|
+
fixSchemaProperties(copy, fixSchema);
|
|
521
|
+
if (copy.items) {
|
|
522
|
+
copy.items = fixSchema(copy.items);
|
|
817
523
|
}
|
|
818
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
819
524
|
return copy;
|
|
820
525
|
}
|
|
821
526
|
return copy;
|
|
822
527
|
};
|
|
528
|
+
const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
529
|
+
const sanitizeName = (toolName) => {
|
|
530
|
+
const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
|
|
531
|
+
return s.length > 0 ? s : "tool";
|
|
532
|
+
};
|
|
533
|
+
const buildTransformedTools = (tools, fixSchemaFn) => {
|
|
534
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
535
|
+
const transformedTools = tools.map((t) => {
|
|
536
|
+
const fixed = fixSchemaFn(t.parameters);
|
|
537
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
538
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
539
|
+
const sanitized = sanitizeName(t.name);
|
|
540
|
+
nameMap.set(sanitized, t.name);
|
|
541
|
+
return {
|
|
542
|
+
type: "function",
|
|
543
|
+
name: sanitized,
|
|
544
|
+
description: t.description,
|
|
545
|
+
inputSchema
|
|
546
|
+
};
|
|
547
|
+
});
|
|
548
|
+
return { transformedTools, nameMap };
|
|
549
|
+
};
|
|
550
|
+
const parseDebugToolCalls = (raw) => {
|
|
551
|
+
if (!raw) {
|
|
552
|
+
return [];
|
|
553
|
+
}
|
|
554
|
+
try {
|
|
555
|
+
const arr = JSON.parse(raw);
|
|
556
|
+
return Array.isArray(arr) ? arr : [];
|
|
557
|
+
} catch {
|
|
558
|
+
return [];
|
|
559
|
+
}
|
|
560
|
+
};
|
|
561
|
+
const getSanitizedName = (rawName, transformedTools) => {
|
|
562
|
+
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
563
|
+
return transformedTools[Number(rawName)]?.name ?? rawName;
|
|
564
|
+
}
|
|
565
|
+
return rawName;
|
|
566
|
+
};
|
|
567
|
+
const parseToolArgs = (extractedArgs) => {
|
|
568
|
+
if (typeof extractedArgs !== "string") {
|
|
569
|
+
return extractedArgs;
|
|
570
|
+
}
|
|
571
|
+
try {
|
|
572
|
+
return JSON.parse(extractedArgs);
|
|
573
|
+
} catch {
|
|
574
|
+
return extractedArgs;
|
|
575
|
+
}
|
|
576
|
+
};
|
|
577
|
+
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
578
|
+
const call = c;
|
|
579
|
+
const rawName = call.toolName ?? call.name;
|
|
580
|
+
const sanitizedFromIndex = getSanitizedName(
|
|
581
|
+
rawName,
|
|
582
|
+
transformedTools
|
|
583
|
+
);
|
|
584
|
+
const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
|
|
585
|
+
const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
|
|
586
|
+
const parsedArgs = parseToolArgs(extractedArgs);
|
|
587
|
+
return {
|
|
588
|
+
...call,
|
|
589
|
+
toolName: originalName,
|
|
590
|
+
name: originalName,
|
|
591
|
+
args: parsedArgs ?? {}
|
|
592
|
+
};
|
|
593
|
+
});
|
|
594
|
+
const summarizeArgs = (args) => {
|
|
595
|
+
if (args == null) {
|
|
596
|
+
return args;
|
|
597
|
+
}
|
|
598
|
+
if (typeof args !== "object") {
|
|
599
|
+
return args;
|
|
600
|
+
}
|
|
601
|
+
return Object.keys(args).sort().reduce(
|
|
602
|
+
(acc, k) => {
|
|
603
|
+
acc[k] = args[k];
|
|
604
|
+
return acc;
|
|
605
|
+
},
|
|
606
|
+
{}
|
|
607
|
+
);
|
|
608
|
+
};
|
|
609
|
+
const generateParamMismatchDiff = (paramName, allowed, got) => {
|
|
610
|
+
const diffLines = [];
|
|
611
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
612
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
613
|
+
const expectedLine = (() => {
|
|
614
|
+
if (allowedArray.length === 1) {
|
|
615
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
616
|
+
}
|
|
617
|
+
const formatted = allowedArray.map(
|
|
618
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
619
|
+
).join(", ");
|
|
620
|
+
return `- expected one of: ${formatted}`;
|
|
621
|
+
})();
|
|
622
|
+
diffLines.push(expectedLine);
|
|
623
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
624
|
+
return diffLines;
|
|
625
|
+
};
|
|
626
|
+
const paramValueMatches = (allowed, got) => {
|
|
627
|
+
if (!Array.isArray(allowed)) {
|
|
628
|
+
return false;
|
|
629
|
+
}
|
|
630
|
+
return allowed.some((v) => {
|
|
631
|
+
try {
|
|
632
|
+
if (Array.isArray(got)) {
|
|
633
|
+
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
634
|
+
}
|
|
635
|
+
} catch {
|
|
636
|
+
}
|
|
637
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
638
|
+
});
|
|
639
|
+
};
|
|
640
|
+
const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
|
|
641
|
+
if (expectedName !== receivedName) {
|
|
642
|
+
diff.push("@@ function name");
|
|
643
|
+
diff.push(`- ${expectedName}`);
|
|
644
|
+
diff.push(`+ ${receivedName}`);
|
|
645
|
+
}
|
|
646
|
+
};
|
|
647
|
+
const checkMissingParams = (required, receivedArgs, diff) => {
|
|
648
|
+
for (const req of required) {
|
|
649
|
+
if (!(req in receivedArgs)) {
|
|
650
|
+
diff.push(`- missing required param: ${req}`);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
};
|
|
654
|
+
const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
|
|
655
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
656
|
+
if (!(k in expectedParams)) {
|
|
657
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
};
|
|
661
|
+
const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
|
|
662
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
663
|
+
if (k in expectedParams) {
|
|
664
|
+
const allowed = expectedParams[k];
|
|
665
|
+
const got = receivedArgs[k];
|
|
666
|
+
if (!paramValueMatches(allowed, got)) {
|
|
667
|
+
diff.push(...generateParamMismatchDiff(k, allowed, got));
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
};
|
|
672
|
+
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
673
|
+
const funcDesc = tools[0];
|
|
674
|
+
const gt = possibleAnswer.ground_truth?.[0];
|
|
675
|
+
const expectedFuncName = funcDesc?.name;
|
|
676
|
+
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
677
|
+
const received = restoredCalls[0];
|
|
678
|
+
const receivedName = received?.toolName ?? received?.name;
|
|
679
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
680
|
+
const expected = {
|
|
681
|
+
function: expectedFuncName,
|
|
682
|
+
params: expectedParams
|
|
683
|
+
};
|
|
684
|
+
const actual = {
|
|
685
|
+
function: receivedName,
|
|
686
|
+
args: receivedArgs
|
|
687
|
+
};
|
|
688
|
+
const diff = [];
|
|
689
|
+
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
690
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
691
|
+
const required = funcDesc?.parameters?.required ?? [];
|
|
692
|
+
checkMissingParams(
|
|
693
|
+
required,
|
|
694
|
+
receivedArgs,
|
|
695
|
+
diff
|
|
696
|
+
);
|
|
697
|
+
checkUnexpectedParams(
|
|
698
|
+
expectedParams,
|
|
699
|
+
receivedArgs,
|
|
700
|
+
diff
|
|
701
|
+
);
|
|
702
|
+
checkParamValueMismatches(
|
|
703
|
+
expectedParams,
|
|
704
|
+
receivedArgs,
|
|
705
|
+
diff
|
|
706
|
+
);
|
|
707
|
+
}
|
|
708
|
+
return { expected, actual, diff };
|
|
709
|
+
};
|
|
710
|
+
const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
|
|
711
|
+
if (expectedCount !== actualCount) {
|
|
712
|
+
diff.push("@@ call count");
|
|
713
|
+
diff.push(`- expected ${expectedCount}`);
|
|
714
|
+
diff.push(`+ got ${actualCount}`);
|
|
715
|
+
}
|
|
716
|
+
};
|
|
717
|
+
const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
|
|
718
|
+
const missing = expectedNames.filter((n) => !actualNames.includes(n));
|
|
719
|
+
const extra = actualNames.filter((n) => !expectedNames.includes(n));
|
|
720
|
+
for (const m of missing) {
|
|
721
|
+
diff.push(`- missing function: ${m}`);
|
|
722
|
+
}
|
|
723
|
+
for (const e of extra) {
|
|
724
|
+
diff.push(`+ unexpected function: ${e}`);
|
|
725
|
+
}
|
|
726
|
+
};
|
|
727
|
+
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
728
|
+
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
729
|
+
if (usedActual.has(i)) {
|
|
730
|
+
continue;
|
|
731
|
+
}
|
|
732
|
+
const rc = restoredCalls[i];
|
|
733
|
+
const rcName = rc?.toolName ?? rc?.name;
|
|
734
|
+
if (rcName === fname) {
|
|
735
|
+
return i;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
return -1;
|
|
739
|
+
};
|
|
740
|
+
const validateFunctionParams = (options) => {
|
|
741
|
+
const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
|
|
742
|
+
checkMissingParams(requiredParams, receivedArgs, diff);
|
|
743
|
+
checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
|
|
744
|
+
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
745
|
+
};
|
|
746
|
+
const processExpectedCall = (options) => {
|
|
747
|
+
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
748
|
+
const fname = Object.keys(expectedObj)[0];
|
|
749
|
+
const matchedIndex = findMatchingCallIndex(
|
|
750
|
+
fname,
|
|
751
|
+
restoredCalls,
|
|
752
|
+
usedActual
|
|
753
|
+
);
|
|
754
|
+
if (matchedIndex === -1) {
|
|
755
|
+
return;
|
|
756
|
+
}
|
|
757
|
+
usedActual.add(matchedIndex);
|
|
758
|
+
const received = restoredCalls[matchedIndex];
|
|
759
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
760
|
+
const expectedParamsAllowed = expectedObj[fname];
|
|
761
|
+
const funcDesc = tools.find((t) => t.name === fname);
|
|
762
|
+
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
763
|
+
diff.push(`@@ function ${fname}`);
|
|
764
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
765
|
+
validateFunctionParams({
|
|
766
|
+
receivedArgs,
|
|
767
|
+
expectedParamsAllowed,
|
|
768
|
+
requiredParams,
|
|
769
|
+
diff
|
|
770
|
+
});
|
|
771
|
+
}
|
|
772
|
+
};
|
|
773
|
+
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
774
|
+
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
775
|
+
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
776
|
+
const actualNames = restoredCalls.map(
|
|
777
|
+
(c) => c.toolName ?? c.name
|
|
778
|
+
);
|
|
779
|
+
const expected = {
|
|
780
|
+
functions: expectedNames
|
|
781
|
+
};
|
|
782
|
+
const actual = { functions: actualNames };
|
|
783
|
+
const diff = [];
|
|
784
|
+
checkCallCountMismatch(
|
|
785
|
+
expectedNames.length,
|
|
786
|
+
actualNames.length,
|
|
787
|
+
diff
|
|
788
|
+
);
|
|
789
|
+
addMissingAndExtraFunctions(expectedNames, actualNames, diff);
|
|
790
|
+
const usedActual = /* @__PURE__ */ new Set();
|
|
791
|
+
for (const expectedObj of gtArr) {
|
|
792
|
+
processExpectedCall({
|
|
793
|
+
expectedObj,
|
|
794
|
+
restoredCalls,
|
|
795
|
+
tools,
|
|
796
|
+
usedActual,
|
|
797
|
+
diff
|
|
798
|
+
});
|
|
799
|
+
}
|
|
800
|
+
return { expected, actual, diff };
|
|
801
|
+
};
|
|
823
802
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
824
803
|
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
825
804
|
logs.push(
|
|
826
805
|
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
827
806
|
);
|
|
828
|
-
const
|
|
829
|
-
const caseLogs = [];
|
|
830
|
-
const { function: tools, question: messages } = testCase;
|
|
831
|
-
const temp = config?.temperature;
|
|
832
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
833
|
-
const maxTok = config?.maxTokens;
|
|
834
|
-
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
807
|
+
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
835
808
|
try {
|
|
836
|
-
const
|
|
837
|
-
const
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
return s.length > 0 ? s : "tool";
|
|
841
|
-
};
|
|
842
|
-
const transformedTools = tools.map((t) => {
|
|
843
|
-
const fixed = fixSchema(t.parameters);
|
|
844
|
-
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
845
|
-
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
846
|
-
const sanitized = sanitizeName(t.name);
|
|
847
|
-
nameMap.set(sanitized, t.name);
|
|
848
|
-
return {
|
|
849
|
-
type: "function",
|
|
850
|
-
name: sanitized,
|
|
851
|
-
description: t.description,
|
|
852
|
-
inputSchema
|
|
853
|
-
};
|
|
854
|
-
});
|
|
855
|
-
const toolsMap = Object.fromEntries(
|
|
856
|
-
transformedTools.map((t) => [
|
|
857
|
-
t.name,
|
|
858
|
-
(0, import_ai.tool)({
|
|
859
|
-
description: typeof t.description === "string" ? t.description : void 0,
|
|
860
|
-
inputSchema: (0, import_ai.jsonSchema)(t.inputSchema)
|
|
861
|
-
})
|
|
862
|
-
])
|
|
809
|
+
const firstTool = transformedTools[0];
|
|
810
|
+
const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
|
|
811
|
+
caseLogs.push(
|
|
812
|
+
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
863
813
|
);
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
814
|
+
} catch (e) {
|
|
815
|
+
caseLogs.push(
|
|
816
|
+
`[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
|
|
817
|
+
);
|
|
818
|
+
}
|
|
819
|
+
};
|
|
820
|
+
const logRawToolCalls = (options) => {
|
|
821
|
+
const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
|
|
822
|
+
try {
|
|
823
|
+
caseLogs.push(
|
|
824
|
+
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
825
|
+
);
|
|
826
|
+
} catch {
|
|
827
|
+
caseLogs.push(
|
|
828
|
+
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
829
|
+
);
|
|
830
|
+
}
|
|
831
|
+
};
|
|
832
|
+
const buildFailureContext = (options) => {
|
|
833
|
+
const {
|
|
834
|
+
testCase,
|
|
835
|
+
tools,
|
|
836
|
+
flatMessages,
|
|
837
|
+
mwOriginalText,
|
|
838
|
+
text,
|
|
839
|
+
finishReason,
|
|
840
|
+
mwParsedToolCalls,
|
|
841
|
+
restoredCalls,
|
|
842
|
+
possibleAnswer
|
|
843
|
+
} = options;
|
|
844
|
+
const lastUser = (() => {
|
|
845
|
+
const reversed = [...flatMessages].reverse();
|
|
846
|
+
const found = reversed.find(
|
|
847
|
+
(m) => m.role === "user"
|
|
848
|
+
);
|
|
849
|
+
return found?.content ?? void 0;
|
|
850
|
+
})();
|
|
851
|
+
const rawModelText = (() => {
|
|
852
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
853
|
+
return mwOriginalText;
|
|
874
854
|
}
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
855
|
+
if (typeof text === "string") {
|
|
856
|
+
return text;
|
|
857
|
+
}
|
|
858
|
+
return "";
|
|
859
|
+
})();
|
|
860
|
+
return {
|
|
861
|
+
id: testCase.id,
|
|
862
|
+
tool_schema: tools,
|
|
863
|
+
last_user_query: lastUser,
|
|
864
|
+
raw_model_text: rawModelText,
|
|
865
|
+
finish_reason: finishReason,
|
|
866
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
867
|
+
ground_truth: possibleAnswer.ground_truth
|
|
868
|
+
};
|
|
869
|
+
};
|
|
870
|
+
const logFailureDetails = (options) => {
|
|
871
|
+
const {
|
|
872
|
+
testCase,
|
|
873
|
+
tools,
|
|
874
|
+
possibleAnswer,
|
|
875
|
+
restoredCalls,
|
|
876
|
+
checkerResult,
|
|
877
|
+
flatMessages,
|
|
878
|
+
mwOriginalText,
|
|
879
|
+
text,
|
|
880
|
+
finishReason,
|
|
881
|
+
mwParsedToolCalls,
|
|
882
|
+
caseLogs
|
|
883
|
+
} = options;
|
|
884
|
+
try {
|
|
885
|
+
const category = testCase.id.split("_")[0];
|
|
886
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
887
|
+
tools,
|
|
888
|
+
possibleAnswer,
|
|
889
|
+
restoredCalls
|
|
890
|
+
) : buildParallelDiff(
|
|
891
|
+
tools,
|
|
892
|
+
possibleAnswer,
|
|
893
|
+
restoredCalls
|
|
894
|
+
);
|
|
895
|
+
caseLogs.push(
|
|
896
|
+
`[DEBUG-FAIL] ${JSON.stringify({
|
|
897
|
+
id: testCase.id,
|
|
898
|
+
message: checkerResult.error,
|
|
899
|
+
error_type: checkerResult.error_type,
|
|
900
|
+
expected,
|
|
901
|
+
actual,
|
|
902
|
+
diff
|
|
903
|
+
})}`
|
|
904
|
+
);
|
|
901
905
|
try {
|
|
906
|
+
const contextPayload = buildFailureContext({
|
|
907
|
+
testCase,
|
|
908
|
+
tools,
|
|
909
|
+
flatMessages,
|
|
910
|
+
mwOriginalText,
|
|
911
|
+
text,
|
|
912
|
+
finishReason,
|
|
913
|
+
mwParsedToolCalls,
|
|
914
|
+
restoredCalls,
|
|
915
|
+
possibleAnswer
|
|
916
|
+
});
|
|
902
917
|
caseLogs.push(
|
|
903
|
-
`[DEBUG] ${
|
|
918
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
904
919
|
);
|
|
905
920
|
} catch {
|
|
906
|
-
caseLogs.push(
|
|
907
|
-
`[DEBUG] ${testCase.id}: failed to serialize toolCalls`
|
|
908
|
-
);
|
|
909
921
|
}
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
922
|
+
} catch {
|
|
923
|
+
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
924
|
+
}
|
|
925
|
+
};
|
|
926
|
+
const buildToolsMap = (transformedTools) => Object.fromEntries(
|
|
927
|
+
transformedTools.map((t) => [
|
|
928
|
+
t.name,
|
|
929
|
+
(0, import_ai.tool)({
|
|
930
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
931
|
+
inputSchema: (0, import_ai.jsonSchema)(
|
|
932
|
+
t.inputSchema
|
|
933
|
+
)
|
|
934
|
+
})
|
|
935
|
+
])
|
|
936
|
+
);
|
|
937
|
+
const executeModelGeneration = async (options) => {
|
|
938
|
+
const {
|
|
939
|
+
model: modelInstance,
|
|
940
|
+
flatMessages,
|
|
941
|
+
toolsMap,
|
|
942
|
+
temperature,
|
|
943
|
+
maxTokens
|
|
944
|
+
} = options;
|
|
945
|
+
const debugSummaryRef = {};
|
|
946
|
+
const providerOptions = {
|
|
947
|
+
toolCallMiddleware: {
|
|
948
|
+
debugSummary: debugSummaryRef
|
|
913
949
|
}
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
950
|
+
};
|
|
951
|
+
const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
|
|
952
|
+
model: modelInstance,
|
|
953
|
+
messages: flatMessages,
|
|
954
|
+
tools: toolsMap,
|
|
955
|
+
toolChoice: "auto",
|
|
956
|
+
providerOptions,
|
|
957
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
958
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
959
|
+
});
|
|
960
|
+
return { toolCalls, text, finishReason, debugSummaryRef };
|
|
961
|
+
};
|
|
962
|
+
const processValidationResult = (options) => {
|
|
963
|
+
const {
|
|
964
|
+
checkerResult,
|
|
965
|
+
testCase,
|
|
966
|
+
tools,
|
|
967
|
+
possibleAnswer,
|
|
968
|
+
restoredCalls,
|
|
969
|
+
flatMessages,
|
|
970
|
+
mwOriginalText,
|
|
971
|
+
text,
|
|
972
|
+
finishReason,
|
|
973
|
+
mwParsedToolCalls,
|
|
974
|
+
caseLogs
|
|
975
|
+
} = options;
|
|
976
|
+
if (checkerResult.valid) {
|
|
977
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
978
|
+
return { valid: true, logs: caseLogs };
|
|
979
|
+
}
|
|
980
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
981
|
+
logFailureDetails({
|
|
982
|
+
testCase,
|
|
983
|
+
tools,
|
|
984
|
+
possibleAnswer,
|
|
985
|
+
restoredCalls,
|
|
986
|
+
checkerResult,
|
|
987
|
+
flatMessages,
|
|
988
|
+
mwOriginalText,
|
|
989
|
+
text,
|
|
990
|
+
finishReason,
|
|
991
|
+
mwParsedToolCalls,
|
|
992
|
+
caseLogs
|
|
993
|
+
});
|
|
994
|
+
return { valid: false, logs: caseLogs };
|
|
995
|
+
};
|
|
996
|
+
const prepareTestCaseData = (testCase) => {
|
|
997
|
+
const { function: tools, question: messages } = testCase;
|
|
998
|
+
const flatMessages = flattenMessages(messages);
|
|
999
|
+
const { transformedTools, nameMap } = buildTransformedTools(
|
|
1000
|
+
tools,
|
|
1001
|
+
fixSchema
|
|
1002
|
+
);
|
|
1003
|
+
const toolsMap = buildToolsMap(transformedTools);
|
|
1004
|
+
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
1005
|
+
};
|
|
1006
|
+
const processModelResponse = (options) => {
|
|
1007
|
+
const {
|
|
1008
|
+
testCase,
|
|
1009
|
+
toolCalls,
|
|
1010
|
+
text,
|
|
1011
|
+
finishReason,
|
|
1012
|
+
debugSummaryRef,
|
|
1013
|
+
nameMap,
|
|
1014
|
+
transformedTools,
|
|
1015
|
+
flatMessages,
|
|
1016
|
+
tools,
|
|
1017
|
+
caseLogs
|
|
1018
|
+
} = options;
|
|
1019
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
1020
|
+
const mwParsedToolCalls = parseDebugToolCalls(
|
|
1021
|
+
debugSummaryRef.toolCalls
|
|
1022
|
+
);
|
|
1023
|
+
logRawToolCalls({
|
|
1024
|
+
toolCalls,
|
|
1025
|
+
finishReason,
|
|
1026
|
+
text,
|
|
1027
|
+
testCaseId: testCase.id,
|
|
1028
|
+
caseLogs
|
|
1029
|
+
});
|
|
1030
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
1031
|
+
if (!possibleAnswer) {
|
|
1032
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
1033
|
+
}
|
|
1034
|
+
const restoredCalls = restoreToolCalls(
|
|
1035
|
+
toolCalls || [],
|
|
1036
|
+
nameMap,
|
|
1037
|
+
transformedTools
|
|
1038
|
+
);
|
|
1039
|
+
const checkerResult = check(testCase, restoredCalls, possibleAnswer);
|
|
1040
|
+
return processValidationResult({
|
|
1041
|
+
checkerResult,
|
|
1042
|
+
testCase,
|
|
1043
|
+
tools,
|
|
1044
|
+
possibleAnswer,
|
|
1045
|
+
restoredCalls,
|
|
1046
|
+
flatMessages,
|
|
1047
|
+
mwOriginalText,
|
|
1048
|
+
text,
|
|
1049
|
+
finishReason,
|
|
1050
|
+
mwParsedToolCalls,
|
|
1051
|
+
caseLogs
|
|
1052
|
+
});
|
|
1053
|
+
};
|
|
1054
|
+
const runSingleCase = async (testCase) => {
|
|
1055
|
+
const caseLogs = [];
|
|
1056
|
+
const { function: tools } = testCase;
|
|
1057
|
+
const temp = config?.temperature;
|
|
1058
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1059
|
+
const maxTok = config?.maxTokens;
|
|
1060
|
+
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1061
|
+
try {
|
|
1062
|
+
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
1063
|
+
logFirstToolDebug(transformedTools, testCase.id, caseLogs);
|
|
1064
|
+
const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
|
|
1065
|
+
model,
|
|
1066
|
+
flatMessages,
|
|
1067
|
+
toolsMap,
|
|
1068
|
+
temperature,
|
|
1069
|
+
maxTokens
|
|
932
1070
|
});
|
|
933
|
-
|
|
1071
|
+
return processModelResponse({
|
|
934
1072
|
testCase,
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
const diffLines = [];
|
|
946
|
-
diffLines.push(`@@ param ${paramName}`);
|
|
947
|
-
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
948
|
-
const expectedLine = (() => {
|
|
949
|
-
if (allowedArray.length === 1) {
|
|
950
|
-
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
951
|
-
}
|
|
952
|
-
const formatted = allowedArray.map(
|
|
953
|
-
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
954
|
-
).join(", ");
|
|
955
|
-
return `- expected one of: ${formatted}`;
|
|
956
|
-
})();
|
|
957
|
-
diffLines.push(expectedLine);
|
|
958
|
-
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
959
|
-
return diffLines;
|
|
960
|
-
};
|
|
961
|
-
var generateParamMismatchDiff = generateParamMismatchDiff2;
|
|
962
|
-
const category = testCase.id.split("_")[0];
|
|
963
|
-
const diff = [];
|
|
964
|
-
const summarizeArgs = (args) => {
|
|
965
|
-
if (args == null) return args;
|
|
966
|
-
if (typeof args !== "object") return args;
|
|
967
|
-
return Object.keys(args).sort().reduce(
|
|
968
|
-
(acc, k) => {
|
|
969
|
-
acc[k] = args[k];
|
|
970
|
-
return acc;
|
|
971
|
-
},
|
|
972
|
-
{}
|
|
973
|
-
);
|
|
974
|
-
};
|
|
975
|
-
const expected = {};
|
|
976
|
-
const actual = {};
|
|
977
|
-
if (category === "simple") {
|
|
978
|
-
const funcDesc = tools[0];
|
|
979
|
-
const gt = possibleAnswer.ground_truth?.[0];
|
|
980
|
-
const expectedFuncName = funcDesc?.name;
|
|
981
|
-
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
982
|
-
const received = restoredCalls[0];
|
|
983
|
-
const receivedName = received?.toolName ?? received?.name;
|
|
984
|
-
const receivedArgs = summarizeArgs(received?.args);
|
|
985
|
-
expected.function = expectedFuncName;
|
|
986
|
-
expected.params = expectedParams;
|
|
987
|
-
actual.function = receivedName;
|
|
988
|
-
actual.args = receivedArgs;
|
|
989
|
-
if (expectedFuncName !== receivedName) {
|
|
990
|
-
diff.push(`@@ function name`);
|
|
991
|
-
diff.push(`- ${expectedFuncName}`);
|
|
992
|
-
diff.push(`+ ${receivedName}`);
|
|
993
|
-
}
|
|
994
|
-
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
995
|
-
const required = funcDesc?.parameters?.required ?? [];
|
|
996
|
-
for (const req of required) {
|
|
997
|
-
if (!(req in receivedArgs)) {
|
|
998
|
-
diff.push(`- missing required param: ${req}`);
|
|
999
|
-
}
|
|
1000
|
-
}
|
|
1001
|
-
for (const k of Object.keys(
|
|
1002
|
-
receivedArgs
|
|
1003
|
-
)) {
|
|
1004
|
-
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1005
|
-
diff.push(`+ unexpected param: ${k}`);
|
|
1006
|
-
}
|
|
1007
|
-
}
|
|
1008
|
-
for (const k of Object.keys(
|
|
1009
|
-
receivedArgs
|
|
1010
|
-
)) {
|
|
1011
|
-
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
1012
|
-
const allowed = expectedParams[k];
|
|
1013
|
-
const got = receivedArgs[k];
|
|
1014
|
-
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1015
|
-
try {
|
|
1016
|
-
if (Array.isArray(got)) {
|
|
1017
|
-
return JSON.stringify(
|
|
1018
|
-
got.map((x) => String(x)).sort()
|
|
1019
|
-
) === JSON.stringify(
|
|
1020
|
-
v.map((x) => String(x)).sort()
|
|
1021
|
-
);
|
|
1022
|
-
}
|
|
1023
|
-
} catch {
|
|
1024
|
-
}
|
|
1025
|
-
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1026
|
-
});
|
|
1027
|
-
if (!includes) {
|
|
1028
|
-
diff.push(
|
|
1029
|
-
...generateParamMismatchDiff2(k, allowed, got)
|
|
1030
|
-
);
|
|
1031
|
-
}
|
|
1032
|
-
}
|
|
1033
|
-
}
|
|
1034
|
-
}
|
|
1035
|
-
} else {
|
|
1036
|
-
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
1037
|
-
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
1038
|
-
const actualNames = restoredCalls.map(
|
|
1039
|
-
(c) => c.toolName ?? c.name
|
|
1040
|
-
);
|
|
1041
|
-
expected.functions = expectedNames;
|
|
1042
|
-
actual.functions = actualNames;
|
|
1043
|
-
if (expectedNames.length !== actualNames.length) {
|
|
1044
|
-
diff.push(`@@ call count`);
|
|
1045
|
-
diff.push(`- expected ${expectedNames.length}`);
|
|
1046
|
-
diff.push(`+ got ${actualNames.length}`);
|
|
1047
|
-
}
|
|
1048
|
-
const missing = expectedNames.filter(
|
|
1049
|
-
(n) => !actualNames.includes(n)
|
|
1050
|
-
);
|
|
1051
|
-
const extra = actualNames.filter(
|
|
1052
|
-
(n) => !expectedNames.includes(n)
|
|
1053
|
-
);
|
|
1054
|
-
for (const m of missing)
|
|
1055
|
-
diff.push(`- missing function: ${m}`);
|
|
1056
|
-
for (const e of extra)
|
|
1057
|
-
diff.push(`+ unexpected function: ${e}`);
|
|
1058
|
-
const usedActual = /* @__PURE__ */ new Set();
|
|
1059
|
-
for (const expectedObj of gtArr) {
|
|
1060
|
-
const fname = Object.keys(expectedObj)[0];
|
|
1061
|
-
let matchedIndex = -1;
|
|
1062
|
-
for (let i = 0; i < restoredCalls.length; i++) {
|
|
1063
|
-
if (usedActual.has(i)) continue;
|
|
1064
|
-
const rc = restoredCalls[i];
|
|
1065
|
-
const rcName = rc?.toolName ?? rc?.name;
|
|
1066
|
-
if (rcName === fname) {
|
|
1067
|
-
matchedIndex = i;
|
|
1068
|
-
break;
|
|
1069
|
-
}
|
|
1070
|
-
}
|
|
1071
|
-
if (matchedIndex === -1) continue;
|
|
1072
|
-
usedActual.add(matchedIndex);
|
|
1073
|
-
const received = restoredCalls[matchedIndex];
|
|
1074
|
-
const receivedArgs = summarizeArgs(received?.args);
|
|
1075
|
-
const expectedParamsAllowed = expectedObj[fname];
|
|
1076
|
-
const funcDesc = tools.find(
|
|
1077
|
-
(t) => t.name === fname
|
|
1078
|
-
);
|
|
1079
|
-
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
1080
|
-
diff.push(`@@ function ${fname}`);
|
|
1081
|
-
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
1082
|
-
for (const req of requiredParams) {
|
|
1083
|
-
if (!(req in receivedArgs)) {
|
|
1084
|
-
diff.push(`- missing required param: ${req}`);
|
|
1085
|
-
}
|
|
1086
|
-
}
|
|
1087
|
-
for (const k of Object.keys(
|
|
1088
|
-
receivedArgs
|
|
1089
|
-
)) {
|
|
1090
|
-
if (!Object.prototype.hasOwnProperty.call(
|
|
1091
|
-
expectedParamsAllowed,
|
|
1092
|
-
k
|
|
1093
|
-
)) {
|
|
1094
|
-
diff.push(`+ unexpected param: ${k}`);
|
|
1095
|
-
}
|
|
1096
|
-
}
|
|
1097
|
-
for (const k of Object.keys(
|
|
1098
|
-
receivedArgs
|
|
1099
|
-
)) {
|
|
1100
|
-
if (Object.prototype.hasOwnProperty.call(
|
|
1101
|
-
expectedParamsAllowed,
|
|
1102
|
-
k
|
|
1103
|
-
)) {
|
|
1104
|
-
const allowed = expectedParamsAllowed[k];
|
|
1105
|
-
const got = receivedArgs[k];
|
|
1106
|
-
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
1107
|
-
try {
|
|
1108
|
-
if (Array.isArray(got)) {
|
|
1109
|
-
return JSON.stringify(
|
|
1110
|
-
got.map((x) => String(x)).sort()
|
|
1111
|
-
) === JSON.stringify(
|
|
1112
|
-
v.map((x) => String(x)).sort()
|
|
1113
|
-
);
|
|
1114
|
-
}
|
|
1115
|
-
} catch {
|
|
1116
|
-
}
|
|
1117
|
-
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
1118
|
-
});
|
|
1119
|
-
if (!includes) {
|
|
1120
|
-
diff.push(
|
|
1121
|
-
...generateParamMismatchDiff2(k, allowed, got)
|
|
1122
|
-
);
|
|
1123
|
-
}
|
|
1124
|
-
}
|
|
1125
|
-
}
|
|
1126
|
-
}
|
|
1127
|
-
}
|
|
1128
|
-
}
|
|
1129
|
-
caseLogs.push(
|
|
1130
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
1131
|
-
id: testCase.id,
|
|
1132
|
-
message: checkerResult.error,
|
|
1133
|
-
error_type: checkerResult.error_type,
|
|
1134
|
-
expected,
|
|
1135
|
-
actual,
|
|
1136
|
-
diff
|
|
1137
|
-
})}`
|
|
1138
|
-
);
|
|
1139
|
-
try {
|
|
1140
|
-
const lastUser = (() => {
|
|
1141
|
-
const reversed = [...flatMessages].reverse();
|
|
1142
|
-
const found = reversed.find(
|
|
1143
|
-
(m) => m.role === "user"
|
|
1144
|
-
);
|
|
1145
|
-
return found?.content ?? void 0;
|
|
1146
|
-
})();
|
|
1147
|
-
const contextPayload = {
|
|
1148
|
-
id: testCase.id,
|
|
1149
|
-
tool_schema: tools,
|
|
1150
|
-
last_user_query: lastUser,
|
|
1151
|
-
raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
|
|
1152
|
-
finish_reason: finishReason,
|
|
1153
|
-
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1154
|
-
ground_truth: possibleAnswer.ground_truth
|
|
1155
|
-
};
|
|
1156
|
-
caseLogs.push(
|
|
1157
|
-
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
1158
|
-
);
|
|
1159
|
-
} catch {
|
|
1160
|
-
}
|
|
1161
|
-
} catch {
|
|
1162
|
-
caseLogs.push(
|
|
1163
|
-
`[DEBUG] ${testCase.id}: failed to build debug diff`
|
|
1164
|
-
);
|
|
1165
|
-
}
|
|
1166
|
-
return { valid: false, logs: caseLogs };
|
|
1167
|
-
}
|
|
1073
|
+
toolCalls,
|
|
1074
|
+
text,
|
|
1075
|
+
finishReason,
|
|
1076
|
+
debugSummaryRef,
|
|
1077
|
+
nameMap,
|
|
1078
|
+
transformedTools,
|
|
1079
|
+
flatMessages,
|
|
1080
|
+
tools,
|
|
1081
|
+
caseLogs
|
|
1082
|
+
});
|
|
1168
1083
|
} catch (e) {
|
|
1169
1084
|
caseLogs.push(
|
|
1170
1085
|
`[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
|
|
@@ -1175,13 +1090,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1175
1090
|
return { valid: false, logs: caseLogs };
|
|
1176
1091
|
}
|
|
1177
1092
|
};
|
|
1178
|
-
const mapWithConcurrency = async (items,
|
|
1093
|
+
const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
|
|
1179
1094
|
const results = new Array(items.length);
|
|
1180
1095
|
let idx = 0;
|
|
1181
|
-
const workers = new Array(Math.min(
|
|
1096
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1182
1097
|
while (true) {
|
|
1183
|
-
const current = idx
|
|
1184
|
-
|
|
1098
|
+
const current = idx;
|
|
1099
|
+
idx += 1;
|
|
1100
|
+
if (current >= items.length) {
|
|
1101
|
+
break;
|
|
1102
|
+
}
|
|
1185
1103
|
results[current] = await mapper(items[current], current);
|
|
1186
1104
|
}
|
|
1187
1105
|
});
|
|
@@ -1197,7 +1115,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1197
1115
|
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1198
1116
|
0
|
|
1199
1117
|
);
|
|
1200
|
-
for (const r of resultsPerCase)
|
|
1118
|
+
for (const r of resultsPerCase) {
|
|
1119
|
+
logs.push(...r.logs);
|
|
1120
|
+
}
|
|
1201
1121
|
if (testCases.length === 0) {
|
|
1202
1122
|
return {
|
|
1203
1123
|
score: 0,
|
|
@@ -1224,7 +1144,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1224
1144
|
success: false,
|
|
1225
1145
|
metrics: {},
|
|
1226
1146
|
error: e,
|
|
1227
|
-
logs: [
|
|
1147
|
+
logs: [
|
|
1148
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1149
|
+
]
|
|
1228
1150
|
};
|
|
1229
1151
|
}
|
|
1230
1152
|
}
|
|
@@ -1233,87 +1155,222 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1233
1155
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1234
1156
|
"bfcl-simple",
|
|
1235
1157
|
"BFCL Simple Function Calling",
|
|
1236
|
-
"BFCL_v3_simple.
|
|
1237
|
-
"BFCL_v3_simple_possible_answer.
|
|
1158
|
+
"BFCL_v3_simple.jsonl",
|
|
1159
|
+
"BFCL_v3_simple_possible_answer.jsonl"
|
|
1238
1160
|
);
|
|
1239
1161
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1240
1162
|
"bfcl-parallel",
|
|
1241
1163
|
"BFCL Parallel Function Calling",
|
|
1242
|
-
"BFCL_v3_parallel.
|
|
1243
|
-
"BFCL_v3_parallel_possible_answer.
|
|
1164
|
+
"BFCL_v3_parallel.jsonl",
|
|
1165
|
+
"BFCL_v3_parallel_possible_answer.jsonl"
|
|
1244
1166
|
);
|
|
1245
1167
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1246
1168
|
"bfcl-multiple",
|
|
1247
1169
|
"BFCL Multiple Function Calling",
|
|
1248
|
-
"BFCL_v3_multiple.
|
|
1249
|
-
"BFCL_v3_multiple_possible_answer.
|
|
1170
|
+
"BFCL_v3_multiple.jsonl",
|
|
1171
|
+
"BFCL_v3_multiple_possible_answer.jsonl"
|
|
1250
1172
|
);
|
|
1251
1173
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1252
1174
|
"bfcl-parallel-multiple",
|
|
1253
1175
|
"BFCL Parallel & Multiple Function Calling",
|
|
1254
|
-
"BFCL_v3_parallel_multiple.
|
|
1255
|
-
"BFCL_v3_parallel_multiple_possible_answer.
|
|
1176
|
+
"BFCL_v3_parallel_multiple.jsonl",
|
|
1177
|
+
"BFCL_v3_parallel_multiple_possible_answer.jsonl"
|
|
1256
1178
|
);
|
|
1257
1179
|
|
|
1258
1180
|
// src/benchmarks/json-generation.ts
|
|
1181
|
+
var import_node_fs3 = require("fs");
|
|
1182
|
+
var import_node_path3 = __toESM(require("path"), 1);
|
|
1259
1183
|
var import_ai2 = require("ai");
|
|
1260
1184
|
var import_ajv = __toESM(require("ajv"), 1);
|
|
1261
|
-
var
|
|
1262
|
-
var
|
|
1263
|
-
|
|
1185
|
+
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1186
|
+
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1187
|
+
var NEWLINE_REGEX = /\r?\n/;
|
|
1188
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1189
|
+
function tryDirectParse(text) {
|
|
1264
1190
|
try {
|
|
1265
1191
|
return JSON.parse(text);
|
|
1266
1192
|
} catch {
|
|
1193
|
+
return;
|
|
1267
1194
|
}
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1195
|
+
}
|
|
1196
|
+
function tryCodeFenceParse(text) {
|
|
1197
|
+
const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
|
|
1198
|
+
if (!fenceMatch) {
|
|
1199
|
+
return;
|
|
1200
|
+
}
|
|
1201
|
+
const inner = fenceMatch[1].trim();
|
|
1202
|
+
try {
|
|
1203
|
+
return JSON.parse(inner);
|
|
1204
|
+
} catch {
|
|
1205
|
+
return;
|
|
1275
1206
|
}
|
|
1207
|
+
}
|
|
1208
|
+
function tryBracketScan(text) {
|
|
1276
1209
|
const startIdxObj = text.indexOf("{");
|
|
1277
1210
|
const startIdxArr = text.indexOf("[");
|
|
1278
1211
|
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1279
|
-
if (start === void 0)
|
|
1212
|
+
if (start === void 0) {
|
|
1213
|
+
return;
|
|
1214
|
+
}
|
|
1280
1215
|
const open = text[start] === "{" ? "{" : "[";
|
|
1281
1216
|
const close = open === "{" ? "}" : "]";
|
|
1282
1217
|
let depth = 0;
|
|
1283
|
-
for (let i = start; i < text.length; i
|
|
1218
|
+
for (let i = start; i < text.length; i += 1) {
|
|
1284
1219
|
const ch = text[i];
|
|
1285
|
-
if (ch === open)
|
|
1286
|
-
|
|
1220
|
+
if (ch === open) {
|
|
1221
|
+
depth += 1;
|
|
1222
|
+
} else if (ch === close) {
|
|
1223
|
+
depth -= 1;
|
|
1224
|
+
}
|
|
1287
1225
|
if (depth === 0) {
|
|
1288
1226
|
const candidate = text.slice(start, i + 1);
|
|
1289
1227
|
try {
|
|
1290
1228
|
return JSON.parse(candidate);
|
|
1291
1229
|
} catch {
|
|
1230
|
+
return;
|
|
1292
1231
|
}
|
|
1293
|
-
break;
|
|
1294
1232
|
}
|
|
1295
1233
|
}
|
|
1296
|
-
return
|
|
1234
|
+
return;
|
|
1235
|
+
}
|
|
1236
|
+
function extractFirstJsonBlock(text) {
|
|
1237
|
+
const directResult = tryDirectParse(text);
|
|
1238
|
+
if (directResult !== void 0) {
|
|
1239
|
+
return directResult;
|
|
1240
|
+
}
|
|
1241
|
+
const fenceResult = tryCodeFenceParse(text);
|
|
1242
|
+
if (fenceResult !== void 0) {
|
|
1243
|
+
return fenceResult;
|
|
1244
|
+
}
|
|
1245
|
+
return tryBracketScan(text);
|
|
1297
1246
|
}
|
|
1298
1247
|
function subsetMatch(expected, actual) {
|
|
1299
1248
|
if (expected === null || typeof expected !== "object") {
|
|
1300
1249
|
return expected === actual;
|
|
1301
1250
|
}
|
|
1302
1251
|
if (Array.isArray(expected)) {
|
|
1303
|
-
if (!Array.isArray(actual))
|
|
1304
|
-
|
|
1305
|
-
|
|
1252
|
+
if (!Array.isArray(actual)) {
|
|
1253
|
+
return false;
|
|
1254
|
+
}
|
|
1255
|
+
for (let i = 0; i < expected.length; i += 1) {
|
|
1256
|
+
if (!subsetMatch(expected[i], actual[i])) {
|
|
1257
|
+
return false;
|
|
1258
|
+
}
|
|
1306
1259
|
}
|
|
1307
1260
|
return true;
|
|
1308
1261
|
}
|
|
1309
|
-
if (actual === null || typeof actual !== "object")
|
|
1262
|
+
if (actual === null || typeof actual !== "object") {
|
|
1263
|
+
return false;
|
|
1264
|
+
}
|
|
1310
1265
|
const eObj = expected;
|
|
1311
1266
|
const aObj = actual;
|
|
1312
1267
|
for (const key of Object.keys(eObj)) {
|
|
1313
|
-
if (!subsetMatch(eObj[key], aObj[key]))
|
|
1268
|
+
if (!subsetMatch(eObj[key], aObj[key])) {
|
|
1269
|
+
return false;
|
|
1270
|
+
}
|
|
1314
1271
|
}
|
|
1315
1272
|
return true;
|
|
1316
1273
|
}
|
|
1274
|
+
async function loadDatasets() {
|
|
1275
|
+
try {
|
|
1276
|
+
const dataDir = resolveDataDir();
|
|
1277
|
+
const testsJsonl = await import_node_fs3.promises.readFile(
|
|
1278
|
+
import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1279
|
+
"utf-8"
|
|
1280
|
+
);
|
|
1281
|
+
const expectedJsonl = await import_node_fs3.promises.readFile(
|
|
1282
|
+
import_node_path3.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
1283
|
+
"utf-8"
|
|
1284
|
+
);
|
|
1285
|
+
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1286
|
+
const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1287
|
+
const expectedMap = /* @__PURE__ */ new Map();
|
|
1288
|
+
for (const r of expecteds) {
|
|
1289
|
+
expectedMap.set(r.id, r);
|
|
1290
|
+
}
|
|
1291
|
+
return { tests, expectedMap };
|
|
1292
|
+
} catch (e) {
|
|
1293
|
+
return {
|
|
1294
|
+
tests: [],
|
|
1295
|
+
expectedMap: /* @__PURE__ */ new Map(),
|
|
1296
|
+
error: e
|
|
1297
|
+
};
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
function buildMessages(tc) {
|
|
1301
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1302
|
+
return [
|
|
1303
|
+
{
|
|
1304
|
+
role: "system",
|
|
1305
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1306
|
+
},
|
|
1307
|
+
{
|
|
1308
|
+
role: "user",
|
|
1309
|
+
content: [
|
|
1310
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1311
|
+
"JSON Schema:",
|
|
1312
|
+
schemaStr,
|
|
1313
|
+
"Facts:",
|
|
1314
|
+
tc.promptFacts,
|
|
1315
|
+
"Output must be a single JSON only, with no additional text."
|
|
1316
|
+
].join("\n\n")
|
|
1317
|
+
}
|
|
1318
|
+
];
|
|
1319
|
+
}
|
|
1320
|
+
function validateTestCase(tc, parsed, context) {
|
|
1321
|
+
const validate = context.ajv.compile(tc.schema);
|
|
1322
|
+
const valid = validate(parsed);
|
|
1323
|
+
if (!valid) {
|
|
1324
|
+
context.logs.push(
|
|
1325
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1326
|
+
);
|
|
1327
|
+
}
|
|
1328
|
+
const expectedRec = context.expectedMap.get(tc.id);
|
|
1329
|
+
if (!expectedRec) {
|
|
1330
|
+
context.logs.push(
|
|
1331
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1332
|
+
);
|
|
1333
|
+
}
|
|
1334
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1335
|
+
return { valid, valuesOk, parsed };
|
|
1336
|
+
}
|
|
1337
|
+
async function processTestCase(tc, context) {
|
|
1338
|
+
const messages = buildMessages(tc);
|
|
1339
|
+
const temp = context.config?.temperature;
|
|
1340
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1341
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
1342
|
+
model: context.model,
|
|
1343
|
+
messages,
|
|
1344
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1345
|
+
});
|
|
1346
|
+
let parsed;
|
|
1347
|
+
try {
|
|
1348
|
+
parsed = extractFirstJsonBlock(text);
|
|
1349
|
+
} catch {
|
|
1350
|
+
}
|
|
1351
|
+
if (parsed === void 0) {
|
|
1352
|
+
context.validation.logs.push(
|
|
1353
|
+
`[FAIL] ${tc.id}: Unable to parse JSON from model output.`
|
|
1354
|
+
);
|
|
1355
|
+
return { schemaValid: false, valueMatch: false, correct: false };
|
|
1356
|
+
}
|
|
1357
|
+
const {
|
|
1358
|
+
valid,
|
|
1359
|
+
valuesOk,
|
|
1360
|
+
parsed: validatedParsed
|
|
1361
|
+
} = validateTestCase(tc, parsed, context.validation);
|
|
1362
|
+
const correct = valid && valuesOk;
|
|
1363
|
+
if (correct) {
|
|
1364
|
+
context.validation.logs.push(`[PASS] ${tc.id}`);
|
|
1365
|
+
} else {
|
|
1366
|
+
context.validation.logs.push(
|
|
1367
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1368
|
+
validatedParsed
|
|
1369
|
+
)}`
|
|
1370
|
+
);
|
|
1371
|
+
}
|
|
1372
|
+
return { schemaValid: valid, valueMatch: valuesOk, correct };
|
|
1373
|
+
}
|
|
1317
1374
|
var jsonGenerationBenchmark = {
|
|
1318
1375
|
name: "json-generation",
|
|
1319
1376
|
version: "2.1.0",
|
|
@@ -1321,116 +1378,124 @@ var jsonGenerationBenchmark = {
|
|
|
1321
1378
|
async run(model, config) {
|
|
1322
1379
|
const logs = [];
|
|
1323
1380
|
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
let correctCount = 0;
|
|
1327
|
-
let tests = [];
|
|
1328
|
-
const expectedMap = /* @__PURE__ */ new Map();
|
|
1329
|
-
try {
|
|
1330
|
-
const dataDir = resolveDataDir();
|
|
1331
|
-
const testsJsonl = await import_fs3.promises.readFile(
|
|
1332
|
-
import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1333
|
-
"utf-8"
|
|
1334
|
-
);
|
|
1335
|
-
const expectedJsonl = await import_fs3.promises.readFile(
|
|
1336
|
-
import_path3.default.join(dataDir, "json_generation_expected.jsonl"),
|
|
1337
|
-
"utf-8"
|
|
1338
|
-
);
|
|
1339
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1340
|
-
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1341
|
-
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
1342
|
-
} catch (e) {
|
|
1343
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1381
|
+
const { tests, expectedMap, error } = await loadDatasets();
|
|
1382
|
+
if (error) {
|
|
1344
1383
|
return {
|
|
1345
1384
|
score: 0,
|
|
1346
1385
|
success: false,
|
|
1347
1386
|
metrics: {},
|
|
1348
|
-
logs: [
|
|
1349
|
-
|
|
1387
|
+
logs: [
|
|
1388
|
+
`[FATAL] Failed to load json-generation datasets: ${error.message}`
|
|
1389
|
+
],
|
|
1390
|
+
error
|
|
1350
1391
|
};
|
|
1351
1392
|
}
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
{
|
|
1357
|
-
role: "system",
|
|
1358
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1359
|
-
},
|
|
1360
|
-
{
|
|
1361
|
-
role: "user",
|
|
1362
|
-
content: [
|
|
1363
|
-
"Generate a JSON object that reflects the following facts.",
|
|
1364
|
-
"JSON Schema:",
|
|
1365
|
-
schemaStr,
|
|
1366
|
-
"Facts:",
|
|
1367
|
-
tc.promptFacts,
|
|
1368
|
-
"Output must be a single JSON only, with no additional text."
|
|
1369
|
-
].join("\n\n")
|
|
1370
|
-
}
|
|
1371
|
-
];
|
|
1372
|
-
const temp = config?.temperature;
|
|
1373
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1374
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
1375
|
-
model,
|
|
1376
|
-
messages,
|
|
1377
|
-
...temperature !== void 0 ? { temperature } : {}
|
|
1378
|
-
});
|
|
1379
|
-
let parsed;
|
|
1380
|
-
try {
|
|
1381
|
-
parsed = extractFirstJsonBlock(text);
|
|
1382
|
-
} catch {
|
|
1383
|
-
}
|
|
1384
|
-
if (parsed === void 0) {
|
|
1385
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1386
|
-
continue;
|
|
1387
|
-
}
|
|
1388
|
-
const validate = ajv.compile(tc.schema);
|
|
1389
|
-
const valid = validate(parsed);
|
|
1390
|
-
if (valid) schemaValidCount++;
|
|
1391
|
-
else
|
|
1392
|
-
logs.push(
|
|
1393
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1394
|
-
);
|
|
1395
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
1396
|
-
if (!expectedRec) {
|
|
1397
|
-
logs.push(
|
|
1398
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1399
|
-
);
|
|
1400
|
-
}
|
|
1401
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1402
|
-
if (valuesOk) valueMatchCount++;
|
|
1403
|
-
if (valid && valuesOk) {
|
|
1404
|
-
correctCount++;
|
|
1405
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
1406
|
-
} else {
|
|
1407
|
-
logs.push(
|
|
1408
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1409
|
-
parsed
|
|
1410
|
-
)}`
|
|
1411
|
-
);
|
|
1412
|
-
}
|
|
1413
|
-
} catch (e) {
|
|
1414
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1415
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1416
|
-
}
|
|
1417
|
-
}
|
|
1418
|
-
const total = tests.length;
|
|
1419
|
-
const score = correctCount / total;
|
|
1420
|
-
return {
|
|
1421
|
-
score,
|
|
1422
|
-
success: score >= 0.8,
|
|
1423
|
-
metrics: {
|
|
1424
|
-
total_cases: total,
|
|
1425
|
-
correct_count: correctCount,
|
|
1426
|
-
schema_valid_count: schemaValidCount,
|
|
1427
|
-
value_match_count: valueMatchCount,
|
|
1428
|
-
accuracy: score
|
|
1429
|
-
},
|
|
1430
|
-
logs
|
|
1393
|
+
const context = {
|
|
1394
|
+
model,
|
|
1395
|
+
config,
|
|
1396
|
+
validation: { expectedMap, ajv, logs }
|
|
1431
1397
|
};
|
|
1398
|
+
const counts = await processAllTests(tests, context);
|
|
1399
|
+
return buildBenchmarkResult(tests.length, counts, logs);
|
|
1432
1400
|
}
|
|
1433
1401
|
};
|
|
1402
|
+
async function processAllTests(tests, context) {
|
|
1403
|
+
let schemaValidCount = 0;
|
|
1404
|
+
let valueMatchCount = 0;
|
|
1405
|
+
let correctCount = 0;
|
|
1406
|
+
for (const tc of tests) {
|
|
1407
|
+
try {
|
|
1408
|
+
const result = await processTestCase(tc, context);
|
|
1409
|
+
if (result.schemaValid) {
|
|
1410
|
+
schemaValidCount += 1;
|
|
1411
|
+
}
|
|
1412
|
+
if (result.valueMatch) {
|
|
1413
|
+
valueMatchCount += 1;
|
|
1414
|
+
}
|
|
1415
|
+
if (result.correct) {
|
|
1416
|
+
correctCount += 1;
|
|
1417
|
+
}
|
|
1418
|
+
} catch (e) {
|
|
1419
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1420
|
+
context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
return { schemaValidCount, valueMatchCount, correctCount };
|
|
1424
|
+
}
|
|
1425
|
+
function buildBenchmarkResult(total, counts, logs) {
|
|
1426
|
+
const score = counts.correctCount / total;
|
|
1427
|
+
return {
|
|
1428
|
+
score,
|
|
1429
|
+
success: score >= 0.8,
|
|
1430
|
+
metrics: {
|
|
1431
|
+
total_cases: total,
|
|
1432
|
+
correct_count: counts.correctCount,
|
|
1433
|
+
schema_valid_count: counts.schemaValidCount,
|
|
1434
|
+
value_match_count: counts.valueMatchCount,
|
|
1435
|
+
accuracy: score
|
|
1436
|
+
},
|
|
1437
|
+
logs
|
|
1438
|
+
};
|
|
1439
|
+
}
|
|
1440
|
+
async function loadSchemaOnlyTests() {
|
|
1441
|
+
try {
|
|
1442
|
+
const dataDir = resolveDataDir();
|
|
1443
|
+
const testsJsonl = await import_node_fs3.promises.readFile(
|
|
1444
|
+
import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1445
|
+
"utf-8"
|
|
1446
|
+
);
|
|
1447
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1448
|
+
return { tests };
|
|
1449
|
+
} catch (e) {
|
|
1450
|
+
return { tests: [], error: e };
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
async function processSchemaOnlyTestCase(tc, context) {
|
|
1454
|
+
const messages = buildMessages(tc);
|
|
1455
|
+
const temp = context.config?.temperature;
|
|
1456
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1457
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
1458
|
+
model: context.model,
|
|
1459
|
+
messages,
|
|
1460
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1461
|
+
});
|
|
1462
|
+
let parsed;
|
|
1463
|
+
try {
|
|
1464
|
+
parsed = extractFirstJsonBlock(text);
|
|
1465
|
+
} catch {
|
|
1466
|
+
}
|
|
1467
|
+
if (parsed === void 0) {
|
|
1468
|
+
context.logs.push(
|
|
1469
|
+
`[FAIL] ${tc.id}: Could not parse JSON from model output.`
|
|
1470
|
+
);
|
|
1471
|
+
return false;
|
|
1472
|
+
}
|
|
1473
|
+
const validate = context.ajv.compile(tc.schema);
|
|
1474
|
+
const valid = validate(parsed);
|
|
1475
|
+
if (valid) {
|
|
1476
|
+
context.logs.push(`[PASS] ${tc.id}`);
|
|
1477
|
+
return true;
|
|
1478
|
+
}
|
|
1479
|
+
context.logs.push(
|
|
1480
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1481
|
+
);
|
|
1482
|
+
return false;
|
|
1483
|
+
}
|
|
1484
|
+
async function runSchemaOnlyTests(tests, context) {
|
|
1485
|
+
let schemaValidCount = 0;
|
|
1486
|
+
for (const tc of tests) {
|
|
1487
|
+
try {
|
|
1488
|
+
const isValid = await processSchemaOnlyTestCase(tc, context);
|
|
1489
|
+
if (isValid) {
|
|
1490
|
+
schemaValidCount += 1;
|
|
1491
|
+
}
|
|
1492
|
+
} catch (e) {
|
|
1493
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1494
|
+
context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1495
|
+
}
|
|
1496
|
+
}
|
|
1497
|
+
return schemaValidCount;
|
|
1498
|
+
}
|
|
1434
1499
|
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1435
1500
|
name: "json-generation-schema-only",
|
|
1436
1501
|
version: "1.0.1",
|
|
@@ -1438,76 +1503,19 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1438
1503
|
async run(model, config) {
|
|
1439
1504
|
const logs = [];
|
|
1440
1505
|
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
const
|
|
1444
|
-
const testsJsonl = await import_fs3.promises.readFile(
|
|
1445
|
-
import_path3.default.join(dataDir, "json_generation_tests.jsonl"),
|
|
1446
|
-
"utf-8"
|
|
1447
|
-
);
|
|
1448
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1449
|
-
} catch (e) {
|
|
1450
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1506
|
+
const { tests, error } = await loadSchemaOnlyTests();
|
|
1507
|
+
if (error) {
|
|
1508
|
+
const msg = error.message;
|
|
1451
1509
|
return {
|
|
1452
1510
|
score: 0,
|
|
1453
1511
|
success: false,
|
|
1454
1512
|
metrics: {},
|
|
1455
1513
|
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1456
|
-
error
|
|
1514
|
+
error
|
|
1457
1515
|
};
|
|
1458
1516
|
}
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
try {
|
|
1462
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1463
|
-
const messages = [
|
|
1464
|
-
{
|
|
1465
|
-
role: "system",
|
|
1466
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1467
|
-
},
|
|
1468
|
-
{
|
|
1469
|
-
role: "user",
|
|
1470
|
-
content: [
|
|
1471
|
-
"Generate a JSON object that reflects the following facts.",
|
|
1472
|
-
"JSON Schema:",
|
|
1473
|
-
schemaStr,
|
|
1474
|
-
"Facts:",
|
|
1475
|
-
tc.promptFacts,
|
|
1476
|
-
"Output must be a single JSON only, with no additional text."
|
|
1477
|
-
].join("\n\n")
|
|
1478
|
-
}
|
|
1479
|
-
];
|
|
1480
|
-
const temp = config?.temperature;
|
|
1481
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1482
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
1483
|
-
model,
|
|
1484
|
-
messages,
|
|
1485
|
-
...temperature !== void 0 ? { temperature } : {}
|
|
1486
|
-
});
|
|
1487
|
-
let parsed;
|
|
1488
|
-
try {
|
|
1489
|
-
parsed = extractFirstJsonBlock(text);
|
|
1490
|
-
} catch {
|
|
1491
|
-
}
|
|
1492
|
-
if (parsed === void 0) {
|
|
1493
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1494
|
-
continue;
|
|
1495
|
-
}
|
|
1496
|
-
const validate = ajv.compile(tc.schema);
|
|
1497
|
-
const valid = validate(parsed);
|
|
1498
|
-
if (valid) {
|
|
1499
|
-
schemaValidCount++;
|
|
1500
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
1501
|
-
} else {
|
|
1502
|
-
logs.push(
|
|
1503
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1504
|
-
);
|
|
1505
|
-
}
|
|
1506
|
-
} catch (e) {
|
|
1507
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1508
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1509
|
-
}
|
|
1510
|
-
}
|
|
1517
|
+
const context = { model, config, ajv, logs };
|
|
1518
|
+
const schemaValidCount = await runSchemaOnlyTests(tests, context);
|
|
1511
1519
|
const total = tests.length;
|
|
1512
1520
|
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1513
1521
|
return {
|
|
@@ -1522,6 +1530,505 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1522
1530
|
};
|
|
1523
1531
|
}
|
|
1524
1532
|
};
|
|
1533
|
+
|
|
1534
|
+
// src/reporters/console.ts
|
|
1535
|
+
var colors = {
|
|
1536
|
+
reset: "\x1B[0m",
|
|
1537
|
+
green: "\x1B[32m",
|
|
1538
|
+
red: "\x1B[31m",
|
|
1539
|
+
yellow: "\x1B[33m",
|
|
1540
|
+
cyan: "\x1B[36m",
|
|
1541
|
+
magenta: "\x1B[35m",
|
|
1542
|
+
gray: "\x1B[90m"
|
|
1543
|
+
};
|
|
1544
|
+
function printResult(result) {
|
|
1545
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1546
|
+
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
1547
|
+
console.log(
|
|
1548
|
+
`
|
|
1549
|
+
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
1550
|
+
);
|
|
1551
|
+
console.log(
|
|
1552
|
+
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
1553
|
+
);
|
|
1554
|
+
const metrics = Object.entries(benchmarkResult.metrics);
|
|
1555
|
+
if (metrics.length > 0) {
|
|
1556
|
+
console.log(" Metrics:");
|
|
1557
|
+
for (const [key, value] of metrics) {
|
|
1558
|
+
console.log(` - ${key}: ${value}`);
|
|
1559
|
+
}
|
|
1560
|
+
}
|
|
1561
|
+
if (benchmarkResult.error) {
|
|
1562
|
+
console.log(
|
|
1563
|
+
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
1564
|
+
);
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
function consoleReporter(results) {
|
|
1568
|
+
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
1569
|
+
for (const result of results) {
|
|
1570
|
+
printResult(result);
|
|
1571
|
+
}
|
|
1572
|
+
console.log("\n---------------------------\n");
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
// src/reporters/console.debug.ts
|
|
1576
|
+
var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
|
|
1577
|
+
var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
|
|
1578
|
+
var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
|
|
1579
|
+
var colors2 = {
|
|
1580
|
+
reset: "\x1B[0m",
|
|
1581
|
+
green: "\x1B[32m",
|
|
1582
|
+
red: "\x1B[31m",
|
|
1583
|
+
yellow: "\x1B[33m",
|
|
1584
|
+
cyan: "\x1B[36m",
|
|
1585
|
+
magenta: "\x1B[35m",
|
|
1586
|
+
gray: "\x1B[90m",
|
|
1587
|
+
bold: "\x1B[1m",
|
|
1588
|
+
underline: "\x1B[4m"
|
|
1589
|
+
};
|
|
1590
|
+
function colorizeDiffLine(line) {
|
|
1591
|
+
if (line.startsWith("+")) {
|
|
1592
|
+
return `${colors2.green}${line}${colors2.reset}`;
|
|
1593
|
+
}
|
|
1594
|
+
if (line.startsWith("-")) {
|
|
1595
|
+
return `${colors2.red}${line}${colors2.reset}`;
|
|
1596
|
+
}
|
|
1597
|
+
if (line.startsWith("@")) {
|
|
1598
|
+
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
1599
|
+
}
|
|
1600
|
+
return line;
|
|
1601
|
+
}
|
|
1602
|
+
function uniqueLines(lines) {
|
|
1603
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1604
|
+
const out = [];
|
|
1605
|
+
for (const l of lines) {
|
|
1606
|
+
if (seen.has(l)) {
|
|
1607
|
+
continue;
|
|
1608
|
+
}
|
|
1609
|
+
seen.add(l);
|
|
1610
|
+
out.push(l);
|
|
1611
|
+
}
|
|
1612
|
+
return out;
|
|
1613
|
+
}
|
|
1614
|
+
function hasFunctionNameIssue(diff) {
|
|
1615
|
+
return diff.some(
|
|
1616
|
+
(d) => String(d).includes("function name") || String(d).includes("missing function:")
|
|
1617
|
+
);
|
|
1618
|
+
}
|
|
1619
|
+
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
1620
|
+
const expectedName = expected?.function;
|
|
1621
|
+
const actualName = actual?.function;
|
|
1622
|
+
if (expectedName && actualName && expectedName !== actualName) {
|
|
1623
|
+
suggestions.push(
|
|
1624
|
+
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
1625
|
+
);
|
|
1626
|
+
}
|
|
1627
|
+
if (Array.isArray(expected?.functions)) {
|
|
1628
|
+
suggestions.push(
|
|
1629
|
+
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
1630
|
+
);
|
|
1631
|
+
}
|
|
1632
|
+
}
|
|
1633
|
+
function suggestMissingParamFix(diff, suggestions) {
|
|
1634
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
1635
|
+
if (missing.length) {
|
|
1636
|
+
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
1637
|
+
}
|
|
1638
|
+
}
|
|
1639
|
+
function suggestUnexpectedParamFix(diff, suggestions) {
|
|
1640
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
1641
|
+
if (extras.length) {
|
|
1642
|
+
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
1643
|
+
}
|
|
1644
|
+
}
|
|
1645
|
+
function suggestParamValueFix(diff, suggestions) {
|
|
1646
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
1647
|
+
for (const param of targets) {
|
|
1648
|
+
const allowedOneOfLine = diff.find(
|
|
1649
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
1650
|
+
);
|
|
1651
|
+
const allowedSingleLine = diff.find(
|
|
1652
|
+
(d) => String(d).startsWith("- expected:")
|
|
1653
|
+
);
|
|
1654
|
+
if (allowedSingleLine) {
|
|
1655
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
1656
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
1657
|
+
} else if (allowedOneOfLine) {
|
|
1658
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
1659
|
+
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
1660
|
+
} else {
|
|
1661
|
+
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
1662
|
+
}
|
|
1663
|
+
}
|
|
1664
|
+
}
|
|
1665
|
+
function suggestFromErrorType(error_type, suggestions) {
|
|
1666
|
+
if (error_type.includes("missing_required")) {
|
|
1667
|
+
suggestions.push("Add all required parameters defined by the tool schema.");
|
|
1668
|
+
} else if (error_type.includes("unexpected_param")) {
|
|
1669
|
+
suggestions.push("Remove parameters not present in the tool schema.");
|
|
1670
|
+
} else if (error_type.includes("wrong_count")) {
|
|
1671
|
+
suggestions.push(
|
|
1672
|
+
"Adjust the number of tool calls to match expected count."
|
|
1673
|
+
);
|
|
1674
|
+
} else if (error_type.includes("wrong_func_name")) {
|
|
1675
|
+
suggestions.push("Use the exact expected function name from the schema.");
|
|
1676
|
+
} else if (error_type.includes("value_error")) {
|
|
1677
|
+
suggestions.push("Choose a value from the allowed options.");
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
function suggestFixFromDiff(parsed) {
|
|
1681
|
+
const suggestions = [];
|
|
1682
|
+
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
1683
|
+
if (!Array.isArray(diff)) {
|
|
1684
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1685
|
+
suggestFromErrorType(error_type, suggestions);
|
|
1686
|
+
}
|
|
1687
|
+
return uniqueLines(suggestions);
|
|
1688
|
+
}
|
|
1689
|
+
if (hasFunctionNameIssue(diff)) {
|
|
1690
|
+
suggestFunctionNameFix(expected, actual, suggestions);
|
|
1691
|
+
}
|
|
1692
|
+
if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
1693
|
+
suggestMissingParamFix(diff, suggestions);
|
|
1694
|
+
}
|
|
1695
|
+
if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
1696
|
+
suggestUnexpectedParamFix(diff, suggestions);
|
|
1697
|
+
}
|
|
1698
|
+
if (diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
1699
|
+
suggestParamValueFix(diff, suggestions);
|
|
1700
|
+
}
|
|
1701
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1702
|
+
suggestFromErrorType(error_type, suggestions);
|
|
1703
|
+
}
|
|
1704
|
+
return uniqueLines(suggestions);
|
|
1705
|
+
}
|
|
1706
|
+
function getTestIdFromLogLine(line) {
|
|
1707
|
+
if (line.startsWith("[FAIL]")) {
|
|
1708
|
+
const m = line.match(FAIL_ID_REGEX);
|
|
1709
|
+
return m?.[1];
|
|
1710
|
+
}
|
|
1711
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1712
|
+
try {
|
|
1713
|
+
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1714
|
+
return String(parsed?.id ?? "");
|
|
1715
|
+
} catch {
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
1719
|
+
try {
|
|
1720
|
+
const parsed = JSON.parse(
|
|
1721
|
+
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
1722
|
+
);
|
|
1723
|
+
return String(parsed?.id ?? "");
|
|
1724
|
+
} catch {
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
return;
|
|
1728
|
+
}
|
|
1729
|
+
function groupLogsByTestId(failLogs) {
|
|
1730
|
+
const byId = /* @__PURE__ */ new Map();
|
|
1731
|
+
for (const line of failLogs) {
|
|
1732
|
+
const id = getTestIdFromLogLine(line);
|
|
1733
|
+
const key = id ?? "__general__";
|
|
1734
|
+
const arr = byId.get(key) ?? [];
|
|
1735
|
+
arr.push(line);
|
|
1736
|
+
byId.set(key, arr);
|
|
1737
|
+
}
|
|
1738
|
+
return byId;
|
|
1739
|
+
}
|
|
1740
|
+
function collectDebugIds(lines) {
|
|
1741
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
1742
|
+
for (const l of lines) {
|
|
1743
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
1744
|
+
try {
|
|
1745
|
+
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1746
|
+
if (parsed?.id) {
|
|
1747
|
+
debugIds.add(String(parsed.id));
|
|
1748
|
+
}
|
|
1749
|
+
} catch {
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
}
|
|
1753
|
+
return debugIds;
|
|
1754
|
+
}
|
|
1755
|
+
function printIndentedJson(prefix, data, color) {
|
|
1756
|
+
console.log(
|
|
1757
|
+
color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n ") + colors2.reset
|
|
1758
|
+
);
|
|
1759
|
+
}
|
|
1760
|
+
function displayDebugFailLine(line) {
|
|
1761
|
+
const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
|
|
1762
|
+
try {
|
|
1763
|
+
const parsed = JSON.parse(payload);
|
|
1764
|
+
const { message, diff, expected, actual } = parsed;
|
|
1765
|
+
if (message) {
|
|
1766
|
+
console.log(` ${colors2.bold}${message}${colors2.reset}`);
|
|
1767
|
+
}
|
|
1768
|
+
if (diff && Array.isArray(diff)) {
|
|
1769
|
+
for (const dLine of diff) {
|
|
1770
|
+
console.log(` ${colorizeDiffLine(dLine)}`);
|
|
1771
|
+
}
|
|
1772
|
+
} else {
|
|
1773
|
+
console.log(" expected:");
|
|
1774
|
+
printIndentedJson(" ", expected, colors2.green);
|
|
1775
|
+
console.log(" actual:");
|
|
1776
|
+
printIndentedJson(" ", actual, colors2.red);
|
|
1777
|
+
}
|
|
1778
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
1779
|
+
if (suggestions.length) {
|
|
1780
|
+
console.log(` ${colors2.bold}Suggested fix:${colors2.reset}`);
|
|
1781
|
+
for (const s of suggestions) {
|
|
1782
|
+
console.log(` \u2022 ${s}`);
|
|
1783
|
+
}
|
|
1784
|
+
}
|
|
1785
|
+
} catch {
|
|
1786
|
+
console.log(` ${line}`);
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
function displayContextInfo(ctx) {
|
|
1790
|
+
if (ctx.tool_schema) {
|
|
1791
|
+
printIndentedJson(" tool schema: ", ctx.tool_schema, colors2.gray);
|
|
1792
|
+
}
|
|
1793
|
+
if (ctx.last_user_query) {
|
|
1794
|
+
console.log(
|
|
1795
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
1796
|
+
);
|
|
1797
|
+
}
|
|
1798
|
+
if (ctx.raw_model_text) {
|
|
1799
|
+
console.log(
|
|
1800
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
1801
|
+
);
|
|
1802
|
+
}
|
|
1803
|
+
if (ctx.parsed_tool_calls) {
|
|
1804
|
+
printIndentedJson(
|
|
1805
|
+
" parsed tool calls: ",
|
|
1806
|
+
ctx.parsed_tool_calls,
|
|
1807
|
+
colors2.gray
|
|
1808
|
+
);
|
|
1809
|
+
}
|
|
1810
|
+
if (ctx.ground_truth) {
|
|
1811
|
+
printIndentedJson(
|
|
1812
|
+
" ground truth: ",
|
|
1813
|
+
ctx.ground_truth,
|
|
1814
|
+
colors2.gray
|
|
1815
|
+
);
|
|
1816
|
+
}
|
|
1817
|
+
if (ctx.finish_reason) {
|
|
1818
|
+
console.log(
|
|
1819
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
1820
|
+
);
|
|
1821
|
+
}
|
|
1822
|
+
}
|
|
1823
|
+
function displayDebugFailContextLine(line) {
|
|
1824
|
+
const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
|
|
1825
|
+
try {
|
|
1826
|
+
const ctx = JSON.parse(payload);
|
|
1827
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
1828
|
+
displayContextInfo(ctx);
|
|
1829
|
+
} catch {
|
|
1830
|
+
console.log(` ${line}`);
|
|
1831
|
+
}
|
|
1832
|
+
}
|
|
1833
|
+
function displayLogLine(line, debugIds) {
|
|
1834
|
+
if (line.startsWith("[FAIL]")) {
|
|
1835
|
+
const m = line.match(FAIL_ID_REGEX);
|
|
1836
|
+
const failId = m?.[1];
|
|
1837
|
+
if (failId && debugIds.has(failId)) {
|
|
1838
|
+
return;
|
|
1839
|
+
}
|
|
1840
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
1841
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
1842
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
1843
|
+
} else if (line.startsWith("[STACK]")) {
|
|
1844
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
1845
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1846
|
+
displayDebugFailLine(line);
|
|
1847
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
1848
|
+
displayDebugFailContextLine(line);
|
|
1849
|
+
}
|
|
1850
|
+
}
|
|
1851
|
+
function displayGroupedFailures(byId) {
|
|
1852
|
+
console.log(` ${colors2.bold}Failure details (grouped):${colors2.reset}`);
|
|
1853
|
+
for (const [groupId, lines] of byId) {
|
|
1854
|
+
if (groupId !== "__general__") {
|
|
1855
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
1856
|
+
}
|
|
1857
|
+
const debugIds = collectDebugIds(lines);
|
|
1858
|
+
for (const line of lines) {
|
|
1859
|
+
displayLogLine(line, debugIds);
|
|
1860
|
+
}
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
function displaySuccessLogs(logs) {
|
|
1864
|
+
const info = logs.filter(
|
|
1865
|
+
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
1866
|
+
);
|
|
1867
|
+
for (const line of info) {
|
|
1868
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
1869
|
+
}
|
|
1870
|
+
}
|
|
1871
|
+
function filterFailureLogs(logs) {
|
|
1872
|
+
return logs.filter(
|
|
1873
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
1874
|
+
);
|
|
1875
|
+
}
|
|
1876
|
+
function displayResultLogs(logs) {
|
|
1877
|
+
const failLogs = filterFailureLogs(logs);
|
|
1878
|
+
const hasFails = failLogs.length > 0;
|
|
1879
|
+
if (hasFails) {
|
|
1880
|
+
const byId = groupLogsByTestId(failLogs);
|
|
1881
|
+
displayGroupedFailures(byId);
|
|
1882
|
+
} else {
|
|
1883
|
+
displaySuccessLogs(logs);
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
function displayMetrics(metrics) {
|
|
1887
|
+
if (metrics.length > 0) {
|
|
1888
|
+
console.log(" Metrics:");
|
|
1889
|
+
for (const [k, v] of metrics) {
|
|
1890
|
+
console.log(` - ${k}: ${v}`);
|
|
1891
|
+
}
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1894
|
+
function displayResultHeader(r) {
|
|
1895
|
+
const { model, modelKey, benchmark, result } = r;
|
|
1896
|
+
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
1897
|
+
console.log(
|
|
1898
|
+
`
|
|
1899
|
+
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
1900
|
+
);
|
|
1901
|
+
console.log(
|
|
1902
|
+
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
1903
|
+
);
|
|
1904
|
+
}
|
|
1905
|
+
function consoleDebugReporter(results) {
|
|
1906
|
+
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
1907
|
+
for (const r of results) {
|
|
1908
|
+
displayResultHeader(r);
|
|
1909
|
+
displayMetrics(Object.entries(r.result.metrics));
|
|
1910
|
+
if (r.result.logs?.length) {
|
|
1911
|
+
displayResultLogs(r.result.logs);
|
|
1912
|
+
}
|
|
1913
|
+
}
|
|
1914
|
+
console.log("\n------------------------------------\n");
|
|
1915
|
+
}
|
|
1916
|
+
|
|
1917
|
+
// src/reporters/json.ts
|
|
1918
|
+
function jsonReporter(results) {
|
|
1919
|
+
const serializableResults = results.map((r) => ({
|
|
1920
|
+
...r,
|
|
1921
|
+
result: {
|
|
1922
|
+
...r.result,
|
|
1923
|
+
error: r.result.error?.message
|
|
1924
|
+
}
|
|
1925
|
+
}));
|
|
1926
|
+
console.log(JSON.stringify(serializableResults, null, 2));
|
|
1927
|
+
}
|
|
1928
|
+
|
|
1929
|
+
// src/reporters/index.ts
|
|
1930
|
+
var reporters = {
|
|
1931
|
+
console: consoleReporter,
|
|
1932
|
+
json: jsonReporter,
|
|
1933
|
+
"console.debug": consoleDebugReporter
|
|
1934
|
+
};
|
|
1935
|
+
|
|
1936
|
+
// src/evaluate.ts
|
|
1937
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
1938
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
1939
|
+
try {
|
|
1940
|
+
console.log(
|
|
1941
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
1942
|
+
);
|
|
1943
|
+
const result = await benchmark.run(model, config);
|
|
1944
|
+
console.log(
|
|
1945
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
1946
|
+
);
|
|
1947
|
+
return {
|
|
1948
|
+
model: modelId,
|
|
1949
|
+
modelKey,
|
|
1950
|
+
benchmark: benchmark.name,
|
|
1951
|
+
result
|
|
1952
|
+
};
|
|
1953
|
+
} catch (error) {
|
|
1954
|
+
console.error(
|
|
1955
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
1956
|
+
error
|
|
1957
|
+
);
|
|
1958
|
+
return {
|
|
1959
|
+
model: modelId,
|
|
1960
|
+
modelKey,
|
|
1961
|
+
benchmark: benchmark.name,
|
|
1962
|
+
result: {
|
|
1963
|
+
score: 0,
|
|
1964
|
+
success: false,
|
|
1965
|
+
metrics: {},
|
|
1966
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
1967
|
+
}
|
|
1968
|
+
};
|
|
1969
|
+
}
|
|
1970
|
+
}
|
|
1971
|
+
function normalizeModels(models) {
|
|
1972
|
+
const modelEntries = [];
|
|
1973
|
+
if (Array.isArray(models)) {
|
|
1974
|
+
for (const m of models) {
|
|
1975
|
+
modelEntries.push([void 0, m]);
|
|
1976
|
+
}
|
|
1977
|
+
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
1978
|
+
modelEntries.push([void 0, models]);
|
|
1979
|
+
} else {
|
|
1980
|
+
for (const [key, m] of Object.entries(
|
|
1981
|
+
models
|
|
1982
|
+
)) {
|
|
1983
|
+
modelEntries.push([key, m]);
|
|
1984
|
+
}
|
|
1985
|
+
}
|
|
1986
|
+
return modelEntries;
|
|
1987
|
+
}
|
|
1988
|
+
function buildConfig(temperature, maxTokens) {
|
|
1989
|
+
const config = {};
|
|
1990
|
+
if (temperature !== void 0) {
|
|
1991
|
+
config.temperature = temperature;
|
|
1992
|
+
}
|
|
1993
|
+
if (maxTokens !== void 0) {
|
|
1994
|
+
config.maxTokens = maxTokens;
|
|
1995
|
+
}
|
|
1996
|
+
return Object.keys(config).length > 0 ? config : void 0;
|
|
1997
|
+
}
|
|
1998
|
+
function executeReporter(reporter, results) {
|
|
1999
|
+
const report = reporters[reporter];
|
|
2000
|
+
if (report) {
|
|
2001
|
+
report(results);
|
|
2002
|
+
} else {
|
|
2003
|
+
console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
|
|
2004
|
+
reporters.console(results);
|
|
2005
|
+
}
|
|
2006
|
+
}
|
|
2007
|
+
async function evaluate(options) {
|
|
2008
|
+
const {
|
|
2009
|
+
models,
|
|
2010
|
+
benchmarks,
|
|
2011
|
+
reporter = "console",
|
|
2012
|
+
temperature,
|
|
2013
|
+
maxTokens
|
|
2014
|
+
} = options;
|
|
2015
|
+
const modelEntries = normalizeModels(models);
|
|
2016
|
+
const config = buildConfig(temperature, maxTokens);
|
|
2017
|
+
const allResults = [];
|
|
2018
|
+
for (const [modelKey, model] of modelEntries) {
|
|
2019
|
+
for (const benchmark of benchmarks) {
|
|
2020
|
+
const evaluationResult = await runSingleBenchmark(
|
|
2021
|
+
model,
|
|
2022
|
+
benchmark,
|
|
2023
|
+
modelKey,
|
|
2024
|
+
config
|
|
2025
|
+
);
|
|
2026
|
+
allResults.push(evaluationResult);
|
|
2027
|
+
}
|
|
2028
|
+
}
|
|
2029
|
+
executeReporter(reporter, allResults);
|
|
2030
|
+
return allResults;
|
|
2031
|
+
}
|
|
1525
2032
|
// Annotate the CommonJS export names for ESM import in node:
|
|
1526
2033
|
0 && (module.exports = {
|
|
1527
2034
|
bfclMultipleBenchmark,
|