@ai-sdk-tool/eval 0.1.7 → 1.0.0-canary.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/data/{BFCL_v3_multiple.json → BFCL_v3_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_multiple_possible_answer.json → BFCL_v3_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel.json → BFCL_v3_parallel.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple.json → BFCL_v3_parallel_multiple.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_multiple_possible_answer.json → BFCL_v3_parallel_multiple_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_parallel_possible_answer.json → BFCL_v3_parallel_possible_answer.jsonl} +1 -1
- package/data/{BFCL_v3_simple.json → BFCL_v3_simple.jsonl} +1 -1
- package/data/{BFCL_v3_simple_possible_answer.json → BFCL_v3_simple_possible_answer.jsonl} +1 -1
- package/dist/index.cjs +1610 -964
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +17 -17
- package/dist/index.d.ts +17 -17
- package/dist/index.js +1595 -945
- package/dist/index.js.map +1 -1
- package/package.json +7 -10
package/dist/index.js
CHANGED
|
@@ -1,339 +1,18 @@
|
|
|
1
|
-
// src/reporters/console.ts
|
|
2
|
-
var colors = {
|
|
3
|
-
reset: "\x1B[0m",
|
|
4
|
-
green: "\x1B[32m",
|
|
5
|
-
red: "\x1B[31m",
|
|
6
|
-
yellow: "\x1B[33m",
|
|
7
|
-
cyan: "\x1B[36m",
|
|
8
|
-
magenta: "\x1B[35m",
|
|
9
|
-
gray: "\x1B[90m"
|
|
10
|
-
};
|
|
11
|
-
function printResult(result) {
|
|
12
|
-
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
13
|
-
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
14
|
-
console.log(
|
|
15
|
-
`
|
|
16
|
-
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
17
|
-
);
|
|
18
|
-
console.log(
|
|
19
|
-
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
20
|
-
);
|
|
21
|
-
const metrics = Object.entries(benchmarkResult.metrics);
|
|
22
|
-
if (metrics.length > 0) {
|
|
23
|
-
console.log(" Metrics:");
|
|
24
|
-
for (const [key, value] of metrics) {
|
|
25
|
-
console.log(` - ${key}: ${value}`);
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
if (benchmarkResult.error) {
|
|
29
|
-
console.log(
|
|
30
|
-
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
31
|
-
);
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
function consoleReporter(results) {
|
|
35
|
-
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
36
|
-
for (const result of results) {
|
|
37
|
-
printResult(result);
|
|
38
|
-
}
|
|
39
|
-
console.log("\n---------------------------\n");
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
// src/reporters/console.debug.ts
|
|
43
|
-
var colors2 = {
|
|
44
|
-
reset: "\x1B[0m",
|
|
45
|
-
green: "\x1B[32m",
|
|
46
|
-
red: "\x1B[31m",
|
|
47
|
-
yellow: "\x1B[33m",
|
|
48
|
-
cyan: "\x1B[36m",
|
|
49
|
-
magenta: "\x1B[35m",
|
|
50
|
-
gray: "\x1B[90m",
|
|
51
|
-
bold: "\x1B[1m",
|
|
52
|
-
underline: "\x1B[4m"
|
|
53
|
-
};
|
|
54
|
-
function colorizeDiffLine(line) {
|
|
55
|
-
if (line.startsWith("+")) return `${colors2.green}${line}${colors2.reset}`;
|
|
56
|
-
if (line.startsWith("-")) return `${colors2.red}${line}${colors2.reset}`;
|
|
57
|
-
if (line.startsWith("@"))
|
|
58
|
-
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
59
|
-
return line;
|
|
60
|
-
}
|
|
61
|
-
function uniqueLines(lines) {
|
|
62
|
-
const seen = /* @__PURE__ */ new Set();
|
|
63
|
-
const out = [];
|
|
64
|
-
for (const l of lines) {
|
|
65
|
-
if (seen.has(l)) continue;
|
|
66
|
-
seen.add(l);
|
|
67
|
-
out.push(l);
|
|
68
|
-
}
|
|
69
|
-
return out;
|
|
70
|
-
}
|
|
71
|
-
function suggestFixFromDiff(parsed) {
|
|
72
|
-
const suggestions = [];
|
|
73
|
-
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
74
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
|
|
75
|
-
const expectedName = expected?.function;
|
|
76
|
-
const actualName = actual?.function;
|
|
77
|
-
if (expectedName && actualName && expectedName !== actualName) {
|
|
78
|
-
suggestions.push(
|
|
79
|
-
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
80
|
-
);
|
|
81
|
-
}
|
|
82
|
-
if (Array.isArray(expected?.functions)) {
|
|
83
|
-
suggestions.push(
|
|
84
|
-
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
85
|
-
);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
89
|
-
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
90
|
-
if (missing.length) {
|
|
91
|
-
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
95
|
-
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
96
|
-
if (extras.length) {
|
|
97
|
-
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
101
|
-
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
102
|
-
for (const param of targets) {
|
|
103
|
-
const allowedLine = diff.find(
|
|
104
|
-
(d) => String(d).startsWith("- expected one of:")
|
|
105
|
-
);
|
|
106
|
-
if (allowedLine) {
|
|
107
|
-
const allowed = allowedLine.replace("- expected one of: ", "");
|
|
108
|
-
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
109
|
-
} else {
|
|
110
|
-
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
115
|
-
if (error_type.includes("missing_required")) {
|
|
116
|
-
suggestions.push(
|
|
117
|
-
"Add all required parameters defined by the tool schema."
|
|
118
|
-
);
|
|
119
|
-
} else if (error_type.includes("unexpected_param")) {
|
|
120
|
-
suggestions.push("Remove parameters not present in the tool schema.");
|
|
121
|
-
} else if (error_type.includes("wrong_count")) {
|
|
122
|
-
suggestions.push(
|
|
123
|
-
"Adjust the number of tool calls to match expected count."
|
|
124
|
-
);
|
|
125
|
-
} else if (error_type.includes("wrong_func_name")) {
|
|
126
|
-
suggestions.push("Use the exact expected function name from the schema.");
|
|
127
|
-
} else if (error_type.includes("value_error")) {
|
|
128
|
-
suggestions.push("Choose a value from the allowed options.");
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
return uniqueLines(suggestions);
|
|
132
|
-
}
|
|
133
|
-
function consoleDebugReporter(results) {
|
|
134
|
-
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
135
|
-
for (const r of results) {
|
|
136
|
-
const { model, modelKey, benchmark, result } = r;
|
|
137
|
-
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
138
|
-
console.log(
|
|
139
|
-
`
|
|
140
|
-
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
141
|
-
);
|
|
142
|
-
console.log(
|
|
143
|
-
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
144
|
-
);
|
|
145
|
-
const metrics = Object.entries(result.metrics);
|
|
146
|
-
if (metrics.length > 0) {
|
|
147
|
-
console.log(" Metrics:");
|
|
148
|
-
for (const [k, v] of metrics) console.log(` - ${k}: ${v}`);
|
|
149
|
-
}
|
|
150
|
-
if (result.logs && result.logs.length) {
|
|
151
|
-
const failLogs = result.logs.filter(
|
|
152
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
153
|
-
);
|
|
154
|
-
const hasFails = failLogs.length > 0;
|
|
155
|
-
if (hasFails) {
|
|
156
|
-
console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
|
|
157
|
-
const debugIds = /* @__PURE__ */ new Set();
|
|
158
|
-
for (const l of failLogs) {
|
|
159
|
-
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
160
|
-
try {
|
|
161
|
-
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
162
|
-
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
163
|
-
} catch {
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
for (const line of failLogs) {
|
|
168
|
-
if (line.startsWith("[FAIL]")) {
|
|
169
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
170
|
-
const failId = m?.[1];
|
|
171
|
-
if (failId && debugIds.has(failId)) continue;
|
|
172
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
173
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
174
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
175
|
-
} else if (line.startsWith("[STACK]")) {
|
|
176
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
177
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
178
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
179
|
-
try {
|
|
180
|
-
const parsed = JSON.parse(payload);
|
|
181
|
-
const { id, expected, actual, message, diff } = parsed;
|
|
182
|
-
console.log(
|
|
183
|
-
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
184
|
-
);
|
|
185
|
-
if (diff && Array.isArray(diff)) {
|
|
186
|
-
for (const dLine of diff)
|
|
187
|
-
console.log(" " + colorizeDiffLine(dLine));
|
|
188
|
-
} else {
|
|
189
|
-
console.log(" expected:");
|
|
190
|
-
console.log(
|
|
191
|
-
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
192
|
-
);
|
|
193
|
-
console.log(" actual:");
|
|
194
|
-
console.log(
|
|
195
|
-
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
196
|
-
);
|
|
197
|
-
}
|
|
198
|
-
const suggestions = suggestFixFromDiff(parsed);
|
|
199
|
-
if (suggestions.length) {
|
|
200
|
-
console.log(
|
|
201
|
-
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
202
|
-
);
|
|
203
|
-
for (const s of suggestions) console.log(` \u2022 ${s}`);
|
|
204
|
-
}
|
|
205
|
-
} catch {
|
|
206
|
-
console.log(` ${line}`);
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
} else {
|
|
211
|
-
const info = result.logs.filter(
|
|
212
|
-
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
213
|
-
);
|
|
214
|
-
for (const line of info)
|
|
215
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
console.log("\n------------------------------------\n");
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
// src/reporters/json.ts
|
|
223
|
-
function jsonReporter(results) {
|
|
224
|
-
const serializableResults = results.map((r) => ({
|
|
225
|
-
...r,
|
|
226
|
-
result: {
|
|
227
|
-
...r.result,
|
|
228
|
-
error: r.result.error?.message
|
|
229
|
-
}
|
|
230
|
-
}));
|
|
231
|
-
console.log(JSON.stringify(serializableResults, null, 2));
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
// src/reporters/index.ts
|
|
235
|
-
var reporters = {
|
|
236
|
-
console: consoleReporter,
|
|
237
|
-
json: jsonReporter,
|
|
238
|
-
"console.debug": consoleDebugReporter
|
|
239
|
-
};
|
|
240
|
-
|
|
241
|
-
// src/evaluate.ts
|
|
242
|
-
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
243
|
-
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
244
|
-
try {
|
|
245
|
-
console.log(
|
|
246
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
247
|
-
);
|
|
248
|
-
const result = await benchmark.run(model, config);
|
|
249
|
-
console.log(
|
|
250
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
251
|
-
);
|
|
252
|
-
return {
|
|
253
|
-
model: modelId,
|
|
254
|
-
modelKey,
|
|
255
|
-
benchmark: benchmark.name,
|
|
256
|
-
result
|
|
257
|
-
};
|
|
258
|
-
} catch (error) {
|
|
259
|
-
console.error(
|
|
260
|
-
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
261
|
-
error
|
|
262
|
-
);
|
|
263
|
-
return {
|
|
264
|
-
model: modelId,
|
|
265
|
-
modelKey,
|
|
266
|
-
benchmark: benchmark.name,
|
|
267
|
-
result: {
|
|
268
|
-
score: 0,
|
|
269
|
-
success: false,
|
|
270
|
-
metrics: {},
|
|
271
|
-
error: error instanceof Error ? error : new Error(String(error))
|
|
272
|
-
}
|
|
273
|
-
};
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
async function evaluate(options) {
|
|
277
|
-
const {
|
|
278
|
-
models,
|
|
279
|
-
benchmarks,
|
|
280
|
-
reporter = "console",
|
|
281
|
-
temperature,
|
|
282
|
-
maxTokens
|
|
283
|
-
} = options;
|
|
284
|
-
const modelEntries = [];
|
|
285
|
-
if (Array.isArray(models)) {
|
|
286
|
-
for (const m of models) modelEntries.push([void 0, m]);
|
|
287
|
-
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
288
|
-
modelEntries.push([void 0, models]);
|
|
289
|
-
} else {
|
|
290
|
-
for (const [key, m] of Object.entries(
|
|
291
|
-
models
|
|
292
|
-
)) {
|
|
293
|
-
modelEntries.push([key, m]);
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
const allResults = [];
|
|
297
|
-
for (const [modelKey, model] of modelEntries) {
|
|
298
|
-
for (const benchmark of benchmarks) {
|
|
299
|
-
const config = {};
|
|
300
|
-
if (temperature !== void 0) config.temperature = temperature;
|
|
301
|
-
if (maxTokens !== void 0) config.maxTokens = maxTokens;
|
|
302
|
-
const evaluationResult = await runSingleBenchmark(
|
|
303
|
-
model,
|
|
304
|
-
benchmark,
|
|
305
|
-
modelKey,
|
|
306
|
-
Object.keys(config).length > 0 ? config : void 0
|
|
307
|
-
);
|
|
308
|
-
allResults.push(evaluationResult);
|
|
309
|
-
}
|
|
310
|
-
}
|
|
311
|
-
const report = reporters[reporter];
|
|
312
|
-
if (report) {
|
|
313
|
-
report(allResults);
|
|
314
|
-
} else {
|
|
315
|
-
console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
|
|
316
|
-
reporters.console(allResults);
|
|
317
|
-
}
|
|
318
|
-
return allResults;
|
|
319
|
-
}
|
|
320
|
-
|
|
321
1
|
// src/benchmarks/bfcl.ts
|
|
322
|
-
import { generateText, jsonSchema, tool } from "ai";
|
|
323
2
|
import { promises as fs2 } from "fs";
|
|
324
3
|
import path2 from "path";
|
|
4
|
+
import {
|
|
5
|
+
generateText,
|
|
6
|
+
jsonSchema,
|
|
7
|
+
tool
|
|
8
|
+
} from "ai";
|
|
325
9
|
|
|
326
10
|
// src/utils/paths.ts
|
|
327
11
|
import fs from "fs";
|
|
328
12
|
import { createRequire } from "module";
|
|
329
13
|
import path from "path";
|
|
330
14
|
import { fileURLToPath } from "url";
|
|
331
|
-
function
|
|
332
|
-
const moduleUrl = fromModuleUrl;
|
|
333
|
-
const override = process.env.BFCL_DATA_DIR;
|
|
334
|
-
if (override && override.trim().length > 0) {
|
|
335
|
-
return override;
|
|
336
|
-
}
|
|
15
|
+
function tryResolveViaPackageEntry(moduleUrl) {
|
|
337
16
|
try {
|
|
338
17
|
const baseForRequireEntry = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
|
|
339
18
|
const requireFromEntry = createRequire(baseForRequireEntry);
|
|
@@ -341,43 +20,80 @@ function resolveDataDir(fromModuleUrl) {
|
|
|
341
20
|
const entryDir = path.dirname(entryPath);
|
|
342
21
|
const guessPkgRoot = fs.existsSync(path.join(entryDir, "..")) ? path.resolve(entryDir, "..") : entryDir;
|
|
343
22
|
const dataAtRoot = path.join(guessPkgRoot, "data");
|
|
344
|
-
if (fs.existsSync(dataAtRoot))
|
|
23
|
+
if (fs.existsSync(dataAtRoot)) {
|
|
24
|
+
return dataAtRoot;
|
|
25
|
+
}
|
|
345
26
|
} catch {
|
|
346
27
|
}
|
|
28
|
+
return null;
|
|
29
|
+
}
|
|
30
|
+
function tryResolveViaPackageJson(moduleUrl) {
|
|
347
31
|
try {
|
|
348
32
|
const baseForRequire = typeof moduleUrl === "string" && moduleUrl || path.join(process.cwd(), "package.json");
|
|
349
33
|
const require2 = createRequire(baseForRequire);
|
|
350
34
|
const pkgJsonPath = require2.resolve("@ai-sdk-tool/eval/package.json");
|
|
351
35
|
const pkgDir = path.dirname(pkgJsonPath);
|
|
352
36
|
const dataAtPkg = path.join(pkgDir, "data");
|
|
353
|
-
if (fs.existsSync(dataAtPkg))
|
|
37
|
+
if (fs.existsSync(dataAtPkg)) {
|
|
38
|
+
return dataAtPkg;
|
|
39
|
+
}
|
|
354
40
|
} catch {
|
|
355
41
|
}
|
|
356
|
-
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
function getStartDir(moduleUrl) {
|
|
357
45
|
if (moduleUrl) {
|
|
358
46
|
try {
|
|
359
|
-
|
|
47
|
+
return path.dirname(fileURLToPath(moduleUrl));
|
|
360
48
|
} catch {
|
|
361
|
-
|
|
49
|
+
return process.cwd();
|
|
362
50
|
}
|
|
363
|
-
} else {
|
|
364
|
-
startDir = process.cwd();
|
|
365
51
|
}
|
|
52
|
+
return process.cwd();
|
|
53
|
+
}
|
|
54
|
+
function findDataDirByTraversal(startDir) {
|
|
366
55
|
let dir = startDir;
|
|
367
|
-
|
|
56
|
+
const MAX_PARENT_TRAVERSAL_DEPTH = 6;
|
|
57
|
+
for (let i = 0; i < MAX_PARENT_TRAVERSAL_DEPTH; i += 1) {
|
|
368
58
|
const dataCandidate = path.join(dir, "data");
|
|
369
|
-
if (fs.existsSync(dataCandidate))
|
|
59
|
+
if (fs.existsSync(dataCandidate)) {
|
|
60
|
+
return dataCandidate;
|
|
61
|
+
}
|
|
370
62
|
const parent = path.resolve(dir, "..");
|
|
371
|
-
if (parent === dir)
|
|
63
|
+
if (parent === dir) {
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
372
66
|
dir = parent;
|
|
373
67
|
}
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
function resolveDataDir(fromModuleUrl) {
|
|
71
|
+
const override = process.env.BFCL_DATA_DIR;
|
|
72
|
+
if (override && override.trim().length > 0) {
|
|
73
|
+
return override;
|
|
74
|
+
}
|
|
75
|
+
const viaEntry = tryResolveViaPackageEntry(fromModuleUrl);
|
|
76
|
+
if (viaEntry) {
|
|
77
|
+
return viaEntry;
|
|
78
|
+
}
|
|
79
|
+
const viaPackageJson = tryResolveViaPackageJson(fromModuleUrl);
|
|
80
|
+
if (viaPackageJson) {
|
|
81
|
+
return viaPackageJson;
|
|
82
|
+
}
|
|
83
|
+
const startDir = getStartDir(fromModuleUrl);
|
|
84
|
+
const viaTraversal = findDataDirByTraversal(startDir);
|
|
85
|
+
if (viaTraversal) {
|
|
86
|
+
return viaTraversal;
|
|
87
|
+
}
|
|
374
88
|
const pkgRoot = path.resolve(startDir, "..", "..");
|
|
375
89
|
return path.join(pkgRoot, "data");
|
|
376
90
|
}
|
|
377
91
|
|
|
378
92
|
// src/benchmarks/bfcl/ast-checker.ts
|
|
379
93
|
function standardizeString(input) {
|
|
380
|
-
if (typeof input !== "string")
|
|
94
|
+
if (typeof input !== "string") {
|
|
95
|
+
return input;
|
|
96
|
+
}
|
|
381
97
|
const regex = /[ ,./\\-_*^]/g;
|
|
382
98
|
return input.replace(regex, "").toLowerCase().replace(/'/g, '"');
|
|
383
99
|
}
|
|
@@ -397,131 +113,185 @@ function checkStringValue(param, modelValue, possibleAnswers) {
|
|
|
397
113
|
}
|
|
398
114
|
return { valid: true };
|
|
399
115
|
}
|
|
400
|
-
function
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
const expectedFuncName = funcDescription.name;
|
|
404
|
-
const expectedParams = funcDescription.parameters.properties;
|
|
405
|
-
const requiredParams = funcDescription.parameters.required;
|
|
406
|
-
if (modelFuncName !== expectedFuncName) {
|
|
407
|
-
return {
|
|
408
|
-
valid: false,
|
|
409
|
-
error: `Function name '${modelFuncName}' does not match expected '${expectedFuncName}'.`,
|
|
410
|
-
error_type: "simple_function_checker:wrong_func_name"
|
|
411
|
-
};
|
|
116
|
+
function normalizeObject(obj) {
|
|
117
|
+
if (Array.isArray(obj)) {
|
|
118
|
+
return obj.map(normalizeObject);
|
|
412
119
|
}
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
};
|
|
120
|
+
if (obj && typeof obj === "object") {
|
|
121
|
+
const normalized = {};
|
|
122
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
123
|
+
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
124
|
+
normalized[key] = value[0];
|
|
125
|
+
} else {
|
|
126
|
+
normalized[key] = normalizeObject(value);
|
|
127
|
+
}
|
|
422
128
|
}
|
|
129
|
+
return normalized;
|
|
423
130
|
}
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
paramName,
|
|
438
|
-
modelValue,
|
|
439
|
-
possibleValues ?? []
|
|
440
|
-
);
|
|
441
|
-
if (!result.valid) return result;
|
|
442
|
-
} else if (Array.isArray(modelValue)) {
|
|
443
|
-
const modelValueStr = JSON.stringify(
|
|
444
|
-
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
445
|
-
);
|
|
446
|
-
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
447
|
-
if (!Array.isArray(p)) return false;
|
|
448
|
-
return JSON.stringify(
|
|
449
|
-
p.map((v) => standardizeString(String(v))).sort()
|
|
450
|
-
) === modelValueStr;
|
|
451
|
-
}) : false;
|
|
452
|
-
if (!hasMatch) {
|
|
453
|
-
return {
|
|
454
|
-
valid: false,
|
|
455
|
-
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
456
|
-
modelValue
|
|
457
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
458
|
-
error_type: "value_error:list"
|
|
459
|
-
};
|
|
460
|
-
}
|
|
461
|
-
} else {
|
|
462
|
-
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
|
|
463
|
-
if (modelValue === possibleValue) return true;
|
|
464
|
-
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
465
|
-
try {
|
|
466
|
-
const normalizeObject = (obj) => {
|
|
467
|
-
if (Array.isArray(obj)) {
|
|
468
|
-
return obj.map(normalizeObject);
|
|
469
|
-
}
|
|
470
|
-
if (obj && typeof obj === "object") {
|
|
471
|
-
const normalized = {};
|
|
472
|
-
for (const [key, value] of Object.entries(
|
|
473
|
-
obj
|
|
474
|
-
)) {
|
|
475
|
-
if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
|
|
476
|
-
normalized[key] = value[0];
|
|
477
|
-
} else {
|
|
478
|
-
normalized[key] = normalizeObject(value);
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
return normalized;
|
|
482
|
-
}
|
|
483
|
-
return obj;
|
|
484
|
-
};
|
|
485
|
-
const normalizedModel = normalizeObject(modelValue);
|
|
486
|
-
const normalizedPossible = normalizeObject(possibleValue);
|
|
487
|
-
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
488
|
-
} catch {
|
|
489
|
-
return false;
|
|
490
|
-
}
|
|
491
|
-
}
|
|
492
|
-
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
493
|
-
return modelValue.toString() === possibleValue;
|
|
494
|
-
}
|
|
495
|
-
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
496
|
-
return modelValue === possibleValue.toString();
|
|
497
|
-
}
|
|
498
|
-
return false;
|
|
499
|
-
}) : false;
|
|
500
|
-
if (!hasMatch) {
|
|
501
|
-
return {
|
|
502
|
-
valid: false,
|
|
503
|
-
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
504
|
-
modelValue
|
|
505
|
-
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
506
|
-
error_type: "value_error:other"
|
|
507
|
-
};
|
|
508
|
-
}
|
|
509
|
-
}
|
|
131
|
+
return obj;
|
|
132
|
+
}
|
|
133
|
+
function valuesMatch(modelValue, possibleValue) {
|
|
134
|
+
if (modelValue === possibleValue) {
|
|
135
|
+
return true;
|
|
136
|
+
}
|
|
137
|
+
if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
|
|
138
|
+
try {
|
|
139
|
+
const normalizedModel = normalizeObject(modelValue);
|
|
140
|
+
const normalizedPossible = normalizeObject(possibleValue);
|
|
141
|
+
return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
|
|
142
|
+
} catch {
|
|
143
|
+
return false;
|
|
510
144
|
}
|
|
511
145
|
}
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
146
|
+
if (typeof modelValue === "number" && typeof possibleValue === "string") {
|
|
147
|
+
return modelValue.toString() === possibleValue;
|
|
148
|
+
}
|
|
149
|
+
if (typeof modelValue === "string" && typeof possibleValue === "number") {
|
|
150
|
+
return modelValue === possibleValue.toString();
|
|
151
|
+
}
|
|
152
|
+
return false;
|
|
153
|
+
}
|
|
154
|
+
function checkArrayValue(paramName, modelValue, possibleValues) {
|
|
155
|
+
const modelValueStr = JSON.stringify(
|
|
156
|
+
modelValue.map((v) => standardizeString(String(v))).sort()
|
|
157
|
+
);
|
|
158
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
|
|
159
|
+
if (!Array.isArray(p)) {
|
|
160
|
+
return false;
|
|
161
|
+
}
|
|
162
|
+
return JSON.stringify(p.map((v) => standardizeString(String(v))).sort()) === modelValueStr;
|
|
163
|
+
}) : false;
|
|
164
|
+
if (!hasMatch) {
|
|
165
|
+
return {
|
|
166
|
+
valid: false,
|
|
167
|
+
error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
|
|
168
|
+
modelValue
|
|
169
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
170
|
+
error_type: "value_error:list"
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
return { valid: true };
|
|
174
|
+
}
|
|
175
|
+
function checkObjectValue(paramName, modelValue, possibleValues) {
|
|
176
|
+
const hasMatch = Array.isArray(possibleValues) ? possibleValues.some(
|
|
177
|
+
(possibleValue) => valuesMatch(modelValue, possibleValue)
|
|
178
|
+
) : false;
|
|
179
|
+
if (!hasMatch) {
|
|
180
|
+
return {
|
|
181
|
+
valid: false,
|
|
182
|
+
error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
|
|
183
|
+
modelValue
|
|
184
|
+
)}. Expected one of ${JSON.stringify(possibleValues)}.`,
|
|
185
|
+
error_type: "value_error:other"
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
return { valid: true };
|
|
189
|
+
}
|
|
190
|
+
function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
|
|
191
|
+
const funcNameCheck = checkFunctionName(
|
|
192
|
+
funcDescription.name,
|
|
193
|
+
modelToolCall.toolName
|
|
194
|
+
);
|
|
195
|
+
if (!funcNameCheck.valid) {
|
|
196
|
+
return funcNameCheck;
|
|
197
|
+
}
|
|
198
|
+
const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
|
|
199
|
+
const argsObj = modelToolCall.args && typeof modelToolCall.args === "object" ? modelToolCall.args : {};
|
|
200
|
+
const context = {
|
|
201
|
+
funcDescription,
|
|
202
|
+
modelToolCall,
|
|
203
|
+
possibleAnswerParams,
|
|
204
|
+
expectedParams: funcDescription.parameters.properties
|
|
205
|
+
};
|
|
206
|
+
const requiredCheck = checkRequiredParams(
|
|
207
|
+
funcDescription.parameters.required,
|
|
208
|
+
argsObj
|
|
209
|
+
);
|
|
210
|
+
if (!requiredCheck.valid) {
|
|
211
|
+
return requiredCheck;
|
|
212
|
+
}
|
|
213
|
+
const paramsCheck = checkAllParameters(argsObj, context);
|
|
214
|
+
if (!paramsCheck.valid) {
|
|
215
|
+
return paramsCheck;
|
|
216
|
+
}
|
|
217
|
+
const optionalCheck = checkOptionalParams(argsObj, possibleAnswerParams);
|
|
218
|
+
if (!optionalCheck.valid) {
|
|
219
|
+
return optionalCheck;
|
|
220
|
+
}
|
|
221
|
+
return { valid: true };
|
|
222
|
+
}
|
|
223
|
+
function checkFunctionName(expected, actual) {
|
|
224
|
+
if (actual !== expected) {
|
|
225
|
+
return {
|
|
226
|
+
valid: false,
|
|
227
|
+
error: `Function name '${actual}' does not match expected '${expected}'.`,
|
|
228
|
+
error_type: "simple_function_checker:wrong_func_name"
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
return { valid: true };
|
|
232
|
+
}
|
|
233
|
+
function checkRequiredParams(requiredParams, argsObj) {
|
|
234
|
+
for (const param of requiredParams) {
|
|
235
|
+
if (!(param in argsObj)) {
|
|
516
236
|
return {
|
|
517
237
|
valid: false,
|
|
518
|
-
error: `Missing
|
|
519
|
-
error_type: "simple_function_checker:
|
|
238
|
+
error: `Missing required parameter: '${param}'.`,
|
|
239
|
+
error_type: "simple_function_checker:missing_required"
|
|
520
240
|
};
|
|
521
241
|
}
|
|
522
242
|
}
|
|
523
243
|
return { valid: true };
|
|
524
244
|
}
|
|
245
|
+
function checkAllParameters(argsObj, context) {
|
|
246
|
+
for (const paramName of Object.keys(argsObj)) {
|
|
247
|
+
const paramCheck = checkSingleParameter(
|
|
248
|
+
paramName,
|
|
249
|
+
argsObj[paramName],
|
|
250
|
+
context
|
|
251
|
+
);
|
|
252
|
+
if (!paramCheck.valid) {
|
|
253
|
+
return paramCheck;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
return { valid: true };
|
|
257
|
+
}
|
|
258
|
+
function checkSingleParameter(paramName, modelValue, context) {
|
|
259
|
+
if (!(paramName in context.expectedParams && paramName in context.possibleAnswerParams)) {
|
|
260
|
+
return {
|
|
261
|
+
valid: false,
|
|
262
|
+
error: `Unexpected parameter: '${paramName}'.`,
|
|
263
|
+
error_type: "simple_function_checker:unexpected_param"
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
const possibleValues = context.possibleAnswerParams[paramName];
|
|
267
|
+
if (typeof modelValue === "string") {
|
|
268
|
+
return checkStringValue(
|
|
269
|
+
paramName,
|
|
270
|
+
modelValue,
|
|
271
|
+
possibleValues ?? []
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
if (Array.isArray(modelValue)) {
|
|
275
|
+
return checkArrayValue(paramName, modelValue, possibleValues);
|
|
276
|
+
}
|
|
277
|
+
return checkObjectValue(paramName, modelValue, possibleValues);
|
|
278
|
+
}
|
|
279
|
+
function checkOptionalParams(argsObj, possibleAnswerParams) {
|
|
280
|
+
for (const paramName in possibleAnswerParams) {
|
|
281
|
+
if (Object.hasOwn(possibleAnswerParams, paramName)) {
|
|
282
|
+
const val = possibleAnswerParams[paramName];
|
|
283
|
+
const isOptional = Array.isArray(val) && val.includes("");
|
|
284
|
+
if (!(paramName in argsObj || isOptional)) {
|
|
285
|
+
return {
|
|
286
|
+
valid: false,
|
|
287
|
+
error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
|
|
288
|
+
error_type: "simple_function_checker:missing_optional"
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return { valid: true };
|
|
294
|
+
}
|
|
525
295
|
function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possibleAnswers) {
|
|
526
296
|
if (modelToolCalls.length !== possibleAnswers.length) {
|
|
527
297
|
return {
|
|
@@ -544,8 +314,10 @@ function parallelFunctionCheckerNoOrder(funcDescriptions, modelToolCalls, possib
|
|
|
544
314
|
};
|
|
545
315
|
}
|
|
546
316
|
let foundMatch = false;
|
|
547
|
-
for (let i = 0; i < modelToolCalls.length; i
|
|
548
|
-
if (matchedModelCallIndices.has(i))
|
|
317
|
+
for (let i = 0; i < modelToolCalls.length; i += 1) {
|
|
318
|
+
if (matchedModelCallIndices.has(i)) {
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
549
321
|
const checkerResult = simpleFunctionChecker(
|
|
550
322
|
funcDescription,
|
|
551
323
|
modelToolCalls[i],
|
|
@@ -594,6 +366,8 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
|
|
|
594
366
|
}
|
|
595
367
|
|
|
596
368
|
// src/benchmarks/bfcl.ts
|
|
369
|
+
var LINE_SPLIT_REGEX = /\r?\n/;
|
|
370
|
+
var NUMERIC_STRING_REGEX = /^\d+$/;
|
|
597
371
|
function check(testCase, modelOutput, possibleAnswer) {
|
|
598
372
|
const category = testCase.id.split("_")[0];
|
|
599
373
|
try {
|
|
@@ -610,19 +384,22 @@ function check(testCase, modelOutput, possibleAnswer) {
|
|
|
610
384
|
modelOutput[0],
|
|
611
385
|
possibleAnswer.ground_truth[0]
|
|
612
386
|
);
|
|
613
|
-
}
|
|
387
|
+
}
|
|
388
|
+
if (category === "parallel") {
|
|
614
389
|
return parallelFunctionCheckerNoOrder(
|
|
615
390
|
testCase.function,
|
|
616
391
|
modelOutput,
|
|
617
392
|
possibleAnswer.ground_truth
|
|
618
393
|
);
|
|
619
|
-
}
|
|
394
|
+
}
|
|
395
|
+
if (category === "multiple") {
|
|
620
396
|
return multipleFunctionChecker(
|
|
621
397
|
testCase.function,
|
|
622
398
|
modelOutput,
|
|
623
399
|
possibleAnswer.ground_truth
|
|
624
400
|
);
|
|
625
|
-
}
|
|
401
|
+
}
|
|
402
|
+
if (category.includes("parallel-multiple")) {
|
|
626
403
|
return parallelFunctionCheckerNoOrder(
|
|
627
404
|
testCase.function,
|
|
628
405
|
modelOutput,
|
|
@@ -658,8 +435,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
658
435
|
path2.join(dataPath, answerDataFile),
|
|
659
436
|
"utf-8"
|
|
660
437
|
);
|
|
661
|
-
testCases = testCasesJson.split(
|
|
662
|
-
const possibleAnswers = possibleAnswersJson.split(
|
|
438
|
+
testCases = testCasesJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
439
|
+
const possibleAnswers = possibleAnswersJson.split(LINE_SPLIT_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
663
440
|
const possibleAnswersMap = new Map(
|
|
664
441
|
possibleAnswers.map((ans) => [ans.id, ans])
|
|
665
442
|
);
|
|
@@ -671,319 +448,600 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
671
448
|
`[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
|
|
672
449
|
);
|
|
673
450
|
}
|
|
451
|
+
const fixSchemaType = (copy) => {
|
|
452
|
+
if (!copy.type) {
|
|
453
|
+
return;
|
|
454
|
+
}
|
|
455
|
+
if (copy.type === "dict") {
|
|
456
|
+
copy.type = "object";
|
|
457
|
+
}
|
|
458
|
+
if (copy.type === "tuple") {
|
|
459
|
+
copy.type = "array";
|
|
460
|
+
}
|
|
461
|
+
if (copy.type === "integer" || copy.type === "float") {
|
|
462
|
+
copy.type = "number";
|
|
463
|
+
}
|
|
464
|
+
};
|
|
465
|
+
const fixSchemaProperties = (copy, fixSchemaFn) => {
|
|
466
|
+
if (!copy.properties || typeof copy.properties !== "object") {
|
|
467
|
+
return;
|
|
468
|
+
}
|
|
469
|
+
for (const k of Object.keys(copy.properties)) {
|
|
470
|
+
copy.properties[k] = fixSchemaFn(
|
|
471
|
+
copy.properties[k]
|
|
472
|
+
);
|
|
473
|
+
}
|
|
474
|
+
};
|
|
674
475
|
const fixSchema = (schema) => {
|
|
675
|
-
if (!schema || typeof schema !== "object")
|
|
476
|
+
if (!schema || typeof schema !== "object") {
|
|
676
477
|
return { type: "object", properties: {} };
|
|
478
|
+
}
|
|
677
479
|
const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
|
|
678
480
|
if (!Array.isArray(copy)) {
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
481
|
+
fixSchemaType(copy);
|
|
482
|
+
fixSchemaProperties(copy, fixSchema);
|
|
483
|
+
if (copy.items) {
|
|
484
|
+
copy.items = fixSchema(copy.items);
|
|
683
485
|
}
|
|
684
|
-
if (copy.properties && typeof copy.properties === "object") {
|
|
685
|
-
for (const k of Object.keys(copy.properties)) {
|
|
686
|
-
copy.properties[k] = fixSchema(
|
|
687
|
-
copy.properties[k]
|
|
688
|
-
);
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
if (copy.items) copy.items = fixSchema(copy.items);
|
|
692
486
|
return copy;
|
|
693
487
|
}
|
|
694
488
|
return copy;
|
|
695
489
|
};
|
|
490
|
+
const flattenMessages = (messages) => Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
491
|
+
const sanitizeName = (toolName) => {
|
|
492
|
+
const s = toolName.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64);
|
|
493
|
+
return s.length > 0 ? s : "tool";
|
|
494
|
+
};
|
|
495
|
+
const buildTransformedTools = (tools, fixSchemaFn) => {
|
|
496
|
+
const nameMap = /* @__PURE__ */ new Map();
|
|
497
|
+
const transformedTools = tools.map((t) => {
|
|
498
|
+
const fixed = fixSchemaFn(t.parameters);
|
|
499
|
+
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
500
|
+
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
501
|
+
const sanitized = sanitizeName(t.name);
|
|
502
|
+
nameMap.set(sanitized, t.name);
|
|
503
|
+
return {
|
|
504
|
+
type: "function",
|
|
505
|
+
name: sanitized,
|
|
506
|
+
description: t.description,
|
|
507
|
+
inputSchema
|
|
508
|
+
};
|
|
509
|
+
});
|
|
510
|
+
return { transformedTools, nameMap };
|
|
511
|
+
};
|
|
512
|
+
const parseDebugToolCalls = (raw) => {
|
|
513
|
+
if (!raw) {
|
|
514
|
+
return [];
|
|
515
|
+
}
|
|
516
|
+
try {
|
|
517
|
+
const arr = JSON.parse(raw);
|
|
518
|
+
return Array.isArray(arr) ? arr : [];
|
|
519
|
+
} catch {
|
|
520
|
+
return [];
|
|
521
|
+
}
|
|
522
|
+
};
|
|
523
|
+
const getSanitizedName = (rawName, transformedTools) => {
|
|
524
|
+
if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
|
|
525
|
+
return transformedTools[Number(rawName)]?.name ?? rawName;
|
|
526
|
+
}
|
|
527
|
+
return rawName;
|
|
528
|
+
};
|
|
529
|
+
const parseToolArgs = (extractedArgs) => {
|
|
530
|
+
if (typeof extractedArgs !== "string") {
|
|
531
|
+
return extractedArgs;
|
|
532
|
+
}
|
|
533
|
+
try {
|
|
534
|
+
return JSON.parse(extractedArgs);
|
|
535
|
+
} catch {
|
|
536
|
+
return extractedArgs;
|
|
537
|
+
}
|
|
538
|
+
};
|
|
539
|
+
const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
|
|
540
|
+
const call = c;
|
|
541
|
+
const rawName = call.toolName ?? call.name;
|
|
542
|
+
const sanitizedFromIndex = getSanitizedName(
|
|
543
|
+
rawName,
|
|
544
|
+
transformedTools
|
|
545
|
+
);
|
|
546
|
+
const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
|
|
547
|
+
const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
|
|
548
|
+
const parsedArgs = parseToolArgs(extractedArgs);
|
|
549
|
+
return {
|
|
550
|
+
...call,
|
|
551
|
+
toolName: originalName,
|
|
552
|
+
name: originalName,
|
|
553
|
+
args: parsedArgs ?? {}
|
|
554
|
+
};
|
|
555
|
+
});
|
|
556
|
+
const summarizeArgs = (args) => {
|
|
557
|
+
if (args == null) {
|
|
558
|
+
return args;
|
|
559
|
+
}
|
|
560
|
+
if (typeof args !== "object") {
|
|
561
|
+
return args;
|
|
562
|
+
}
|
|
563
|
+
return Object.keys(args).sort().reduce(
|
|
564
|
+
(acc, k) => {
|
|
565
|
+
acc[k] = args[k];
|
|
566
|
+
return acc;
|
|
567
|
+
},
|
|
568
|
+
{}
|
|
569
|
+
);
|
|
570
|
+
};
|
|
571
|
+
const generateParamMismatchDiff = (paramName, allowed, got) => {
|
|
572
|
+
const diffLines = [];
|
|
573
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
574
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
575
|
+
const expectedLine = (() => {
|
|
576
|
+
if (allowedArray.length === 1) {
|
|
577
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
578
|
+
}
|
|
579
|
+
const formatted = allowedArray.map(
|
|
580
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
581
|
+
).join(", ");
|
|
582
|
+
return `- expected one of: ${formatted}`;
|
|
583
|
+
})();
|
|
584
|
+
diffLines.push(expectedLine);
|
|
585
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
586
|
+
return diffLines;
|
|
587
|
+
};
|
|
588
|
+
const paramValueMatches = (allowed, got) => {
|
|
589
|
+
if (!Array.isArray(allowed)) {
|
|
590
|
+
return false;
|
|
591
|
+
}
|
|
592
|
+
return allowed.some((v) => {
|
|
593
|
+
try {
|
|
594
|
+
if (Array.isArray(got)) {
|
|
595
|
+
return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
|
|
596
|
+
}
|
|
597
|
+
} catch {
|
|
598
|
+
}
|
|
599
|
+
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
600
|
+
});
|
|
601
|
+
};
|
|
602
|
+
const checkFunctionNameMismatch = (expectedName, receivedName, diff) => {
|
|
603
|
+
if (expectedName !== receivedName) {
|
|
604
|
+
diff.push("@@ function name");
|
|
605
|
+
diff.push(`- ${expectedName}`);
|
|
606
|
+
diff.push(`+ ${receivedName}`);
|
|
607
|
+
}
|
|
608
|
+
};
|
|
609
|
+
const checkMissingParams = (required, receivedArgs, diff) => {
|
|
610
|
+
for (const req of required) {
|
|
611
|
+
if (!(req in receivedArgs)) {
|
|
612
|
+
diff.push(`- missing required param: ${req}`);
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
};
|
|
616
|
+
const checkUnexpectedParams = (expectedParams, receivedArgs, diff) => {
|
|
617
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
618
|
+
if (!(k in expectedParams)) {
|
|
619
|
+
diff.push(`+ unexpected param: ${k}`);
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
};
|
|
623
|
+
const checkParamValueMismatches = (expectedParams, receivedArgs, diff) => {
|
|
624
|
+
for (const k of Object.keys(receivedArgs)) {
|
|
625
|
+
if (k in expectedParams) {
|
|
626
|
+
const allowed = expectedParams[k];
|
|
627
|
+
const got = receivedArgs[k];
|
|
628
|
+
if (!paramValueMatches(allowed, got)) {
|
|
629
|
+
diff.push(...generateParamMismatchDiff(k, allowed, got));
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
};
|
|
634
|
+
const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
635
|
+
const funcDesc = tools[0];
|
|
636
|
+
const gt = possibleAnswer.ground_truth?.[0];
|
|
637
|
+
const expectedFuncName = funcDesc?.name;
|
|
638
|
+
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
639
|
+
const received = restoredCalls[0];
|
|
640
|
+
const receivedName = received?.toolName ?? received?.name;
|
|
641
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
642
|
+
const expected = {
|
|
643
|
+
function: expectedFuncName,
|
|
644
|
+
params: expectedParams
|
|
645
|
+
};
|
|
646
|
+
const actual = {
|
|
647
|
+
function: receivedName,
|
|
648
|
+
args: receivedArgs
|
|
649
|
+
};
|
|
650
|
+
const diff = [];
|
|
651
|
+
checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
|
|
652
|
+
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
653
|
+
const required = funcDesc?.parameters?.required ?? [];
|
|
654
|
+
checkMissingParams(
|
|
655
|
+
required,
|
|
656
|
+
receivedArgs,
|
|
657
|
+
diff
|
|
658
|
+
);
|
|
659
|
+
checkUnexpectedParams(
|
|
660
|
+
expectedParams,
|
|
661
|
+
receivedArgs,
|
|
662
|
+
diff
|
|
663
|
+
);
|
|
664
|
+
checkParamValueMismatches(
|
|
665
|
+
expectedParams,
|
|
666
|
+
receivedArgs,
|
|
667
|
+
diff
|
|
668
|
+
);
|
|
669
|
+
}
|
|
670
|
+
return { expected, actual, diff };
|
|
671
|
+
};
|
|
672
|
+
const checkCallCountMismatch = (expectedCount, actualCount, diff) => {
|
|
673
|
+
if (expectedCount !== actualCount) {
|
|
674
|
+
diff.push("@@ call count");
|
|
675
|
+
diff.push(`- expected ${expectedCount}`);
|
|
676
|
+
diff.push(`+ got ${actualCount}`);
|
|
677
|
+
}
|
|
678
|
+
};
|
|
679
|
+
const addMissingAndExtraFunctions = (expectedNames, actualNames, diff) => {
|
|
680
|
+
const missing = expectedNames.filter((n) => !actualNames.includes(n));
|
|
681
|
+
const extra = actualNames.filter((n) => !expectedNames.includes(n));
|
|
682
|
+
for (const m of missing) {
|
|
683
|
+
diff.push(`- missing function: ${m}`);
|
|
684
|
+
}
|
|
685
|
+
for (const e of extra) {
|
|
686
|
+
diff.push(`+ unexpected function: ${e}`);
|
|
687
|
+
}
|
|
688
|
+
};
|
|
689
|
+
const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
|
|
690
|
+
for (let i = 0; i < restoredCalls.length; i += 1) {
|
|
691
|
+
if (usedActual.has(i)) {
|
|
692
|
+
continue;
|
|
693
|
+
}
|
|
694
|
+
const rc = restoredCalls[i];
|
|
695
|
+
const rcName = rc?.toolName ?? rc?.name;
|
|
696
|
+
if (rcName === fname) {
|
|
697
|
+
return i;
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
return -1;
|
|
701
|
+
};
|
|
702
|
+
const validateFunctionParams = (options) => {
|
|
703
|
+
const { receivedArgs, expectedParamsAllowed, requiredParams, diff } = options;
|
|
704
|
+
checkMissingParams(requiredParams, receivedArgs, diff);
|
|
705
|
+
checkUnexpectedParams(expectedParamsAllowed, receivedArgs, diff);
|
|
706
|
+
checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
|
|
707
|
+
};
|
|
708
|
+
const processExpectedCall = (options) => {
|
|
709
|
+
const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
|
|
710
|
+
const fname = Object.keys(expectedObj)[0];
|
|
711
|
+
const matchedIndex = findMatchingCallIndex(
|
|
712
|
+
fname,
|
|
713
|
+
restoredCalls,
|
|
714
|
+
usedActual
|
|
715
|
+
);
|
|
716
|
+
if (matchedIndex === -1) {
|
|
717
|
+
return;
|
|
718
|
+
}
|
|
719
|
+
usedActual.add(matchedIndex);
|
|
720
|
+
const received = restoredCalls[matchedIndex];
|
|
721
|
+
const receivedArgs = summarizeArgs(received?.args);
|
|
722
|
+
const expectedParamsAllowed = expectedObj[fname];
|
|
723
|
+
const funcDesc = tools.find((t) => t.name === fname);
|
|
724
|
+
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
725
|
+
diff.push(`@@ function ${fname}`);
|
|
726
|
+
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
727
|
+
validateFunctionParams({
|
|
728
|
+
receivedArgs,
|
|
729
|
+
expectedParamsAllowed,
|
|
730
|
+
requiredParams,
|
|
731
|
+
diff
|
|
732
|
+
});
|
|
733
|
+
}
|
|
734
|
+
};
|
|
735
|
+
const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
|
|
736
|
+
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
737
|
+
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
738
|
+
const actualNames = restoredCalls.map(
|
|
739
|
+
(c) => c.toolName ?? c.name
|
|
740
|
+
);
|
|
741
|
+
const expected = {
|
|
742
|
+
functions: expectedNames
|
|
743
|
+
};
|
|
744
|
+
const actual = { functions: actualNames };
|
|
745
|
+
const diff = [];
|
|
746
|
+
checkCallCountMismatch(
|
|
747
|
+
expectedNames.length,
|
|
748
|
+
actualNames.length,
|
|
749
|
+
diff
|
|
750
|
+
);
|
|
751
|
+
addMissingAndExtraFunctions(expectedNames, actualNames, diff);
|
|
752
|
+
const usedActual = /* @__PURE__ */ new Set();
|
|
753
|
+
for (const expectedObj of gtArr) {
|
|
754
|
+
processExpectedCall({
|
|
755
|
+
expectedObj,
|
|
756
|
+
restoredCalls,
|
|
757
|
+
tools,
|
|
758
|
+
usedActual,
|
|
759
|
+
diff
|
|
760
|
+
});
|
|
761
|
+
}
|
|
762
|
+
return { expected, actual, diff };
|
|
763
|
+
};
|
|
696
764
|
const concurrencyEnv = process.env.BFCL_CONCURRENCY;
|
|
697
765
|
const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
|
|
698
766
|
logs.push(
|
|
699
767
|
`[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
|
|
700
768
|
);
|
|
701
|
-
const
|
|
702
|
-
const caseLogs = [];
|
|
703
|
-
const { function: tools, question: messages } = testCase;
|
|
704
|
-
const temp = config?.temperature;
|
|
705
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
706
|
-
const maxTok = config?.maxTokens;
|
|
707
|
-
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
769
|
+
const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
|
|
708
770
|
try {
|
|
709
|
-
const
|
|
710
|
-
const
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
return s.length > 0 ? s : "tool";
|
|
714
|
-
};
|
|
715
|
-
const transformedTools = tools.map((t) => {
|
|
716
|
-
const fixed = fixSchema(t.parameters);
|
|
717
|
-
const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
|
|
718
|
-
const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
|
|
719
|
-
const sanitized = sanitizeName(t.name);
|
|
720
|
-
nameMap.set(sanitized, t.name);
|
|
721
|
-
return {
|
|
722
|
-
type: "function",
|
|
723
|
-
name: sanitized,
|
|
724
|
-
description: t.description,
|
|
725
|
-
inputSchema
|
|
726
|
-
};
|
|
727
|
-
});
|
|
728
|
-
const toolsMap = Object.fromEntries(
|
|
729
|
-
transformedTools.map((t) => [
|
|
730
|
-
t.name,
|
|
731
|
-
tool({
|
|
732
|
-
description: typeof t.description === "string" ? t.description : void 0,
|
|
733
|
-
inputSchema: jsonSchema(t.inputSchema)
|
|
734
|
-
})
|
|
735
|
-
])
|
|
771
|
+
const firstTool = transformedTools[0];
|
|
772
|
+
const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
|
|
773
|
+
caseLogs.push(
|
|
774
|
+
`[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
|
|
736
775
|
);
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
776
|
+
} catch (e) {
|
|
777
|
+
caseLogs.push(
|
|
778
|
+
`[DEBUG] ${testCaseId}: failed to introspect tools: ${e.message}`
|
|
779
|
+
);
|
|
780
|
+
}
|
|
781
|
+
};
|
|
782
|
+
const logRawToolCalls = (options) => {
|
|
783
|
+
const { toolCalls, finishReason, text, testCaseId, caseLogs } = options;
|
|
784
|
+
try {
|
|
785
|
+
caseLogs.push(
|
|
786
|
+
`[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
787
|
+
);
|
|
788
|
+
} catch {
|
|
789
|
+
caseLogs.push(
|
|
790
|
+
`[DEBUG] ${testCaseId}: failed to serialize toolCalls`
|
|
791
|
+
);
|
|
792
|
+
}
|
|
793
|
+
};
|
|
794
|
+
const buildFailureContext = (options) => {
|
|
795
|
+
const {
|
|
796
|
+
testCase,
|
|
797
|
+
tools,
|
|
798
|
+
flatMessages,
|
|
799
|
+
mwOriginalText,
|
|
800
|
+
text,
|
|
801
|
+
finishReason,
|
|
802
|
+
mwParsedToolCalls,
|
|
803
|
+
restoredCalls,
|
|
804
|
+
possibleAnswer
|
|
805
|
+
} = options;
|
|
806
|
+
const lastUser = (() => {
|
|
807
|
+
const reversed = [...flatMessages].reverse();
|
|
808
|
+
const found = reversed.find(
|
|
809
|
+
(m) => m.role === "user"
|
|
810
|
+
);
|
|
811
|
+
return found?.content ?? void 0;
|
|
812
|
+
})();
|
|
813
|
+
const rawModelText = (() => {
|
|
814
|
+
if (mwOriginalText && mwOriginalText.length > 0) {
|
|
815
|
+
return mwOriginalText;
|
|
747
816
|
}
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
817
|
+
if (typeof text === "string") {
|
|
818
|
+
return text;
|
|
819
|
+
}
|
|
820
|
+
return "";
|
|
821
|
+
})();
|
|
822
|
+
return {
|
|
823
|
+
id: testCase.id,
|
|
824
|
+
tool_schema: tools,
|
|
825
|
+
last_user_query: lastUser,
|
|
826
|
+
raw_model_text: rawModelText,
|
|
827
|
+
finish_reason: finishReason,
|
|
828
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
829
|
+
ground_truth: possibleAnswer.ground_truth
|
|
830
|
+
};
|
|
831
|
+
};
|
|
832
|
+
const logFailureDetails = (options) => {
|
|
833
|
+
const {
|
|
834
|
+
testCase,
|
|
835
|
+
tools,
|
|
836
|
+
possibleAnswer,
|
|
837
|
+
restoredCalls,
|
|
838
|
+
checkerResult,
|
|
839
|
+
flatMessages,
|
|
840
|
+
mwOriginalText,
|
|
841
|
+
text,
|
|
842
|
+
finishReason,
|
|
843
|
+
mwParsedToolCalls,
|
|
844
|
+
caseLogs
|
|
845
|
+
} = options;
|
|
846
|
+
try {
|
|
847
|
+
const category = testCase.id.split("_")[0];
|
|
848
|
+
const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
|
|
849
|
+
tools,
|
|
850
|
+
possibleAnswer,
|
|
851
|
+
restoredCalls
|
|
852
|
+
) : buildParallelDiff(
|
|
853
|
+
tools,
|
|
854
|
+
possibleAnswer,
|
|
855
|
+
restoredCalls
|
|
856
|
+
);
|
|
857
|
+
caseLogs.push(
|
|
858
|
+
`[DEBUG-FAIL] ${JSON.stringify({
|
|
859
|
+
id: testCase.id,
|
|
860
|
+
message: checkerResult.error,
|
|
861
|
+
error_type: checkerResult.error_type,
|
|
862
|
+
expected,
|
|
863
|
+
actual,
|
|
864
|
+
diff
|
|
865
|
+
})}`
|
|
866
|
+
);
|
|
756
867
|
try {
|
|
868
|
+
const contextPayload = buildFailureContext({
|
|
869
|
+
testCase,
|
|
870
|
+
tools,
|
|
871
|
+
flatMessages,
|
|
872
|
+
mwOriginalText,
|
|
873
|
+
text,
|
|
874
|
+
finishReason,
|
|
875
|
+
mwParsedToolCalls,
|
|
876
|
+
restoredCalls,
|
|
877
|
+
possibleAnswer
|
|
878
|
+
});
|
|
757
879
|
caseLogs.push(
|
|
758
|
-
`[DEBUG] ${
|
|
880
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
759
881
|
);
|
|
760
882
|
} catch {
|
|
761
|
-
caseLogs.push(
|
|
762
|
-
`[DEBUG] ${testCase.id}: failed to serialize toolCalls`
|
|
763
|
-
);
|
|
764
883
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
884
|
+
} catch {
|
|
885
|
+
caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
|
|
886
|
+
}
|
|
887
|
+
};
|
|
888
|
+
const buildToolsMap = (transformedTools) => Object.fromEntries(
|
|
889
|
+
transformedTools.map((t) => [
|
|
890
|
+
t.name,
|
|
891
|
+
tool({
|
|
892
|
+
description: typeof t.description === "string" ? t.description : void 0,
|
|
893
|
+
inputSchema: jsonSchema(
|
|
894
|
+
t.inputSchema
|
|
895
|
+
)
|
|
896
|
+
})
|
|
897
|
+
])
|
|
898
|
+
);
|
|
899
|
+
const executeModelGeneration = async (options) => {
|
|
900
|
+
const {
|
|
901
|
+
model: modelInstance,
|
|
902
|
+
flatMessages,
|
|
903
|
+
toolsMap,
|
|
904
|
+
temperature,
|
|
905
|
+
maxTokens
|
|
906
|
+
} = options;
|
|
907
|
+
const debugSummaryRef = {};
|
|
908
|
+
const providerOptions = {
|
|
909
|
+
toolCallMiddleware: {
|
|
910
|
+
debugSummary: debugSummaryRef
|
|
768
911
|
}
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
912
|
+
};
|
|
913
|
+
const { toolCalls, text, finishReason } = await generateText({
|
|
914
|
+
model: modelInstance,
|
|
915
|
+
messages: flatMessages,
|
|
916
|
+
tools: toolsMap,
|
|
917
|
+
toolChoice: "auto",
|
|
918
|
+
providerOptions,
|
|
919
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
920
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
921
|
+
});
|
|
922
|
+
return { toolCalls, text, finishReason, debugSummaryRef };
|
|
923
|
+
};
|
|
924
|
+
const processValidationResult = (options) => {
|
|
925
|
+
const {
|
|
926
|
+
checkerResult,
|
|
927
|
+
testCase,
|
|
928
|
+
tools,
|
|
929
|
+
possibleAnswer,
|
|
930
|
+
restoredCalls,
|
|
931
|
+
flatMessages,
|
|
932
|
+
mwOriginalText,
|
|
933
|
+
text,
|
|
934
|
+
finishReason,
|
|
935
|
+
mwParsedToolCalls,
|
|
936
|
+
caseLogs
|
|
937
|
+
} = options;
|
|
938
|
+
if (checkerResult.valid) {
|
|
939
|
+
caseLogs.push(`[PASS] ${testCase.id}`);
|
|
940
|
+
return { valid: true, logs: caseLogs };
|
|
941
|
+
}
|
|
942
|
+
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
943
|
+
logFailureDetails({
|
|
944
|
+
testCase,
|
|
945
|
+
tools,
|
|
946
|
+
possibleAnswer,
|
|
947
|
+
restoredCalls,
|
|
948
|
+
checkerResult,
|
|
949
|
+
flatMessages,
|
|
950
|
+
mwOriginalText,
|
|
951
|
+
text,
|
|
952
|
+
finishReason,
|
|
953
|
+
mwParsedToolCalls,
|
|
954
|
+
caseLogs
|
|
955
|
+
});
|
|
956
|
+
return { valid: false, logs: caseLogs };
|
|
957
|
+
};
|
|
958
|
+
const prepareTestCaseData = (testCase) => {
|
|
959
|
+
const { function: tools, question: messages } = testCase;
|
|
960
|
+
const flatMessages = flattenMessages(messages);
|
|
961
|
+
const { transformedTools, nameMap } = buildTransformedTools(
|
|
962
|
+
tools,
|
|
963
|
+
fixSchema
|
|
964
|
+
);
|
|
965
|
+
const toolsMap = buildToolsMap(transformedTools);
|
|
966
|
+
return { flatMessages, transformedTools, nameMap, toolsMap };
|
|
967
|
+
};
|
|
968
|
+
const processModelResponse = (options) => {
|
|
969
|
+
const {
|
|
970
|
+
testCase,
|
|
971
|
+
toolCalls,
|
|
972
|
+
text,
|
|
973
|
+
finishReason,
|
|
974
|
+
debugSummaryRef,
|
|
975
|
+
nameMap,
|
|
976
|
+
transformedTools,
|
|
977
|
+
flatMessages,
|
|
978
|
+
tools,
|
|
979
|
+
caseLogs
|
|
980
|
+
} = options;
|
|
981
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
982
|
+
const mwParsedToolCalls = parseDebugToolCalls(
|
|
983
|
+
debugSummaryRef.toolCalls
|
|
984
|
+
);
|
|
985
|
+
logRawToolCalls({
|
|
986
|
+
toolCalls,
|
|
987
|
+
finishReason,
|
|
988
|
+
text,
|
|
989
|
+
testCaseId: testCase.id,
|
|
990
|
+
caseLogs
|
|
991
|
+
});
|
|
992
|
+
const possibleAnswer = possibleAnswersMap.get(testCase.id);
|
|
993
|
+
if (!possibleAnswer) {
|
|
994
|
+
throw new Error(`No possible answer for id: ${testCase.id}`);
|
|
995
|
+
}
|
|
996
|
+
const restoredCalls = restoreToolCalls(
|
|
997
|
+
toolCalls || [],
|
|
998
|
+
nameMap,
|
|
999
|
+
transformedTools
|
|
1000
|
+
);
|
|
1001
|
+
const checkerResult = check(testCase, restoredCalls, possibleAnswer);
|
|
1002
|
+
return processValidationResult({
|
|
1003
|
+
checkerResult,
|
|
1004
|
+
testCase,
|
|
1005
|
+
tools,
|
|
1006
|
+
possibleAnswer,
|
|
1007
|
+
restoredCalls,
|
|
1008
|
+
flatMessages,
|
|
1009
|
+
mwOriginalText,
|
|
1010
|
+
text,
|
|
1011
|
+
finishReason,
|
|
1012
|
+
mwParsedToolCalls,
|
|
1013
|
+
caseLogs
|
|
1014
|
+
});
|
|
1015
|
+
};
|
|
1016
|
+
const runSingleCase = async (testCase) => {
|
|
1017
|
+
const caseLogs = [];
|
|
1018
|
+
const { function: tools } = testCase;
|
|
1019
|
+
const temp = config?.temperature;
|
|
1020
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1021
|
+
const maxTok = config?.maxTokens;
|
|
1022
|
+
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
1023
|
+
try {
|
|
1024
|
+
const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
|
|
1025
|
+
logFirstToolDebug(transformedTools, testCase.id, caseLogs);
|
|
1026
|
+
const { toolCalls, text, finishReason, debugSummaryRef } = await executeModelGeneration({
|
|
1027
|
+
model,
|
|
1028
|
+
flatMessages,
|
|
1029
|
+
toolsMap,
|
|
1030
|
+
temperature,
|
|
1031
|
+
maxTokens
|
|
787
1032
|
});
|
|
788
|
-
|
|
1033
|
+
return processModelResponse({
|
|
789
1034
|
testCase,
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
const diff = [];
|
|
801
|
-
const summarizeArgs = (args) => {
|
|
802
|
-
if (args == null) return args;
|
|
803
|
-
if (typeof args !== "object") return args;
|
|
804
|
-
return Object.keys(args).sort().reduce(
|
|
805
|
-
(acc, k) => {
|
|
806
|
-
acc[k] = args[k];
|
|
807
|
-
return acc;
|
|
808
|
-
},
|
|
809
|
-
{}
|
|
810
|
-
);
|
|
811
|
-
};
|
|
812
|
-
const expected = {};
|
|
813
|
-
const actual = {};
|
|
814
|
-
if (category === "simple") {
|
|
815
|
-
const funcDesc = tools[0];
|
|
816
|
-
const gt = possibleAnswer.ground_truth?.[0];
|
|
817
|
-
const expectedFuncName = funcDesc?.name;
|
|
818
|
-
const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
|
|
819
|
-
const received = restoredCalls[0];
|
|
820
|
-
const receivedName = received?.toolName ?? received?.name;
|
|
821
|
-
const receivedArgs = summarizeArgs(received?.args);
|
|
822
|
-
expected.function = expectedFuncName;
|
|
823
|
-
expected.params = expectedParams;
|
|
824
|
-
actual.function = receivedName;
|
|
825
|
-
actual.args = receivedArgs;
|
|
826
|
-
if (expectedFuncName !== receivedName) {
|
|
827
|
-
diff.push(`@@ function name`);
|
|
828
|
-
diff.push(`- ${expectedFuncName}`);
|
|
829
|
-
diff.push(`+ ${receivedName}`);
|
|
830
|
-
}
|
|
831
|
-
if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
832
|
-
const required = funcDesc?.parameters?.required ?? [];
|
|
833
|
-
for (const req of required) {
|
|
834
|
-
if (!(req in receivedArgs)) {
|
|
835
|
-
diff.push(`- missing required param: ${req}`);
|
|
836
|
-
}
|
|
837
|
-
}
|
|
838
|
-
for (const k of Object.keys(
|
|
839
|
-
receivedArgs
|
|
840
|
-
)) {
|
|
841
|
-
if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
842
|
-
diff.push(`+ unexpected param: ${k}`);
|
|
843
|
-
}
|
|
844
|
-
}
|
|
845
|
-
for (const k of Object.keys(
|
|
846
|
-
receivedArgs
|
|
847
|
-
)) {
|
|
848
|
-
if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
|
|
849
|
-
const allowed = expectedParams[k];
|
|
850
|
-
const got = receivedArgs[k];
|
|
851
|
-
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
852
|
-
try {
|
|
853
|
-
if (Array.isArray(got)) {
|
|
854
|
-
return JSON.stringify(
|
|
855
|
-
got.map((x) => String(x)).sort()
|
|
856
|
-
) === JSON.stringify(
|
|
857
|
-
v.map((x) => String(x)).sort()
|
|
858
|
-
);
|
|
859
|
-
}
|
|
860
|
-
} catch {
|
|
861
|
-
}
|
|
862
|
-
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
863
|
-
});
|
|
864
|
-
if (!includes) {
|
|
865
|
-
diff.push(`@@ param ${k}`);
|
|
866
|
-
diff.push(
|
|
867
|
-
`- expected one of: ${JSON.stringify(allowed)}`
|
|
868
|
-
);
|
|
869
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
870
|
-
}
|
|
871
|
-
}
|
|
872
|
-
}
|
|
873
|
-
}
|
|
874
|
-
} else {
|
|
875
|
-
const gtArr = possibleAnswer.ground_truth ?? [];
|
|
876
|
-
const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
|
|
877
|
-
const actualNames = restoredCalls.map(
|
|
878
|
-
(c) => c.toolName ?? c.name
|
|
879
|
-
);
|
|
880
|
-
expected.functions = expectedNames;
|
|
881
|
-
actual.functions = actualNames;
|
|
882
|
-
if (expectedNames.length !== actualNames.length) {
|
|
883
|
-
diff.push(`@@ call count`);
|
|
884
|
-
diff.push(`- expected ${expectedNames.length}`);
|
|
885
|
-
diff.push(`+ got ${actualNames.length}`);
|
|
886
|
-
}
|
|
887
|
-
const missing = expectedNames.filter(
|
|
888
|
-
(n) => !actualNames.includes(n)
|
|
889
|
-
);
|
|
890
|
-
const extra = actualNames.filter(
|
|
891
|
-
(n) => !expectedNames.includes(n)
|
|
892
|
-
);
|
|
893
|
-
for (const m of missing)
|
|
894
|
-
diff.push(`- missing function: ${m}`);
|
|
895
|
-
for (const e of extra)
|
|
896
|
-
diff.push(`+ unexpected function: ${e}`);
|
|
897
|
-
const usedActual = /* @__PURE__ */ new Set();
|
|
898
|
-
for (const expectedObj of gtArr) {
|
|
899
|
-
const fname = Object.keys(expectedObj)[0];
|
|
900
|
-
let matchedIndex = -1;
|
|
901
|
-
for (let i = 0; i < restoredCalls.length; i++) {
|
|
902
|
-
if (usedActual.has(i)) continue;
|
|
903
|
-
const rc = restoredCalls[i];
|
|
904
|
-
const rcName = rc?.toolName ?? rc?.name;
|
|
905
|
-
if (rcName === fname) {
|
|
906
|
-
matchedIndex = i;
|
|
907
|
-
break;
|
|
908
|
-
}
|
|
909
|
-
}
|
|
910
|
-
if (matchedIndex === -1) continue;
|
|
911
|
-
usedActual.add(matchedIndex);
|
|
912
|
-
const received = restoredCalls[matchedIndex];
|
|
913
|
-
const receivedArgs = summarizeArgs(received?.args);
|
|
914
|
-
const expectedParamsAllowed = expectedObj[fname];
|
|
915
|
-
const funcDesc = tools.find(
|
|
916
|
-
(t) => t.name === fname
|
|
917
|
-
);
|
|
918
|
-
const requiredParams = funcDesc?.parameters?.required ?? [];
|
|
919
|
-
diff.push(`@@ function ${fname}`);
|
|
920
|
-
if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
|
|
921
|
-
for (const req of requiredParams) {
|
|
922
|
-
if (!(req in receivedArgs)) {
|
|
923
|
-
diff.push(`- missing required param: ${req}`);
|
|
924
|
-
}
|
|
925
|
-
}
|
|
926
|
-
for (const k of Object.keys(
|
|
927
|
-
receivedArgs
|
|
928
|
-
)) {
|
|
929
|
-
if (!Object.prototype.hasOwnProperty.call(
|
|
930
|
-
expectedParamsAllowed,
|
|
931
|
-
k
|
|
932
|
-
)) {
|
|
933
|
-
diff.push(`+ unexpected param: ${k}`);
|
|
934
|
-
}
|
|
935
|
-
}
|
|
936
|
-
for (const k of Object.keys(
|
|
937
|
-
receivedArgs
|
|
938
|
-
)) {
|
|
939
|
-
if (Object.prototype.hasOwnProperty.call(
|
|
940
|
-
expectedParamsAllowed,
|
|
941
|
-
k
|
|
942
|
-
)) {
|
|
943
|
-
const allowed = expectedParamsAllowed[k];
|
|
944
|
-
const got = receivedArgs[k];
|
|
945
|
-
const includes = Array.isArray(allowed) && allowed.some((v) => {
|
|
946
|
-
try {
|
|
947
|
-
if (Array.isArray(got)) {
|
|
948
|
-
return JSON.stringify(
|
|
949
|
-
got.map((x) => String(x)).sort()
|
|
950
|
-
) === JSON.stringify(
|
|
951
|
-
v.map((x) => String(x)).sort()
|
|
952
|
-
);
|
|
953
|
-
}
|
|
954
|
-
} catch {
|
|
955
|
-
}
|
|
956
|
-
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
957
|
-
});
|
|
958
|
-
if (!includes) {
|
|
959
|
-
diff.push(`@@ param ${k}`);
|
|
960
|
-
diff.push(
|
|
961
|
-
`- expected one of: ${JSON.stringify(allowed)}`
|
|
962
|
-
);
|
|
963
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
964
|
-
}
|
|
965
|
-
}
|
|
966
|
-
}
|
|
967
|
-
}
|
|
968
|
-
}
|
|
969
|
-
}
|
|
970
|
-
caseLogs.push(
|
|
971
|
-
`[DEBUG-FAIL] ${JSON.stringify({
|
|
972
|
-
id: testCase.id,
|
|
973
|
-
message: checkerResult.error,
|
|
974
|
-
error_type: checkerResult.error_type,
|
|
975
|
-
expected,
|
|
976
|
-
actual,
|
|
977
|
-
diff
|
|
978
|
-
})}`
|
|
979
|
-
);
|
|
980
|
-
} catch {
|
|
981
|
-
caseLogs.push(
|
|
982
|
-
`[DEBUG] ${testCase.id}: failed to build debug diff`
|
|
983
|
-
);
|
|
984
|
-
}
|
|
985
|
-
return { valid: false, logs: caseLogs };
|
|
986
|
-
}
|
|
1035
|
+
toolCalls,
|
|
1036
|
+
text,
|
|
1037
|
+
finishReason,
|
|
1038
|
+
debugSummaryRef,
|
|
1039
|
+
nameMap,
|
|
1040
|
+
transformedTools,
|
|
1041
|
+
flatMessages,
|
|
1042
|
+
tools,
|
|
1043
|
+
caseLogs
|
|
1044
|
+
});
|
|
987
1045
|
} catch (e) {
|
|
988
1046
|
caseLogs.push(
|
|
989
1047
|
`[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
|
|
@@ -994,13 +1052,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
994
1052
|
return { valid: false, logs: caseLogs };
|
|
995
1053
|
}
|
|
996
1054
|
};
|
|
997
|
-
const mapWithConcurrency = async (items,
|
|
1055
|
+
const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
|
|
998
1056
|
const results = new Array(items.length);
|
|
999
1057
|
let idx = 0;
|
|
1000
|
-
const workers = new Array(Math.min(
|
|
1058
|
+
const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
|
|
1001
1059
|
while (true) {
|
|
1002
|
-
const current = idx
|
|
1003
|
-
|
|
1060
|
+
const current = idx;
|
|
1061
|
+
idx += 1;
|
|
1062
|
+
if (current >= items.length) {
|
|
1063
|
+
break;
|
|
1064
|
+
}
|
|
1004
1065
|
results[current] = await mapper(items[current], current);
|
|
1005
1066
|
}
|
|
1006
1067
|
});
|
|
@@ -1016,7 +1077,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1016
1077
|
(acc, r) => acc + (r.valid ? 1 : 0),
|
|
1017
1078
|
0
|
|
1018
1079
|
);
|
|
1019
|
-
for (const r of resultsPerCase)
|
|
1080
|
+
for (const r of resultsPerCase) {
|
|
1081
|
+
logs.push(...r.logs);
|
|
1082
|
+
}
|
|
1020
1083
|
if (testCases.length === 0) {
|
|
1021
1084
|
return {
|
|
1022
1085
|
score: 0,
|
|
@@ -1043,7 +1106,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1043
1106
|
success: false,
|
|
1044
1107
|
metrics: {},
|
|
1045
1108
|
error: e,
|
|
1046
|
-
logs: [
|
|
1109
|
+
logs: [
|
|
1110
|
+
`[FATAL] Failed to run benchmark ${name}: ${e.message}`
|
|
1111
|
+
]
|
|
1047
1112
|
};
|
|
1048
1113
|
}
|
|
1049
1114
|
}
|
|
@@ -1052,87 +1117,222 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1052
1117
|
var bfclSimpleBenchmark = createBfclBenchmark(
|
|
1053
1118
|
"bfcl-simple",
|
|
1054
1119
|
"BFCL Simple Function Calling",
|
|
1055
|
-
"BFCL_v3_simple.
|
|
1056
|
-
"BFCL_v3_simple_possible_answer.
|
|
1120
|
+
"BFCL_v3_simple.jsonl",
|
|
1121
|
+
"BFCL_v3_simple_possible_answer.jsonl"
|
|
1057
1122
|
);
|
|
1058
1123
|
var bfclParallelBenchmark = createBfclBenchmark(
|
|
1059
1124
|
"bfcl-parallel",
|
|
1060
1125
|
"BFCL Parallel Function Calling",
|
|
1061
|
-
"BFCL_v3_parallel.
|
|
1062
|
-
"BFCL_v3_parallel_possible_answer.
|
|
1126
|
+
"BFCL_v3_parallel.jsonl",
|
|
1127
|
+
"BFCL_v3_parallel_possible_answer.jsonl"
|
|
1063
1128
|
);
|
|
1064
1129
|
var bfclMultipleBenchmark = createBfclBenchmark(
|
|
1065
1130
|
"bfcl-multiple",
|
|
1066
1131
|
"BFCL Multiple Function Calling",
|
|
1067
|
-
"BFCL_v3_multiple.
|
|
1068
|
-
"BFCL_v3_multiple_possible_answer.
|
|
1132
|
+
"BFCL_v3_multiple.jsonl",
|
|
1133
|
+
"BFCL_v3_multiple_possible_answer.jsonl"
|
|
1069
1134
|
);
|
|
1070
1135
|
var bfclParallelMultipleBenchmark = createBfclBenchmark(
|
|
1071
1136
|
"bfcl-parallel-multiple",
|
|
1072
1137
|
"BFCL Parallel & Multiple Function Calling",
|
|
1073
|
-
"BFCL_v3_parallel_multiple.
|
|
1074
|
-
"BFCL_v3_parallel_multiple_possible_answer.
|
|
1138
|
+
"BFCL_v3_parallel_multiple.jsonl",
|
|
1139
|
+
"BFCL_v3_parallel_multiple_possible_answer.jsonl"
|
|
1075
1140
|
);
|
|
1076
1141
|
|
|
1077
1142
|
// src/benchmarks/json-generation.ts
|
|
1078
|
-
import { generateText as generateText2 } from "ai";
|
|
1079
|
-
import Ajv from "ajv";
|
|
1080
1143
|
import { promises as fs3 } from "fs";
|
|
1081
1144
|
import path3 from "path";
|
|
1082
|
-
|
|
1145
|
+
import { generateText as generateText2 } from "ai";
|
|
1146
|
+
import Ajv from "ajv";
|
|
1147
|
+
var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
|
|
1148
|
+
var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
|
|
1149
|
+
var NEWLINE_REGEX = /\r?\n/;
|
|
1150
|
+
var LINE_SPLIT_REGEX2 = /\r?\n/;
|
|
1151
|
+
function tryDirectParse(text) {
|
|
1083
1152
|
try {
|
|
1084
1153
|
return JSON.parse(text);
|
|
1085
1154
|
} catch {
|
|
1155
|
+
return;
|
|
1086
1156
|
}
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1157
|
+
}
|
|
1158
|
+
function tryCodeFenceParse(text) {
|
|
1159
|
+
const fenceMatch = text.match(JSON_FENCE_REGEX) || text.match(CODE_FENCE_REGEX);
|
|
1160
|
+
if (!fenceMatch) {
|
|
1161
|
+
return;
|
|
1162
|
+
}
|
|
1163
|
+
const inner = fenceMatch[1].trim();
|
|
1164
|
+
try {
|
|
1165
|
+
return JSON.parse(inner);
|
|
1166
|
+
} catch {
|
|
1167
|
+
return;
|
|
1094
1168
|
}
|
|
1169
|
+
}
|
|
1170
|
+
function tryBracketScan(text) {
|
|
1095
1171
|
const startIdxObj = text.indexOf("{");
|
|
1096
1172
|
const startIdxArr = text.indexOf("[");
|
|
1097
1173
|
const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
|
|
1098
|
-
if (start === void 0)
|
|
1174
|
+
if (start === void 0) {
|
|
1175
|
+
return;
|
|
1176
|
+
}
|
|
1099
1177
|
const open = text[start] === "{" ? "{" : "[";
|
|
1100
1178
|
const close = open === "{" ? "}" : "]";
|
|
1101
1179
|
let depth = 0;
|
|
1102
|
-
for (let i = start; i < text.length; i
|
|
1180
|
+
for (let i = start; i < text.length; i += 1) {
|
|
1103
1181
|
const ch = text[i];
|
|
1104
|
-
if (ch === open)
|
|
1105
|
-
|
|
1182
|
+
if (ch === open) {
|
|
1183
|
+
depth += 1;
|
|
1184
|
+
} else if (ch === close) {
|
|
1185
|
+
depth -= 1;
|
|
1186
|
+
}
|
|
1106
1187
|
if (depth === 0) {
|
|
1107
1188
|
const candidate = text.slice(start, i + 1);
|
|
1108
1189
|
try {
|
|
1109
1190
|
return JSON.parse(candidate);
|
|
1110
1191
|
} catch {
|
|
1192
|
+
return;
|
|
1111
1193
|
}
|
|
1112
|
-
break;
|
|
1113
1194
|
}
|
|
1114
1195
|
}
|
|
1115
|
-
return
|
|
1196
|
+
return;
|
|
1197
|
+
}
|
|
1198
|
+
function extractFirstJsonBlock(text) {
|
|
1199
|
+
const directResult = tryDirectParse(text);
|
|
1200
|
+
if (directResult !== void 0) {
|
|
1201
|
+
return directResult;
|
|
1202
|
+
}
|
|
1203
|
+
const fenceResult = tryCodeFenceParse(text);
|
|
1204
|
+
if (fenceResult !== void 0) {
|
|
1205
|
+
return fenceResult;
|
|
1206
|
+
}
|
|
1207
|
+
return tryBracketScan(text);
|
|
1116
1208
|
}
|
|
1117
1209
|
function subsetMatch(expected, actual) {
|
|
1118
1210
|
if (expected === null || typeof expected !== "object") {
|
|
1119
1211
|
return expected === actual;
|
|
1120
1212
|
}
|
|
1121
1213
|
if (Array.isArray(expected)) {
|
|
1122
|
-
if (!Array.isArray(actual))
|
|
1123
|
-
|
|
1124
|
-
|
|
1214
|
+
if (!Array.isArray(actual)) {
|
|
1215
|
+
return false;
|
|
1216
|
+
}
|
|
1217
|
+
for (let i = 0; i < expected.length; i += 1) {
|
|
1218
|
+
if (!subsetMatch(expected[i], actual[i])) {
|
|
1219
|
+
return false;
|
|
1220
|
+
}
|
|
1125
1221
|
}
|
|
1126
1222
|
return true;
|
|
1127
1223
|
}
|
|
1128
|
-
if (actual === null || typeof actual !== "object")
|
|
1224
|
+
if (actual === null || typeof actual !== "object") {
|
|
1225
|
+
return false;
|
|
1226
|
+
}
|
|
1129
1227
|
const eObj = expected;
|
|
1130
1228
|
const aObj = actual;
|
|
1131
1229
|
for (const key of Object.keys(eObj)) {
|
|
1132
|
-
if (!subsetMatch(eObj[key], aObj[key]))
|
|
1230
|
+
if (!subsetMatch(eObj[key], aObj[key])) {
|
|
1231
|
+
return false;
|
|
1232
|
+
}
|
|
1133
1233
|
}
|
|
1134
1234
|
return true;
|
|
1135
1235
|
}
|
|
1236
|
+
async function loadDatasets() {
|
|
1237
|
+
try {
|
|
1238
|
+
const dataDir = resolveDataDir();
|
|
1239
|
+
const testsJsonl = await fs3.readFile(
|
|
1240
|
+
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1241
|
+
"utf-8"
|
|
1242
|
+
);
|
|
1243
|
+
const expectedJsonl = await fs3.readFile(
|
|
1244
|
+
path3.join(dataDir, "json_generation_expected.jsonl"),
|
|
1245
|
+
"utf-8"
|
|
1246
|
+
);
|
|
1247
|
+
const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1248
|
+
const expecteds = expectedJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1249
|
+
const expectedMap = /* @__PURE__ */ new Map();
|
|
1250
|
+
for (const r of expecteds) {
|
|
1251
|
+
expectedMap.set(r.id, r);
|
|
1252
|
+
}
|
|
1253
|
+
return { tests, expectedMap };
|
|
1254
|
+
} catch (e) {
|
|
1255
|
+
return {
|
|
1256
|
+
tests: [],
|
|
1257
|
+
expectedMap: /* @__PURE__ */ new Map(),
|
|
1258
|
+
error: e
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
function buildMessages(tc) {
|
|
1263
|
+
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1264
|
+
return [
|
|
1265
|
+
{
|
|
1266
|
+
role: "system",
|
|
1267
|
+
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1268
|
+
},
|
|
1269
|
+
{
|
|
1270
|
+
role: "user",
|
|
1271
|
+
content: [
|
|
1272
|
+
"Generate a JSON object that reflects the following facts.",
|
|
1273
|
+
"JSON Schema:",
|
|
1274
|
+
schemaStr,
|
|
1275
|
+
"Facts:",
|
|
1276
|
+
tc.promptFacts,
|
|
1277
|
+
"Output must be a single JSON only, with no additional text."
|
|
1278
|
+
].join("\n\n")
|
|
1279
|
+
}
|
|
1280
|
+
];
|
|
1281
|
+
}
|
|
1282
|
+
function validateTestCase(tc, parsed, context) {
|
|
1283
|
+
const validate = context.ajv.compile(tc.schema);
|
|
1284
|
+
const valid = validate(parsed);
|
|
1285
|
+
if (!valid) {
|
|
1286
|
+
context.logs.push(
|
|
1287
|
+
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1288
|
+
);
|
|
1289
|
+
}
|
|
1290
|
+
const expectedRec = context.expectedMap.get(tc.id);
|
|
1291
|
+
if (!expectedRec) {
|
|
1292
|
+
context.logs.push(
|
|
1293
|
+
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1294
|
+
);
|
|
1295
|
+
}
|
|
1296
|
+
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1297
|
+
return { valid, valuesOk, parsed };
|
|
1298
|
+
}
|
|
1299
|
+
async function processTestCase(tc, context) {
|
|
1300
|
+
const messages = buildMessages(tc);
|
|
1301
|
+
const temp = context.config?.temperature;
|
|
1302
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1303
|
+
const { text } = await generateText2({
|
|
1304
|
+
model: context.model,
|
|
1305
|
+
messages,
|
|
1306
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1307
|
+
});
|
|
1308
|
+
let parsed;
|
|
1309
|
+
try {
|
|
1310
|
+
parsed = extractFirstJsonBlock(text);
|
|
1311
|
+
} catch {
|
|
1312
|
+
}
|
|
1313
|
+
if (parsed === void 0) {
|
|
1314
|
+
context.validation.logs.push(
|
|
1315
|
+
`[FAIL] ${tc.id}: Unable to parse JSON from model output.`
|
|
1316
|
+
);
|
|
1317
|
+
return { schemaValid: false, valueMatch: false, correct: false };
|
|
1318
|
+
}
|
|
1319
|
+
const {
|
|
1320
|
+
valid,
|
|
1321
|
+
valuesOk,
|
|
1322
|
+
parsed: validatedParsed
|
|
1323
|
+
} = validateTestCase(tc, parsed, context.validation);
|
|
1324
|
+
const correct = valid && valuesOk;
|
|
1325
|
+
if (correct) {
|
|
1326
|
+
context.validation.logs.push(`[PASS] ${tc.id}`);
|
|
1327
|
+
} else {
|
|
1328
|
+
context.validation.logs.push(
|
|
1329
|
+
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1330
|
+
validatedParsed
|
|
1331
|
+
)}`
|
|
1332
|
+
);
|
|
1333
|
+
}
|
|
1334
|
+
return { schemaValid: valid, valueMatch: valuesOk, correct };
|
|
1335
|
+
}
|
|
1136
1336
|
var jsonGenerationBenchmark = {
|
|
1137
1337
|
name: "json-generation",
|
|
1138
1338
|
version: "2.1.0",
|
|
@@ -1140,116 +1340,124 @@ var jsonGenerationBenchmark = {
|
|
|
1140
1340
|
async run(model, config) {
|
|
1141
1341
|
const logs = [];
|
|
1142
1342
|
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
let correctCount = 0;
|
|
1146
|
-
let tests = [];
|
|
1147
|
-
const expectedMap = /* @__PURE__ */ new Map();
|
|
1148
|
-
try {
|
|
1149
|
-
const dataDir = resolveDataDir();
|
|
1150
|
-
const testsJsonl = await fs3.readFile(
|
|
1151
|
-
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1152
|
-
"utf-8"
|
|
1153
|
-
);
|
|
1154
|
-
const expectedJsonl = await fs3.readFile(
|
|
1155
|
-
path3.join(dataDir, "json_generation_expected.jsonl"),
|
|
1156
|
-
"utf-8"
|
|
1157
|
-
);
|
|
1158
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1159
|
-
const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1160
|
-
for (const r of expecteds) expectedMap.set(r.id, r);
|
|
1161
|
-
} catch (e) {
|
|
1162
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1343
|
+
const { tests, expectedMap, error } = await loadDatasets();
|
|
1344
|
+
if (error) {
|
|
1163
1345
|
return {
|
|
1164
1346
|
score: 0,
|
|
1165
1347
|
success: false,
|
|
1166
1348
|
metrics: {},
|
|
1167
|
-
logs: [
|
|
1168
|
-
|
|
1349
|
+
logs: [
|
|
1350
|
+
`[FATAL] Failed to load json-generation datasets: ${error.message}`
|
|
1351
|
+
],
|
|
1352
|
+
error
|
|
1169
1353
|
};
|
|
1170
1354
|
}
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
{
|
|
1176
|
-
role: "system",
|
|
1177
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1178
|
-
},
|
|
1179
|
-
{
|
|
1180
|
-
role: "user",
|
|
1181
|
-
content: [
|
|
1182
|
-
"Generate a JSON object that reflects the following facts.",
|
|
1183
|
-
"JSON Schema:",
|
|
1184
|
-
schemaStr,
|
|
1185
|
-
"Facts:",
|
|
1186
|
-
tc.promptFacts,
|
|
1187
|
-
"Output must be a single JSON only, with no additional text."
|
|
1188
|
-
].join("\n\n")
|
|
1189
|
-
}
|
|
1190
|
-
];
|
|
1191
|
-
const temp = config?.temperature;
|
|
1192
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1193
|
-
const { text } = await generateText2({
|
|
1194
|
-
model,
|
|
1195
|
-
messages,
|
|
1196
|
-
...temperature !== void 0 ? { temperature } : {}
|
|
1197
|
-
});
|
|
1198
|
-
let parsed;
|
|
1199
|
-
try {
|
|
1200
|
-
parsed = extractFirstJsonBlock(text);
|
|
1201
|
-
} catch {
|
|
1202
|
-
}
|
|
1203
|
-
if (parsed === void 0) {
|
|
1204
|
-
logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
|
|
1205
|
-
continue;
|
|
1206
|
-
}
|
|
1207
|
-
const validate = ajv.compile(tc.schema);
|
|
1208
|
-
const valid = validate(parsed);
|
|
1209
|
-
if (valid) schemaValidCount++;
|
|
1210
|
-
else
|
|
1211
|
-
logs.push(
|
|
1212
|
-
`[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1213
|
-
);
|
|
1214
|
-
const expectedRec = expectedMap.get(tc.id);
|
|
1215
|
-
if (!expectedRec) {
|
|
1216
|
-
logs.push(
|
|
1217
|
-
`[WARN] ${tc.id}: No expected record found. Skipping value match.`
|
|
1218
|
-
);
|
|
1219
|
-
}
|
|
1220
|
-
const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
|
|
1221
|
-
if (valuesOk) valueMatchCount++;
|
|
1222
|
-
if (valid && valuesOk) {
|
|
1223
|
-
correctCount++;
|
|
1224
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
1225
|
-
} else {
|
|
1226
|
-
logs.push(
|
|
1227
|
-
`[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
|
|
1228
|
-
parsed
|
|
1229
|
-
)}`
|
|
1230
|
-
);
|
|
1231
|
-
}
|
|
1232
|
-
} catch (e) {
|
|
1233
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1234
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1235
|
-
}
|
|
1236
|
-
}
|
|
1237
|
-
const total = tests.length;
|
|
1238
|
-
const score = correctCount / total;
|
|
1239
|
-
return {
|
|
1240
|
-
score,
|
|
1241
|
-
success: score >= 0.8,
|
|
1242
|
-
metrics: {
|
|
1243
|
-
total_cases: total,
|
|
1244
|
-
correct_count: correctCount,
|
|
1245
|
-
schema_valid_count: schemaValidCount,
|
|
1246
|
-
value_match_count: valueMatchCount,
|
|
1247
|
-
accuracy: score
|
|
1248
|
-
},
|
|
1249
|
-
logs
|
|
1355
|
+
const context = {
|
|
1356
|
+
model,
|
|
1357
|
+
config,
|
|
1358
|
+
validation: { expectedMap, ajv, logs }
|
|
1250
1359
|
};
|
|
1360
|
+
const counts = await processAllTests(tests, context);
|
|
1361
|
+
return buildBenchmarkResult(tests.length, counts, logs);
|
|
1251
1362
|
}
|
|
1252
1363
|
};
|
|
1364
|
+
async function processAllTests(tests, context) {
|
|
1365
|
+
let schemaValidCount = 0;
|
|
1366
|
+
let valueMatchCount = 0;
|
|
1367
|
+
let correctCount = 0;
|
|
1368
|
+
for (const tc of tests) {
|
|
1369
|
+
try {
|
|
1370
|
+
const result = await processTestCase(tc, context);
|
|
1371
|
+
if (result.schemaValid) {
|
|
1372
|
+
schemaValidCount += 1;
|
|
1373
|
+
}
|
|
1374
|
+
if (result.valueMatch) {
|
|
1375
|
+
valueMatchCount += 1;
|
|
1376
|
+
}
|
|
1377
|
+
if (result.correct) {
|
|
1378
|
+
correctCount += 1;
|
|
1379
|
+
}
|
|
1380
|
+
} catch (e) {
|
|
1381
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1382
|
+
context.validation.logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
return { schemaValidCount, valueMatchCount, correctCount };
|
|
1386
|
+
}
|
|
1387
|
+
function buildBenchmarkResult(total, counts, logs) {
|
|
1388
|
+
const score = counts.correctCount / total;
|
|
1389
|
+
return {
|
|
1390
|
+
score,
|
|
1391
|
+
success: score >= 0.8,
|
|
1392
|
+
metrics: {
|
|
1393
|
+
total_cases: total,
|
|
1394
|
+
correct_count: counts.correctCount,
|
|
1395
|
+
schema_valid_count: counts.schemaValidCount,
|
|
1396
|
+
value_match_count: counts.valueMatchCount,
|
|
1397
|
+
accuracy: score
|
|
1398
|
+
},
|
|
1399
|
+
logs
|
|
1400
|
+
};
|
|
1401
|
+
}
|
|
1402
|
+
async function loadSchemaOnlyTests() {
|
|
1403
|
+
try {
|
|
1404
|
+
const dataDir = resolveDataDir();
|
|
1405
|
+
const testsJsonl = await fs3.readFile(
|
|
1406
|
+
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1407
|
+
"utf-8"
|
|
1408
|
+
);
|
|
1409
|
+
const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1410
|
+
return { tests };
|
|
1411
|
+
} catch (e) {
|
|
1412
|
+
return { tests: [], error: e };
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
async function processSchemaOnlyTestCase(tc, context) {
|
|
1416
|
+
const messages = buildMessages(tc);
|
|
1417
|
+
const temp = context.config?.temperature;
|
|
1418
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1419
|
+
const { text } = await generateText2({
|
|
1420
|
+
model: context.model,
|
|
1421
|
+
messages,
|
|
1422
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1423
|
+
});
|
|
1424
|
+
let parsed;
|
|
1425
|
+
try {
|
|
1426
|
+
parsed = extractFirstJsonBlock(text);
|
|
1427
|
+
} catch {
|
|
1428
|
+
}
|
|
1429
|
+
if (parsed === void 0) {
|
|
1430
|
+
context.logs.push(
|
|
1431
|
+
`[FAIL] ${tc.id}: Could not parse JSON from model output.`
|
|
1432
|
+
);
|
|
1433
|
+
return false;
|
|
1434
|
+
}
|
|
1435
|
+
const validate = context.ajv.compile(tc.schema);
|
|
1436
|
+
const valid = validate(parsed);
|
|
1437
|
+
if (valid) {
|
|
1438
|
+
context.logs.push(`[PASS] ${tc.id}`);
|
|
1439
|
+
return true;
|
|
1440
|
+
}
|
|
1441
|
+
context.logs.push(
|
|
1442
|
+
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1443
|
+
);
|
|
1444
|
+
return false;
|
|
1445
|
+
}
|
|
1446
|
+
async function runSchemaOnlyTests(tests, context) {
|
|
1447
|
+
let schemaValidCount = 0;
|
|
1448
|
+
for (const tc of tests) {
|
|
1449
|
+
try {
|
|
1450
|
+
const isValid = await processSchemaOnlyTestCase(tc, context);
|
|
1451
|
+
if (isValid) {
|
|
1452
|
+
schemaValidCount += 1;
|
|
1453
|
+
}
|
|
1454
|
+
} catch (e) {
|
|
1455
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
1456
|
+
context.logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1457
|
+
}
|
|
1458
|
+
}
|
|
1459
|
+
return schemaValidCount;
|
|
1460
|
+
}
|
|
1253
1461
|
var jsonGenerationSchemaOnlyBenchmark = {
|
|
1254
1462
|
name: "json-generation-schema-only",
|
|
1255
1463
|
version: "1.0.1",
|
|
@@ -1257,76 +1465,19 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1257
1465
|
async run(model, config) {
|
|
1258
1466
|
const logs = [];
|
|
1259
1467
|
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
const
|
|
1263
|
-
const testsJsonl = await fs3.readFile(
|
|
1264
|
-
path3.join(dataDir, "json_generation_tests.jsonl"),
|
|
1265
|
-
"utf-8"
|
|
1266
|
-
);
|
|
1267
|
-
tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
|
|
1268
|
-
} catch (e) {
|
|
1269
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1468
|
+
const { tests, error } = await loadSchemaOnlyTests();
|
|
1469
|
+
if (error) {
|
|
1470
|
+
const msg = error.message;
|
|
1270
1471
|
return {
|
|
1271
1472
|
score: 0,
|
|
1272
1473
|
success: false,
|
|
1273
1474
|
metrics: {},
|
|
1274
1475
|
logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
|
|
1275
|
-
error
|
|
1476
|
+
error
|
|
1276
1477
|
};
|
|
1277
1478
|
}
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
try {
|
|
1281
|
-
const schemaStr = JSON.stringify(tc.schema, null, 2);
|
|
1282
|
-
const messages = [
|
|
1283
|
-
{
|
|
1284
|
-
role: "system",
|
|
1285
|
-
content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
|
|
1286
|
-
},
|
|
1287
|
-
{
|
|
1288
|
-
role: "user",
|
|
1289
|
-
content: [
|
|
1290
|
-
"Generate a JSON object that reflects the following facts.",
|
|
1291
|
-
"JSON Schema:",
|
|
1292
|
-
schemaStr,
|
|
1293
|
-
"Facts:",
|
|
1294
|
-
tc.promptFacts,
|
|
1295
|
-
"Output must be a single JSON only, with no additional text."
|
|
1296
|
-
].join("\n\n")
|
|
1297
|
-
}
|
|
1298
|
-
];
|
|
1299
|
-
const temp = config?.temperature;
|
|
1300
|
-
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1301
|
-
const { text } = await generateText2({
|
|
1302
|
-
model,
|
|
1303
|
-
messages,
|
|
1304
|
-
...temperature !== void 0 ? { temperature } : {}
|
|
1305
|
-
});
|
|
1306
|
-
let parsed;
|
|
1307
|
-
try {
|
|
1308
|
-
parsed = extractFirstJsonBlock(text);
|
|
1309
|
-
} catch {
|
|
1310
|
-
}
|
|
1311
|
-
if (parsed === void 0) {
|
|
1312
|
-
logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
|
|
1313
|
-
continue;
|
|
1314
|
-
}
|
|
1315
|
-
const validate = ajv.compile(tc.schema);
|
|
1316
|
-
const valid = validate(parsed);
|
|
1317
|
-
if (valid) {
|
|
1318
|
-
schemaValidCount++;
|
|
1319
|
-
logs.push(`[PASS] ${tc.id}`);
|
|
1320
|
-
} else {
|
|
1321
|
-
logs.push(
|
|
1322
|
-
`[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
|
|
1323
|
-
);
|
|
1324
|
-
}
|
|
1325
|
-
} catch (e) {
|
|
1326
|
-
const msg = e instanceof Error ? e.message : String(e);
|
|
1327
|
-
logs.push(`[ERROR] ${tc.id}: ${msg}`);
|
|
1328
|
-
}
|
|
1329
|
-
}
|
|
1479
|
+
const context = { model, config, ajv, logs };
|
|
1480
|
+
const schemaValidCount = await runSchemaOnlyTests(tests, context);
|
|
1330
1481
|
const total = tests.length;
|
|
1331
1482
|
const score = total > 0 ? schemaValidCount / total : 0;
|
|
1332
1483
|
return {
|
|
@@ -1341,6 +1492,505 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1341
1492
|
};
|
|
1342
1493
|
}
|
|
1343
1494
|
};
|
|
1495
|
+
|
|
1496
|
+
// src/reporters/console.ts
|
|
1497
|
+
var colors = {
|
|
1498
|
+
reset: "\x1B[0m",
|
|
1499
|
+
green: "\x1B[32m",
|
|
1500
|
+
red: "\x1B[31m",
|
|
1501
|
+
yellow: "\x1B[33m",
|
|
1502
|
+
cyan: "\x1B[36m",
|
|
1503
|
+
magenta: "\x1B[35m",
|
|
1504
|
+
gray: "\x1B[90m"
|
|
1505
|
+
};
|
|
1506
|
+
function printResult(result) {
|
|
1507
|
+
const { model, modelKey, benchmark, result: benchmarkResult } = result;
|
|
1508
|
+
const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
|
|
1509
|
+
console.log(
|
|
1510
|
+
`
|
|
1511
|
+
${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
|
|
1512
|
+
);
|
|
1513
|
+
console.log(
|
|
1514
|
+
` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
|
|
1515
|
+
);
|
|
1516
|
+
const metrics = Object.entries(benchmarkResult.metrics);
|
|
1517
|
+
if (metrics.length > 0) {
|
|
1518
|
+
console.log(" Metrics:");
|
|
1519
|
+
for (const [key, value] of metrics) {
|
|
1520
|
+
console.log(` - ${key}: ${value}`);
|
|
1521
|
+
}
|
|
1522
|
+
}
|
|
1523
|
+
if (benchmarkResult.error) {
|
|
1524
|
+
console.log(
|
|
1525
|
+
` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
|
|
1526
|
+
);
|
|
1527
|
+
}
|
|
1528
|
+
}
|
|
1529
|
+
function consoleReporter(results) {
|
|
1530
|
+
console.log("\n--- \u{1F4CA} Evaluation Report ---");
|
|
1531
|
+
for (const result of results) {
|
|
1532
|
+
printResult(result);
|
|
1533
|
+
}
|
|
1534
|
+
console.log("\n---------------------------\n");
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
// src/reporters/console.debug.ts
|
|
1538
|
+
var FAIL_ID_REGEX = /^\[FAIL\]\s+([^:]+):/;
|
|
1539
|
+
var DEBUG_FAIL_PREFIX_REGEX = /^\[DEBUG-FAIL\] /;
|
|
1540
|
+
var DEBUG_FAIL_CONTEXT_PREFIX_REGEX = /^\[DEBUG-FAIL-CONTEXT\] /;
|
|
1541
|
+
var colors2 = {
|
|
1542
|
+
reset: "\x1B[0m",
|
|
1543
|
+
green: "\x1B[32m",
|
|
1544
|
+
red: "\x1B[31m",
|
|
1545
|
+
yellow: "\x1B[33m",
|
|
1546
|
+
cyan: "\x1B[36m",
|
|
1547
|
+
magenta: "\x1B[35m",
|
|
1548
|
+
gray: "\x1B[90m",
|
|
1549
|
+
bold: "\x1B[1m",
|
|
1550
|
+
underline: "\x1B[4m"
|
|
1551
|
+
};
|
|
1552
|
+
function colorizeDiffLine(line) {
|
|
1553
|
+
if (line.startsWith("+")) {
|
|
1554
|
+
return `${colors2.green}${line}${colors2.reset}`;
|
|
1555
|
+
}
|
|
1556
|
+
if (line.startsWith("-")) {
|
|
1557
|
+
return `${colors2.red}${line}${colors2.reset}`;
|
|
1558
|
+
}
|
|
1559
|
+
if (line.startsWith("@")) {
|
|
1560
|
+
return `${colors2.cyan}${colors2.bold}${line}${colors2.reset}`;
|
|
1561
|
+
}
|
|
1562
|
+
return line;
|
|
1563
|
+
}
|
|
1564
|
+
function uniqueLines(lines) {
|
|
1565
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1566
|
+
const out = [];
|
|
1567
|
+
for (const l of lines) {
|
|
1568
|
+
if (seen.has(l)) {
|
|
1569
|
+
continue;
|
|
1570
|
+
}
|
|
1571
|
+
seen.add(l);
|
|
1572
|
+
out.push(l);
|
|
1573
|
+
}
|
|
1574
|
+
return out;
|
|
1575
|
+
}
|
|
1576
|
+
function hasFunctionNameIssue(diff) {
|
|
1577
|
+
return diff.some(
|
|
1578
|
+
(d) => String(d).includes("function name") || String(d).includes("missing function:")
|
|
1579
|
+
);
|
|
1580
|
+
}
|
|
1581
|
+
function suggestFunctionNameFix(expected, actual, suggestions) {
|
|
1582
|
+
const expectedName = expected?.function;
|
|
1583
|
+
const actualName = actual?.function;
|
|
1584
|
+
if (expectedName && actualName && expectedName !== actualName) {
|
|
1585
|
+
suggestions.push(
|
|
1586
|
+
`Call the function '${expectedName}' instead of '${actualName}'.`
|
|
1587
|
+
);
|
|
1588
|
+
}
|
|
1589
|
+
if (Array.isArray(expected?.functions)) {
|
|
1590
|
+
suggestions.push(
|
|
1591
|
+
`Ensure tool calls include: ${expected.functions.join(", ")}.`
|
|
1592
|
+
);
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
function suggestMissingParamFix(diff, suggestions) {
|
|
1596
|
+
const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
|
|
1597
|
+
if (missing.length) {
|
|
1598
|
+
suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
|
|
1599
|
+
}
|
|
1600
|
+
}
|
|
1601
|
+
function suggestUnexpectedParamFix(diff, suggestions) {
|
|
1602
|
+
const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
|
|
1603
|
+
if (extras.length) {
|
|
1604
|
+
suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
|
|
1605
|
+
}
|
|
1606
|
+
}
|
|
1607
|
+
function suggestParamValueFix(diff, suggestions) {
|
|
1608
|
+
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
1609
|
+
for (const param of targets) {
|
|
1610
|
+
const allowedOneOfLine = diff.find(
|
|
1611
|
+
(d) => String(d).startsWith("- expected one of:")
|
|
1612
|
+
);
|
|
1613
|
+
const allowedSingleLine = diff.find(
|
|
1614
|
+
(d) => String(d).startsWith("- expected:")
|
|
1615
|
+
);
|
|
1616
|
+
if (allowedSingleLine) {
|
|
1617
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
1618
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
1619
|
+
} else if (allowedOneOfLine) {
|
|
1620
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
1621
|
+
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
1622
|
+
} else {
|
|
1623
|
+
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
1624
|
+
}
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
function suggestFromErrorType(error_type, suggestions) {
|
|
1628
|
+
if (error_type.includes("missing_required")) {
|
|
1629
|
+
suggestions.push("Add all required parameters defined by the tool schema.");
|
|
1630
|
+
} else if (error_type.includes("unexpected_param")) {
|
|
1631
|
+
suggestions.push("Remove parameters not present in the tool schema.");
|
|
1632
|
+
} else if (error_type.includes("wrong_count")) {
|
|
1633
|
+
suggestions.push(
|
|
1634
|
+
"Adjust the number of tool calls to match expected count."
|
|
1635
|
+
);
|
|
1636
|
+
} else if (error_type.includes("wrong_func_name")) {
|
|
1637
|
+
suggestions.push("Use the exact expected function name from the schema.");
|
|
1638
|
+
} else if (error_type.includes("value_error")) {
|
|
1639
|
+
suggestions.push("Choose a value from the allowed options.");
|
|
1640
|
+
}
|
|
1641
|
+
}
|
|
1642
|
+
function suggestFixFromDiff(parsed) {
|
|
1643
|
+
const suggestions = [];
|
|
1644
|
+
const { error_type, expected, actual, diff } = parsed ?? {};
|
|
1645
|
+
if (!Array.isArray(diff)) {
|
|
1646
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1647
|
+
suggestFromErrorType(error_type, suggestions);
|
|
1648
|
+
}
|
|
1649
|
+
return uniqueLines(suggestions);
|
|
1650
|
+
}
|
|
1651
|
+
if (hasFunctionNameIssue(diff)) {
|
|
1652
|
+
suggestFunctionNameFix(expected, actual, suggestions);
|
|
1653
|
+
}
|
|
1654
|
+
if (diff.some((d) => String(d).startsWith("- missing required param:"))) {
|
|
1655
|
+
suggestMissingParamFix(diff, suggestions);
|
|
1656
|
+
}
|
|
1657
|
+
if (diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
|
|
1658
|
+
suggestUnexpectedParamFix(diff, suggestions);
|
|
1659
|
+
}
|
|
1660
|
+
if (diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
1661
|
+
suggestParamValueFix(diff, suggestions);
|
|
1662
|
+
}
|
|
1663
|
+
if (suggestions.length === 0 && typeof error_type === "string") {
|
|
1664
|
+
suggestFromErrorType(error_type, suggestions);
|
|
1665
|
+
}
|
|
1666
|
+
return uniqueLines(suggestions);
|
|
1667
|
+
}
|
|
1668
|
+
function getTestIdFromLogLine(line) {
|
|
1669
|
+
if (line.startsWith("[FAIL]")) {
|
|
1670
|
+
const m = line.match(FAIL_ID_REGEX);
|
|
1671
|
+
return m?.[1];
|
|
1672
|
+
}
|
|
1673
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1674
|
+
try {
|
|
1675
|
+
const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1676
|
+
return String(parsed?.id ?? "");
|
|
1677
|
+
} catch {
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
1681
|
+
try {
|
|
1682
|
+
const parsed = JSON.parse(
|
|
1683
|
+
line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
|
|
1684
|
+
);
|
|
1685
|
+
return String(parsed?.id ?? "");
|
|
1686
|
+
} catch {
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
return;
|
|
1690
|
+
}
|
|
1691
|
+
function groupLogsByTestId(failLogs) {
|
|
1692
|
+
const byId = /* @__PURE__ */ new Map();
|
|
1693
|
+
for (const line of failLogs) {
|
|
1694
|
+
const id = getTestIdFromLogLine(line);
|
|
1695
|
+
const key = id ?? "__general__";
|
|
1696
|
+
const arr = byId.get(key) ?? [];
|
|
1697
|
+
arr.push(line);
|
|
1698
|
+
byId.set(key, arr);
|
|
1699
|
+
}
|
|
1700
|
+
return byId;
|
|
1701
|
+
}
|
|
1702
|
+
function collectDebugIds(lines) {
|
|
1703
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
1704
|
+
for (const l of lines) {
|
|
1705
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
1706
|
+
try {
|
|
1707
|
+
const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
|
|
1708
|
+
if (parsed?.id) {
|
|
1709
|
+
debugIds.add(String(parsed.id));
|
|
1710
|
+
}
|
|
1711
|
+
} catch {
|
|
1712
|
+
}
|
|
1713
|
+
}
|
|
1714
|
+
}
|
|
1715
|
+
return debugIds;
|
|
1716
|
+
}
|
|
1717
|
+
function printIndentedJson(prefix, data, color) {
|
|
1718
|
+
console.log(
|
|
1719
|
+
color + prefix + JSON.stringify(data, null, 2).split("\n").join("\n ") + colors2.reset
|
|
1720
|
+
);
|
|
1721
|
+
}
|
|
1722
|
+
function displayDebugFailLine(line) {
|
|
1723
|
+
const payload = line.replace(DEBUG_FAIL_PREFIX_REGEX, "");
|
|
1724
|
+
try {
|
|
1725
|
+
const parsed = JSON.parse(payload);
|
|
1726
|
+
const { message, diff, expected, actual } = parsed;
|
|
1727
|
+
if (message) {
|
|
1728
|
+
console.log(` ${colors2.bold}${message}${colors2.reset}`);
|
|
1729
|
+
}
|
|
1730
|
+
if (diff && Array.isArray(diff)) {
|
|
1731
|
+
for (const dLine of diff) {
|
|
1732
|
+
console.log(` ${colorizeDiffLine(dLine)}`);
|
|
1733
|
+
}
|
|
1734
|
+
} else {
|
|
1735
|
+
console.log(" expected:");
|
|
1736
|
+
printIndentedJson(" ", expected, colors2.green);
|
|
1737
|
+
console.log(" actual:");
|
|
1738
|
+
printIndentedJson(" ", actual, colors2.red);
|
|
1739
|
+
}
|
|
1740
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
1741
|
+
if (suggestions.length) {
|
|
1742
|
+
console.log(` ${colors2.bold}Suggested fix:${colors2.reset}`);
|
|
1743
|
+
for (const s of suggestions) {
|
|
1744
|
+
console.log(` \u2022 ${s}`);
|
|
1745
|
+
}
|
|
1746
|
+
}
|
|
1747
|
+
} catch {
|
|
1748
|
+
console.log(` ${line}`);
|
|
1749
|
+
}
|
|
1750
|
+
}
|
|
1751
|
+
function displayContextInfo(ctx) {
|
|
1752
|
+
if (ctx.tool_schema) {
|
|
1753
|
+
printIndentedJson(" tool schema: ", ctx.tool_schema, colors2.gray);
|
|
1754
|
+
}
|
|
1755
|
+
if (ctx.last_user_query) {
|
|
1756
|
+
console.log(
|
|
1757
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
1758
|
+
);
|
|
1759
|
+
}
|
|
1760
|
+
if (ctx.raw_model_text) {
|
|
1761
|
+
console.log(
|
|
1762
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
1763
|
+
);
|
|
1764
|
+
}
|
|
1765
|
+
if (ctx.parsed_tool_calls) {
|
|
1766
|
+
printIndentedJson(
|
|
1767
|
+
" parsed tool calls: ",
|
|
1768
|
+
ctx.parsed_tool_calls,
|
|
1769
|
+
colors2.gray
|
|
1770
|
+
);
|
|
1771
|
+
}
|
|
1772
|
+
if (ctx.ground_truth) {
|
|
1773
|
+
printIndentedJson(
|
|
1774
|
+
" ground truth: ",
|
|
1775
|
+
ctx.ground_truth,
|
|
1776
|
+
colors2.gray
|
|
1777
|
+
);
|
|
1778
|
+
}
|
|
1779
|
+
if (ctx.finish_reason) {
|
|
1780
|
+
console.log(
|
|
1781
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
1782
|
+
);
|
|
1783
|
+
}
|
|
1784
|
+
}
|
|
1785
|
+
function displayDebugFailContextLine(line) {
|
|
1786
|
+
const payload = line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "");
|
|
1787
|
+
try {
|
|
1788
|
+
const ctx = JSON.parse(payload);
|
|
1789
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
1790
|
+
displayContextInfo(ctx);
|
|
1791
|
+
} catch {
|
|
1792
|
+
console.log(` ${line}`);
|
|
1793
|
+
}
|
|
1794
|
+
}
|
|
1795
|
+
function displayLogLine(line, debugIds) {
|
|
1796
|
+
if (line.startsWith("[FAIL]")) {
|
|
1797
|
+
const m = line.match(FAIL_ID_REGEX);
|
|
1798
|
+
const failId = m?.[1];
|
|
1799
|
+
if (failId && debugIds.has(failId)) {
|
|
1800
|
+
return;
|
|
1801
|
+
}
|
|
1802
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
1803
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
1804
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
1805
|
+
} else if (line.startsWith("[STACK]")) {
|
|
1806
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
1807
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
1808
|
+
displayDebugFailLine(line);
|
|
1809
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
1810
|
+
displayDebugFailContextLine(line);
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
function displayGroupedFailures(byId) {
|
|
1814
|
+
console.log(` ${colors2.bold}Failure details (grouped):${colors2.reset}`);
|
|
1815
|
+
for (const [groupId, lines] of byId) {
|
|
1816
|
+
if (groupId !== "__general__") {
|
|
1817
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
1818
|
+
}
|
|
1819
|
+
const debugIds = collectDebugIds(lines);
|
|
1820
|
+
for (const line of lines) {
|
|
1821
|
+
displayLogLine(line, debugIds);
|
|
1822
|
+
}
|
|
1823
|
+
}
|
|
1824
|
+
}
|
|
1825
|
+
function displaySuccessLogs(logs) {
|
|
1826
|
+
const info = logs.filter(
|
|
1827
|
+
(l) => l.startsWith("[INFO]") || l.startsWith("[PASS]")
|
|
1828
|
+
);
|
|
1829
|
+
for (const line of info) {
|
|
1830
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
1831
|
+
}
|
|
1832
|
+
}
|
|
1833
|
+
function filterFailureLogs(logs) {
|
|
1834
|
+
return logs.filter(
|
|
1835
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
1836
|
+
);
|
|
1837
|
+
}
|
|
1838
|
+
function displayResultLogs(logs) {
|
|
1839
|
+
const failLogs = filterFailureLogs(logs);
|
|
1840
|
+
const hasFails = failLogs.length > 0;
|
|
1841
|
+
if (hasFails) {
|
|
1842
|
+
const byId = groupLogsByTestId(failLogs);
|
|
1843
|
+
displayGroupedFailures(byId);
|
|
1844
|
+
} else {
|
|
1845
|
+
displaySuccessLogs(logs);
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
function displayMetrics(metrics) {
|
|
1849
|
+
if (metrics.length > 0) {
|
|
1850
|
+
console.log(" Metrics:");
|
|
1851
|
+
for (const [k, v] of metrics) {
|
|
1852
|
+
console.log(` - ${k}: ${v}`);
|
|
1853
|
+
}
|
|
1854
|
+
}
|
|
1855
|
+
}
|
|
1856
|
+
function displayResultHeader(r) {
|
|
1857
|
+
const { model, modelKey, benchmark, result } = r;
|
|
1858
|
+
const status = result.success ? `${colors2.green}\u2714 SUCCESS${colors2.reset}` : `${colors2.red}\u2716 FAILURE${colors2.reset}`;
|
|
1859
|
+
console.log(
|
|
1860
|
+
`
|
|
1861
|
+
${colors2.cyan}[${model}]${colors2.reset}${modelKey ? ` ${colors2.gray}(${modelKey})${colors2.reset}` : ""} - ${colors2.magenta}${benchmark}${colors2.reset}`
|
|
1862
|
+
);
|
|
1863
|
+
console.log(
|
|
1864
|
+
` \u2514 ${status} | Score: ${colors2.yellow}${result.score.toFixed(2)}${colors2.reset}`
|
|
1865
|
+
);
|
|
1866
|
+
}
|
|
1867
|
+
function consoleDebugReporter(results) {
|
|
1868
|
+
console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
|
|
1869
|
+
for (const r of results) {
|
|
1870
|
+
displayResultHeader(r);
|
|
1871
|
+
displayMetrics(Object.entries(r.result.metrics));
|
|
1872
|
+
if (r.result.logs?.length) {
|
|
1873
|
+
displayResultLogs(r.result.logs);
|
|
1874
|
+
}
|
|
1875
|
+
}
|
|
1876
|
+
console.log("\n------------------------------------\n");
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
// src/reporters/json.ts
|
|
1880
|
+
function jsonReporter(results) {
|
|
1881
|
+
const serializableResults = results.map((r) => ({
|
|
1882
|
+
...r,
|
|
1883
|
+
result: {
|
|
1884
|
+
...r.result,
|
|
1885
|
+
error: r.result.error?.message
|
|
1886
|
+
}
|
|
1887
|
+
}));
|
|
1888
|
+
console.log(JSON.stringify(serializableResults, null, 2));
|
|
1889
|
+
}
|
|
1890
|
+
|
|
1891
|
+
// src/reporters/index.ts
|
|
1892
|
+
var reporters = {
|
|
1893
|
+
console: consoleReporter,
|
|
1894
|
+
json: jsonReporter,
|
|
1895
|
+
"console.debug": consoleDebugReporter
|
|
1896
|
+
};
|
|
1897
|
+
|
|
1898
|
+
// src/evaluate.ts
|
|
1899
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
1900
|
+
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
1901
|
+
try {
|
|
1902
|
+
console.log(
|
|
1903
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
1904
|
+
);
|
|
1905
|
+
const result = await benchmark.run(model, config);
|
|
1906
|
+
console.log(
|
|
1907
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
1908
|
+
);
|
|
1909
|
+
return {
|
|
1910
|
+
model: modelId,
|
|
1911
|
+
modelKey,
|
|
1912
|
+
benchmark: benchmark.name,
|
|
1913
|
+
result
|
|
1914
|
+
};
|
|
1915
|
+
} catch (error) {
|
|
1916
|
+
console.error(
|
|
1917
|
+
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
|
|
1918
|
+
error
|
|
1919
|
+
);
|
|
1920
|
+
return {
|
|
1921
|
+
model: modelId,
|
|
1922
|
+
modelKey,
|
|
1923
|
+
benchmark: benchmark.name,
|
|
1924
|
+
result: {
|
|
1925
|
+
score: 0,
|
|
1926
|
+
success: false,
|
|
1927
|
+
metrics: {},
|
|
1928
|
+
error: error instanceof Error ? error : new Error(String(error))
|
|
1929
|
+
}
|
|
1930
|
+
};
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
function normalizeModels(models) {
|
|
1934
|
+
const modelEntries = [];
|
|
1935
|
+
if (Array.isArray(models)) {
|
|
1936
|
+
for (const m of models) {
|
|
1937
|
+
modelEntries.push([void 0, m]);
|
|
1938
|
+
}
|
|
1939
|
+
} else if (typeof models === "object" && models !== null && "modelId" in models) {
|
|
1940
|
+
modelEntries.push([void 0, models]);
|
|
1941
|
+
} else {
|
|
1942
|
+
for (const [key, m] of Object.entries(
|
|
1943
|
+
models
|
|
1944
|
+
)) {
|
|
1945
|
+
modelEntries.push([key, m]);
|
|
1946
|
+
}
|
|
1947
|
+
}
|
|
1948
|
+
return modelEntries;
|
|
1949
|
+
}
|
|
1950
|
+
function buildConfig(temperature, maxTokens) {
|
|
1951
|
+
const config = {};
|
|
1952
|
+
if (temperature !== void 0) {
|
|
1953
|
+
config.temperature = temperature;
|
|
1954
|
+
}
|
|
1955
|
+
if (maxTokens !== void 0) {
|
|
1956
|
+
config.maxTokens = maxTokens;
|
|
1957
|
+
}
|
|
1958
|
+
return Object.keys(config).length > 0 ? config : void 0;
|
|
1959
|
+
}
|
|
1960
|
+
function executeReporter(reporter, results) {
|
|
1961
|
+
const report = reporters[reporter];
|
|
1962
|
+
if (report) {
|
|
1963
|
+
report(results);
|
|
1964
|
+
} else {
|
|
1965
|
+
console.warn(`Unknown reporter: '${reporter}'. Defaulting to console.`);
|
|
1966
|
+
reporters.console(results);
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1969
|
+
async function evaluate(options) {
|
|
1970
|
+
const {
|
|
1971
|
+
models,
|
|
1972
|
+
benchmarks,
|
|
1973
|
+
reporter = "console",
|
|
1974
|
+
temperature,
|
|
1975
|
+
maxTokens
|
|
1976
|
+
} = options;
|
|
1977
|
+
const modelEntries = normalizeModels(models);
|
|
1978
|
+
const config = buildConfig(temperature, maxTokens);
|
|
1979
|
+
const allResults = [];
|
|
1980
|
+
for (const [modelKey, model] of modelEntries) {
|
|
1981
|
+
for (const benchmark of benchmarks) {
|
|
1982
|
+
const evaluationResult = await runSingleBenchmark(
|
|
1983
|
+
model,
|
|
1984
|
+
benchmark,
|
|
1985
|
+
modelKey,
|
|
1986
|
+
config
|
|
1987
|
+
);
|
|
1988
|
+
allResults.push(evaluationResult);
|
|
1989
|
+
}
|
|
1990
|
+
}
|
|
1991
|
+
executeReporter(reporter, allResults);
|
|
1992
|
+
return allResults;
|
|
1993
|
+
}
|
|
1344
1994
|
export {
|
|
1345
1995
|
bfclMultipleBenchmark,
|
|
1346
1996
|
bfclParallelBenchmark,
|