@ai-sdk-tool/eval 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +206 -66
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +206 -66
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -142,11 +142,17 @@ function suggestFixFromDiff(parsed) {
|
|
|
142
142
|
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
143
143
|
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
144
144
|
for (const param of targets) {
|
|
145
|
-
const
|
|
145
|
+
const allowedOneOfLine = diff.find(
|
|
146
146
|
(d) => String(d).startsWith("- expected one of:")
|
|
147
147
|
);
|
|
148
|
-
|
|
149
|
-
|
|
148
|
+
const allowedSingleLine = diff.find(
|
|
149
|
+
(d) => String(d).startsWith("- expected:")
|
|
150
|
+
);
|
|
151
|
+
if (allowedSingleLine) {
|
|
152
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
153
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
154
|
+
} else if (allowedOneOfLine) {
|
|
155
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
150
156
|
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
151
157
|
} else {
|
|
152
158
|
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
@@ -191,61 +197,140 @@ function consoleDebugReporter(results) {
|
|
|
191
197
|
}
|
|
192
198
|
if (result.logs && result.logs.length) {
|
|
193
199
|
const failLogs = result.logs.filter(
|
|
194
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
200
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
195
201
|
);
|
|
196
202
|
const hasFails = failLogs.length > 0;
|
|
197
203
|
if (hasFails) {
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
204
|
+
let getTestIdFromLogLine2 = function(line) {
|
|
205
|
+
if (line.startsWith("[FAIL]")) {
|
|
206
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
207
|
+
return m?.[1];
|
|
208
|
+
}
|
|
209
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
202
210
|
try {
|
|
203
|
-
const parsed = JSON.parse(
|
|
204
|
-
|
|
211
|
+
const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
212
|
+
return String(parsed?.id ?? "");
|
|
205
213
|
} catch {
|
|
206
214
|
}
|
|
207
215
|
}
|
|
208
|
-
|
|
209
|
-
for (const line of failLogs) {
|
|
210
|
-
if (line.startsWith("[FAIL]")) {
|
|
211
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
212
|
-
const failId = m?.[1];
|
|
213
|
-
if (failId && debugIds.has(failId)) continue;
|
|
214
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
215
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
216
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
217
|
-
} else if (line.startsWith("[STACK]")) {
|
|
218
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
219
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
220
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
216
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
221
217
|
try {
|
|
222
|
-
const parsed = JSON.parse(
|
|
223
|
-
|
|
224
|
-
console.log(
|
|
225
|
-
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
218
|
+
const parsed = JSON.parse(
|
|
219
|
+
line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
|
|
226
220
|
);
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
221
|
+
return String(parsed?.id ?? "");
|
|
222
|
+
} catch {
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return void 0;
|
|
226
|
+
};
|
|
227
|
+
var getTestIdFromLogLine = getTestIdFromLogLine2;
|
|
228
|
+
const byId = /* @__PURE__ */ new Map();
|
|
229
|
+
for (const line of failLogs) {
|
|
230
|
+
const id = getTestIdFromLogLine2(line);
|
|
231
|
+
const key = id ?? "__general__";
|
|
232
|
+
const arr = byId.get(key) ?? [];
|
|
233
|
+
arr.push(line);
|
|
234
|
+
byId.set(key, arr);
|
|
235
|
+
}
|
|
236
|
+
console.log(
|
|
237
|
+
` ${colors2.bold}Failure details (grouped):${colors2.reset}`
|
|
238
|
+
);
|
|
239
|
+
for (const [groupId, lines] of byId) {
|
|
240
|
+
if (groupId !== "__general__") {
|
|
241
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
242
|
+
}
|
|
243
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
244
|
+
for (const l of lines) {
|
|
245
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
246
|
+
try {
|
|
247
|
+
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
248
|
+
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
249
|
+
} catch {
|
|
239
250
|
}
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
for (const line of lines) {
|
|
254
|
+
if (line.startsWith("[FAIL]")) {
|
|
255
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
256
|
+
const failId = m?.[1];
|
|
257
|
+
if (failId && debugIds.has(failId)) continue;
|
|
258
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
259
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
260
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
261
|
+
} else if (line.startsWith("[STACK]")) {
|
|
262
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
263
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
264
|
+
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
265
|
+
try {
|
|
266
|
+
const parsed = JSON.parse(payload);
|
|
267
|
+
const { message, diff, expected, actual } = parsed;
|
|
268
|
+
if (message)
|
|
269
|
+
console.log(
|
|
270
|
+
` ${colors2.bold}${message}${colors2.reset}`
|
|
271
|
+
);
|
|
272
|
+
if (diff && Array.isArray(diff)) {
|
|
273
|
+
for (const dLine of diff)
|
|
274
|
+
console.log(" " + colorizeDiffLine(dLine));
|
|
275
|
+
} else {
|
|
276
|
+
console.log(" expected:");
|
|
277
|
+
console.log(
|
|
278
|
+
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
279
|
+
);
|
|
280
|
+
console.log(" actual:");
|
|
281
|
+
console.log(
|
|
282
|
+
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
286
|
+
if (suggestions.length) {
|
|
287
|
+
console.log(
|
|
288
|
+
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
289
|
+
);
|
|
290
|
+
for (const s of suggestions)
|
|
291
|
+
console.log(` \u2022 ${s}`);
|
|
292
|
+
}
|
|
293
|
+
} catch {
|
|
294
|
+
console.log(` ${line}`);
|
|
295
|
+
}
|
|
296
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
297
|
+
const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
|
|
298
|
+
try {
|
|
299
|
+
const ctx = JSON.parse(payload);
|
|
300
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
301
|
+
if (ctx.tool_schema) {
|
|
302
|
+
console.log(
|
|
303
|
+
colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
|
|
304
|
+
);
|
|
305
|
+
}
|
|
306
|
+
if (ctx.last_user_query) {
|
|
307
|
+
console.log(
|
|
308
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
if (ctx.raw_model_text) {
|
|
312
|
+
console.log(
|
|
313
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
if (ctx.parsed_tool_calls) {
|
|
317
|
+
console.log(
|
|
318
|
+
colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
if (ctx.ground_truth) {
|
|
322
|
+
console.log(
|
|
323
|
+
colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
if (ctx.finish_reason) {
|
|
327
|
+
console.log(
|
|
328
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
} catch {
|
|
332
|
+
console.log(` ${line}`);
|
|
246
333
|
}
|
|
247
|
-
} catch {
|
|
248
|
-
console.log(` ${line}`);
|
|
249
334
|
}
|
|
250
335
|
}
|
|
251
336
|
}
|
|
@@ -316,7 +401,13 @@ async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
|
316
401
|
}
|
|
317
402
|
}
|
|
318
403
|
async function evaluate(options) {
|
|
319
|
-
const {
|
|
404
|
+
const {
|
|
405
|
+
models,
|
|
406
|
+
benchmarks,
|
|
407
|
+
reporter = "console",
|
|
408
|
+
temperature,
|
|
409
|
+
maxTokens
|
|
410
|
+
} = options;
|
|
320
411
|
const modelEntries = [];
|
|
321
412
|
if (Array.isArray(models)) {
|
|
322
413
|
for (const m of models) modelEntries.push([void 0, m]);
|
|
@@ -332,11 +423,14 @@ async function evaluate(options) {
|
|
|
332
423
|
const allResults = [];
|
|
333
424
|
for (const [modelKey, model] of modelEntries) {
|
|
334
425
|
for (const benchmark of benchmarks) {
|
|
426
|
+
const config = {};
|
|
427
|
+
if (temperature !== void 0) config.temperature = temperature;
|
|
428
|
+
if (maxTokens !== void 0) config.maxTokens = maxTokens;
|
|
335
429
|
const evaluationResult = await runSingleBenchmark(
|
|
336
430
|
model,
|
|
337
431
|
benchmark,
|
|
338
432
|
modelKey,
|
|
339
|
-
|
|
433
|
+
Object.keys(config).length > 0 ? config : void 0
|
|
340
434
|
);
|
|
341
435
|
allResults.push(evaluationResult);
|
|
342
436
|
}
|
|
@@ -736,6 +830,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
736
830
|
const { function: tools, question: messages } = testCase;
|
|
737
831
|
const temp = config?.temperature;
|
|
738
832
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
833
|
+
const maxTok = config?.maxTokens;
|
|
834
|
+
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
739
835
|
try {
|
|
740
836
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
741
837
|
const nameMap = /* @__PURE__ */ new Map();
|
|
@@ -776,24 +872,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
776
872
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
777
873
|
);
|
|
778
874
|
}
|
|
875
|
+
const debugSummaryRef = {};
|
|
876
|
+
const providerOptions = {
|
|
877
|
+
toolCallMiddleware: {
|
|
878
|
+
debugSummary: debugSummaryRef
|
|
879
|
+
}
|
|
880
|
+
};
|
|
779
881
|
const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
|
|
780
882
|
model,
|
|
781
883
|
messages: flatMessages,
|
|
782
884
|
tools: toolsMap,
|
|
783
885
|
toolChoice: "auto",
|
|
886
|
+
providerOptions,
|
|
784
887
|
...temperature !== void 0 ? { temperature } : {},
|
|
785
|
-
|
|
786
|
-
providerOptions: {
|
|
787
|
-
toolCallMiddleware: {
|
|
788
|
-
originalToolSchemas: Object.fromEntries(
|
|
789
|
-
transformedTools.map((t) => [
|
|
790
|
-
t.name,
|
|
791
|
-
t.inputSchema
|
|
792
|
-
])
|
|
793
|
-
)
|
|
794
|
-
}
|
|
795
|
-
}
|
|
888
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
796
889
|
});
|
|
890
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
891
|
+
const mwParsedToolCalls = (() => {
|
|
892
|
+
const raw = debugSummaryRef.toolCalls;
|
|
893
|
+
if (!raw) return [];
|
|
894
|
+
try {
|
|
895
|
+
const arr = JSON.parse(raw);
|
|
896
|
+
return Array.isArray(arr) ? arr : [];
|
|
897
|
+
} catch {
|
|
898
|
+
return [];
|
|
899
|
+
}
|
|
900
|
+
})();
|
|
797
901
|
try {
|
|
798
902
|
caseLogs.push(
|
|
799
903
|
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
@@ -837,6 +941,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
837
941
|
} else {
|
|
838
942
|
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
839
943
|
try {
|
|
944
|
+
let generateParamMismatchDiff2 = function(paramName, allowed, got) {
|
|
945
|
+
const diffLines = [];
|
|
946
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
947
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
948
|
+
const expectedLine = (() => {
|
|
949
|
+
if (allowedArray.length === 1) {
|
|
950
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
951
|
+
}
|
|
952
|
+
const formatted = allowedArray.map(
|
|
953
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
954
|
+
).join(", ");
|
|
955
|
+
return `- expected one of: ${formatted}`;
|
|
956
|
+
})();
|
|
957
|
+
diffLines.push(expectedLine);
|
|
958
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
959
|
+
return diffLines;
|
|
960
|
+
};
|
|
961
|
+
var generateParamMismatchDiff = generateParamMismatchDiff2;
|
|
840
962
|
const category = testCase.id.split("_")[0];
|
|
841
963
|
const diff = [];
|
|
842
964
|
const summarizeArgs = (args) => {
|
|
@@ -903,11 +1025,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
903
1025
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
904
1026
|
});
|
|
905
1027
|
if (!includes) {
|
|
906
|
-
diff.push(`@@ param ${k}`);
|
|
907
1028
|
diff.push(
|
|
908
|
-
|
|
1029
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
909
1030
|
);
|
|
910
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
911
1031
|
}
|
|
912
1032
|
}
|
|
913
1033
|
}
|
|
@@ -997,11 +1117,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
997
1117
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
998
1118
|
});
|
|
999
1119
|
if (!includes) {
|
|
1000
|
-
diff.push(`@@ param ${k}`);
|
|
1001
1120
|
diff.push(
|
|
1002
|
-
|
|
1121
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
1003
1122
|
);
|
|
1004
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
1005
1123
|
}
|
|
1006
1124
|
}
|
|
1007
1125
|
}
|
|
@@ -1018,6 +1136,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1018
1136
|
diff
|
|
1019
1137
|
})}`
|
|
1020
1138
|
);
|
|
1139
|
+
try {
|
|
1140
|
+
const lastUser = (() => {
|
|
1141
|
+
const reversed = [...flatMessages].reverse();
|
|
1142
|
+
const found = reversed.find(
|
|
1143
|
+
(m) => m.role === "user"
|
|
1144
|
+
);
|
|
1145
|
+
return found?.content ?? void 0;
|
|
1146
|
+
})();
|
|
1147
|
+
const contextPayload = {
|
|
1148
|
+
id: testCase.id,
|
|
1149
|
+
tool_schema: tools,
|
|
1150
|
+
last_user_query: lastUser,
|
|
1151
|
+
raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
|
|
1152
|
+
finish_reason: finishReason,
|
|
1153
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1154
|
+
ground_truth: possibleAnswer.ground_truth
|
|
1155
|
+
};
|
|
1156
|
+
caseLogs.push(
|
|
1157
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
1158
|
+
);
|
|
1159
|
+
} catch {
|
|
1160
|
+
}
|
|
1021
1161
|
} catch {
|
|
1022
1162
|
caseLogs.push(
|
|
1023
1163
|
`[DEBUG] ${testCase.id}: failed to build debug diff`
|