@ai-sdk-tool/eval 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +192 -53
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +192 -53
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -142,11 +142,17 @@ function suggestFixFromDiff(parsed) {
|
|
|
142
142
|
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
143
143
|
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
144
144
|
for (const param of targets) {
|
|
145
|
-
const
|
|
145
|
+
const allowedOneOfLine = diff.find(
|
|
146
146
|
(d) => String(d).startsWith("- expected one of:")
|
|
147
147
|
);
|
|
148
|
-
|
|
149
|
-
|
|
148
|
+
const allowedSingleLine = diff.find(
|
|
149
|
+
(d) => String(d).startsWith("- expected:")
|
|
150
|
+
);
|
|
151
|
+
if (allowedSingleLine) {
|
|
152
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
153
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
154
|
+
} else if (allowedOneOfLine) {
|
|
155
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
150
156
|
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
151
157
|
} else {
|
|
152
158
|
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
@@ -191,61 +197,140 @@ function consoleDebugReporter(results) {
|
|
|
191
197
|
}
|
|
192
198
|
if (result.logs && result.logs.length) {
|
|
193
199
|
const failLogs = result.logs.filter(
|
|
194
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
200
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
195
201
|
);
|
|
196
202
|
const hasFails = failLogs.length > 0;
|
|
197
203
|
if (hasFails) {
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
204
|
+
let getTestIdFromLogLine2 = function(line) {
|
|
205
|
+
if (line.startsWith("[FAIL]")) {
|
|
206
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
207
|
+
return m?.[1];
|
|
208
|
+
}
|
|
209
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
202
210
|
try {
|
|
203
|
-
const parsed = JSON.parse(
|
|
204
|
-
|
|
211
|
+
const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
212
|
+
return String(parsed?.id ?? "");
|
|
205
213
|
} catch {
|
|
206
214
|
}
|
|
207
215
|
}
|
|
208
|
-
|
|
209
|
-
for (const line of failLogs) {
|
|
210
|
-
if (line.startsWith("[FAIL]")) {
|
|
211
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
212
|
-
const failId = m?.[1];
|
|
213
|
-
if (failId && debugIds.has(failId)) continue;
|
|
214
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
215
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
216
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
217
|
-
} else if (line.startsWith("[STACK]")) {
|
|
218
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
219
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
220
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
216
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
221
217
|
try {
|
|
222
|
-
const parsed = JSON.parse(
|
|
223
|
-
|
|
224
|
-
console.log(
|
|
225
|
-
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
218
|
+
const parsed = JSON.parse(
|
|
219
|
+
line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
|
|
226
220
|
);
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
221
|
+
return String(parsed?.id ?? "");
|
|
222
|
+
} catch {
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return void 0;
|
|
226
|
+
};
|
|
227
|
+
var getTestIdFromLogLine = getTestIdFromLogLine2;
|
|
228
|
+
const byId = /* @__PURE__ */ new Map();
|
|
229
|
+
for (const line of failLogs) {
|
|
230
|
+
const id = getTestIdFromLogLine2(line);
|
|
231
|
+
const key = id ?? "__general__";
|
|
232
|
+
const arr = byId.get(key) ?? [];
|
|
233
|
+
arr.push(line);
|
|
234
|
+
byId.set(key, arr);
|
|
235
|
+
}
|
|
236
|
+
console.log(
|
|
237
|
+
` ${colors2.bold}Failure details (grouped):${colors2.reset}`
|
|
238
|
+
);
|
|
239
|
+
for (const [groupId, lines] of byId) {
|
|
240
|
+
if (groupId !== "__general__") {
|
|
241
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
242
|
+
}
|
|
243
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
244
|
+
for (const l of lines) {
|
|
245
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
246
|
+
try {
|
|
247
|
+
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
248
|
+
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
249
|
+
} catch {
|
|
239
250
|
}
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
for (const line of lines) {
|
|
254
|
+
if (line.startsWith("[FAIL]")) {
|
|
255
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
256
|
+
const failId = m?.[1];
|
|
257
|
+
if (failId && debugIds.has(failId)) continue;
|
|
258
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
259
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
260
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
261
|
+
} else if (line.startsWith("[STACK]")) {
|
|
262
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
263
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
264
|
+
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
265
|
+
try {
|
|
266
|
+
const parsed = JSON.parse(payload);
|
|
267
|
+
const { message, diff, expected, actual } = parsed;
|
|
268
|
+
if (message)
|
|
269
|
+
console.log(
|
|
270
|
+
` ${colors2.bold}${message}${colors2.reset}`
|
|
271
|
+
);
|
|
272
|
+
if (diff && Array.isArray(diff)) {
|
|
273
|
+
for (const dLine of diff)
|
|
274
|
+
console.log(" " + colorizeDiffLine(dLine));
|
|
275
|
+
} else {
|
|
276
|
+
console.log(" expected:");
|
|
277
|
+
console.log(
|
|
278
|
+
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
279
|
+
);
|
|
280
|
+
console.log(" actual:");
|
|
281
|
+
console.log(
|
|
282
|
+
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
283
|
+
);
|
|
284
|
+
}
|
|
285
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
286
|
+
if (suggestions.length) {
|
|
287
|
+
console.log(
|
|
288
|
+
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
289
|
+
);
|
|
290
|
+
for (const s of suggestions)
|
|
291
|
+
console.log(` \u2022 ${s}`);
|
|
292
|
+
}
|
|
293
|
+
} catch {
|
|
294
|
+
console.log(` ${line}`);
|
|
295
|
+
}
|
|
296
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
297
|
+
const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
|
|
298
|
+
try {
|
|
299
|
+
const ctx = JSON.parse(payload);
|
|
300
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
301
|
+
if (ctx.tool_schema) {
|
|
302
|
+
console.log(
|
|
303
|
+
colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
|
|
304
|
+
);
|
|
305
|
+
}
|
|
306
|
+
if (ctx.last_user_query) {
|
|
307
|
+
console.log(
|
|
308
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
if (ctx.raw_model_text) {
|
|
312
|
+
console.log(
|
|
313
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
314
|
+
);
|
|
315
|
+
}
|
|
316
|
+
if (ctx.parsed_tool_calls) {
|
|
317
|
+
console.log(
|
|
318
|
+
colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
|
|
319
|
+
);
|
|
320
|
+
}
|
|
321
|
+
if (ctx.ground_truth) {
|
|
322
|
+
console.log(
|
|
323
|
+
colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
|
|
324
|
+
);
|
|
325
|
+
}
|
|
326
|
+
if (ctx.finish_reason) {
|
|
327
|
+
console.log(
|
|
328
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
329
|
+
);
|
|
330
|
+
}
|
|
331
|
+
} catch {
|
|
332
|
+
console.log(` ${line}`);
|
|
246
333
|
}
|
|
247
|
-
} catch {
|
|
248
|
-
console.log(` ${line}`);
|
|
249
334
|
}
|
|
250
335
|
}
|
|
251
336
|
}
|
|
@@ -787,14 +872,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
787
872
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
788
873
|
);
|
|
789
874
|
}
|
|
875
|
+
const debugSummaryRef = {};
|
|
876
|
+
const providerOptions = {
|
|
877
|
+
toolCallMiddleware: {
|
|
878
|
+
debugSummary: debugSummaryRef
|
|
879
|
+
}
|
|
880
|
+
};
|
|
790
881
|
const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
|
|
791
882
|
model,
|
|
792
883
|
messages: flatMessages,
|
|
793
884
|
tools: toolsMap,
|
|
794
885
|
toolChoice: "auto",
|
|
886
|
+
providerOptions,
|
|
795
887
|
...temperature !== void 0 ? { temperature } : {},
|
|
796
888
|
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
797
889
|
});
|
|
890
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
891
|
+
const mwParsedToolCalls = (() => {
|
|
892
|
+
const raw = debugSummaryRef.toolCalls;
|
|
893
|
+
if (!raw) return [];
|
|
894
|
+
try {
|
|
895
|
+
const arr = JSON.parse(raw);
|
|
896
|
+
return Array.isArray(arr) ? arr : [];
|
|
897
|
+
} catch {
|
|
898
|
+
return [];
|
|
899
|
+
}
|
|
900
|
+
})();
|
|
798
901
|
try {
|
|
799
902
|
caseLogs.push(
|
|
800
903
|
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
@@ -838,6 +941,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
838
941
|
} else {
|
|
839
942
|
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
840
943
|
try {
|
|
944
|
+
let generateParamMismatchDiff2 = function(paramName, allowed, got) {
|
|
945
|
+
const diffLines = [];
|
|
946
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
947
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
948
|
+
const expectedLine = (() => {
|
|
949
|
+
if (allowedArray.length === 1) {
|
|
950
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
951
|
+
}
|
|
952
|
+
const formatted = allowedArray.map(
|
|
953
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
954
|
+
).join(", ");
|
|
955
|
+
return `- expected one of: ${formatted}`;
|
|
956
|
+
})();
|
|
957
|
+
diffLines.push(expectedLine);
|
|
958
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
959
|
+
return diffLines;
|
|
960
|
+
};
|
|
961
|
+
var generateParamMismatchDiff = generateParamMismatchDiff2;
|
|
841
962
|
const category = testCase.id.split("_")[0];
|
|
842
963
|
const diff = [];
|
|
843
964
|
const summarizeArgs = (args) => {
|
|
@@ -904,11 +1025,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
904
1025
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
905
1026
|
});
|
|
906
1027
|
if (!includes) {
|
|
907
|
-
diff.push(`@@ param ${k}`);
|
|
908
1028
|
diff.push(
|
|
909
|
-
|
|
1029
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
910
1030
|
);
|
|
911
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
912
1031
|
}
|
|
913
1032
|
}
|
|
914
1033
|
}
|
|
@@ -998,11 +1117,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
998
1117
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
999
1118
|
});
|
|
1000
1119
|
if (!includes) {
|
|
1001
|
-
diff.push(`@@ param ${k}`);
|
|
1002
1120
|
diff.push(
|
|
1003
|
-
|
|
1121
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
1004
1122
|
);
|
|
1005
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
1006
1123
|
}
|
|
1007
1124
|
}
|
|
1008
1125
|
}
|
|
@@ -1019,6 +1136,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
1019
1136
|
diff
|
|
1020
1137
|
})}`
|
|
1021
1138
|
);
|
|
1139
|
+
try {
|
|
1140
|
+
const lastUser = (() => {
|
|
1141
|
+
const reversed = [...flatMessages].reverse();
|
|
1142
|
+
const found = reversed.find(
|
|
1143
|
+
(m) => m.role === "user"
|
|
1144
|
+
);
|
|
1145
|
+
return found?.content ?? void 0;
|
|
1146
|
+
})();
|
|
1147
|
+
const contextPayload = {
|
|
1148
|
+
id: testCase.id,
|
|
1149
|
+
tool_schema: tools,
|
|
1150
|
+
last_user_query: lastUser,
|
|
1151
|
+
raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
|
|
1152
|
+
finish_reason: finishReason,
|
|
1153
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1154
|
+
ground_truth: possibleAnswer.ground_truth
|
|
1155
|
+
};
|
|
1156
|
+
caseLogs.push(
|
|
1157
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
1158
|
+
);
|
|
1159
|
+
} catch {
|
|
1160
|
+
}
|
|
1022
1161
|
} catch {
|
|
1023
1162
|
caseLogs.push(
|
|
1024
1163
|
`[DEBUG] ${testCase.id}: failed to build debug diff`
|