@ai-sdk-tool/eval 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +192 -53
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +192 -53
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -100,11 +100,17 @@ function suggestFixFromDiff(parsed) {
|
|
|
100
100
|
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
101
101
|
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
102
102
|
for (const param of targets) {
|
|
103
|
-
const
|
|
103
|
+
const allowedOneOfLine = diff.find(
|
|
104
104
|
(d) => String(d).startsWith("- expected one of:")
|
|
105
105
|
);
|
|
106
|
-
|
|
107
|
-
|
|
106
|
+
const allowedSingleLine = diff.find(
|
|
107
|
+
(d) => String(d).startsWith("- expected:")
|
|
108
|
+
);
|
|
109
|
+
if (allowedSingleLine) {
|
|
110
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
111
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
112
|
+
} else if (allowedOneOfLine) {
|
|
113
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
108
114
|
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
109
115
|
} else {
|
|
110
116
|
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
@@ -149,61 +155,140 @@ function consoleDebugReporter(results) {
|
|
|
149
155
|
}
|
|
150
156
|
if (result.logs && result.logs.length) {
|
|
151
157
|
const failLogs = result.logs.filter(
|
|
152
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
158
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
153
159
|
);
|
|
154
160
|
const hasFails = failLogs.length > 0;
|
|
155
161
|
if (hasFails) {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
162
|
+
let getTestIdFromLogLine2 = function(line) {
|
|
163
|
+
if (line.startsWith("[FAIL]")) {
|
|
164
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
165
|
+
return m?.[1];
|
|
166
|
+
}
|
|
167
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
160
168
|
try {
|
|
161
|
-
const parsed = JSON.parse(
|
|
162
|
-
|
|
169
|
+
const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
170
|
+
return String(parsed?.id ?? "");
|
|
163
171
|
} catch {
|
|
164
172
|
}
|
|
165
173
|
}
|
|
166
|
-
|
|
167
|
-
for (const line of failLogs) {
|
|
168
|
-
if (line.startsWith("[FAIL]")) {
|
|
169
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
170
|
-
const failId = m?.[1];
|
|
171
|
-
if (failId && debugIds.has(failId)) continue;
|
|
172
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
173
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
174
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
175
|
-
} else if (line.startsWith("[STACK]")) {
|
|
176
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
177
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
178
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
174
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
179
175
|
try {
|
|
180
|
-
const parsed = JSON.parse(
|
|
181
|
-
|
|
182
|
-
console.log(
|
|
183
|
-
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
176
|
+
const parsed = JSON.parse(
|
|
177
|
+
line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
|
|
184
178
|
);
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
179
|
+
return String(parsed?.id ?? "");
|
|
180
|
+
} catch {
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return void 0;
|
|
184
|
+
};
|
|
185
|
+
var getTestIdFromLogLine = getTestIdFromLogLine2;
|
|
186
|
+
const byId = /* @__PURE__ */ new Map();
|
|
187
|
+
for (const line of failLogs) {
|
|
188
|
+
const id = getTestIdFromLogLine2(line);
|
|
189
|
+
const key = id ?? "__general__";
|
|
190
|
+
const arr = byId.get(key) ?? [];
|
|
191
|
+
arr.push(line);
|
|
192
|
+
byId.set(key, arr);
|
|
193
|
+
}
|
|
194
|
+
console.log(
|
|
195
|
+
` ${colors2.bold}Failure details (grouped):${colors2.reset}`
|
|
196
|
+
);
|
|
197
|
+
for (const [groupId, lines] of byId) {
|
|
198
|
+
if (groupId !== "__general__") {
|
|
199
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
200
|
+
}
|
|
201
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
202
|
+
for (const l of lines) {
|
|
203
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
204
|
+
try {
|
|
205
|
+
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
206
|
+
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
207
|
+
} catch {
|
|
197
208
|
}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
for (const line of lines) {
|
|
212
|
+
if (line.startsWith("[FAIL]")) {
|
|
213
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
214
|
+
const failId = m?.[1];
|
|
215
|
+
if (failId && debugIds.has(failId)) continue;
|
|
216
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
217
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
218
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
219
|
+
} else if (line.startsWith("[STACK]")) {
|
|
220
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
221
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
222
|
+
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
223
|
+
try {
|
|
224
|
+
const parsed = JSON.parse(payload);
|
|
225
|
+
const { message, diff, expected, actual } = parsed;
|
|
226
|
+
if (message)
|
|
227
|
+
console.log(
|
|
228
|
+
` ${colors2.bold}${message}${colors2.reset}`
|
|
229
|
+
);
|
|
230
|
+
if (diff && Array.isArray(diff)) {
|
|
231
|
+
for (const dLine of diff)
|
|
232
|
+
console.log(" " + colorizeDiffLine(dLine));
|
|
233
|
+
} else {
|
|
234
|
+
console.log(" expected:");
|
|
235
|
+
console.log(
|
|
236
|
+
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
237
|
+
);
|
|
238
|
+
console.log(" actual:");
|
|
239
|
+
console.log(
|
|
240
|
+
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
244
|
+
if (suggestions.length) {
|
|
245
|
+
console.log(
|
|
246
|
+
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
247
|
+
);
|
|
248
|
+
for (const s of suggestions)
|
|
249
|
+
console.log(` \u2022 ${s}`);
|
|
250
|
+
}
|
|
251
|
+
} catch {
|
|
252
|
+
console.log(` ${line}`);
|
|
253
|
+
}
|
|
254
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
255
|
+
const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
|
|
256
|
+
try {
|
|
257
|
+
const ctx = JSON.parse(payload);
|
|
258
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
259
|
+
if (ctx.tool_schema) {
|
|
260
|
+
console.log(
|
|
261
|
+
colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
if (ctx.last_user_query) {
|
|
265
|
+
console.log(
|
|
266
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
if (ctx.raw_model_text) {
|
|
270
|
+
console.log(
|
|
271
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
if (ctx.parsed_tool_calls) {
|
|
275
|
+
console.log(
|
|
276
|
+
colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
|
|
277
|
+
);
|
|
278
|
+
}
|
|
279
|
+
if (ctx.ground_truth) {
|
|
280
|
+
console.log(
|
|
281
|
+
colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
|
|
282
|
+
);
|
|
283
|
+
}
|
|
284
|
+
if (ctx.finish_reason) {
|
|
285
|
+
console.log(
|
|
286
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
287
|
+
);
|
|
288
|
+
}
|
|
289
|
+
} catch {
|
|
290
|
+
console.log(` ${line}`);
|
|
204
291
|
}
|
|
205
|
-
} catch {
|
|
206
|
-
console.log(` ${line}`);
|
|
207
292
|
}
|
|
208
293
|
}
|
|
209
294
|
}
|
|
@@ -745,14 +830,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
745
830
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
746
831
|
);
|
|
747
832
|
}
|
|
833
|
+
const debugSummaryRef = {};
|
|
834
|
+
const providerOptions = {
|
|
835
|
+
toolCallMiddleware: {
|
|
836
|
+
debugSummary: debugSummaryRef
|
|
837
|
+
}
|
|
838
|
+
};
|
|
748
839
|
const { toolCalls, text, finishReason } = await generateText({
|
|
749
840
|
model,
|
|
750
841
|
messages: flatMessages,
|
|
751
842
|
tools: toolsMap,
|
|
752
843
|
toolChoice: "auto",
|
|
844
|
+
providerOptions,
|
|
753
845
|
...temperature !== void 0 ? { temperature } : {},
|
|
754
846
|
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
755
847
|
});
|
|
848
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
849
|
+
const mwParsedToolCalls = (() => {
|
|
850
|
+
const raw = debugSummaryRef.toolCalls;
|
|
851
|
+
if (!raw) return [];
|
|
852
|
+
try {
|
|
853
|
+
const arr = JSON.parse(raw);
|
|
854
|
+
return Array.isArray(arr) ? arr : [];
|
|
855
|
+
} catch {
|
|
856
|
+
return [];
|
|
857
|
+
}
|
|
858
|
+
})();
|
|
756
859
|
try {
|
|
757
860
|
caseLogs.push(
|
|
758
861
|
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
@@ -796,6 +899,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
796
899
|
} else {
|
|
797
900
|
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
798
901
|
try {
|
|
902
|
+
let generateParamMismatchDiff2 = function(paramName, allowed, got) {
|
|
903
|
+
const diffLines = [];
|
|
904
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
905
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
906
|
+
const expectedLine = (() => {
|
|
907
|
+
if (allowedArray.length === 1) {
|
|
908
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
909
|
+
}
|
|
910
|
+
const formatted = allowedArray.map(
|
|
911
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
912
|
+
).join(", ");
|
|
913
|
+
return `- expected one of: ${formatted}`;
|
|
914
|
+
})();
|
|
915
|
+
diffLines.push(expectedLine);
|
|
916
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
917
|
+
return diffLines;
|
|
918
|
+
};
|
|
919
|
+
var generateParamMismatchDiff = generateParamMismatchDiff2;
|
|
799
920
|
const category = testCase.id.split("_")[0];
|
|
800
921
|
const diff = [];
|
|
801
922
|
const summarizeArgs = (args) => {
|
|
@@ -862,11 +983,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
862
983
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
863
984
|
});
|
|
864
985
|
if (!includes) {
|
|
865
|
-
diff.push(`@@ param ${k}`);
|
|
866
986
|
diff.push(
|
|
867
|
-
|
|
987
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
868
988
|
);
|
|
869
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
870
989
|
}
|
|
871
990
|
}
|
|
872
991
|
}
|
|
@@ -956,11 +1075,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
956
1075
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
957
1076
|
});
|
|
958
1077
|
if (!includes) {
|
|
959
|
-
diff.push(`@@ param ${k}`);
|
|
960
1078
|
diff.push(
|
|
961
|
-
|
|
1079
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
962
1080
|
);
|
|
963
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
964
1081
|
}
|
|
965
1082
|
}
|
|
966
1083
|
}
|
|
@@ -977,6 +1094,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
977
1094
|
diff
|
|
978
1095
|
})}`
|
|
979
1096
|
);
|
|
1097
|
+
try {
|
|
1098
|
+
const lastUser = (() => {
|
|
1099
|
+
const reversed = [...flatMessages].reverse();
|
|
1100
|
+
const found = reversed.find(
|
|
1101
|
+
(m) => m.role === "user"
|
|
1102
|
+
);
|
|
1103
|
+
return found?.content ?? void 0;
|
|
1104
|
+
})();
|
|
1105
|
+
const contextPayload = {
|
|
1106
|
+
id: testCase.id,
|
|
1107
|
+
tool_schema: tools,
|
|
1108
|
+
last_user_query: lastUser,
|
|
1109
|
+
raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
|
|
1110
|
+
finish_reason: finishReason,
|
|
1111
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1112
|
+
ground_truth: possibleAnswer.ground_truth
|
|
1113
|
+
};
|
|
1114
|
+
caseLogs.push(
|
|
1115
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
1116
|
+
);
|
|
1117
|
+
} catch {
|
|
1118
|
+
}
|
|
980
1119
|
} catch {
|
|
981
1120
|
caseLogs.push(
|
|
982
1121
|
`[DEBUG] ${testCase.id}: failed to build debug diff`
|