@ai-sdk-tool/eval 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +206 -66
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +206 -66
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -100,11 +100,17 @@ function suggestFixFromDiff(parsed) {
|
|
|
100
100
|
if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
|
|
101
101
|
const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
|
|
102
102
|
for (const param of targets) {
|
|
103
|
-
const
|
|
103
|
+
const allowedOneOfLine = diff.find(
|
|
104
104
|
(d) => String(d).startsWith("- expected one of:")
|
|
105
105
|
);
|
|
106
|
-
|
|
107
|
-
|
|
106
|
+
const allowedSingleLine = diff.find(
|
|
107
|
+
(d) => String(d).startsWith("- expected:")
|
|
108
|
+
);
|
|
109
|
+
if (allowedSingleLine) {
|
|
110
|
+
const value = allowedSingleLine.replace("- expected: ", "");
|
|
111
|
+
suggestions.push(`Set '${param}' to: ${value}.`);
|
|
112
|
+
} else if (allowedOneOfLine) {
|
|
113
|
+
const allowed = allowedOneOfLine.replace("- expected one of: ", "");
|
|
108
114
|
suggestions.push(`Set '${param}' to one of: ${allowed}.`);
|
|
109
115
|
} else {
|
|
110
116
|
suggestions.push(`Adjust '${param}' to an allowed value.`);
|
|
@@ -149,61 +155,140 @@ function consoleDebugReporter(results) {
|
|
|
149
155
|
}
|
|
150
156
|
if (result.logs && result.logs.length) {
|
|
151
157
|
const failLogs = result.logs.filter(
|
|
152
|
-
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
|
|
158
|
+
(l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
|
|
153
159
|
);
|
|
154
160
|
const hasFails = failLogs.length > 0;
|
|
155
161
|
if (hasFails) {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
162
|
+
let getTestIdFromLogLine2 = function(line) {
|
|
163
|
+
if (line.startsWith("[FAIL]")) {
|
|
164
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
165
|
+
return m?.[1];
|
|
166
|
+
}
|
|
167
|
+
if (line.startsWith("[DEBUG-FAIL]")) {
|
|
160
168
|
try {
|
|
161
|
-
const parsed = JSON.parse(
|
|
162
|
-
|
|
169
|
+
const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
170
|
+
return String(parsed?.id ?? "");
|
|
163
171
|
} catch {
|
|
164
172
|
}
|
|
165
173
|
}
|
|
166
|
-
|
|
167
|
-
for (const line of failLogs) {
|
|
168
|
-
if (line.startsWith("[FAIL]")) {
|
|
169
|
-
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
170
|
-
const failId = m?.[1];
|
|
171
|
-
if (failId && debugIds.has(failId)) continue;
|
|
172
|
-
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
173
|
-
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
174
|
-
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
175
|
-
} else if (line.startsWith("[STACK]")) {
|
|
176
|
-
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
177
|
-
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
178
|
-
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
174
|
+
if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
179
175
|
try {
|
|
180
|
-
const parsed = JSON.parse(
|
|
181
|
-
|
|
182
|
-
console.log(
|
|
183
|
-
` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
|
|
176
|
+
const parsed = JSON.parse(
|
|
177
|
+
line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
|
|
184
178
|
);
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
179
|
+
return String(parsed?.id ?? "");
|
|
180
|
+
} catch {
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
return void 0;
|
|
184
|
+
};
|
|
185
|
+
var getTestIdFromLogLine = getTestIdFromLogLine2;
|
|
186
|
+
const byId = /* @__PURE__ */ new Map();
|
|
187
|
+
for (const line of failLogs) {
|
|
188
|
+
const id = getTestIdFromLogLine2(line);
|
|
189
|
+
const key = id ?? "__general__";
|
|
190
|
+
const arr = byId.get(key) ?? [];
|
|
191
|
+
arr.push(line);
|
|
192
|
+
byId.set(key, arr);
|
|
193
|
+
}
|
|
194
|
+
console.log(
|
|
195
|
+
` ${colors2.bold}Failure details (grouped):${colors2.reset}`
|
|
196
|
+
);
|
|
197
|
+
for (const [groupId, lines] of byId) {
|
|
198
|
+
if (groupId !== "__general__") {
|
|
199
|
+
console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
|
|
200
|
+
}
|
|
201
|
+
const debugIds = /* @__PURE__ */ new Set();
|
|
202
|
+
for (const l of lines) {
|
|
203
|
+
if (l.startsWith("[DEBUG-FAIL]")) {
|
|
204
|
+
try {
|
|
205
|
+
const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
|
|
206
|
+
if (parsed?.id) debugIds.add(String(parsed.id));
|
|
207
|
+
} catch {
|
|
197
208
|
}
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
for (const line of lines) {
|
|
212
|
+
if (line.startsWith("[FAIL]")) {
|
|
213
|
+
const m = line.match(/^\[FAIL\]\s+([^:]+):/);
|
|
214
|
+
const failId = m?.[1];
|
|
215
|
+
if (failId && debugIds.has(failId)) continue;
|
|
216
|
+
console.log(` ${colors2.red}${line}${colors2.reset}`);
|
|
217
|
+
} else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
|
|
218
|
+
console.log(` ${colors2.yellow}${line}${colors2.reset}`);
|
|
219
|
+
} else if (line.startsWith("[STACK]")) {
|
|
220
|
+
console.log(` ${colors2.gray}${line}${colors2.reset}`);
|
|
221
|
+
} else if (line.startsWith("[DEBUG-FAIL]")) {
|
|
222
|
+
const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
|
|
223
|
+
try {
|
|
224
|
+
const parsed = JSON.parse(payload);
|
|
225
|
+
const { message, diff, expected, actual } = parsed;
|
|
226
|
+
if (message)
|
|
227
|
+
console.log(
|
|
228
|
+
` ${colors2.bold}${message}${colors2.reset}`
|
|
229
|
+
);
|
|
230
|
+
if (diff && Array.isArray(diff)) {
|
|
231
|
+
for (const dLine of diff)
|
|
232
|
+
console.log(" " + colorizeDiffLine(dLine));
|
|
233
|
+
} else {
|
|
234
|
+
console.log(" expected:");
|
|
235
|
+
console.log(
|
|
236
|
+
colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
|
|
237
|
+
);
|
|
238
|
+
console.log(" actual:");
|
|
239
|
+
console.log(
|
|
240
|
+
colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
|
|
241
|
+
);
|
|
242
|
+
}
|
|
243
|
+
const suggestions = suggestFixFromDiff(parsed);
|
|
244
|
+
if (suggestions.length) {
|
|
245
|
+
console.log(
|
|
246
|
+
` ${colors2.bold}Suggested fix:${colors2.reset}`
|
|
247
|
+
);
|
|
248
|
+
for (const s of suggestions)
|
|
249
|
+
console.log(` \u2022 ${s}`);
|
|
250
|
+
}
|
|
251
|
+
} catch {
|
|
252
|
+
console.log(` ${line}`);
|
|
253
|
+
}
|
|
254
|
+
} else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
|
|
255
|
+
const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
|
|
256
|
+
try {
|
|
257
|
+
const ctx = JSON.parse(payload);
|
|
258
|
+
console.log(` ${colors2.gray}context:${colors2.reset}`);
|
|
259
|
+
if (ctx.tool_schema) {
|
|
260
|
+
console.log(
|
|
261
|
+
colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
if (ctx.last_user_query) {
|
|
265
|
+
console.log(
|
|
266
|
+
colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
|
|
267
|
+
);
|
|
268
|
+
}
|
|
269
|
+
if (ctx.raw_model_text) {
|
|
270
|
+
console.log(
|
|
271
|
+
colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
if (ctx.parsed_tool_calls) {
|
|
275
|
+
console.log(
|
|
276
|
+
colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
|
|
277
|
+
);
|
|
278
|
+
}
|
|
279
|
+
if (ctx.ground_truth) {
|
|
280
|
+
console.log(
|
|
281
|
+
colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
|
|
282
|
+
);
|
|
283
|
+
}
|
|
284
|
+
if (ctx.finish_reason) {
|
|
285
|
+
console.log(
|
|
286
|
+
colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
|
|
287
|
+
);
|
|
288
|
+
}
|
|
289
|
+
} catch {
|
|
290
|
+
console.log(` ${line}`);
|
|
204
291
|
}
|
|
205
|
-
} catch {
|
|
206
|
-
console.log(` ${line}`);
|
|
207
292
|
}
|
|
208
293
|
}
|
|
209
294
|
}
|
|
@@ -274,7 +359,13 @@ async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
|
274
359
|
}
|
|
275
360
|
}
|
|
276
361
|
async function evaluate(options) {
|
|
277
|
-
const {
|
|
362
|
+
const {
|
|
363
|
+
models,
|
|
364
|
+
benchmarks,
|
|
365
|
+
reporter = "console",
|
|
366
|
+
temperature,
|
|
367
|
+
maxTokens
|
|
368
|
+
} = options;
|
|
278
369
|
const modelEntries = [];
|
|
279
370
|
if (Array.isArray(models)) {
|
|
280
371
|
for (const m of models) modelEntries.push([void 0, m]);
|
|
@@ -290,11 +381,14 @@ async function evaluate(options) {
|
|
|
290
381
|
const allResults = [];
|
|
291
382
|
for (const [modelKey, model] of modelEntries) {
|
|
292
383
|
for (const benchmark of benchmarks) {
|
|
384
|
+
const config = {};
|
|
385
|
+
if (temperature !== void 0) config.temperature = temperature;
|
|
386
|
+
if (maxTokens !== void 0) config.maxTokens = maxTokens;
|
|
293
387
|
const evaluationResult = await runSingleBenchmark(
|
|
294
388
|
model,
|
|
295
389
|
benchmark,
|
|
296
390
|
modelKey,
|
|
297
|
-
|
|
391
|
+
Object.keys(config).length > 0 ? config : void 0
|
|
298
392
|
);
|
|
299
393
|
allResults.push(evaluationResult);
|
|
300
394
|
}
|
|
@@ -694,6 +788,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
694
788
|
const { function: tools, question: messages } = testCase;
|
|
695
789
|
const temp = config?.temperature;
|
|
696
790
|
const temperature = typeof temp === "number" ? temp : void 0;
|
|
791
|
+
const maxTok = config?.maxTokens;
|
|
792
|
+
const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
|
|
697
793
|
try {
|
|
698
794
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
699
795
|
const nameMap = /* @__PURE__ */ new Map();
|
|
@@ -734,24 +830,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
734
830
|
`[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
|
|
735
831
|
);
|
|
736
832
|
}
|
|
833
|
+
const debugSummaryRef = {};
|
|
834
|
+
const providerOptions = {
|
|
835
|
+
toolCallMiddleware: {
|
|
836
|
+
debugSummary: debugSummaryRef
|
|
837
|
+
}
|
|
838
|
+
};
|
|
737
839
|
const { toolCalls, text, finishReason } = await generateText({
|
|
738
840
|
model,
|
|
739
841
|
messages: flatMessages,
|
|
740
842
|
tools: toolsMap,
|
|
741
843
|
toolChoice: "auto",
|
|
844
|
+
providerOptions,
|
|
742
845
|
...temperature !== void 0 ? { temperature } : {},
|
|
743
|
-
|
|
744
|
-
providerOptions: {
|
|
745
|
-
toolCallMiddleware: {
|
|
746
|
-
originalToolSchemas: Object.fromEntries(
|
|
747
|
-
transformedTools.map((t) => [
|
|
748
|
-
t.name,
|
|
749
|
-
t.inputSchema
|
|
750
|
-
])
|
|
751
|
-
)
|
|
752
|
-
}
|
|
753
|
-
}
|
|
846
|
+
...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
|
|
754
847
|
});
|
|
848
|
+
const mwOriginalText = debugSummaryRef.originalText;
|
|
849
|
+
const mwParsedToolCalls = (() => {
|
|
850
|
+
const raw = debugSummaryRef.toolCalls;
|
|
851
|
+
if (!raw) return [];
|
|
852
|
+
try {
|
|
853
|
+
const arr = JSON.parse(raw);
|
|
854
|
+
return Array.isArray(arr) ? arr : [];
|
|
855
|
+
} catch {
|
|
856
|
+
return [];
|
|
857
|
+
}
|
|
858
|
+
})();
|
|
755
859
|
try {
|
|
756
860
|
caseLogs.push(
|
|
757
861
|
`[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
|
|
@@ -795,6 +899,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
795
899
|
} else {
|
|
796
900
|
caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
|
|
797
901
|
try {
|
|
902
|
+
let generateParamMismatchDiff2 = function(paramName, allowed, got) {
|
|
903
|
+
const diffLines = [];
|
|
904
|
+
diffLines.push(`@@ param ${paramName}`);
|
|
905
|
+
const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
|
|
906
|
+
const expectedLine = (() => {
|
|
907
|
+
if (allowedArray.length === 1) {
|
|
908
|
+
return `- expected: ${JSON.stringify(allowedArray[0])}`;
|
|
909
|
+
}
|
|
910
|
+
const formatted = allowedArray.map(
|
|
911
|
+
(v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
|
|
912
|
+
).join(", ");
|
|
913
|
+
return `- expected one of: ${formatted}`;
|
|
914
|
+
})();
|
|
915
|
+
diffLines.push(expectedLine);
|
|
916
|
+
diffLines.push(`+ got: ${JSON.stringify(got)}`);
|
|
917
|
+
return diffLines;
|
|
918
|
+
};
|
|
919
|
+
var generateParamMismatchDiff = generateParamMismatchDiff2;
|
|
798
920
|
const category = testCase.id.split("_")[0];
|
|
799
921
|
const diff = [];
|
|
800
922
|
const summarizeArgs = (args) => {
|
|
@@ -861,11 +983,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
861
983
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
862
984
|
});
|
|
863
985
|
if (!includes) {
|
|
864
|
-
diff.push(`@@ param ${k}`);
|
|
865
986
|
diff.push(
|
|
866
|
-
|
|
987
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
867
988
|
);
|
|
868
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
869
989
|
}
|
|
870
990
|
}
|
|
871
991
|
}
|
|
@@ -955,11 +1075,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
955
1075
|
return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
|
|
956
1076
|
});
|
|
957
1077
|
if (!includes) {
|
|
958
|
-
diff.push(`@@ param ${k}`);
|
|
959
1078
|
diff.push(
|
|
960
|
-
|
|
1079
|
+
...generateParamMismatchDiff2(k, allowed, got)
|
|
961
1080
|
);
|
|
962
|
-
diff.push(`+ got: ${JSON.stringify(got)}`);
|
|
963
1081
|
}
|
|
964
1082
|
}
|
|
965
1083
|
}
|
|
@@ -976,6 +1094,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
976
1094
|
diff
|
|
977
1095
|
})}`
|
|
978
1096
|
);
|
|
1097
|
+
try {
|
|
1098
|
+
const lastUser = (() => {
|
|
1099
|
+
const reversed = [...flatMessages].reverse();
|
|
1100
|
+
const found = reversed.find(
|
|
1101
|
+
(m) => m.role === "user"
|
|
1102
|
+
);
|
|
1103
|
+
return found?.content ?? void 0;
|
|
1104
|
+
})();
|
|
1105
|
+
const contextPayload = {
|
|
1106
|
+
id: testCase.id,
|
|
1107
|
+
tool_schema: tools,
|
|
1108
|
+
last_user_query: lastUser,
|
|
1109
|
+
raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
|
|
1110
|
+
finish_reason: finishReason,
|
|
1111
|
+
parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
|
|
1112
|
+
ground_truth: possibleAnswer.ground_truth
|
|
1113
|
+
};
|
|
1114
|
+
caseLogs.push(
|
|
1115
|
+
`[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
|
|
1116
|
+
);
|
|
1117
|
+
} catch {
|
|
1118
|
+
}
|
|
979
1119
|
} catch {
|
|
980
1120
|
caseLogs.push(
|
|
981
1121
|
`[DEBUG] ${testCase.id}: failed to build debug diff`
|