@ai-sdk-tool/eval 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -142,11 +142,17 @@ function suggestFixFromDiff(parsed) {
142
142
  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
143
  const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
144
  for (const param of targets) {
145
- const allowedLine = diff.find(
145
+ const allowedOneOfLine = diff.find(
146
146
  (d) => String(d).startsWith("- expected one of:")
147
147
  );
148
- if (allowedLine) {
149
- const allowed = allowedLine.replace("- expected one of: ", "");
148
+ const allowedSingleLine = diff.find(
149
+ (d) => String(d).startsWith("- expected:")
150
+ );
151
+ if (allowedSingleLine) {
152
+ const value = allowedSingleLine.replace("- expected: ", "");
153
+ suggestions.push(`Set '${param}' to: ${value}.`);
154
+ } else if (allowedOneOfLine) {
155
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
150
156
  suggestions.push(`Set '${param}' to one of: ${allowed}.`);
151
157
  } else {
152
158
  suggestions.push(`Adjust '${param}' to an allowed value.`);
@@ -191,61 +197,140 @@ function consoleDebugReporter(results) {
191
197
  }
192
198
  if (result.logs && result.logs.length) {
193
199
  const failLogs = result.logs.filter(
194
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
200
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
195
201
  );
196
202
  const hasFails = failLogs.length > 0;
197
203
  if (hasFails) {
198
- console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
199
- const debugIds = /* @__PURE__ */ new Set();
200
- for (const l of failLogs) {
201
- if (l.startsWith("[DEBUG-FAIL]")) {
204
+ let getTestIdFromLogLine2 = function(line) {
205
+ if (line.startsWith("[FAIL]")) {
206
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
207
+ return m?.[1];
208
+ }
209
+ if (line.startsWith("[DEBUG-FAIL]")) {
202
210
  try {
203
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
204
- if (parsed?.id) debugIds.add(String(parsed.id));
211
+ const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
212
+ return String(parsed?.id ?? "");
205
213
  } catch {
206
214
  }
207
215
  }
208
- }
209
- for (const line of failLogs) {
210
- if (line.startsWith("[FAIL]")) {
211
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
212
- const failId = m?.[1];
213
- if (failId && debugIds.has(failId)) continue;
214
- console.log(` ${colors2.red}${line}${colors2.reset}`);
215
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
216
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
217
- } else if (line.startsWith("[STACK]")) {
218
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
219
- } else if (line.startsWith("[DEBUG-FAIL]")) {
220
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
216
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
221
217
  try {
222
- const parsed = JSON.parse(payload);
223
- const { id, expected, actual, message, diff } = parsed;
224
- console.log(
225
- ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
218
+ const parsed = JSON.parse(
219
+ line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
226
220
  );
227
- if (diff && Array.isArray(diff)) {
228
- for (const dLine of diff)
229
- console.log(" " + colorizeDiffLine(dLine));
230
- } else {
231
- console.log(" expected:");
232
- console.log(
233
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
234
- );
235
- console.log(" actual:");
236
- console.log(
237
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
238
- );
221
+ return String(parsed?.id ?? "");
222
+ } catch {
223
+ }
224
+ }
225
+ return void 0;
226
+ };
227
+ var getTestIdFromLogLine = getTestIdFromLogLine2;
228
+ const byId = /* @__PURE__ */ new Map();
229
+ for (const line of failLogs) {
230
+ const id = getTestIdFromLogLine2(line);
231
+ const key = id ?? "__general__";
232
+ const arr = byId.get(key) ?? [];
233
+ arr.push(line);
234
+ byId.set(key, arr);
235
+ }
236
+ console.log(
237
+ ` ${colors2.bold}Failure details (grouped):${colors2.reset}`
238
+ );
239
+ for (const [groupId, lines] of byId) {
240
+ if (groupId !== "__general__") {
241
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
242
+ }
243
+ const debugIds = /* @__PURE__ */ new Set();
244
+ for (const l of lines) {
245
+ if (l.startsWith("[DEBUG-FAIL]")) {
246
+ try {
247
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
248
+ if (parsed?.id) debugIds.add(String(parsed.id));
249
+ } catch {
239
250
  }
240
- const suggestions = suggestFixFromDiff(parsed);
241
- if (suggestions.length) {
242
- console.log(
243
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
244
- );
245
- for (const s of suggestions) console.log(` \u2022 ${s}`);
251
+ }
252
+ }
253
+ for (const line of lines) {
254
+ if (line.startsWith("[FAIL]")) {
255
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
256
+ const failId = m?.[1];
257
+ if (failId && debugIds.has(failId)) continue;
258
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
259
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
260
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
261
+ } else if (line.startsWith("[STACK]")) {
262
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
263
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
264
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
265
+ try {
266
+ const parsed = JSON.parse(payload);
267
+ const { message, diff, expected, actual } = parsed;
268
+ if (message)
269
+ console.log(
270
+ ` ${colors2.bold}${message}${colors2.reset}`
271
+ );
272
+ if (diff && Array.isArray(diff)) {
273
+ for (const dLine of diff)
274
+ console.log(" " + colorizeDiffLine(dLine));
275
+ } else {
276
+ console.log(" expected:");
277
+ console.log(
278
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
279
+ );
280
+ console.log(" actual:");
281
+ console.log(
282
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
283
+ );
284
+ }
285
+ const suggestions = suggestFixFromDiff(parsed);
286
+ if (suggestions.length) {
287
+ console.log(
288
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
289
+ );
290
+ for (const s of suggestions)
291
+ console.log(` \u2022 ${s}`);
292
+ }
293
+ } catch {
294
+ console.log(` ${line}`);
295
+ }
296
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
297
+ const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
298
+ try {
299
+ const ctx = JSON.parse(payload);
300
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
301
+ if (ctx.tool_schema) {
302
+ console.log(
303
+ colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
304
+ );
305
+ }
306
+ if (ctx.last_user_query) {
307
+ console.log(
308
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
309
+ );
310
+ }
311
+ if (ctx.raw_model_text) {
312
+ console.log(
313
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
314
+ );
315
+ }
316
+ if (ctx.parsed_tool_calls) {
317
+ console.log(
318
+ colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
319
+ );
320
+ }
321
+ if (ctx.ground_truth) {
322
+ console.log(
323
+ colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
324
+ );
325
+ }
326
+ if (ctx.finish_reason) {
327
+ console.log(
328
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
329
+ );
330
+ }
331
+ } catch {
332
+ console.log(` ${line}`);
246
333
  }
247
- } catch {
248
- console.log(` ${line}`);
249
334
  }
250
335
  }
251
336
  }
@@ -316,7 +401,13 @@ async function runSingleBenchmark(model, benchmark, modelKey, config) {
316
401
  }
317
402
  }
318
403
  async function evaluate(options) {
319
- const { models, benchmarks, reporter = "console", temperature } = options;
404
+ const {
405
+ models,
406
+ benchmarks,
407
+ reporter = "console",
408
+ temperature,
409
+ maxTokens
410
+ } = options;
320
411
  const modelEntries = [];
321
412
  if (Array.isArray(models)) {
322
413
  for (const m of models) modelEntries.push([void 0, m]);
@@ -332,11 +423,14 @@ async function evaluate(options) {
332
423
  const allResults = [];
333
424
  for (const [modelKey, model] of modelEntries) {
334
425
  for (const benchmark of benchmarks) {
426
+ const config = {};
427
+ if (temperature !== void 0) config.temperature = temperature;
428
+ if (maxTokens !== void 0) config.maxTokens = maxTokens;
335
429
  const evaluationResult = await runSingleBenchmark(
336
430
  model,
337
431
  benchmark,
338
432
  modelKey,
339
- temperature !== void 0 ? { temperature } : void 0
433
+ Object.keys(config).length > 0 ? config : void 0
340
434
  );
341
435
  allResults.push(evaluationResult);
342
436
  }
@@ -736,6 +830,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
736
830
  const { function: tools, question: messages } = testCase;
737
831
  const temp = config?.temperature;
738
832
  const temperature = typeof temp === "number" ? temp : void 0;
833
+ const maxTok = config?.maxTokens;
834
+ const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
739
835
  try {
740
836
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
741
837
  const nameMap = /* @__PURE__ */ new Map();
@@ -776,24 +872,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
776
872
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
777
873
  );
778
874
  }
875
+ const debugSummaryRef = {};
876
+ const providerOptions = {
877
+ toolCallMiddleware: {
878
+ debugSummary: debugSummaryRef
879
+ }
880
+ };
779
881
  const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
780
882
  model,
781
883
  messages: flatMessages,
782
884
  tools: toolsMap,
783
885
  toolChoice: "auto",
886
+ providerOptions,
784
887
  ...temperature !== void 0 ? { temperature } : {},
785
- // Pass original schema information to middleware
786
- providerOptions: {
787
- toolCallMiddleware: {
788
- originalToolSchemas: Object.fromEntries(
789
- transformedTools.map((t) => [
790
- t.name,
791
- t.inputSchema
792
- ])
793
- )
794
- }
795
- }
888
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
796
889
  });
890
+ const mwOriginalText = debugSummaryRef.originalText;
891
+ const mwParsedToolCalls = (() => {
892
+ const raw = debugSummaryRef.toolCalls;
893
+ if (!raw) return [];
894
+ try {
895
+ const arr = JSON.parse(raw);
896
+ return Array.isArray(arr) ? arr : [];
897
+ } catch {
898
+ return [];
899
+ }
900
+ })();
797
901
  try {
798
902
  caseLogs.push(
799
903
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
@@ -837,6 +941,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
837
941
  } else {
838
942
  caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
839
943
  try {
944
+ let generateParamMismatchDiff2 = function(paramName, allowed, got) {
945
+ const diffLines = [];
946
+ diffLines.push(`@@ param ${paramName}`);
947
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
948
+ const expectedLine = (() => {
949
+ if (allowedArray.length === 1) {
950
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
951
+ }
952
+ const formatted = allowedArray.map(
953
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
954
+ ).join(", ");
955
+ return `- expected one of: ${formatted}`;
956
+ })();
957
+ diffLines.push(expectedLine);
958
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
959
+ return diffLines;
960
+ };
961
+ var generateParamMismatchDiff = generateParamMismatchDiff2;
840
962
  const category = testCase.id.split("_")[0];
841
963
  const diff = [];
842
964
  const summarizeArgs = (args) => {
@@ -903,11 +1025,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
903
1025
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
904
1026
  });
905
1027
  if (!includes) {
906
- diff.push(`@@ param ${k}`);
907
1028
  diff.push(
908
- `- expected one of: ${JSON.stringify(allowed)}`
1029
+ ...generateParamMismatchDiff2(k, allowed, got)
909
1030
  );
910
- diff.push(`+ got: ${JSON.stringify(got)}`);
911
1031
  }
912
1032
  }
913
1033
  }
@@ -997,11 +1117,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
997
1117
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
998
1118
  });
999
1119
  if (!includes) {
1000
- diff.push(`@@ param ${k}`);
1001
1120
  diff.push(
1002
- `- expected one of: ${JSON.stringify(allowed)}`
1121
+ ...generateParamMismatchDiff2(k, allowed, got)
1003
1122
  );
1004
- diff.push(`+ got: ${JSON.stringify(got)}`);
1005
1123
  }
1006
1124
  }
1007
1125
  }
@@ -1018,6 +1136,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1018
1136
  diff
1019
1137
  })}`
1020
1138
  );
1139
+ try {
1140
+ const lastUser = (() => {
1141
+ const reversed = [...flatMessages].reverse();
1142
+ const found = reversed.find(
1143
+ (m) => m.role === "user"
1144
+ );
1145
+ return found?.content ?? void 0;
1146
+ })();
1147
+ const contextPayload = {
1148
+ id: testCase.id,
1149
+ tool_schema: tools,
1150
+ last_user_query: lastUser,
1151
+ raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
1152
+ finish_reason: finishReason,
1153
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1154
+ ground_truth: possibleAnswer.ground_truth
1155
+ };
1156
+ caseLogs.push(
1157
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
1158
+ );
1159
+ } catch {
1160
+ }
1021
1161
  } catch {
1022
1162
  caseLogs.push(
1023
1163
  `[DEBUG] ${testCase.id}: failed to build debug diff`