@ai-sdk-tool/eval 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -142,11 +142,17 @@ function suggestFixFromDiff(parsed) {
142
142
  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
143
143
  const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
144
144
  for (const param of targets) {
145
- const allowedLine = diff.find(
145
+ const allowedOneOfLine = diff.find(
146
146
  (d) => String(d).startsWith("- expected one of:")
147
147
  );
148
- if (allowedLine) {
149
- const allowed = allowedLine.replace("- expected one of: ", "");
148
+ const allowedSingleLine = diff.find(
149
+ (d) => String(d).startsWith("- expected:")
150
+ );
151
+ if (allowedSingleLine) {
152
+ const value = allowedSingleLine.replace("- expected: ", "");
153
+ suggestions.push(`Set '${param}' to: ${value}.`);
154
+ } else if (allowedOneOfLine) {
155
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
150
156
  suggestions.push(`Set '${param}' to one of: ${allowed}.`);
151
157
  } else {
152
158
  suggestions.push(`Adjust '${param}' to an allowed value.`);
@@ -191,61 +197,140 @@ function consoleDebugReporter(results) {
191
197
  }
192
198
  if (result.logs && result.logs.length) {
193
199
  const failLogs = result.logs.filter(
194
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
200
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
195
201
  );
196
202
  const hasFails = failLogs.length > 0;
197
203
  if (hasFails) {
198
- console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
199
- const debugIds = /* @__PURE__ */ new Set();
200
- for (const l of failLogs) {
201
- if (l.startsWith("[DEBUG-FAIL]")) {
204
+ let getTestIdFromLogLine2 = function(line) {
205
+ if (line.startsWith("[FAIL]")) {
206
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
207
+ return m?.[1];
208
+ }
209
+ if (line.startsWith("[DEBUG-FAIL]")) {
202
210
  try {
203
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
204
- if (parsed?.id) debugIds.add(String(parsed.id));
211
+ const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
212
+ return String(parsed?.id ?? "");
205
213
  } catch {
206
214
  }
207
215
  }
208
- }
209
- for (const line of failLogs) {
210
- if (line.startsWith("[FAIL]")) {
211
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
212
- const failId = m?.[1];
213
- if (failId && debugIds.has(failId)) continue;
214
- console.log(` ${colors2.red}${line}${colors2.reset}`);
215
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
216
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
217
- } else if (line.startsWith("[STACK]")) {
218
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
219
- } else if (line.startsWith("[DEBUG-FAIL]")) {
220
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
216
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
221
217
  try {
222
- const parsed = JSON.parse(payload);
223
- const { id, expected, actual, message, diff } = parsed;
224
- console.log(
225
- ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
218
+ const parsed = JSON.parse(
219
+ line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
226
220
  );
227
- if (diff && Array.isArray(diff)) {
228
- for (const dLine of diff)
229
- console.log(" " + colorizeDiffLine(dLine));
230
- } else {
231
- console.log(" expected:");
232
- console.log(
233
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
234
- );
235
- console.log(" actual:");
236
- console.log(
237
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
238
- );
221
+ return String(parsed?.id ?? "");
222
+ } catch {
223
+ }
224
+ }
225
+ return void 0;
226
+ };
227
+ var getTestIdFromLogLine = getTestIdFromLogLine2;
228
+ const byId = /* @__PURE__ */ new Map();
229
+ for (const line of failLogs) {
230
+ const id = getTestIdFromLogLine2(line);
231
+ const key = id ?? "__general__";
232
+ const arr = byId.get(key) ?? [];
233
+ arr.push(line);
234
+ byId.set(key, arr);
235
+ }
236
+ console.log(
237
+ ` ${colors2.bold}Failure details (grouped):${colors2.reset}`
238
+ );
239
+ for (const [groupId, lines] of byId) {
240
+ if (groupId !== "__general__") {
241
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
242
+ }
243
+ const debugIds = /* @__PURE__ */ new Set();
244
+ for (const l of lines) {
245
+ if (l.startsWith("[DEBUG-FAIL]")) {
246
+ try {
247
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
248
+ if (parsed?.id) debugIds.add(String(parsed.id));
249
+ } catch {
239
250
  }
240
- const suggestions = suggestFixFromDiff(parsed);
241
- if (suggestions.length) {
242
- console.log(
243
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
244
- );
245
- for (const s of suggestions) console.log(` \u2022 ${s}`);
251
+ }
252
+ }
253
+ for (const line of lines) {
254
+ if (line.startsWith("[FAIL]")) {
255
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
256
+ const failId = m?.[1];
257
+ if (failId && debugIds.has(failId)) continue;
258
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
259
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
260
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
261
+ } else if (line.startsWith("[STACK]")) {
262
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
263
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
264
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
265
+ try {
266
+ const parsed = JSON.parse(payload);
267
+ const { message, diff, expected, actual } = parsed;
268
+ if (message)
269
+ console.log(
270
+ ` ${colors2.bold}${message}${colors2.reset}`
271
+ );
272
+ if (diff && Array.isArray(diff)) {
273
+ for (const dLine of diff)
274
+ console.log(" " + colorizeDiffLine(dLine));
275
+ } else {
276
+ console.log(" expected:");
277
+ console.log(
278
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
279
+ );
280
+ console.log(" actual:");
281
+ console.log(
282
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
283
+ );
284
+ }
285
+ const suggestions = suggestFixFromDiff(parsed);
286
+ if (suggestions.length) {
287
+ console.log(
288
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
289
+ );
290
+ for (const s of suggestions)
291
+ console.log(` \u2022 ${s}`);
292
+ }
293
+ } catch {
294
+ console.log(` ${line}`);
295
+ }
296
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
297
+ const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
298
+ try {
299
+ const ctx = JSON.parse(payload);
300
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
301
+ if (ctx.tool_schema) {
302
+ console.log(
303
+ colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
304
+ );
305
+ }
306
+ if (ctx.last_user_query) {
307
+ console.log(
308
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
309
+ );
310
+ }
311
+ if (ctx.raw_model_text) {
312
+ console.log(
313
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
314
+ );
315
+ }
316
+ if (ctx.parsed_tool_calls) {
317
+ console.log(
318
+ colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
319
+ );
320
+ }
321
+ if (ctx.ground_truth) {
322
+ console.log(
323
+ colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
324
+ );
325
+ }
326
+ if (ctx.finish_reason) {
327
+ console.log(
328
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
329
+ );
330
+ }
331
+ } catch {
332
+ console.log(` ${line}`);
246
333
  }
247
- } catch {
248
- console.log(` ${line}`);
249
334
  }
250
335
  }
251
336
  }
@@ -787,14 +872,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
787
872
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
788
873
  );
789
874
  }
875
+ const debugSummaryRef = {};
876
+ const providerOptions = {
877
+ toolCallMiddleware: {
878
+ debugSummary: debugSummaryRef
879
+ }
880
+ };
790
881
  const { toolCalls, text, finishReason } = await (0, import_ai.generateText)({
791
882
  model,
792
883
  messages: flatMessages,
793
884
  tools: toolsMap,
794
885
  toolChoice: "auto",
886
+ providerOptions,
795
887
  ...temperature !== void 0 ? { temperature } : {},
796
888
  ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
797
889
  });
890
+ const mwOriginalText = debugSummaryRef.originalText;
891
+ const mwParsedToolCalls = (() => {
892
+ const raw = debugSummaryRef.toolCalls;
893
+ if (!raw) return [];
894
+ try {
895
+ const arr = JSON.parse(raw);
896
+ return Array.isArray(arr) ? arr : [];
897
+ } catch {
898
+ return [];
899
+ }
900
+ })();
798
901
  try {
799
902
  caseLogs.push(
800
903
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
@@ -838,6 +941,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
838
941
  } else {
839
942
  caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
840
943
  try {
944
+ let generateParamMismatchDiff2 = function(paramName, allowed, got) {
945
+ const diffLines = [];
946
+ diffLines.push(`@@ param ${paramName}`);
947
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
948
+ const expectedLine = (() => {
949
+ if (allowedArray.length === 1) {
950
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
951
+ }
952
+ const formatted = allowedArray.map(
953
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
954
+ ).join(", ");
955
+ return `- expected one of: ${formatted}`;
956
+ })();
957
+ diffLines.push(expectedLine);
958
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
959
+ return diffLines;
960
+ };
961
+ var generateParamMismatchDiff = generateParamMismatchDiff2;
841
962
  const category = testCase.id.split("_")[0];
842
963
  const diff = [];
843
964
  const summarizeArgs = (args) => {
@@ -904,11 +1025,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
904
1025
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
905
1026
  });
906
1027
  if (!includes) {
907
- diff.push(`@@ param ${k}`);
908
1028
  diff.push(
909
- `- expected one of: ${JSON.stringify(allowed)}`
1029
+ ...generateParamMismatchDiff2(k, allowed, got)
910
1030
  );
911
- diff.push(`+ got: ${JSON.stringify(got)}`);
912
1031
  }
913
1032
  }
914
1033
  }
@@ -998,11 +1117,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
998
1117
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
999
1118
  });
1000
1119
  if (!includes) {
1001
- diff.push(`@@ param ${k}`);
1002
1120
  diff.push(
1003
- `- expected one of: ${JSON.stringify(allowed)}`
1121
+ ...generateParamMismatchDiff2(k, allowed, got)
1004
1122
  );
1005
- diff.push(`+ got: ${JSON.stringify(got)}`);
1006
1123
  }
1007
1124
  }
1008
1125
  }
@@ -1019,6 +1136,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1019
1136
  diff
1020
1137
  })}`
1021
1138
  );
1139
+ try {
1140
+ const lastUser = (() => {
1141
+ const reversed = [...flatMessages].reverse();
1142
+ const found = reversed.find(
1143
+ (m) => m.role === "user"
1144
+ );
1145
+ return found?.content ?? void 0;
1146
+ })();
1147
+ const contextPayload = {
1148
+ id: testCase.id,
1149
+ tool_schema: tools,
1150
+ last_user_query: lastUser,
1151
+ raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
1152
+ finish_reason: finishReason,
1153
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1154
+ ground_truth: possibleAnswer.ground_truth
1155
+ };
1156
+ caseLogs.push(
1157
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
1158
+ );
1159
+ } catch {
1160
+ }
1022
1161
  } catch {
1023
1162
  caseLogs.push(
1024
1163
  `[DEBUG] ${testCase.id}: failed to build debug diff`