@ai-sdk-tool/eval 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -100,11 +100,17 @@ function suggestFixFromDiff(parsed) {
100
100
  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
101
101
  const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
102
102
  for (const param of targets) {
103
- const allowedLine = diff.find(
103
+ const allowedOneOfLine = diff.find(
104
104
  (d) => String(d).startsWith("- expected one of:")
105
105
  );
106
- if (allowedLine) {
107
- const allowed = allowedLine.replace("- expected one of: ", "");
106
+ const allowedSingleLine = diff.find(
107
+ (d) => String(d).startsWith("- expected:")
108
+ );
109
+ if (allowedSingleLine) {
110
+ const value = allowedSingleLine.replace("- expected: ", "");
111
+ suggestions.push(`Set '${param}' to: ${value}.`);
112
+ } else if (allowedOneOfLine) {
113
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
108
114
  suggestions.push(`Set '${param}' to one of: ${allowed}.`);
109
115
  } else {
110
116
  suggestions.push(`Adjust '${param}' to an allowed value.`);
@@ -149,61 +155,140 @@ function consoleDebugReporter(results) {
149
155
  }
150
156
  if (result.logs && result.logs.length) {
151
157
  const failLogs = result.logs.filter(
152
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
158
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
153
159
  );
154
160
  const hasFails = failLogs.length > 0;
155
161
  if (hasFails) {
156
- console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
157
- const debugIds = /* @__PURE__ */ new Set();
158
- for (const l of failLogs) {
159
- if (l.startsWith("[DEBUG-FAIL]")) {
162
+ let getTestIdFromLogLine2 = function(line) {
163
+ if (line.startsWith("[FAIL]")) {
164
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
165
+ return m?.[1];
166
+ }
167
+ if (line.startsWith("[DEBUG-FAIL]")) {
160
168
  try {
161
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
162
- if (parsed?.id) debugIds.add(String(parsed.id));
169
+ const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
170
+ return String(parsed?.id ?? "");
163
171
  } catch {
164
172
  }
165
173
  }
166
- }
167
- for (const line of failLogs) {
168
- if (line.startsWith("[FAIL]")) {
169
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
170
- const failId = m?.[1];
171
- if (failId && debugIds.has(failId)) continue;
172
- console.log(` ${colors2.red}${line}${colors2.reset}`);
173
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
174
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
175
- } else if (line.startsWith("[STACK]")) {
176
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
177
- } else if (line.startsWith("[DEBUG-FAIL]")) {
178
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
174
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
179
175
  try {
180
- const parsed = JSON.parse(payload);
181
- const { id, expected, actual, message, diff } = parsed;
182
- console.log(
183
- ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
176
+ const parsed = JSON.parse(
177
+ line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
184
178
  );
185
- if (diff && Array.isArray(diff)) {
186
- for (const dLine of diff)
187
- console.log(" " + colorizeDiffLine(dLine));
188
- } else {
189
- console.log(" expected:");
190
- console.log(
191
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
192
- );
193
- console.log(" actual:");
194
- console.log(
195
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
196
- );
179
+ return String(parsed?.id ?? "");
180
+ } catch {
181
+ }
182
+ }
183
+ return void 0;
184
+ };
185
+ var getTestIdFromLogLine = getTestIdFromLogLine2;
186
+ const byId = /* @__PURE__ */ new Map();
187
+ for (const line of failLogs) {
188
+ const id = getTestIdFromLogLine2(line);
189
+ const key = id ?? "__general__";
190
+ const arr = byId.get(key) ?? [];
191
+ arr.push(line);
192
+ byId.set(key, arr);
193
+ }
194
+ console.log(
195
+ ` ${colors2.bold}Failure details (grouped):${colors2.reset}`
196
+ );
197
+ for (const [groupId, lines] of byId) {
198
+ if (groupId !== "__general__") {
199
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
200
+ }
201
+ const debugIds = /* @__PURE__ */ new Set();
202
+ for (const l of lines) {
203
+ if (l.startsWith("[DEBUG-FAIL]")) {
204
+ try {
205
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
206
+ if (parsed?.id) debugIds.add(String(parsed.id));
207
+ } catch {
197
208
  }
198
- const suggestions = suggestFixFromDiff(parsed);
199
- if (suggestions.length) {
200
- console.log(
201
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
202
- );
203
- for (const s of suggestions) console.log(` \u2022 ${s}`);
209
+ }
210
+ }
211
+ for (const line of lines) {
212
+ if (line.startsWith("[FAIL]")) {
213
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
214
+ const failId = m?.[1];
215
+ if (failId && debugIds.has(failId)) continue;
216
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
217
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
218
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
219
+ } else if (line.startsWith("[STACK]")) {
220
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
221
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
222
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
223
+ try {
224
+ const parsed = JSON.parse(payload);
225
+ const { message, diff, expected, actual } = parsed;
226
+ if (message)
227
+ console.log(
228
+ ` ${colors2.bold}${message}${colors2.reset}`
229
+ );
230
+ if (diff && Array.isArray(diff)) {
231
+ for (const dLine of diff)
232
+ console.log(" " + colorizeDiffLine(dLine));
233
+ } else {
234
+ console.log(" expected:");
235
+ console.log(
236
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
237
+ );
238
+ console.log(" actual:");
239
+ console.log(
240
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
241
+ );
242
+ }
243
+ const suggestions = suggestFixFromDiff(parsed);
244
+ if (suggestions.length) {
245
+ console.log(
246
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
247
+ );
248
+ for (const s of suggestions)
249
+ console.log(` \u2022 ${s}`);
250
+ }
251
+ } catch {
252
+ console.log(` ${line}`);
253
+ }
254
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
255
+ const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
256
+ try {
257
+ const ctx = JSON.parse(payload);
258
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
259
+ if (ctx.tool_schema) {
260
+ console.log(
261
+ colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
262
+ );
263
+ }
264
+ if (ctx.last_user_query) {
265
+ console.log(
266
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
267
+ );
268
+ }
269
+ if (ctx.raw_model_text) {
270
+ console.log(
271
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
272
+ );
273
+ }
274
+ if (ctx.parsed_tool_calls) {
275
+ console.log(
276
+ colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
277
+ );
278
+ }
279
+ if (ctx.ground_truth) {
280
+ console.log(
281
+ colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
282
+ );
283
+ }
284
+ if (ctx.finish_reason) {
285
+ console.log(
286
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
287
+ );
288
+ }
289
+ } catch {
290
+ console.log(` ${line}`);
204
291
  }
205
- } catch {
206
- console.log(` ${line}`);
207
292
  }
208
293
  }
209
294
  }
@@ -745,14 +830,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
745
830
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
746
831
  );
747
832
  }
833
+ const debugSummaryRef = {};
834
+ const providerOptions = {
835
+ toolCallMiddleware: {
836
+ debugSummary: debugSummaryRef
837
+ }
838
+ };
748
839
  const { toolCalls, text, finishReason } = await generateText({
749
840
  model,
750
841
  messages: flatMessages,
751
842
  tools: toolsMap,
752
843
  toolChoice: "auto",
844
+ providerOptions,
753
845
  ...temperature !== void 0 ? { temperature } : {},
754
846
  ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
755
847
  });
848
+ const mwOriginalText = debugSummaryRef.originalText;
849
+ const mwParsedToolCalls = (() => {
850
+ const raw = debugSummaryRef.toolCalls;
851
+ if (!raw) return [];
852
+ try {
853
+ const arr = JSON.parse(raw);
854
+ return Array.isArray(arr) ? arr : [];
855
+ } catch {
856
+ return [];
857
+ }
858
+ })();
756
859
  try {
757
860
  caseLogs.push(
758
861
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
@@ -796,6 +899,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
796
899
  } else {
797
900
  caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
798
901
  try {
902
+ let generateParamMismatchDiff2 = function(paramName, allowed, got) {
903
+ const diffLines = [];
904
+ diffLines.push(`@@ param ${paramName}`);
905
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
906
+ const expectedLine = (() => {
907
+ if (allowedArray.length === 1) {
908
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
909
+ }
910
+ const formatted = allowedArray.map(
911
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
912
+ ).join(", ");
913
+ return `- expected one of: ${formatted}`;
914
+ })();
915
+ diffLines.push(expectedLine);
916
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
917
+ return diffLines;
918
+ };
919
+ var generateParamMismatchDiff = generateParamMismatchDiff2;
799
920
  const category = testCase.id.split("_")[0];
800
921
  const diff = [];
801
922
  const summarizeArgs = (args) => {
@@ -862,11 +983,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
862
983
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
863
984
  });
864
985
  if (!includes) {
865
- diff.push(`@@ param ${k}`);
866
986
  diff.push(
867
- `- expected one of: ${JSON.stringify(allowed)}`
987
+ ...generateParamMismatchDiff2(k, allowed, got)
868
988
  );
869
- diff.push(`+ got: ${JSON.stringify(got)}`);
870
989
  }
871
990
  }
872
991
  }
@@ -956,11 +1075,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
956
1075
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
957
1076
  });
958
1077
  if (!includes) {
959
- diff.push(`@@ param ${k}`);
960
1078
  diff.push(
961
- `- expected one of: ${JSON.stringify(allowed)}`
1079
+ ...generateParamMismatchDiff2(k, allowed, got)
962
1080
  );
963
- diff.push(`+ got: ${JSON.stringify(got)}`);
964
1081
  }
965
1082
  }
966
1083
  }
@@ -977,6 +1094,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
977
1094
  diff
978
1095
  })}`
979
1096
  );
1097
+ try {
1098
+ const lastUser = (() => {
1099
+ const reversed = [...flatMessages].reverse();
1100
+ const found = reversed.find(
1101
+ (m) => m.role === "user"
1102
+ );
1103
+ return found?.content ?? void 0;
1104
+ })();
1105
+ const contextPayload = {
1106
+ id: testCase.id,
1107
+ tool_schema: tools,
1108
+ last_user_query: lastUser,
1109
+ raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
1110
+ finish_reason: finishReason,
1111
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1112
+ ground_truth: possibleAnswer.ground_truth
1113
+ };
1114
+ caseLogs.push(
1115
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
1116
+ );
1117
+ } catch {
1118
+ }
980
1119
  } catch {
981
1120
  caseLogs.push(
982
1121
  `[DEBUG] ${testCase.id}: failed to build debug diff`