@ai-sdk-tool/eval 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -100,11 +100,17 @@ function suggestFixFromDiff(parsed) {
100
100
  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
101
101
  const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
102
102
  for (const param of targets) {
103
- const allowedLine = diff.find(
103
+ const allowedOneOfLine = diff.find(
104
104
  (d) => String(d).startsWith("- expected one of:")
105
105
  );
106
- if (allowedLine) {
107
- const allowed = allowedLine.replace("- expected one of: ", "");
106
+ const allowedSingleLine = diff.find(
107
+ (d) => String(d).startsWith("- expected:")
108
+ );
109
+ if (allowedSingleLine) {
110
+ const value = allowedSingleLine.replace("- expected: ", "");
111
+ suggestions.push(`Set '${param}' to: ${value}.`);
112
+ } else if (allowedOneOfLine) {
113
+ const allowed = allowedOneOfLine.replace("- expected one of: ", "");
108
114
  suggestions.push(`Set '${param}' to one of: ${allowed}.`);
109
115
  } else {
110
116
  suggestions.push(`Adjust '${param}' to an allowed value.`);
@@ -149,61 +155,140 @@ function consoleDebugReporter(results) {
149
155
  }
150
156
  if (result.logs && result.logs.length) {
151
157
  const failLogs = result.logs.filter(
152
- (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]")
158
+ (l) => l.startsWith("[FAIL]") || l.startsWith("[ERROR]") || l.startsWith("[FATAL]") || l.startsWith("[STACK]") || l.startsWith("[DEBUG-FAIL]") || l.startsWith("[DEBUG-FAIL-CONTEXT]")
153
159
  );
154
160
  const hasFails = failLogs.length > 0;
155
161
  if (hasFails) {
156
- console.log(` ${colors2.bold}Failure details:${colors2.reset}`);
157
- const debugIds = /* @__PURE__ */ new Set();
158
- for (const l of failLogs) {
159
- if (l.startsWith("[DEBUG-FAIL]")) {
162
+ let getTestIdFromLogLine2 = function(line) {
163
+ if (line.startsWith("[FAIL]")) {
164
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
165
+ return m?.[1];
166
+ }
167
+ if (line.startsWith("[DEBUG-FAIL]")) {
160
168
  try {
161
- const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
162
- if (parsed?.id) debugIds.add(String(parsed.id));
169
+ const parsed = JSON.parse(line.replace(/^\[DEBUG-FAIL\] /, ""));
170
+ return String(parsed?.id ?? "");
163
171
  } catch {
164
172
  }
165
173
  }
166
- }
167
- for (const line of failLogs) {
168
- if (line.startsWith("[FAIL]")) {
169
- const m = line.match(/^\[FAIL\]\s+([^:]+):/);
170
- const failId = m?.[1];
171
- if (failId && debugIds.has(failId)) continue;
172
- console.log(` ${colors2.red}${line}${colors2.reset}`);
173
- } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
174
- console.log(` ${colors2.yellow}${line}${colors2.reset}`);
175
- } else if (line.startsWith("[STACK]")) {
176
- console.log(` ${colors2.gray}${line}${colors2.reset}`);
177
- } else if (line.startsWith("[DEBUG-FAIL]")) {
178
- const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
174
+ if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
179
175
  try {
180
- const parsed = JSON.parse(payload);
181
- const { id, expected, actual, message, diff } = parsed;
182
- console.log(
183
- ` ${colors2.underline}${id}${colors2.reset} ${message ? "- " + message : ""}`
176
+ const parsed = JSON.parse(
177
+ line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "")
184
178
  );
185
- if (diff && Array.isArray(diff)) {
186
- for (const dLine of diff)
187
- console.log(" " + colorizeDiffLine(dLine));
188
- } else {
189
- console.log(" expected:");
190
- console.log(
191
- colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
192
- );
193
- console.log(" actual:");
194
- console.log(
195
- colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
196
- );
179
+ return String(parsed?.id ?? "");
180
+ } catch {
181
+ }
182
+ }
183
+ return void 0;
184
+ };
185
+ var getTestIdFromLogLine = getTestIdFromLogLine2;
186
+ const byId = /* @__PURE__ */ new Map();
187
+ for (const line of failLogs) {
188
+ const id = getTestIdFromLogLine2(line);
189
+ const key = id ?? "__general__";
190
+ const arr = byId.get(key) ?? [];
191
+ arr.push(line);
192
+ byId.set(key, arr);
193
+ }
194
+ console.log(
195
+ ` ${colors2.bold}Failure details (grouped):${colors2.reset}`
196
+ );
197
+ for (const [groupId, lines] of byId) {
198
+ if (groupId !== "__general__") {
199
+ console.log(` ${colors2.underline}${groupId}${colors2.reset}`);
200
+ }
201
+ const debugIds = /* @__PURE__ */ new Set();
202
+ for (const l of lines) {
203
+ if (l.startsWith("[DEBUG-FAIL]")) {
204
+ try {
205
+ const parsed = JSON.parse(l.replace(/^\[DEBUG-FAIL\] /, ""));
206
+ if (parsed?.id) debugIds.add(String(parsed.id));
207
+ } catch {
197
208
  }
198
- const suggestions = suggestFixFromDiff(parsed);
199
- if (suggestions.length) {
200
- console.log(
201
- ` ${colors2.bold}Suggested fix:${colors2.reset}`
202
- );
203
- for (const s of suggestions) console.log(` \u2022 ${s}`);
209
+ }
210
+ }
211
+ for (const line of lines) {
212
+ if (line.startsWith("[FAIL]")) {
213
+ const m = line.match(/^\[FAIL\]\s+([^:]+):/);
214
+ const failId = m?.[1];
215
+ if (failId && debugIds.has(failId)) continue;
216
+ console.log(` ${colors2.red}${line}${colors2.reset}`);
217
+ } else if (line.startsWith("[ERROR]") || line.startsWith("[FATAL]")) {
218
+ console.log(` ${colors2.yellow}${line}${colors2.reset}`);
219
+ } else if (line.startsWith("[STACK]")) {
220
+ console.log(` ${colors2.gray}${line}${colors2.reset}`);
221
+ } else if (line.startsWith("[DEBUG-FAIL]")) {
222
+ const payload = line.replace(/^\[DEBUG-FAIL\] /, "");
223
+ try {
224
+ const parsed = JSON.parse(payload);
225
+ const { message, diff, expected, actual } = parsed;
226
+ if (message)
227
+ console.log(
228
+ ` ${colors2.bold}${message}${colors2.reset}`
229
+ );
230
+ if (diff && Array.isArray(diff)) {
231
+ for (const dLine of diff)
232
+ console.log(" " + colorizeDiffLine(dLine));
233
+ } else {
234
+ console.log(" expected:");
235
+ console.log(
236
+ colors2.green + " " + JSON.stringify(expected, null, 2).split("\n").join("\n ") + colors2.reset
237
+ );
238
+ console.log(" actual:");
239
+ console.log(
240
+ colors2.red + " " + JSON.stringify(actual, null, 2).split("\n").join("\n ") + colors2.reset
241
+ );
242
+ }
243
+ const suggestions = suggestFixFromDiff(parsed);
244
+ if (suggestions.length) {
245
+ console.log(
246
+ ` ${colors2.bold}Suggested fix:${colors2.reset}`
247
+ );
248
+ for (const s of suggestions)
249
+ console.log(` \u2022 ${s}`);
250
+ }
251
+ } catch {
252
+ console.log(` ${line}`);
253
+ }
254
+ } else if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
255
+ const payload = line.replace(/^\[DEBUG-FAIL-CONTEXT\] /, "");
256
+ try {
257
+ const ctx = JSON.parse(payload);
258
+ console.log(` ${colors2.gray}context:${colors2.reset}`);
259
+ if (ctx.tool_schema) {
260
+ console.log(
261
+ colors2.gray + " tool schema: " + JSON.stringify(ctx.tool_schema, null, 2).split("\n").join("\n ") + colors2.reset
262
+ );
263
+ }
264
+ if (ctx.last_user_query) {
265
+ console.log(
266
+ colors2.gray + " last user: " + JSON.stringify(ctx.last_user_query) + colors2.reset
267
+ );
268
+ }
269
+ if (ctx.raw_model_text) {
270
+ console.log(
271
+ colors2.gray + " raw model text (middleware parsed):\n " + String(ctx.raw_model_text).split("\n").join("\n ") + colors2.reset
272
+ );
273
+ }
274
+ if (ctx.parsed_tool_calls) {
275
+ console.log(
276
+ colors2.gray + " parsed tool calls: " + JSON.stringify(ctx.parsed_tool_calls, null, 2).split("\n").join("\n ") + colors2.reset
277
+ );
278
+ }
279
+ if (ctx.ground_truth) {
280
+ console.log(
281
+ colors2.gray + " ground truth: " + JSON.stringify(ctx.ground_truth, null, 2).split("\n").join("\n ") + colors2.reset
282
+ );
283
+ }
284
+ if (ctx.finish_reason) {
285
+ console.log(
286
+ colors2.gray + " finish reason: " + JSON.stringify(ctx.finish_reason) + colors2.reset
287
+ );
288
+ }
289
+ } catch {
290
+ console.log(` ${line}`);
204
291
  }
205
- } catch {
206
- console.log(` ${line}`);
207
292
  }
208
293
  }
209
294
  }
@@ -274,7 +359,13 @@ async function runSingleBenchmark(model, benchmark, modelKey, config) {
274
359
  }
275
360
  }
276
361
  async function evaluate(options) {
277
- const { models, benchmarks, reporter = "console", temperature } = options;
362
+ const {
363
+ models,
364
+ benchmarks,
365
+ reporter = "console",
366
+ temperature,
367
+ maxTokens
368
+ } = options;
278
369
  const modelEntries = [];
279
370
  if (Array.isArray(models)) {
280
371
  for (const m of models) modelEntries.push([void 0, m]);
@@ -290,11 +381,14 @@ async function evaluate(options) {
290
381
  const allResults = [];
291
382
  for (const [modelKey, model] of modelEntries) {
292
383
  for (const benchmark of benchmarks) {
384
+ const config = {};
385
+ if (temperature !== void 0) config.temperature = temperature;
386
+ if (maxTokens !== void 0) config.maxTokens = maxTokens;
293
387
  const evaluationResult = await runSingleBenchmark(
294
388
  model,
295
389
  benchmark,
296
390
  modelKey,
297
- temperature !== void 0 ? { temperature } : void 0
391
+ Object.keys(config).length > 0 ? config : void 0
298
392
  );
299
393
  allResults.push(evaluationResult);
300
394
  }
@@ -694,6 +788,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
694
788
  const { function: tools, question: messages } = testCase;
695
789
  const temp = config?.temperature;
696
790
  const temperature = typeof temp === "number" ? temp : void 0;
791
+ const maxTok = config?.maxTokens;
792
+ const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
697
793
  try {
698
794
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
699
795
  const nameMap = /* @__PURE__ */ new Map();
@@ -734,24 +830,32 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
734
830
  `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
735
831
  );
736
832
  }
833
+ const debugSummaryRef = {};
834
+ const providerOptions = {
835
+ toolCallMiddleware: {
836
+ debugSummary: debugSummaryRef
837
+ }
838
+ };
737
839
  const { toolCalls, text, finishReason } = await generateText({
738
840
  model,
739
841
  messages: flatMessages,
740
842
  tools: toolsMap,
741
843
  toolChoice: "auto",
844
+ providerOptions,
742
845
  ...temperature !== void 0 ? { temperature } : {},
743
- // Pass original schema information to middleware
744
- providerOptions: {
745
- toolCallMiddleware: {
746
- originalToolSchemas: Object.fromEntries(
747
- transformedTools.map((t) => [
748
- t.name,
749
- t.inputSchema
750
- ])
751
- )
752
- }
753
- }
846
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
754
847
  });
848
+ const mwOriginalText = debugSummaryRef.originalText;
849
+ const mwParsedToolCalls = (() => {
850
+ const raw = debugSummaryRef.toolCalls;
851
+ if (!raw) return [];
852
+ try {
853
+ const arr = JSON.parse(raw);
854
+ return Array.isArray(arr) ? arr : [];
855
+ } catch {
856
+ return [];
857
+ }
858
+ })();
755
859
  try {
756
860
  caseLogs.push(
757
861
  `[DEBUG] ${testCase.id}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
@@ -795,6 +899,24 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
795
899
  } else {
796
900
  caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
797
901
  try {
902
+ let generateParamMismatchDiff2 = function(paramName, allowed, got) {
903
+ const diffLines = [];
904
+ diffLines.push(`@@ param ${paramName}`);
905
+ const allowedArray = Array.isArray(allowed) ? allowed : [allowed];
906
+ const expectedLine = (() => {
907
+ if (allowedArray.length === 1) {
908
+ return `- expected: ${JSON.stringify(allowedArray[0])}`;
909
+ }
910
+ const formatted = allowedArray.map(
911
+ (v) => Array.isArray(v) || typeof v === "object" && v !== null ? JSON.stringify(v) : String(v)
912
+ ).join(", ");
913
+ return `- expected one of: ${formatted}`;
914
+ })();
915
+ diffLines.push(expectedLine);
916
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
917
+ return diffLines;
918
+ };
919
+ var generateParamMismatchDiff = generateParamMismatchDiff2;
798
920
  const category = testCase.id.split("_")[0];
799
921
  const diff = [];
800
922
  const summarizeArgs = (args) => {
@@ -861,11 +983,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
861
983
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
862
984
  });
863
985
  if (!includes) {
864
- diff.push(`@@ param ${k}`);
865
986
  diff.push(
866
- `- expected one of: ${JSON.stringify(allowed)}`
987
+ ...generateParamMismatchDiff2(k, allowed, got)
867
988
  );
868
- diff.push(`+ got: ${JSON.stringify(got)}`);
869
989
  }
870
990
  }
871
991
  }
@@ -955,11 +1075,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
955
1075
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
956
1076
  });
957
1077
  if (!includes) {
958
- diff.push(`@@ param ${k}`);
959
1078
  diff.push(
960
- `- expected one of: ${JSON.stringify(allowed)}`
1079
+ ...generateParamMismatchDiff2(k, allowed, got)
961
1080
  );
962
- diff.push(`+ got: ${JSON.stringify(got)}`);
963
1081
  }
964
1082
  }
965
1083
  }
@@ -976,6 +1094,28 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
976
1094
  diff
977
1095
  })}`
978
1096
  );
1097
+ try {
1098
+ const lastUser = (() => {
1099
+ const reversed = [...flatMessages].reverse();
1100
+ const found = reversed.find(
1101
+ (m) => m.role === "user"
1102
+ );
1103
+ return found?.content ?? void 0;
1104
+ })();
1105
+ const contextPayload = {
1106
+ id: testCase.id,
1107
+ tool_schema: tools,
1108
+ last_user_query: lastUser,
1109
+ raw_model_text: mwOriginalText && mwOriginalText.length > 0 ? mwOriginalText : typeof text === "string" ? text : "",
1110
+ finish_reason: finishReason,
1111
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1112
+ ground_truth: possibleAnswer.ground_truth
1113
+ };
1114
+ caseLogs.push(
1115
+ `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
1116
+ );
1117
+ } catch {
1118
+ }
979
1119
  } catch {
980
1120
  caseLogs.push(
981
1121
  `[DEBUG] ${testCase.id}: failed to build debug diff`