@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -34,6 +34,7 @@ __export(index_exports, {
34
34
  bfclParallelBenchmark: () => bfclParallelBenchmark,
35
35
  bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
36
36
  bfclSimpleBenchmark: () => bfclSimpleBenchmark,
37
+ complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
37
38
  evaluate: () => evaluate,
38
39
  jsonGenerationBenchmark: () => jsonGenerationBenchmark,
39
40
  jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
@@ -61,7 +62,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
61
62
  if (import_node_fs.default.existsSync(dataAtRoot)) {
62
63
  return dataAtRoot;
63
64
  }
64
- } catch {
65
+ } catch (e) {
65
66
  }
66
67
  return null;
67
68
  }
@@ -75,7 +76,7 @@ function tryResolveViaPackageJson(moduleUrl) {
75
76
  if (import_node_fs.default.existsSync(dataAtPkg)) {
76
77
  return dataAtPkg;
77
78
  }
78
- } catch {
79
+ } catch (e) {
79
80
  }
80
81
  return null;
81
82
  }
@@ -83,7 +84,7 @@ function getStartDir(moduleUrl) {
83
84
  if (moduleUrl) {
84
85
  try {
85
86
  return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
86
- } catch {
87
+ } catch (e) {
87
88
  return process.cwd();
88
89
  }
89
90
  }
@@ -177,7 +178,7 @@ function valuesMatch(modelValue, possibleValue) {
177
178
  const normalizedModel = normalizeObject(modelValue);
178
179
  const normalizedPossible = normalizeObject(possibleValue);
179
180
  return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
180
- } catch {
181
+ } catch (e) {
181
182
  return false;
182
183
  }
183
184
  }
@@ -306,7 +307,7 @@ function checkSingleParameter(paramName, modelValue, context) {
306
307
  return checkStringValue(
307
308
  paramName,
308
309
  modelValue,
309
- possibleValues ?? []
310
+ possibleValues != null ? possibleValues : []
310
311
  );
311
312
  }
312
313
  if (Array.isArray(modelValue)) {
@@ -406,6 +407,37 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
406
407
  // src/benchmarks/bfcl.ts
407
408
  var LINE_SPLIT_REGEX = /\r?\n/;
408
409
  var NUMERIC_STRING_REGEX = /^\d+$/;
410
+ function convertGroundTruthToXML(call) {
411
+ const keys = Object.keys(call);
412
+ if (keys.length === 0) {
413
+ return "<empty_call />";
414
+ }
415
+ const funcName = keys[0];
416
+ if (!funcName) {
417
+ return "<undefined_function />";
418
+ }
419
+ const params = call[funcName];
420
+ if (!params || typeof params !== "object") {
421
+ return `<${funcName} />`;
422
+ }
423
+ let xml = `<${funcName}>
424
+ `;
425
+ for (const [key, value] of Object.entries(params)) {
426
+ const displayValue = Array.isArray(value) ? value[0] : value;
427
+ let valueStr;
428
+ if (typeof displayValue === "string") {
429
+ valueStr = displayValue;
430
+ } else if (displayValue === null || displayValue === void 0) {
431
+ valueStr = "";
432
+ } else {
433
+ valueStr = JSON.stringify(displayValue);
434
+ }
435
+ xml += ` <${key}>${valueStr}</${key}>
436
+ `;
437
+ }
438
+ xml += `</${funcName}>`;
439
+ return xml;
440
+ }
409
441
  function check(testCase, modelOutput, possibleAnswer) {
410
442
  const category = testCase.id.split("_")[0];
411
443
  try {
@@ -486,7 +518,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
486
518
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
487
519
  );
488
520
  }
489
- const fixSchemaType = (copy) => {
521
+ const fixSchemaType2 = (copy) => {
490
522
  if (!copy.type) {
491
523
  return;
492
524
  }
@@ -510,16 +542,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
510
542
  );
511
543
  }
512
544
  };
513
- const fixSchema = (schema) => {
545
+ const fixSchema2 = (schema) => {
514
546
  if (!schema || typeof schema !== "object") {
515
547
  return { type: "object", properties: {} };
516
548
  }
517
- const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
549
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
518
550
  if (!Array.isArray(copy)) {
519
- fixSchemaType(copy);
520
- fixSchemaProperties(copy, fixSchema);
551
+ fixSchemaType2(copy);
552
+ fixSchemaProperties(copy, fixSchema2);
521
553
  if (copy.items) {
522
- copy.items = fixSchema(copy.items);
554
+ copy.items = fixSchema2(copy.items);
523
555
  }
524
556
  return copy;
525
557
  }
@@ -554,13 +586,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
554
586
  try {
555
587
  const arr = JSON.parse(raw);
556
588
  return Array.isArray(arr) ? arr : [];
557
- } catch {
589
+ } catch (e) {
558
590
  return [];
559
591
  }
560
592
  };
561
593
  const getSanitizedName = (rawName, transformedTools) => {
594
+ var _a, _b;
562
595
  if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
563
- return transformedTools[Number(rawName)]?.name ?? rawName;
596
+ return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
564
597
  }
565
598
  return rawName;
566
599
  };
@@ -570,25 +603,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
570
603
  }
571
604
  try {
572
605
  return JSON.parse(extractedArgs);
573
- } catch {
606
+ } catch (e) {
574
607
  return extractedArgs;
575
608
  }
576
609
  };
577
610
  const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
611
+ var _a, _b, _c, _d, _e, _f;
578
612
  const call = c;
579
- const rawName = call.toolName ?? call.name;
613
+ const rawName = (_a = call.toolName) != null ? _a : call.name;
580
614
  const sanitizedFromIndex = getSanitizedName(
581
615
  rawName,
582
616
  transformedTools
583
617
  );
584
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
585
- const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
618
+ const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
619
+ const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
586
620
  const parsedArgs = parseToolArgs(extractedArgs);
587
621
  return {
588
622
  ...call,
589
623
  toolName: originalName,
590
624
  name: originalName,
591
- args: parsedArgs ?? {}
625
+ args: parsedArgs != null ? parsedArgs : {}
592
626
  };
593
627
  });
594
628
  const summarizeArgs = (args) => {
@@ -632,7 +666,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
632
666
  if (Array.isArray(got)) {
633
667
  return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
634
668
  }
635
- } catch {
669
+ } catch (e) {
636
670
  }
637
671
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
638
672
  });
@@ -670,13 +704,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
670
704
  }
671
705
  };
672
706
  const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
707
+ var _a, _b, _c, _d;
673
708
  const funcDesc = tools[0];
674
- const gt = possibleAnswer.ground_truth?.[0];
675
- const expectedFuncName = funcDesc?.name;
709
+ const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
710
+ const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
676
711
  const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
677
712
  const received = restoredCalls[0];
678
- const receivedName = received?.toolName ?? received?.name;
679
- const receivedArgs = summarizeArgs(received?.args);
713
+ const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
714
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
680
715
  const expected = {
681
716
  function: expectedFuncName,
682
717
  params: expectedParams
@@ -688,7 +723,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
688
723
  const diff = [];
689
724
  checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
690
725
  if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
691
- const required = funcDesc?.parameters?.required ?? [];
726
+ const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
692
727
  checkMissingParams(
693
728
  required,
694
729
  receivedArgs,
@@ -725,12 +760,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
725
760
  }
726
761
  };
727
762
  const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
763
+ var _a;
728
764
  for (let i = 0; i < restoredCalls.length; i += 1) {
729
765
  if (usedActual.has(i)) {
730
766
  continue;
731
767
  }
732
768
  const rc = restoredCalls[i];
733
- const rcName = rc?.toolName ?? rc?.name;
769
+ const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
734
770
  if (rcName === fname) {
735
771
  return i;
736
772
  }
@@ -744,6 +780,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
744
780
  checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
745
781
  };
746
782
  const processExpectedCall = (options) => {
783
+ var _a, _b;
747
784
  const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
748
785
  const fname = Object.keys(expectedObj)[0];
749
786
  const matchedIndex = findMatchingCallIndex(
@@ -756,10 +793,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
756
793
  }
757
794
  usedActual.add(matchedIndex);
758
795
  const received = restoredCalls[matchedIndex];
759
- const receivedArgs = summarizeArgs(received?.args);
796
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
760
797
  const expectedParamsAllowed = expectedObj[fname];
761
798
  const funcDesc = tools.find((t) => t.name === fname);
762
- const requiredParams = funcDesc?.parameters?.required ?? [];
799
+ const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
763
800
  diff.push(`@@ function ${fname}`);
764
801
  if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
765
802
  validateFunctionParams({
@@ -771,10 +808,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
771
808
  }
772
809
  };
773
810
  const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
774
- const gtArr = possibleAnswer.ground_truth ?? [];
811
+ var _a;
812
+ const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
775
813
  const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
776
814
  const actualNames = restoredCalls.map(
777
- (c) => c.toolName ?? c.name
815
+ (c) => {
816
+ var _a2;
817
+ return (_a2 = c.toolName) != null ? _a2 : c.name;
818
+ }
778
819
  );
779
820
  const expected = {
780
821
  functions: expectedNames
@@ -800,14 +841,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
800
841
  return { expected, actual, diff };
801
842
  };
802
843
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
803
- const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
844
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
804
845
  logs.push(
805
846
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
806
847
  );
807
848
  const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
849
+ var _a, _b, _c, _d;
808
850
  try {
809
851
  const firstTool = transformedTools[0];
810
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
852
+ const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
811
853
  caseLogs.push(
812
854
  `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
813
855
  );
@@ -823,7 +865,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
823
865
  caseLogs.push(
824
866
  `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
825
867
  );
826
- } catch {
868
+ } catch (e) {
827
869
  caseLogs.push(
828
870
  `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
829
871
  );
@@ -842,11 +884,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
842
884
  possibleAnswer
843
885
  } = options;
844
886
  const lastUser = (() => {
887
+ var _a;
845
888
  const reversed = [...flatMessages].reverse();
846
889
  const found = reversed.find(
847
890
  (m) => m.role === "user"
848
891
  );
849
- return found?.content ?? void 0;
892
+ return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
850
893
  })();
851
894
  const rawModelText = (() => {
852
895
  if (mwOriginalText && mwOriginalText.length > 0) {
@@ -917,9 +960,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
917
960
  caseLogs.push(
918
961
  `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
919
962
  );
920
- } catch {
963
+ } catch (e) {
921
964
  }
922
- } catch {
965
+ } catch (e) {
923
966
  caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
924
967
  }
925
968
  };
@@ -998,7 +1041,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
998
1041
  const flatMessages = flattenMessages(messages);
999
1042
  const { transformedTools, nameMap } = buildTransformedTools(
1000
1043
  tools,
1001
- fixSchema
1044
+ fixSchema2
1002
1045
  );
1003
1046
  const toolsMap = buildToolsMap(transformedTools);
1004
1047
  return { flatMessages, transformedTools, nameMap, toolsMap };
@@ -1020,6 +1063,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1020
1063
  const mwParsedToolCalls = parseDebugToolCalls(
1021
1064
  debugSummaryRef.toolCalls
1022
1065
  );
1066
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1067
+ if (!possibleAnswer) {
1068
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1069
+ }
1070
+ if (process.env.DEBUG_PARSER_OUTPUT === "true") {
1071
+ const groundTruth = possibleAnswer.ground_truth;
1072
+ const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
1073
+ console.log("\n========== BFCL CASE DEBUG ==========");
1074
+ console.log(`Test Case: ${testCase.id}`);
1075
+ console.log(`Expected count: ${groundTruth.length} call(s)`);
1076
+ console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
1077
+ console.log(expectedXML);
1078
+ console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
1079
+ console.log(mwOriginalText || text || "(empty)");
1080
+ console.log(
1081
+ "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
1082
+ );
1083
+ console.log(JSON.stringify(toolCalls, null, 2));
1084
+ console.log("======================================\n");
1085
+ }
1023
1086
  logRawToolCalls({
1024
1087
  toolCalls,
1025
1088
  finishReason,
@@ -1027,10 +1090,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1027
1090
  testCaseId: testCase.id,
1028
1091
  caseLogs
1029
1092
  });
1030
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
1031
- if (!possibleAnswer) {
1032
- throw new Error(`No possible answer for id: ${testCase.id}`);
1033
- }
1034
1093
  const restoredCalls = restoreToolCalls(
1035
1094
  toolCalls || [],
1036
1095
  nameMap,
@@ -1051,12 +1110,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1051
1110
  caseLogs
1052
1111
  });
1053
1112
  };
1054
- const runSingleCase = async (testCase) => {
1113
+ const runSingleCase2 = async (testCase) => {
1055
1114
  const caseLogs = [];
1056
1115
  const { function: tools } = testCase;
1057
- const temp = config?.temperature;
1116
+ const temp = config == null ? void 0 : config.temperature;
1058
1117
  const temperature = typeof temp === "number" ? temp : void 0;
1059
- const maxTok = config?.maxTokens;
1118
+ const maxTok = config == null ? void 0 : config.maxTokens;
1060
1119
  const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1061
1120
  try {
1062
1121
  const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
@@ -1082,15 +1141,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1082
1141
  });
1083
1142
  } catch (e) {
1084
1143
  caseLogs.push(
1085
- `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
1144
+ `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
1086
1145
  );
1087
- if (e?.stack) {
1146
+ if (e == null ? void 0 : e.stack) {
1088
1147
  caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1089
1148
  }
1090
1149
  return { valid: false, logs: caseLogs };
1091
1150
  }
1092
1151
  };
1093
- const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
1152
+ const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
1094
1153
  const results = new Array(items.length);
1095
1154
  let idx = 0;
1096
1155
  const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
@@ -1106,10 +1165,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1106
1165
  await Promise.all(workers);
1107
1166
  return results;
1108
1167
  };
1109
- const resultsPerCase = await mapWithConcurrency(
1168
+ const resultsPerCase = await mapWithConcurrency2(
1110
1169
  testCases,
1111
1170
  concurrency,
1112
- async (tc) => runSingleCase(tc)
1171
+ async (tc) => runSingleCase2(tc)
1113
1172
  );
1114
1173
  correctCount = resultsPerCase.reduce(
1115
1174
  (acc, r) => acc + (r.valid ? 1 : 0),
@@ -1177,19 +1236,387 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
1177
1236
  "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1178
1237
  );
1179
1238
 
1180
- // src/benchmarks/json-generation.ts
1239
+ // src/benchmarks/complex-func-bench.ts
1181
1240
  var import_node_fs3 = require("fs");
1182
1241
  var import_node_path3 = __toESM(require("path"), 1);
1183
1242
  var import_ai2 = require("ai");
1243
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1244
+ function standardizeString2(input) {
1245
+ if (typeof input !== "string") {
1246
+ return input;
1247
+ }
1248
+ return input.toLowerCase().trim();
1249
+ }
1250
+ function valuesMatch2(modelValue, expectedValue) {
1251
+ if (modelValue === expectedValue) {
1252
+ return true;
1253
+ }
1254
+ if (typeof modelValue === "string" && typeof expectedValue === "string") {
1255
+ return standardizeString2(modelValue) === standardizeString2(expectedValue);
1256
+ }
1257
+ if (typeof modelValue === "number" && typeof expectedValue === "string") {
1258
+ return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
1259
+ }
1260
+ if (typeof modelValue === "string" && typeof expectedValue === "number") {
1261
+ return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
1262
+ }
1263
+ if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
1264
+ try {
1265
+ return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
1266
+ } catch (e) {
1267
+ return false;
1268
+ }
1269
+ }
1270
+ return false;
1271
+ }
1272
+ function validateFunctionName(modelFuncName, expectedFuncName) {
1273
+ if (modelFuncName !== expectedFuncName) {
1274
+ return {
1275
+ valid: false,
1276
+ error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
1277
+ error_type: "function_name_mismatch"
1278
+ };
1279
+ }
1280
+ return { valid: true };
1281
+ }
1282
+ function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
1283
+ for (const param of requiredParams) {
1284
+ if (!(param in modelArgs) && param in expectedArgs) {
1285
+ return {
1286
+ valid: false,
1287
+ error: `Missing required parameter: '${param}'`,
1288
+ error_type: "missing_required_param"
1289
+ };
1290
+ }
1291
+ }
1292
+ return { valid: true };
1293
+ }
1294
+ function validateParamValues(expectedArgs, modelArgs, requiredParams) {
1295
+ for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
1296
+ if (!(paramName in modelArgs)) {
1297
+ if (!requiredParams.includes(paramName)) {
1298
+ continue;
1299
+ }
1300
+ return {
1301
+ valid: false,
1302
+ error: `Missing parameter: '${paramName}'`,
1303
+ error_type: "missing_param"
1304
+ };
1305
+ }
1306
+ const modelValue = modelArgs[paramName];
1307
+ if (!valuesMatch2(modelValue, expectedValue)) {
1308
+ return {
1309
+ valid: false,
1310
+ error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
1311
+ error_type: "value_mismatch"
1312
+ };
1313
+ }
1314
+ }
1315
+ return { valid: true };
1316
+ }
1317
+ function checkFunctionCall(modelCall, expected, toolSpecs) {
1318
+ var _a, _b, _c, _d;
1319
+ const expectedFuncName = Object.keys(expected)[0];
1320
+ const expectedArgs = expected[expectedFuncName];
1321
+ const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
1322
+ const modelArgs = (_b = modelCall.args) != null ? _b : {};
1323
+ const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
1324
+ if (!nameResult.valid) {
1325
+ return nameResult;
1326
+ }
1327
+ const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
1328
+ const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
1329
+ const requiredResult = validateRequiredParams(
1330
+ requiredParams,
1331
+ modelArgs,
1332
+ expectedArgs
1333
+ );
1334
+ if (!requiredResult.valid) {
1335
+ return requiredResult;
1336
+ }
1337
+ return validateParamValues(expectedArgs, modelArgs, requiredParams);
1338
+ }
1339
+ function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
1340
+ if (modelCalls.length !== expectedCalls.length) {
1341
+ return {
1342
+ valid: false,
1343
+ error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
1344
+ error_type: "wrong_call_count"
1345
+ };
1346
+ }
1347
+ if (expectedCalls.length === 1) {
1348
+ return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
1349
+ }
1350
+ const matchedIndices = /* @__PURE__ */ new Set();
1351
+ for (const expected of expectedCalls) {
1352
+ let foundMatch = false;
1353
+ for (let i = 0; i < modelCalls.length; i++) {
1354
+ if (matchedIndices.has(i)) {
1355
+ continue;
1356
+ }
1357
+ const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
1358
+ if (result.valid) {
1359
+ matchedIndices.add(i);
1360
+ foundMatch = true;
1361
+ break;
1362
+ }
1363
+ }
1364
+ if (!foundMatch) {
1365
+ const expectedFuncName = Object.keys(expected)[0];
1366
+ return {
1367
+ valid: false,
1368
+ error: `Could not find matching call for function '${expectedFuncName}'`,
1369
+ error_type: "no_matching_call"
1370
+ };
1371
+ }
1372
+ }
1373
+ return { valid: true };
1374
+ }
1375
+ var fixSchemaType = (copy) => {
1376
+ if (!copy.type) {
1377
+ return;
1378
+ }
1379
+ if (copy.type === "dict") {
1380
+ copy.type = "object";
1381
+ }
1382
+ if (copy.type === "tuple") {
1383
+ copy.type = "array";
1384
+ }
1385
+ if (copy.type === "integer" || copy.type === "float") {
1386
+ copy.type = "number";
1387
+ }
1388
+ };
1389
+ var fixSchema = (schema) => {
1390
+ if (!schema || typeof schema !== "object") {
1391
+ return { type: "object", properties: {} };
1392
+ }
1393
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
1394
+ if (!Array.isArray(copy)) {
1395
+ fixSchemaType(copy);
1396
+ if (copy.properties && typeof copy.properties === "object") {
1397
+ for (const k of Object.keys(copy.properties)) {
1398
+ copy.properties[k] = fixSchema(
1399
+ copy.properties[k]
1400
+ );
1401
+ }
1402
+ }
1403
+ if (copy.items) {
1404
+ copy.items = fixSchema(copy.items);
1405
+ }
1406
+ }
1407
+ return copy;
1408
+ };
1409
+ function buildTools(tools) {
1410
+ const nameMap = /* @__PURE__ */ new Map();
1411
+ const transformedTools = tools.map((t) => {
1412
+ const fixed = fixSchema(t.parameters);
1413
+ const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
1414
+ const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
1415
+ nameMap.set(sanitized, t.name);
1416
+ return {
1417
+ type: "function",
1418
+ name: sanitized,
1419
+ description: t.description,
1420
+ inputSchema
1421
+ };
1422
+ });
1423
+ const toolsMap = Object.fromEntries(
1424
+ transformedTools.map((t) => [
1425
+ t.name,
1426
+ (0, import_ai2.tool)({
1427
+ description: typeof t.description === "string" ? t.description : void 0,
1428
+ inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
1429
+ })
1430
+ ])
1431
+ );
1432
+ return { nameMap, toolsMap };
1433
+ }
1434
+ async function mapWithConcurrency(items, concurrencyLimit, mapper) {
1435
+ const results = new Array(items.length);
1436
+ let idx = 0;
1437
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1438
+ while (true) {
1439
+ const current = idx;
1440
+ idx += 1;
1441
+ if (current >= items.length) {
1442
+ break;
1443
+ }
1444
+ results[current] = await mapper(items[current]);
1445
+ }
1446
+ });
1447
+ await Promise.all(workers);
1448
+ return results;
1449
+ }
1450
+ async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
1451
+ const caseLogs = [];
1452
+ const { function: tools, question: messages } = testCase;
1453
+ try {
1454
+ const { nameMap, toolsMap } = buildTools(tools);
1455
+ const debugSummaryRef = {};
1456
+ const providerOptions = {
1457
+ toolCallMiddleware: { debugSummary: debugSummaryRef }
1458
+ };
1459
+ const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
1460
+ model,
1461
+ messages,
1462
+ tools: toolsMap,
1463
+ toolChoice: "auto",
1464
+ providerOptions,
1465
+ ...temperature !== void 0 ? { temperature } : {},
1466
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
1467
+ });
1468
+ const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
1469
+ var _a, _b, _c, _d;
1470
+ const rawName = (_a = c.toolName) != null ? _a : c.name;
1471
+ const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
1472
+ return {
1473
+ toolName: originalName,
1474
+ name: originalName,
1475
+ args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
1476
+ };
1477
+ });
1478
+ caseLogs.push(
1479
+ `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
1480
+ );
1481
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1482
+ if (!possibleAnswer) {
1483
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1484
+ }
1485
+ const checkerResult = checkAllFunctionCalls(
1486
+ restoredCalls,
1487
+ possibleAnswer.ground_truth,
1488
+ tools
1489
+ );
1490
+ if (checkerResult.valid) {
1491
+ caseLogs.push(`[PASS] ${testCase.id}`);
1492
+ return { valid: true, logs: caseLogs };
1493
+ }
1494
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1495
+ return { valid: false, logs: caseLogs };
1496
+ } catch (e) {
1497
+ caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
1498
+ return { valid: false, logs: caseLogs };
1499
+ }
1500
+ }
1501
+ async function loadTestData(dataPath, testDataFile) {
1502
+ const testCasesJson = await import_node_fs3.promises.readFile(
1503
+ import_node_path3.default.join(dataPath, testDataFile),
1504
+ "utf-8"
1505
+ );
1506
+ return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1507
+ }
1508
+ async function loadAnswerData(dataPath, answerDataFile) {
1509
+ const answersJson = await import_node_fs3.promises.readFile(
1510
+ import_node_path3.default.join(dataPath, answerDataFile),
1511
+ "utf-8"
1512
+ );
1513
+ const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1514
+ return new Map(answers.map((ans) => [ans.id, ans]));
1515
+ }
1516
+ function getConfigValues(config) {
1517
+ const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
1518
+ const limit = limitEnv ? Number(limitEnv) : void 0;
1519
+ const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
1520
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
1521
+ const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
1522
+ const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
1523
+ return { limit, concurrency, temperature, maxTokens };
1524
+ }
1525
+ function aggregateResults(resultsPerCase, testCases) {
1526
+ const logs = [];
1527
+ const correctCount = resultsPerCase.reduce(
1528
+ (acc, r) => acc + (r.valid ? 1 : 0),
1529
+ 0
1530
+ );
1531
+ for (const r of resultsPerCase) {
1532
+ logs.push(...r.logs);
1533
+ }
1534
+ if (testCases.length === 0) {
1535
+ return {
1536
+ score: 0,
1537
+ success: false,
1538
+ metrics: {},
1539
+ logs: ["No test cases found."]
1540
+ };
1541
+ }
1542
+ const score = correctCount / testCases.length;
1543
+ return {
1544
+ score,
1545
+ success: score > 0.5,
1546
+ metrics: {
1547
+ correct_count: correctCount,
1548
+ total_cases: testCases.length,
1549
+ accuracy: score
1550
+ },
1551
+ logs
1552
+ };
1553
+ }
1554
+ function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
1555
+ return {
1556
+ name,
1557
+ version: "1.0.0",
1558
+ description,
1559
+ async run(model, config) {
1560
+ var _a;
1561
+ const logs = [];
1562
+ try {
1563
+ const dataPath = resolveDataDir();
1564
+ logs.push(`[INFO] Using data dir: ${dataPath}`);
1565
+ let testCases = await loadTestData(dataPath, testDataFile);
1566
+ const possibleAnswersMap = await loadAnswerData(
1567
+ dataPath,
1568
+ answerDataFile
1569
+ );
1570
+ const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
1571
+ if (limit && Number.isFinite(limit) && limit > 0) {
1572
+ testCases = testCases.slice(0, limit);
1573
+ logs.push(`[INFO] Limiting test cases to ${limit}`);
1574
+ }
1575
+ logs.push(
1576
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
1577
+ );
1578
+ const resultsPerCase = await mapWithConcurrency(
1579
+ testCases,
1580
+ concurrency,
1581
+ (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
1582
+ );
1583
+ const result = aggregateResults(resultsPerCase, testCases);
1584
+ result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
1585
+ return result;
1586
+ } catch (e) {
1587
+ return {
1588
+ score: 0,
1589
+ success: false,
1590
+ metrics: {},
1591
+ error: e,
1592
+ logs: [
1593
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1594
+ ]
1595
+ };
1596
+ }
1597
+ }
1598
+ };
1599
+ }
1600
+ var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
1601
+ "complex-func-bench",
1602
+ "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
1603
+ "ComplexFuncBench.jsonl",
1604
+ "ComplexFuncBench_possible_answer.jsonl"
1605
+ );
1606
+
1607
+ // src/benchmarks/json-generation.ts
1608
+ var import_node_fs4 = require("fs");
1609
+ var import_node_path4 = __toESM(require("path"), 1);
1610
+ var import_ai3 = require("ai");
1184
1611
  var import_ajv = __toESM(require("ajv"), 1);
1185
1612
  var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1186
1613
  var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1187
1614
  var NEWLINE_REGEX = /\r?\n/;
1188
- var LINE_SPLIT_REGEX2 = /\r?\n/;
1615
+ var LINE_SPLIT_REGEX3 = /\r?\n/;
1189
1616
  function tryDirectParse(text) {
1190
1617
  try {
1191
1618
  return JSON.parse(text);
1192
- } catch {
1619
+ } catch (e) {
1193
1620
  return;
1194
1621
  }
1195
1622
  }
@@ -1201,7 +1628,7 @@ function tryCodeFenceParse(text) {
1201
1628
  const inner = fenceMatch[1].trim();
1202
1629
  try {
1203
1630
  return JSON.parse(inner);
1204
- } catch {
1631
+ } catch (e) {
1205
1632
  return;
1206
1633
  }
1207
1634
  }
@@ -1226,7 +1653,7 @@ function tryBracketScan(text) {
1226
1653
  const candidate = text.slice(start, i + 1);
1227
1654
  try {
1228
1655
  return JSON.parse(candidate);
1229
- } catch {
1656
+ } catch (e) {
1230
1657
  return;
1231
1658
  }
1232
1659
  }
@@ -1274,12 +1701,12 @@ function subsetMatch(expected, actual) {
1274
1701
  async function loadDatasets() {
1275
1702
  try {
1276
1703
  const dataDir = resolveDataDir();
1277
- const testsJsonl = await import_node_fs3.promises.readFile(
1278
- import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1704
+ const testsJsonl = await import_node_fs4.promises.readFile(
1705
+ import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
1279
1706
  "utf-8"
1280
1707
  );
1281
- const expectedJsonl = await import_node_fs3.promises.readFile(
1282
- import_node_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1708
+ const expectedJsonl = await import_node_fs4.promises.readFile(
1709
+ import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
1283
1710
  "utf-8"
1284
1711
  );
1285
1712
  const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -1335,10 +1762,11 @@ function validateTestCase(tc, parsed, context) {
1335
1762
  return { valid, valuesOk, parsed };
1336
1763
  }
1337
1764
  async function processTestCase(tc, context) {
1765
+ var _a;
1338
1766
  const messages = buildMessages(tc);
1339
- const temp = context.config?.temperature;
1767
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1340
1768
  const temperature = typeof temp === "number" ? temp : void 0;
1341
- const { text } = await (0, import_ai2.generateText)({
1769
+ const { text } = await (0, import_ai3.generateText)({
1342
1770
  model: context.model,
1343
1771
  messages,
1344
1772
  ...temperature !== void 0 ? { temperature } : {}
@@ -1346,7 +1774,7 @@ async function processTestCase(tc, context) {
1346
1774
  let parsed;
1347
1775
  try {
1348
1776
  parsed = extractFirstJsonBlock(text);
1349
- } catch {
1777
+ } catch (e) {
1350
1778
  }
1351
1779
  if (parsed === void 0) {
1352
1780
  context.validation.logs.push(
@@ -1440,21 +1868,22 @@ function buildBenchmarkResult(total, counts, logs) {
1440
1868
  async function loadSchemaOnlyTests() {
1441
1869
  try {
1442
1870
  const dataDir = resolveDataDir();
1443
- const testsJsonl = await import_node_fs3.promises.readFile(
1444
- import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1871
+ const testsJsonl = await import_node_fs4.promises.readFile(
1872
+ import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
1445
1873
  "utf-8"
1446
1874
  );
1447
- const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1875
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1448
1876
  return { tests };
1449
1877
  } catch (e) {
1450
1878
  return { tests: [], error: e };
1451
1879
  }
1452
1880
  }
1453
1881
  async function processSchemaOnlyTestCase(tc, context) {
1882
+ var _a;
1454
1883
  const messages = buildMessages(tc);
1455
- const temp = context.config?.temperature;
1884
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1456
1885
  const temperature = typeof temp === "number" ? temp : void 0;
1457
- const { text } = await (0, import_ai2.generateText)({
1886
+ const { text } = await (0, import_ai3.generateText)({
1458
1887
  model: context.model,
1459
1888
  messages,
1460
1889
  ...temperature !== void 0 ? { temperature } : {}
@@ -1462,7 +1891,7 @@ async function processSchemaOnlyTestCase(tc, context) {
1462
1891
  let parsed;
1463
1892
  try {
1464
1893
  parsed = extractFirstJsonBlock(text);
1465
- } catch {
1894
+ } catch (e) {
1466
1895
  }
1467
1896
  if (parsed === void 0) {
1468
1897
  context.logs.push(
@@ -1539,8 +1968,56 @@ var colors = {
1539
1968
  yellow: "\x1B[33m",
1540
1969
  cyan: "\x1B[36m",
1541
1970
  magenta: "\x1B[35m",
1542
- gray: "\x1B[90m"
1971
+ gray: "\x1B[90m",
1972
+ white: "\x1B[37m",
1973
+ bgRed: "\x1B[41m"
1543
1974
  };
1975
+ function formatDiff(diff) {
1976
+ if (!diff || diff.length === 0) {
1977
+ return "";
1978
+ }
1979
+ return diff.map((line) => {
1980
+ if (line.startsWith("-")) {
1981
+ return `${colors.red}${line}${colors.reset}`;
1982
+ }
1983
+ if (line.startsWith("+")) {
1984
+ return `${colors.green}${line}${colors.reset}`;
1985
+ }
1986
+ if (line.startsWith("@@")) {
1987
+ return `${colors.cyan}${line}${colors.reset}`;
1988
+ }
1989
+ return line;
1990
+ }).join("\n ");
1991
+ }
1992
+ function printFailLogs(logs) {
1993
+ const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
1994
+ for (const log of failLogs) {
1995
+ try {
1996
+ const jsonStr = log.replace("[DEBUG-FAIL] ", "");
1997
+ const data = JSON.parse(jsonStr);
1998
+ console.log(`
1999
+ ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
2000
+ console.log(
2001
+ ` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
2002
+ );
2003
+ console.log(` Message: ${data.message}`);
2004
+ if (data.diff && Array.isArray(data.diff)) {
2005
+ console.log(` Diff:
2006
+ ${formatDiff(data.diff)}`);
2007
+ }
2008
+ if (data.expected && data.actual) {
2009
+ const expStr = JSON.stringify(data.expected);
2010
+ const actStr = JSON.stringify(data.actual);
2011
+ if (expStr.length < 100 && actStr.length < 100) {
2012
+ console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
2013
+ console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
2014
+ }
2015
+ }
2016
+ } catch (_e) {
2017
+ console.log(` Raw Log: ${log}`);
2018
+ }
2019
+ }
2020
+ }
1544
2021
  function printResult(result) {
1545
2022
  const { model, modelKey, benchmark, result: benchmarkResult } = result;
1546
2023
  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
@@ -1563,6 +2040,18 @@ function printResult(result) {
1563
2040
  ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
1564
2041
  );
1565
2042
  }
2043
+ if (!benchmarkResult.success && benchmarkResult.logs) {
2044
+ printFailLogs(benchmarkResult.logs);
2045
+ const failLogs = benchmarkResult.logs.filter(
2046
+ (l) => l.startsWith("[DEBUG-FAIL]")
2047
+ );
2048
+ if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
2049
+ console.log(" Raw Logs (Sample):");
2050
+ for (const l of benchmarkResult.logs.slice(0, 10)) {
2051
+ console.log(` ${l}`);
2052
+ }
2053
+ }
2054
+ }
1566
2055
  }
1567
2056
  function consoleReporter(results) {
1568
2057
  console.log("\n--- \u{1F4CA} Evaluation Report ---");
@@ -1617,14 +2106,14 @@ function hasFunctionNameIssue(diff) {
1617
2106
  );
1618
2107
  }
1619
2108
  function suggestFunctionNameFix(expected, actual, suggestions) {
1620
- const expectedName = expected?.function;
1621
- const actualName = actual?.function;
2109
+ const expectedName = expected == null ? void 0 : expected.function;
2110
+ const actualName = actual == null ? void 0 : actual.function;
1622
2111
  if (expectedName && actualName && expectedName !== actualName) {
1623
2112
  suggestions.push(
1624
2113
  `Call the function '${expectedName}' instead of '${actualName}'.`
1625
2114
  );
1626
2115
  }
1627
- if (Array.isArray(expected?.functions)) {
2116
+ if (Array.isArray(expected == null ? void 0 : expected.functions)) {
1628
2117
  suggestions.push(
1629
2118
  `Ensure tool calls include: ${expected.functions.join(", ")}.`
1630
2119
  );
@@ -1679,7 +2168,7 @@ function suggestFromErrorType(error_type, suggestions) {
1679
2168
  }
1680
2169
  function suggestFixFromDiff(parsed) {
1681
2170
  const suggestions = [];
1682
- const { error_type, expected, actual, diff } = parsed ?? {};
2171
+ const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
1683
2172
  if (!Array.isArray(diff)) {
1684
2173
  if (suggestions.length === 0 && typeof error_type === "string") {
1685
2174
  suggestFromErrorType(error_type, suggestions);
@@ -1704,15 +2193,16 @@ function suggestFixFromDiff(parsed) {
1704
2193
  return uniqueLines(suggestions);
1705
2194
  }
1706
2195
  function getTestIdFromLogLine(line) {
2196
+ var _a, _b;
1707
2197
  if (line.startsWith("[FAIL]")) {
1708
2198
  const m = line.match(FAIL_ID_REGEX);
1709
- return m?.[1];
2199
+ return m == null ? void 0 : m[1];
1710
2200
  }
1711
2201
  if (line.startsWith("[DEBUG-FAIL]")) {
1712
2202
  try {
1713
2203
  const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1714
- return String(parsed?.id ?? "");
1715
- } catch {
2204
+ return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
2205
+ } catch (e) {
1716
2206
  }
1717
2207
  }
1718
2208
  if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
@@ -1720,18 +2210,19 @@ function getTestIdFromLogLine(line) {
1720
2210
  const parsed = JSON.parse(
1721
2211
  line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1722
2212
  );
1723
- return String(parsed?.id ?? "");
1724
- } catch {
2213
+ return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
2214
+ } catch (e) {
1725
2215
  }
1726
2216
  }
1727
2217
  return;
1728
2218
  }
1729
2219
  function groupLogsByTestId(failLogs) {
2220
+ var _a;
1730
2221
  const byId = /* @__PURE__ */ new Map();
1731
2222
  for (const line of failLogs) {
1732
2223
  const id = getTestIdFromLogLine(line);
1733
- const key = id ?? "__general__";
1734
- const arr = byId.get(key) ?? [];
2224
+ const key = id != null ? id : "__general__";
2225
+ const arr = (_a = byId.get(key)) != null ? _a : [];
1735
2226
  arr.push(line);
1736
2227
  byId.set(key, arr);
1737
2228
  }
@@ -1743,10 +2234,10 @@ function collectDebugIds(lines) {
1743
2234
  if (l.startsWith("[DEBUG-FAIL]")) {
1744
2235
  try {
1745
2236
  const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1746
- if (parsed?.id) {
2237
+ if (parsed == null ? void 0 : parsed.id) {
1747
2238
  debugIds.add(String(parsed.id));
1748
2239
  }
1749
- } catch {
2240
+ } catch (e) {
1750
2241
  }
1751
2242
  }
1752
2243
  }
@@ -1782,7 +2273,7 @@ function displayDebugFailLine(line) {
1782
2273
  console.log(` \u2022 ${s}`);
1783
2274
  }
1784
2275
  }
1785
- } catch {
2276
+ } catch (e) {
1786
2277
  console.log(` ${line}`);
1787
2278
  }
1788
2279
  }
@@ -1826,14 +2317,14 @@ function displayDebugFailContextLine(line) {
1826
2317
  const ctx = JSON.parse(payload);
1827
2318
  console.log(` ${colors2.gray}context:${colors2.reset}`);
1828
2319
  displayContextInfo(ctx);
1829
- } catch {
2320
+ } catch (e) {
1830
2321
  console.log(` ${line}`);
1831
2322
  }
1832
2323
  }
1833
2324
  function displayLogLine(line, debugIds) {
1834
2325
  if (line.startsWith("[FAIL]")) {
1835
2326
  const m = line.match(FAIL_ID_REGEX);
1836
- const failId = m?.[1];
2327
+ const failId = m == null ? void 0 : m[1];
1837
2328
  if (failId && debugIds.has(failId)) {
1838
2329
  return;
1839
2330
  }
@@ -1903,11 +2394,12 @@ function displayResultHeader(r) {
1903
2394
  );
1904
2395
  }
1905
2396
  function consoleDebugReporter(results) {
2397
+ var _a;
1906
2398
  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
1907
2399
  for (const r of results) {
1908
2400
  displayResultHeader(r);
1909
2401
  displayMetrics(Object.entries(r.result.metrics));
1910
- if (r.result.logs?.length) {
2402
+ if ((_a = r.result.logs) == null ? void 0 : _a.length) {
1911
2403
  displayResultLogs(r.result.logs);
1912
2404
  }
1913
2405
  }
@@ -1916,13 +2408,16 @@ function consoleDebugReporter(results) {
1916
2408
 
1917
2409
  // src/reporters/json.ts
1918
2410
  function jsonReporter(results) {
1919
- const serializableResults = results.map((r) => ({
1920
- ...r,
1921
- result: {
1922
- ...r.result,
1923
- error: r.result.error?.message
1924
- }
1925
- }));
2411
+ const serializableResults = results.map((r) => {
2412
+ var _a;
2413
+ return {
2414
+ ...r,
2415
+ result: {
2416
+ ...r.result,
2417
+ error: (_a = r.result.error) == null ? void 0 : _a.message
2418
+ }
2419
+ };
2420
+ });
1926
2421
  console.log(JSON.stringify(serializableResults, null, 2));
1927
2422
  }
1928
2423
 
@@ -2035,6 +2530,7 @@ async function evaluate(options) {
2035
2530
  bfclParallelBenchmark,
2036
2531
  bfclParallelMultipleBenchmark,
2037
2532
  bfclSimpleBenchmark,
2533
+ complexFuncBenchBenchmark,
2038
2534
  evaluate,
2039
2535
  jsonGenerationBenchmark,
2040
2536
  jsonGenerationSchemaOnlyBenchmark