@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -23,7 +23,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
23
23
  if (fs.existsSync(dataAtRoot)) {
24
24
  return dataAtRoot;
25
25
  }
26
- } catch {
26
+ } catch (e) {
27
27
  }
28
28
  return null;
29
29
  }
@@ -37,7 +37,7 @@ function tryResolveViaPackageJson(moduleUrl) {
37
37
  if (fs.existsSync(dataAtPkg)) {
38
38
  return dataAtPkg;
39
39
  }
40
- } catch {
40
+ } catch (e) {
41
41
  }
42
42
  return null;
43
43
  }
@@ -45,7 +45,7 @@ function getStartDir(moduleUrl) {
45
45
  if (moduleUrl) {
46
46
  try {
47
47
  return path.dirname(fileURLToPath(moduleUrl));
48
- } catch {
48
+ } catch (e) {
49
49
  return process.cwd();
50
50
  }
51
51
  }
@@ -139,7 +139,7 @@ function valuesMatch(modelValue, possibleValue) {
139
139
  const normalizedModel = normalizeObject(modelValue);
140
140
  const normalizedPossible = normalizeObject(possibleValue);
141
141
  return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
142
- } catch {
142
+ } catch (e) {
143
143
  return false;
144
144
  }
145
145
  }
@@ -268,7 +268,7 @@ function checkSingleParameter(paramName, modelValue, context) {
268
268
  return checkStringValue(
269
269
  paramName,
270
270
  modelValue,
271
- possibleValues ?? []
271
+ possibleValues != null ? possibleValues : []
272
272
  );
273
273
  }
274
274
  if (Array.isArray(modelValue)) {
@@ -368,6 +368,37 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
368
368
  // src/benchmarks/bfcl.ts
369
369
  var LINE_SPLIT_REGEX = /\r?\n/;
370
370
  var NUMERIC_STRING_REGEX = /^\d+$/;
371
+ function convertGroundTruthToXML(call) {
372
+ const keys = Object.keys(call);
373
+ if (keys.length === 0) {
374
+ return "<empty_call />";
375
+ }
376
+ const funcName = keys[0];
377
+ if (!funcName) {
378
+ return "<undefined_function />";
379
+ }
380
+ const params = call[funcName];
381
+ if (!params || typeof params !== "object") {
382
+ return `<${funcName} />`;
383
+ }
384
+ let xml = `<${funcName}>
385
+ `;
386
+ for (const [key, value] of Object.entries(params)) {
387
+ const displayValue = Array.isArray(value) ? value[0] : value;
388
+ let valueStr;
389
+ if (typeof displayValue === "string") {
390
+ valueStr = displayValue;
391
+ } else if (displayValue === null || displayValue === void 0) {
392
+ valueStr = "";
393
+ } else {
394
+ valueStr = JSON.stringify(displayValue);
395
+ }
396
+ xml += ` <${key}>${valueStr}</${key}>
397
+ `;
398
+ }
399
+ xml += `</${funcName}>`;
400
+ return xml;
401
+ }
371
402
  function check(testCase, modelOutput, possibleAnswer) {
372
403
  const category = testCase.id.split("_")[0];
373
404
  try {
@@ -448,7 +479,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
448
479
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
449
480
  );
450
481
  }
451
- const fixSchemaType = (copy) => {
482
+ const fixSchemaType2 = (copy) => {
452
483
  if (!copy.type) {
453
484
  return;
454
485
  }
@@ -472,16 +503,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
472
503
  );
473
504
  }
474
505
  };
475
- const fixSchema = (schema) => {
506
+ const fixSchema2 = (schema) => {
476
507
  if (!schema || typeof schema !== "object") {
477
508
  return { type: "object", properties: {} };
478
509
  }
479
- const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
510
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
480
511
  if (!Array.isArray(copy)) {
481
- fixSchemaType(copy);
482
- fixSchemaProperties(copy, fixSchema);
512
+ fixSchemaType2(copy);
513
+ fixSchemaProperties(copy, fixSchema2);
483
514
  if (copy.items) {
484
- copy.items = fixSchema(copy.items);
515
+ copy.items = fixSchema2(copy.items);
485
516
  }
486
517
  return copy;
487
518
  }
@@ -516,13 +547,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
516
547
  try {
517
548
  const arr = JSON.parse(raw);
518
549
  return Array.isArray(arr) ? arr : [];
519
- } catch {
550
+ } catch (e) {
520
551
  return [];
521
552
  }
522
553
  };
523
554
  const getSanitizedName = (rawName, transformedTools) => {
555
+ var _a, _b;
524
556
  if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
525
- return transformedTools[Number(rawName)]?.name ?? rawName;
557
+ return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
526
558
  }
527
559
  return rawName;
528
560
  };
@@ -532,25 +564,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
532
564
  }
533
565
  try {
534
566
  return JSON.parse(extractedArgs);
535
- } catch {
567
+ } catch (e) {
536
568
  return extractedArgs;
537
569
  }
538
570
  };
539
571
  const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
572
+ var _a, _b, _c, _d, _e, _f;
540
573
  const call = c;
541
- const rawName = call.toolName ?? call.name;
574
+ const rawName = (_a = call.toolName) != null ? _a : call.name;
542
575
  const sanitizedFromIndex = getSanitizedName(
543
576
  rawName,
544
577
  transformedTools
545
578
  );
546
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
547
- const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
579
+ const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
580
+ const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
548
581
  const parsedArgs = parseToolArgs(extractedArgs);
549
582
  return {
550
583
  ...call,
551
584
  toolName: originalName,
552
585
  name: originalName,
553
- args: parsedArgs ?? {}
586
+ args: parsedArgs != null ? parsedArgs : {}
554
587
  };
555
588
  });
556
589
  const summarizeArgs = (args) => {
@@ -594,7 +627,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
594
627
  if (Array.isArray(got)) {
595
628
  return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
596
629
  }
597
- } catch {
630
+ } catch (e) {
598
631
  }
599
632
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
600
633
  });
@@ -632,13 +665,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
632
665
  }
633
666
  };
634
667
  const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
668
+ var _a, _b, _c, _d;
635
669
  const funcDesc = tools[0];
636
- const gt = possibleAnswer.ground_truth?.[0];
637
- const expectedFuncName = funcDesc?.name;
670
+ const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
671
+ const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
638
672
  const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
639
673
  const received = restoredCalls[0];
640
- const receivedName = received?.toolName ?? received?.name;
641
- const receivedArgs = summarizeArgs(received?.args);
674
+ const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
675
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
642
676
  const expected = {
643
677
  function: expectedFuncName,
644
678
  params: expectedParams
@@ -650,7 +684,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
650
684
  const diff = [];
651
685
  checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
652
686
  if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
653
- const required = funcDesc?.parameters?.required ?? [];
687
+ const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
654
688
  checkMissingParams(
655
689
  required,
656
690
  receivedArgs,
@@ -687,12 +721,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
687
721
  }
688
722
  };
689
723
  const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
724
+ var _a;
690
725
  for (let i = 0; i < restoredCalls.length; i += 1) {
691
726
  if (usedActual.has(i)) {
692
727
  continue;
693
728
  }
694
729
  const rc = restoredCalls[i];
695
- const rcName = rc?.toolName ?? rc?.name;
730
+ const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
696
731
  if (rcName === fname) {
697
732
  return i;
698
733
  }
@@ -706,6 +741,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
706
741
  checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
707
742
  };
708
743
  const processExpectedCall = (options) => {
744
+ var _a, _b;
709
745
  const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
710
746
  const fname = Object.keys(expectedObj)[0];
711
747
  const matchedIndex = findMatchingCallIndex(
@@ -718,10 +754,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
718
754
  }
719
755
  usedActual.add(matchedIndex);
720
756
  const received = restoredCalls[matchedIndex];
721
- const receivedArgs = summarizeArgs(received?.args);
757
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
722
758
  const expectedParamsAllowed = expectedObj[fname];
723
759
  const funcDesc = tools.find((t) => t.name === fname);
724
- const requiredParams = funcDesc?.parameters?.required ?? [];
760
+ const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
725
761
  diff.push(`@@ function ${fname}`);
726
762
  if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
727
763
  validateFunctionParams({
@@ -733,10 +769,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
733
769
  }
734
770
  };
735
771
  const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
736
- const gtArr = possibleAnswer.ground_truth ?? [];
772
+ var _a;
773
+ const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
737
774
  const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
738
775
  const actualNames = restoredCalls.map(
739
- (c) => c.toolName ?? c.name
776
+ (c) => {
777
+ var _a2;
778
+ return (_a2 = c.toolName) != null ? _a2 : c.name;
779
+ }
740
780
  );
741
781
  const expected = {
742
782
  functions: expectedNames
@@ -762,14 +802,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
762
802
  return { expected, actual, diff };
763
803
  };
764
804
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
765
- const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
805
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
766
806
  logs.push(
767
807
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
768
808
  );
769
809
  const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
810
+ var _a, _b, _c, _d;
770
811
  try {
771
812
  const firstTool = transformedTools[0];
772
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
813
+ const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
773
814
  caseLogs.push(
774
815
  `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
775
816
  );
@@ -785,7 +826,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
785
826
  caseLogs.push(
786
827
  `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
787
828
  );
788
- } catch {
829
+ } catch (e) {
789
830
  caseLogs.push(
790
831
  `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
791
832
  );
@@ -804,11 +845,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
804
845
  possibleAnswer
805
846
  } = options;
806
847
  const lastUser = (() => {
848
+ var _a;
807
849
  const reversed = [...flatMessages].reverse();
808
850
  const found = reversed.find(
809
851
  (m) => m.role === "user"
810
852
  );
811
- return found?.content ?? void 0;
853
+ return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
812
854
  })();
813
855
  const rawModelText = (() => {
814
856
  if (mwOriginalText && mwOriginalText.length > 0) {
@@ -879,9 +921,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
879
921
  caseLogs.push(
880
922
  `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
881
923
  );
882
- } catch {
924
+ } catch (e) {
883
925
  }
884
- } catch {
926
+ } catch (e) {
885
927
  caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
886
928
  }
887
929
  };
@@ -960,7 +1002,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
960
1002
  const flatMessages = flattenMessages(messages);
961
1003
  const { transformedTools, nameMap } = buildTransformedTools(
962
1004
  tools,
963
- fixSchema
1005
+ fixSchema2
964
1006
  );
965
1007
  const toolsMap = buildToolsMap(transformedTools);
966
1008
  return { flatMessages, transformedTools, nameMap, toolsMap };
@@ -982,6 +1024,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
982
1024
  const mwParsedToolCalls = parseDebugToolCalls(
983
1025
  debugSummaryRef.toolCalls
984
1026
  );
1027
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1028
+ if (!possibleAnswer) {
1029
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1030
+ }
1031
+ if (process.env.DEBUG_PARSER_OUTPUT === "true") {
1032
+ const groundTruth = possibleAnswer.ground_truth;
1033
+ const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
1034
+ console.log("\n========== BFCL CASE DEBUG ==========");
1035
+ console.log(`Test Case: ${testCase.id}`);
1036
+ console.log(`Expected count: ${groundTruth.length} call(s)`);
1037
+ console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
1038
+ console.log(expectedXML);
1039
+ console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
1040
+ console.log(mwOriginalText || text || "(empty)");
1041
+ console.log(
1042
+ "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
1043
+ );
1044
+ console.log(JSON.stringify(toolCalls, null, 2));
1045
+ console.log("======================================\n");
1046
+ }
985
1047
  logRawToolCalls({
986
1048
  toolCalls,
987
1049
  finishReason,
@@ -989,10 +1051,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
989
1051
  testCaseId: testCase.id,
990
1052
  caseLogs
991
1053
  });
992
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
993
- if (!possibleAnswer) {
994
- throw new Error(`No possible answer for id: ${testCase.id}`);
995
- }
996
1054
  const restoredCalls = restoreToolCalls(
997
1055
  toolCalls || [],
998
1056
  nameMap,
@@ -1013,12 +1071,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1013
1071
  caseLogs
1014
1072
  });
1015
1073
  };
1016
- const runSingleCase = async (testCase) => {
1074
+ const runSingleCase2 = async (testCase) => {
1017
1075
  const caseLogs = [];
1018
1076
  const { function: tools } = testCase;
1019
- const temp = config?.temperature;
1077
+ const temp = config == null ? void 0 : config.temperature;
1020
1078
  const temperature = typeof temp === "number" ? temp : void 0;
1021
- const maxTok = config?.maxTokens;
1079
+ const maxTok = config == null ? void 0 : config.maxTokens;
1022
1080
  const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1023
1081
  try {
1024
1082
  const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
@@ -1044,15 +1102,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1044
1102
  });
1045
1103
  } catch (e) {
1046
1104
  caseLogs.push(
1047
- `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
1105
+ `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
1048
1106
  );
1049
- if (e?.stack) {
1107
+ if (e == null ? void 0 : e.stack) {
1050
1108
  caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1051
1109
  }
1052
1110
  return { valid: false, logs: caseLogs };
1053
1111
  }
1054
1112
  };
1055
- const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
1113
+ const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
1056
1114
  const results = new Array(items.length);
1057
1115
  let idx = 0;
1058
1116
  const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
@@ -1068,10 +1126,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1068
1126
  await Promise.all(workers);
1069
1127
  return results;
1070
1128
  };
1071
- const resultsPerCase = await mapWithConcurrency(
1129
+ const resultsPerCase = await mapWithConcurrency2(
1072
1130
  testCases,
1073
1131
  concurrency,
1074
- async (tc) => runSingleCase(tc)
1132
+ async (tc) => runSingleCase2(tc)
1075
1133
  );
1076
1134
  correctCount = resultsPerCase.reduce(
1077
1135
  (acc, r) => acc + (r.valid ? 1 : 0),
@@ -1139,19 +1197,391 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
1139
1197
  "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1140
1198
  );
1141
1199
 
1142
- // src/benchmarks/json-generation.ts
1200
+ // src/benchmarks/complex-func-bench.ts
1143
1201
  import { promises as fs3 } from "fs";
1144
1202
  import path3 from "path";
1145
- import { generateText as generateText2 } from "ai";
1203
+ import {
1204
+ generateText as generateText2,
1205
+ jsonSchema as jsonSchema2,
1206
+ tool as tool2
1207
+ } from "ai";
1208
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1209
+ function standardizeString2(input) {
1210
+ if (typeof input !== "string") {
1211
+ return input;
1212
+ }
1213
+ return input.toLowerCase().trim();
1214
+ }
1215
+ function valuesMatch2(modelValue, expectedValue) {
1216
+ if (modelValue === expectedValue) {
1217
+ return true;
1218
+ }
1219
+ if (typeof modelValue === "string" && typeof expectedValue === "string") {
1220
+ return standardizeString2(modelValue) === standardizeString2(expectedValue);
1221
+ }
1222
+ if (typeof modelValue === "number" && typeof expectedValue === "string") {
1223
+ return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
1224
+ }
1225
+ if (typeof modelValue === "string" && typeof expectedValue === "number") {
1226
+ return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
1227
+ }
1228
+ if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
1229
+ try {
1230
+ return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
1231
+ } catch (e) {
1232
+ return false;
1233
+ }
1234
+ }
1235
+ return false;
1236
+ }
1237
+ function validateFunctionName(modelFuncName, expectedFuncName) {
1238
+ if (modelFuncName !== expectedFuncName) {
1239
+ return {
1240
+ valid: false,
1241
+ error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
1242
+ error_type: "function_name_mismatch"
1243
+ };
1244
+ }
1245
+ return { valid: true };
1246
+ }
1247
+ function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
1248
+ for (const param of requiredParams) {
1249
+ if (!(param in modelArgs) && param in expectedArgs) {
1250
+ return {
1251
+ valid: false,
1252
+ error: `Missing required parameter: '${param}'`,
1253
+ error_type: "missing_required_param"
1254
+ };
1255
+ }
1256
+ }
1257
+ return { valid: true };
1258
+ }
1259
+ function validateParamValues(expectedArgs, modelArgs, requiredParams) {
1260
+ for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
1261
+ if (!(paramName in modelArgs)) {
1262
+ if (!requiredParams.includes(paramName)) {
1263
+ continue;
1264
+ }
1265
+ return {
1266
+ valid: false,
1267
+ error: `Missing parameter: '${paramName}'`,
1268
+ error_type: "missing_param"
1269
+ };
1270
+ }
1271
+ const modelValue = modelArgs[paramName];
1272
+ if (!valuesMatch2(modelValue, expectedValue)) {
1273
+ return {
1274
+ valid: false,
1275
+ error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
1276
+ error_type: "value_mismatch"
1277
+ };
1278
+ }
1279
+ }
1280
+ return { valid: true };
1281
+ }
1282
+ function checkFunctionCall(modelCall, expected, toolSpecs) {
1283
+ var _a, _b, _c, _d;
1284
+ const expectedFuncName = Object.keys(expected)[0];
1285
+ const expectedArgs = expected[expectedFuncName];
1286
+ const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
1287
+ const modelArgs = (_b = modelCall.args) != null ? _b : {};
1288
+ const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
1289
+ if (!nameResult.valid) {
1290
+ return nameResult;
1291
+ }
1292
+ const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
1293
+ const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
1294
+ const requiredResult = validateRequiredParams(
1295
+ requiredParams,
1296
+ modelArgs,
1297
+ expectedArgs
1298
+ );
1299
+ if (!requiredResult.valid) {
1300
+ return requiredResult;
1301
+ }
1302
+ return validateParamValues(expectedArgs, modelArgs, requiredParams);
1303
+ }
1304
+ function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
1305
+ if (modelCalls.length !== expectedCalls.length) {
1306
+ return {
1307
+ valid: false,
1308
+ error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
1309
+ error_type: "wrong_call_count"
1310
+ };
1311
+ }
1312
+ if (expectedCalls.length === 1) {
1313
+ return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
1314
+ }
1315
+ const matchedIndices = /* @__PURE__ */ new Set();
1316
+ for (const expected of expectedCalls) {
1317
+ let foundMatch = false;
1318
+ for (let i = 0; i < modelCalls.length; i++) {
1319
+ if (matchedIndices.has(i)) {
1320
+ continue;
1321
+ }
1322
+ const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
1323
+ if (result.valid) {
1324
+ matchedIndices.add(i);
1325
+ foundMatch = true;
1326
+ break;
1327
+ }
1328
+ }
1329
+ if (!foundMatch) {
1330
+ const expectedFuncName = Object.keys(expected)[0];
1331
+ return {
1332
+ valid: false,
1333
+ error: `Could not find matching call for function '${expectedFuncName}'`,
1334
+ error_type: "no_matching_call"
1335
+ };
1336
+ }
1337
+ }
1338
+ return { valid: true };
1339
+ }
1340
+ var fixSchemaType = (copy) => {
1341
+ if (!copy.type) {
1342
+ return;
1343
+ }
1344
+ if (copy.type === "dict") {
1345
+ copy.type = "object";
1346
+ }
1347
+ if (copy.type === "tuple") {
1348
+ copy.type = "array";
1349
+ }
1350
+ if (copy.type === "integer" || copy.type === "float") {
1351
+ copy.type = "number";
1352
+ }
1353
+ };
1354
+ var fixSchema = (schema) => {
1355
+ if (!schema || typeof schema !== "object") {
1356
+ return { type: "object", properties: {} };
1357
+ }
1358
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
1359
+ if (!Array.isArray(copy)) {
1360
+ fixSchemaType(copy);
1361
+ if (copy.properties && typeof copy.properties === "object") {
1362
+ for (const k of Object.keys(copy.properties)) {
1363
+ copy.properties[k] = fixSchema(
1364
+ copy.properties[k]
1365
+ );
1366
+ }
1367
+ }
1368
+ if (copy.items) {
1369
+ copy.items = fixSchema(copy.items);
1370
+ }
1371
+ }
1372
+ return copy;
1373
+ };
1374
+ function buildTools(tools) {
1375
+ const nameMap = /* @__PURE__ */ new Map();
1376
+ const transformedTools = tools.map((t) => {
1377
+ const fixed = fixSchema(t.parameters);
1378
+ const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
1379
+ const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
1380
+ nameMap.set(sanitized, t.name);
1381
+ return {
1382
+ type: "function",
1383
+ name: sanitized,
1384
+ description: t.description,
1385
+ inputSchema
1386
+ };
1387
+ });
1388
+ const toolsMap = Object.fromEntries(
1389
+ transformedTools.map((t) => [
1390
+ t.name,
1391
+ tool2({
1392
+ description: typeof t.description === "string" ? t.description : void 0,
1393
+ inputSchema: jsonSchema2(t.inputSchema)
1394
+ })
1395
+ ])
1396
+ );
1397
+ return { nameMap, toolsMap };
1398
+ }
1399
+ async function mapWithConcurrency(items, concurrencyLimit, mapper) {
1400
+ const results = new Array(items.length);
1401
+ let idx = 0;
1402
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1403
+ while (true) {
1404
+ const current = idx;
1405
+ idx += 1;
1406
+ if (current >= items.length) {
1407
+ break;
1408
+ }
1409
+ results[current] = await mapper(items[current]);
1410
+ }
1411
+ });
1412
+ await Promise.all(workers);
1413
+ return results;
1414
+ }
1415
+ async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
1416
+ const caseLogs = [];
1417
+ const { function: tools, question: messages } = testCase;
1418
+ try {
1419
+ const { nameMap, toolsMap } = buildTools(tools);
1420
+ const debugSummaryRef = {};
1421
+ const providerOptions = {
1422
+ toolCallMiddleware: { debugSummary: debugSummaryRef }
1423
+ };
1424
+ const { toolCalls, finishReason } = await generateText2({
1425
+ model,
1426
+ messages,
1427
+ tools: toolsMap,
1428
+ toolChoice: "auto",
1429
+ providerOptions,
1430
+ ...temperature !== void 0 ? { temperature } : {},
1431
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
1432
+ });
1433
+ const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
1434
+ var _a, _b, _c, _d;
1435
+ const rawName = (_a = c.toolName) != null ? _a : c.name;
1436
+ const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
1437
+ return {
1438
+ toolName: originalName,
1439
+ name: originalName,
1440
+ args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
1441
+ };
1442
+ });
1443
+ caseLogs.push(
1444
+ `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
1445
+ );
1446
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1447
+ if (!possibleAnswer) {
1448
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1449
+ }
1450
+ const checkerResult = checkAllFunctionCalls(
1451
+ restoredCalls,
1452
+ possibleAnswer.ground_truth,
1453
+ tools
1454
+ );
1455
+ if (checkerResult.valid) {
1456
+ caseLogs.push(`[PASS] ${testCase.id}`);
1457
+ return { valid: true, logs: caseLogs };
1458
+ }
1459
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1460
+ return { valid: false, logs: caseLogs };
1461
+ } catch (e) {
1462
+ caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
1463
+ return { valid: false, logs: caseLogs };
1464
+ }
1465
+ }
1466
+ async function loadTestData(dataPath, testDataFile) {
1467
+ const testCasesJson = await fs3.readFile(
1468
+ path3.join(dataPath, testDataFile),
1469
+ "utf-8"
1470
+ );
1471
+ return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1472
+ }
1473
+ async function loadAnswerData(dataPath, answerDataFile) {
1474
+ const answersJson = await fs3.readFile(
1475
+ path3.join(dataPath, answerDataFile),
1476
+ "utf-8"
1477
+ );
1478
+ const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1479
+ return new Map(answers.map((ans) => [ans.id, ans]));
1480
+ }
1481
+ function getConfigValues(config) {
1482
+ const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
1483
+ const limit = limitEnv ? Number(limitEnv) : void 0;
1484
+ const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
1485
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
1486
+ const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
1487
+ const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
1488
+ return { limit, concurrency, temperature, maxTokens };
1489
+ }
1490
+ function aggregateResults(resultsPerCase, testCases) {
1491
+ const logs = [];
1492
+ const correctCount = resultsPerCase.reduce(
1493
+ (acc, r) => acc + (r.valid ? 1 : 0),
1494
+ 0
1495
+ );
1496
+ for (const r of resultsPerCase) {
1497
+ logs.push(...r.logs);
1498
+ }
1499
+ if (testCases.length === 0) {
1500
+ return {
1501
+ score: 0,
1502
+ success: false,
1503
+ metrics: {},
1504
+ logs: ["No test cases found."]
1505
+ };
1506
+ }
1507
+ const score = correctCount / testCases.length;
1508
+ return {
1509
+ score,
1510
+ success: score > 0.5,
1511
+ metrics: {
1512
+ correct_count: correctCount,
1513
+ total_cases: testCases.length,
1514
+ accuracy: score
1515
+ },
1516
+ logs
1517
+ };
1518
+ }
1519
+ function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
1520
+ return {
1521
+ name,
1522
+ version: "1.0.0",
1523
+ description,
1524
+ async run(model, config) {
1525
+ var _a;
1526
+ const logs = [];
1527
+ try {
1528
+ const dataPath = resolveDataDir();
1529
+ logs.push(`[INFO] Using data dir: ${dataPath}`);
1530
+ let testCases = await loadTestData(dataPath, testDataFile);
1531
+ const possibleAnswersMap = await loadAnswerData(
1532
+ dataPath,
1533
+ answerDataFile
1534
+ );
1535
+ const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
1536
+ if (limit && Number.isFinite(limit) && limit > 0) {
1537
+ testCases = testCases.slice(0, limit);
1538
+ logs.push(`[INFO] Limiting test cases to ${limit}`);
1539
+ }
1540
+ logs.push(
1541
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
1542
+ );
1543
+ const resultsPerCase = await mapWithConcurrency(
1544
+ testCases,
1545
+ concurrency,
1546
+ (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
1547
+ );
1548
+ const result = aggregateResults(resultsPerCase, testCases);
1549
+ result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
1550
+ return result;
1551
+ } catch (e) {
1552
+ return {
1553
+ score: 0,
1554
+ success: false,
1555
+ metrics: {},
1556
+ error: e,
1557
+ logs: [
1558
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1559
+ ]
1560
+ };
1561
+ }
1562
+ }
1563
+ };
1564
+ }
1565
+ var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
1566
+ "complex-func-bench",
1567
+ "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
1568
+ "ComplexFuncBench.jsonl",
1569
+ "ComplexFuncBench_possible_answer.jsonl"
1570
+ );
1571
+
1572
+ // src/benchmarks/json-generation.ts
1573
+ import { promises as fs4 } from "fs";
1574
+ import path4 from "path";
1575
+ import { generateText as generateText3 } from "ai";
1146
1576
  import Ajv from "ajv";
1147
1577
  var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1148
1578
  var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1149
1579
  var NEWLINE_REGEX = /\r?\n/;
1150
- var LINE_SPLIT_REGEX2 = /\r?\n/;
1580
+ var LINE_SPLIT_REGEX3 = /\r?\n/;
1151
1581
  function tryDirectParse(text) {
1152
1582
  try {
1153
1583
  return JSON.parse(text);
1154
- } catch {
1584
+ } catch (e) {
1155
1585
  return;
1156
1586
  }
1157
1587
  }
@@ -1163,7 +1593,7 @@ function tryCodeFenceParse(text) {
1163
1593
  const inner = fenceMatch[1].trim();
1164
1594
  try {
1165
1595
  return JSON.parse(inner);
1166
- } catch {
1596
+ } catch (e) {
1167
1597
  return;
1168
1598
  }
1169
1599
  }
@@ -1188,7 +1618,7 @@ function tryBracketScan(text) {
1188
1618
  const candidate = text.slice(start, i + 1);
1189
1619
  try {
1190
1620
  return JSON.parse(candidate);
1191
- } catch {
1621
+ } catch (e) {
1192
1622
  return;
1193
1623
  }
1194
1624
  }
@@ -1236,12 +1666,12 @@ function subsetMatch(expected, actual) {
1236
1666
  async function loadDatasets() {
1237
1667
  try {
1238
1668
  const dataDir = resolveDataDir();
1239
- const testsJsonl = await fs3.readFile(
1240
- path3.join(dataDir, "json_generation_tests.jsonl"),
1669
+ const testsJsonl = await fs4.readFile(
1670
+ path4.join(dataDir, "json_generation_tests.jsonl"),
1241
1671
  "utf-8"
1242
1672
  );
1243
- const expectedJsonl = await fs3.readFile(
1244
- path3.join(dataDir, "json_generation_expected.jsonl"),
1673
+ const expectedJsonl = await fs4.readFile(
1674
+ path4.join(dataDir, "json_generation_expected.jsonl"),
1245
1675
  "utf-8"
1246
1676
  );
1247
1677
  const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -1297,10 +1727,11 @@ function validateTestCase(tc, parsed, context) {
1297
1727
  return { valid, valuesOk, parsed };
1298
1728
  }
1299
1729
  async function processTestCase(tc, context) {
1730
+ var _a;
1300
1731
  const messages = buildMessages(tc);
1301
- const temp = context.config?.temperature;
1732
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1302
1733
  const temperature = typeof temp === "number" ? temp : void 0;
1303
- const { text } = await generateText2({
1734
+ const { text } = await generateText3({
1304
1735
  model: context.model,
1305
1736
  messages,
1306
1737
  ...temperature !== void 0 ? { temperature } : {}
@@ -1308,7 +1739,7 @@ async function processTestCase(tc, context) {
1308
1739
  let parsed;
1309
1740
  try {
1310
1741
  parsed = extractFirstJsonBlock(text);
1311
- } catch {
1742
+ } catch (e) {
1312
1743
  }
1313
1744
  if (parsed === void 0) {
1314
1745
  context.validation.logs.push(
@@ -1402,21 +1833,22 @@ function buildBenchmarkResult(total, counts, logs) {
1402
1833
  async function loadSchemaOnlyTests() {
1403
1834
  try {
1404
1835
  const dataDir = resolveDataDir();
1405
- const testsJsonl = await fs3.readFile(
1406
- path3.join(dataDir, "json_generation_tests.jsonl"),
1836
+ const testsJsonl = await fs4.readFile(
1837
+ path4.join(dataDir, "json_generation_tests.jsonl"),
1407
1838
  "utf-8"
1408
1839
  );
1409
- const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1840
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1410
1841
  return { tests };
1411
1842
  } catch (e) {
1412
1843
  return { tests: [], error: e };
1413
1844
  }
1414
1845
  }
1415
1846
  async function processSchemaOnlyTestCase(tc, context) {
1847
+ var _a;
1416
1848
  const messages = buildMessages(tc);
1417
- const temp = context.config?.temperature;
1849
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1418
1850
  const temperature = typeof temp === "number" ? temp : void 0;
1419
- const { text } = await generateText2({
1851
+ const { text } = await generateText3({
1420
1852
  model: context.model,
1421
1853
  messages,
1422
1854
  ...temperature !== void 0 ? { temperature } : {}
@@ -1424,7 +1856,7 @@ async function processSchemaOnlyTestCase(tc, context) {
1424
1856
  let parsed;
1425
1857
  try {
1426
1858
  parsed = extractFirstJsonBlock(text);
1427
- } catch {
1859
+ } catch (e) {
1428
1860
  }
1429
1861
  if (parsed === void 0) {
1430
1862
  context.logs.push(
@@ -1501,8 +1933,56 @@ var colors = {
1501
1933
  yellow: "\x1B[33m",
1502
1934
  cyan: "\x1B[36m",
1503
1935
  magenta: "\x1B[35m",
1504
- gray: "\x1B[90m"
1936
+ gray: "\x1B[90m",
1937
+ white: "\x1B[37m",
1938
+ bgRed: "\x1B[41m"
1505
1939
  };
1940
+ function formatDiff(diff) {
1941
+ if (!diff || diff.length === 0) {
1942
+ return "";
1943
+ }
1944
+ return diff.map((line) => {
1945
+ if (line.startsWith("-")) {
1946
+ return `${colors.red}${line}${colors.reset}`;
1947
+ }
1948
+ if (line.startsWith("+")) {
1949
+ return `${colors.green}${line}${colors.reset}`;
1950
+ }
1951
+ if (line.startsWith("@@")) {
1952
+ return `${colors.cyan}${line}${colors.reset}`;
1953
+ }
1954
+ return line;
1955
+ }).join("\n ");
1956
+ }
1957
+ function printFailLogs(logs) {
1958
+ const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
1959
+ for (const log of failLogs) {
1960
+ try {
1961
+ const jsonStr = log.replace("[DEBUG-FAIL] ", "");
1962
+ const data = JSON.parse(jsonStr);
1963
+ console.log(`
1964
+ ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
1965
+ console.log(
1966
+ ` Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
1967
+ );
1968
+ console.log(` Message: ${data.message}`);
1969
+ if (data.diff && Array.isArray(data.diff)) {
1970
+ console.log(` Diff:
1971
+ ${formatDiff(data.diff)}`);
1972
+ }
1973
+ if (data.expected && data.actual) {
1974
+ const expStr = JSON.stringify(data.expected);
1975
+ const actStr = JSON.stringify(data.actual);
1976
+ if (expStr.length < 100 && actStr.length < 100) {
1977
+ console.log(` Expected: ${colors.gray}${expStr}${colors.reset}`);
1978
+ console.log(` Actual: ${colors.gray}${actStr}${colors.reset}`);
1979
+ }
1980
+ }
1981
+ } catch (_e) {
1982
+ console.log(` Raw Log: ${log}`);
1983
+ }
1984
+ }
1985
+ }
1506
1986
  function printResult(result) {
1507
1987
  const { model, modelKey, benchmark, result: benchmarkResult } = result;
1508
1988
  const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
@@ -1525,6 +2005,18 @@ function printResult(result) {
1525
2005
  ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
1526
2006
  );
1527
2007
  }
2008
+ if (!benchmarkResult.success && benchmarkResult.logs) {
2009
+ printFailLogs(benchmarkResult.logs);
2010
+ const failLogs = benchmarkResult.logs.filter(
2011
+ (l) => l.startsWith("[DEBUG-FAIL]")
2012
+ );
2013
+ if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
2014
+ console.log(" Raw Logs (Sample):");
2015
+ for (const l of benchmarkResult.logs.slice(0, 10)) {
2016
+ console.log(` ${l}`);
2017
+ }
2018
+ }
2019
+ }
1528
2020
  }
1529
2021
  function consoleReporter(results) {
1530
2022
  console.log("\n--- \u{1F4CA} Evaluation Report ---");
@@ -1579,14 +2071,14 @@ function hasFunctionNameIssue(diff) {
1579
2071
  );
1580
2072
  }
1581
2073
  function suggestFunctionNameFix(expected, actual, suggestions) {
1582
- const expectedName = expected?.function;
1583
- const actualName = actual?.function;
2074
+ const expectedName = expected == null ? void 0 : expected.function;
2075
+ const actualName = actual == null ? void 0 : actual.function;
1584
2076
  if (expectedName && actualName && expectedName !== actualName) {
1585
2077
  suggestions.push(
1586
2078
  `Call the function '${expectedName}' instead of '${actualName}'.`
1587
2079
  );
1588
2080
  }
1589
- if (Array.isArray(expected?.functions)) {
2081
+ if (Array.isArray(expected == null ? void 0 : expected.functions)) {
1590
2082
  suggestions.push(
1591
2083
  `Ensure tool calls include: ${expected.functions.join(", ")}.`
1592
2084
  );
@@ -1641,7 +2133,7 @@ function suggestFromErrorType(error_type, suggestions) {
1641
2133
  }
1642
2134
  function suggestFixFromDiff(parsed) {
1643
2135
  const suggestions = [];
1644
- const { error_type, expected, actual, diff } = parsed ?? {};
2136
+ const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
1645
2137
  if (!Array.isArray(diff)) {
1646
2138
  if (suggestions.length === 0 && typeof error_type === "string") {
1647
2139
  suggestFromErrorType(error_type, suggestions);
@@ -1666,15 +2158,16 @@ function suggestFixFromDiff(parsed) {
1666
2158
  return uniqueLines(suggestions);
1667
2159
  }
1668
2160
  function getTestIdFromLogLine(line) {
2161
+ var _a, _b;
1669
2162
  if (line.startsWith("[FAIL]")) {
1670
2163
  const m = line.match(FAIL_ID_REGEX);
1671
- return m?.[1];
2164
+ return m == null ? void 0 : m[1];
1672
2165
  }
1673
2166
  if (line.startsWith("[DEBUG-FAIL]")) {
1674
2167
  try {
1675
2168
  const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1676
- return String(parsed?.id ?? "");
1677
- } catch {
2169
+ return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
2170
+ } catch (e) {
1678
2171
  }
1679
2172
  }
1680
2173
  if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
@@ -1682,18 +2175,19 @@ function getTestIdFromLogLine(line) {
1682
2175
  const parsed = JSON.parse(
1683
2176
  line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1684
2177
  );
1685
- return String(parsed?.id ?? "");
1686
- } catch {
2178
+ return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
2179
+ } catch (e) {
1687
2180
  }
1688
2181
  }
1689
2182
  return;
1690
2183
  }
1691
2184
  function groupLogsByTestId(failLogs) {
2185
+ var _a;
1692
2186
  const byId = /* @__PURE__ */ new Map();
1693
2187
  for (const line of failLogs) {
1694
2188
  const id = getTestIdFromLogLine(line);
1695
- const key = id ?? "__general__";
1696
- const arr = byId.get(key) ?? [];
2189
+ const key = id != null ? id : "__general__";
2190
+ const arr = (_a = byId.get(key)) != null ? _a : [];
1697
2191
  arr.push(line);
1698
2192
  byId.set(key, arr);
1699
2193
  }
@@ -1705,10 +2199,10 @@ function collectDebugIds(lines) {
1705
2199
  if (l.startsWith("[DEBUG-FAIL]")) {
1706
2200
  try {
1707
2201
  const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1708
- if (parsed?.id) {
2202
+ if (parsed == null ? void 0 : parsed.id) {
1709
2203
  debugIds.add(String(parsed.id));
1710
2204
  }
1711
- } catch {
2205
+ } catch (e) {
1712
2206
  }
1713
2207
  }
1714
2208
  }
@@ -1744,7 +2238,7 @@ function displayDebugFailLine(line) {
1744
2238
  console.log(` \u2022 ${s}`);
1745
2239
  }
1746
2240
  }
1747
- } catch {
2241
+ } catch (e) {
1748
2242
  console.log(` ${line}`);
1749
2243
  }
1750
2244
  }
@@ -1788,14 +2282,14 @@ function displayDebugFailContextLine(line) {
1788
2282
  const ctx = JSON.parse(payload);
1789
2283
  console.log(` ${colors2.gray}context:${colors2.reset}`);
1790
2284
  displayContextInfo(ctx);
1791
- } catch {
2285
+ } catch (e) {
1792
2286
  console.log(` ${line}`);
1793
2287
  }
1794
2288
  }
1795
2289
  function displayLogLine(line, debugIds) {
1796
2290
  if (line.startsWith("[FAIL]")) {
1797
2291
  const m = line.match(FAIL_ID_REGEX);
1798
- const failId = m?.[1];
2292
+ const failId = m == null ? void 0 : m[1];
1799
2293
  if (failId && debugIds.has(failId)) {
1800
2294
  return;
1801
2295
  }
@@ -1865,11 +2359,12 @@ function displayResultHeader(r) {
1865
2359
  );
1866
2360
  }
1867
2361
  function consoleDebugReporter(results) {
2362
+ var _a;
1868
2363
  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
1869
2364
  for (const r of results) {
1870
2365
  displayResultHeader(r);
1871
2366
  displayMetrics(Object.entries(r.result.metrics));
1872
- if (r.result.logs?.length) {
2367
+ if ((_a = r.result.logs) == null ? void 0 : _a.length) {
1873
2368
  displayResultLogs(r.result.logs);
1874
2369
  }
1875
2370
  }
@@ -1878,13 +2373,16 @@ function consoleDebugReporter(results) {
1878
2373
 
1879
2374
  // src/reporters/json.ts
1880
2375
  function jsonReporter(results) {
1881
- const serializableResults = results.map((r) => ({
1882
- ...r,
1883
- result: {
1884
- ...r.result,
1885
- error: r.result.error?.message
1886
- }
1887
- }));
2376
+ const serializableResults = results.map((r) => {
2377
+ var _a;
2378
+ return {
2379
+ ...r,
2380
+ result: {
2381
+ ...r.result,
2382
+ error: (_a = r.result.error) == null ? void 0 : _a.message
2383
+ }
2384
+ };
2385
+ });
1888
2386
  console.log(JSON.stringify(serializableResults, null, 2));
1889
2387
  }
1890
2388
 
@@ -1996,6 +2494,7 @@ export {
1996
2494
  bfclParallelBenchmark,
1997
2495
  bfclParallelMultipleBenchmark,
1998
2496
  bfclSimpleBenchmark,
2497
+ complexFuncBenchBenchmark,
1999
2498
  evaluate,
2000
2499
  jsonGenerationBenchmark,
2001
2500
  jsonGenerationSchemaOnlyBenchmark