@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -34,6 +34,7 @@ __export(index_exports, {
34
34
  bfclParallelBenchmark: () => bfclParallelBenchmark,
35
35
  bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
36
36
  bfclSimpleBenchmark: () => bfclSimpleBenchmark,
37
+ complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
37
38
  evaluate: () => evaluate,
38
39
  jsonGenerationBenchmark: () => jsonGenerationBenchmark,
39
40
  jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
@@ -61,7 +62,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
61
62
  if (import_node_fs.default.existsSync(dataAtRoot)) {
62
63
  return dataAtRoot;
63
64
  }
64
- } catch {
65
+ } catch (e) {
65
66
  }
66
67
  return null;
67
68
  }
@@ -75,7 +76,7 @@ function tryResolveViaPackageJson(moduleUrl) {
75
76
  if (import_node_fs.default.existsSync(dataAtPkg)) {
76
77
  return dataAtPkg;
77
78
  }
78
- } catch {
79
+ } catch (e) {
79
80
  }
80
81
  return null;
81
82
  }
@@ -83,7 +84,7 @@ function getStartDir(moduleUrl) {
83
84
  if (moduleUrl) {
84
85
  try {
85
86
  return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
86
- } catch {
87
+ } catch (e) {
87
88
  return process.cwd();
88
89
  }
89
90
  }
@@ -177,7 +178,7 @@ function valuesMatch(modelValue, possibleValue) {
177
178
  const normalizedModel = normalizeObject(modelValue);
178
179
  const normalizedPossible = normalizeObject(possibleValue);
179
180
  return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
180
- } catch {
181
+ } catch (e) {
181
182
  return false;
182
183
  }
183
184
  }
@@ -306,7 +307,7 @@ function checkSingleParameter(paramName, modelValue, context) {
306
307
  return checkStringValue(
307
308
  paramName,
308
309
  modelValue,
309
- possibleValues ?? []
310
+ possibleValues != null ? possibleValues : []
310
311
  );
311
312
  }
312
313
  if (Array.isArray(modelValue)) {
@@ -406,45 +407,99 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
406
407
  // src/benchmarks/bfcl.ts
407
408
  var LINE_SPLIT_REGEX = /\r?\n/;
408
409
  var NUMERIC_STRING_REGEX = /^\d+$/;
410
+ var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
411
+ function convertGroundTruthToXML(call) {
412
+ const keys = Object.keys(call);
413
+ if (keys.length === 0) {
414
+ return "<empty_call />";
415
+ }
416
+ const funcName = keys[0];
417
+ if (!funcName) {
418
+ return "<undefined_function />";
419
+ }
420
+ const params = call[funcName];
421
+ if (!params || typeof params !== "object") {
422
+ return `<${funcName} />`;
423
+ }
424
+ let xml = `<${funcName}>
425
+ `;
426
+ for (const [key, value] of Object.entries(params)) {
427
+ const displayValue = Array.isArray(value) ? value[0] : value;
428
+ let valueStr;
429
+ if (typeof displayValue === "string") {
430
+ valueStr = displayValue;
431
+ } else if (displayValue === null || displayValue === void 0) {
432
+ valueStr = "";
433
+ } else {
434
+ valueStr = JSON.stringify(displayValue);
435
+ }
436
+ xml += ` <${key}>${valueStr}</${key}>
437
+ `;
438
+ }
439
+ xml += `</${funcName}>`;
440
+ return xml;
441
+ }
442
+ function extractCategory(id) {
443
+ if (id.startsWith("parallel_multiple")) {
444
+ return "parallel_multiple";
445
+ }
446
+ if (id.startsWith("simple_python")) {
447
+ return "simple";
448
+ }
449
+ if (id.startsWith("simple_java")) {
450
+ return "simple";
451
+ }
452
+ if (id.startsWith("simple_javascript")) {
453
+ return "simple";
454
+ }
455
+ if (id.startsWith("parallel")) {
456
+ return "parallel";
457
+ }
458
+ if (id.startsWith("multiple")) {
459
+ return "multiple";
460
+ }
461
+ if (id.startsWith("simple")) {
462
+ return "simple";
463
+ }
464
+ return id.split("_")[0];
465
+ }
409
466
  function check(testCase, modelOutput, possibleAnswer) {
410
- const category = testCase.id.split("_")[0];
467
+ const category = extractCategory(testCase.id);
411
468
  try {
412
- if (category === "simple") {
413
- if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
414
- return {
415
- valid: false,
416
- error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
417
- error_type: "simple:wrong_count"
418
- };
469
+ switch (category) {
470
+ case "simple": {
471
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
472
+ return {
473
+ valid: false,
474
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
475
+ error_type: "simple:wrong_count"
476
+ };
477
+ }
478
+ return simpleFunctionChecker(
479
+ testCase.function[0],
480
+ modelOutput[0],
481
+ possibleAnswer.ground_truth[0]
482
+ );
483
+ }
484
+ case "multiple": {
485
+ return multipleFunctionChecker(
486
+ testCase.function,
487
+ modelOutput,
488
+ possibleAnswer.ground_truth
489
+ );
490
+ }
491
+ case "parallel":
492
+ case "parallel_multiple": {
493
+ return parallelFunctionCheckerNoOrder(
494
+ testCase.function,
495
+ modelOutput,
496
+ possibleAnswer.ground_truth
497
+ );
498
+ }
499
+ default: {
500
+ return { valid: true };
419
501
  }
420
- return simpleFunctionChecker(
421
- testCase.function[0],
422
- modelOutput[0],
423
- possibleAnswer.ground_truth[0]
424
- );
425
- }
426
- if (category === "parallel") {
427
- return parallelFunctionCheckerNoOrder(
428
- testCase.function,
429
- modelOutput,
430
- possibleAnswer.ground_truth
431
- );
432
- }
433
- if (category === "multiple") {
434
- return multipleFunctionChecker(
435
- testCase.function,
436
- modelOutput,
437
- possibleAnswer.ground_truth
438
- );
439
- }
440
- if (category.includes("parallel-multiple")) {
441
- return parallelFunctionCheckerNoOrder(
442
- testCase.function,
443
- modelOutput,
444
- possibleAnswer.ground_truth
445
- );
446
502
  }
447
- return { valid: true };
448
503
  } catch (e) {
449
504
  return {
450
505
  valid: false,
@@ -486,7 +541,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
486
541
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
487
542
  );
488
543
  }
489
- const fixSchemaType = (copy) => {
544
+ const fixSchemaType2 = (copy) => {
490
545
  if (!copy.type) {
491
546
  return;
492
547
  }
@@ -510,16 +565,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
510
565
  );
511
566
  }
512
567
  };
513
- const fixSchema = (schema) => {
568
+ const fixSchema2 = (schema) => {
514
569
  if (!schema || typeof schema !== "object") {
515
570
  return { type: "object", properties: {} };
516
571
  }
517
- const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
572
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
518
573
  if (!Array.isArray(copy)) {
519
- fixSchemaType(copy);
520
- fixSchemaProperties(copy, fixSchema);
574
+ fixSchemaType2(copy);
575
+ fixSchemaProperties(copy, fixSchema2);
521
576
  if (copy.items) {
522
- copy.items = fixSchema(copy.items);
577
+ copy.items = fixSchema2(copy.items);
523
578
  }
524
579
  return copy;
525
580
  }
@@ -554,13 +609,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
554
609
  try {
555
610
  const arr = JSON.parse(raw);
556
611
  return Array.isArray(arr) ? arr : [];
557
- } catch {
612
+ } catch (e) {
558
613
  return [];
559
614
  }
560
615
  };
561
616
  const getSanitizedName = (rawName, transformedTools) => {
617
+ var _a, _b;
562
618
  if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
563
- return transformedTools[Number(rawName)]?.name ?? rawName;
619
+ return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
564
620
  }
565
621
  return rawName;
566
622
  };
@@ -570,25 +626,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
570
626
  }
571
627
  try {
572
628
  return JSON.parse(extractedArgs);
573
- } catch {
629
+ } catch (e) {
574
630
  return extractedArgs;
575
631
  }
576
632
  };
577
633
  const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
634
+ var _a, _b, _c, _d, _e, _f;
578
635
  const call = c;
579
- const rawName = call.toolName ?? call.name;
636
+ const rawName = (_a = call.toolName) != null ? _a : call.name;
580
637
  const sanitizedFromIndex = getSanitizedName(
581
638
  rawName,
582
639
  transformedTools
583
640
  );
584
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
585
- const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
641
+ const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
642
+ const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
586
643
  const parsedArgs = parseToolArgs(extractedArgs);
587
644
  return {
588
645
  ...call,
589
646
  toolName: originalName,
590
647
  name: originalName,
591
- args: parsedArgs ?? {}
648
+ args: parsedArgs != null ? parsedArgs : {}
592
649
  };
593
650
  });
594
651
  const summarizeArgs = (args) => {
@@ -620,7 +677,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
620
677
  return `- expected one of: ${formatted}`;
621
678
  })();
622
679
  diffLines.push(expectedLine);
623
- diffLines.push(`+ got: ${JSON.stringify(got)}`);
680
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
624
681
  return diffLines;
625
682
  };
626
683
  const paramValueMatches = (allowed, got) => {
@@ -632,7 +689,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
632
689
  if (Array.isArray(got)) {
633
690
  return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
634
691
  }
635
- } catch {
692
+ } catch (e) {
636
693
  }
637
694
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
638
695
  });
@@ -670,13 +727,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
670
727
  }
671
728
  };
672
729
  const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
730
+ var _a, _b, _c, _d;
673
731
  const funcDesc = tools[0];
674
- const gt = possibleAnswer.ground_truth?.[0];
675
- const expectedFuncName = funcDesc?.name;
732
+ const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
733
+ const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
676
734
  const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
677
735
  const received = restoredCalls[0];
678
- const receivedName = received?.toolName ?? received?.name;
679
- const receivedArgs = summarizeArgs(received?.args);
736
+ const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
737
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
680
738
  const expected = {
681
739
  function: expectedFuncName,
682
740
  params: expectedParams
@@ -688,7 +746,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
688
746
  const diff = [];
689
747
  checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
690
748
  if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
691
- const required = funcDesc?.parameters?.required ?? [];
749
+ const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
692
750
  checkMissingParams(
693
751
  required,
694
752
  receivedArgs,
@@ -725,12 +783,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
725
783
  }
726
784
  };
727
785
  const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
786
+ var _a;
728
787
  for (let i = 0; i < restoredCalls.length; i += 1) {
729
788
  if (usedActual.has(i)) {
730
789
  continue;
731
790
  }
732
791
  const rc = restoredCalls[i];
733
- const rcName = rc?.toolName ?? rc?.name;
792
+ const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
734
793
  if (rcName === fname) {
735
794
  return i;
736
795
  }
@@ -744,6 +803,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
744
803
  checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
745
804
  };
746
805
  const processExpectedCall = (options) => {
806
+ var _a, _b;
747
807
  const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
748
808
  const fname = Object.keys(expectedObj)[0];
749
809
  const matchedIndex = findMatchingCallIndex(
@@ -756,10 +816,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
756
816
  }
757
817
  usedActual.add(matchedIndex);
758
818
  const received = restoredCalls[matchedIndex];
759
- const receivedArgs = summarizeArgs(received?.args);
819
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
760
820
  const expectedParamsAllowed = expectedObj[fname];
761
821
  const funcDesc = tools.find((t) => t.name === fname);
762
- const requiredParams = funcDesc?.parameters?.required ?? [];
822
+ const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
763
823
  diff.push(`@@ function ${fname}`);
764
824
  if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
765
825
  validateFunctionParams({
@@ -771,10 +831,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
771
831
  }
772
832
  };
773
833
  const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
774
- const gtArr = possibleAnswer.ground_truth ?? [];
834
+ var _a;
835
+ const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
775
836
  const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
776
837
  const actualNames = restoredCalls.map(
777
- (c) => c.toolName ?? c.name
838
+ (c) => {
839
+ var _a2;
840
+ return (_a2 = c.toolName) != null ? _a2 : c.name;
841
+ }
778
842
  );
779
843
  const expected = {
780
844
  functions: expectedNames
@@ -800,14 +864,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
800
864
  return { expected, actual, diff };
801
865
  };
802
866
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
803
- const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
867
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
804
868
  logs.push(
805
869
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
806
870
  );
807
871
  const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
872
+ var _a, _b, _c, _d;
808
873
  try {
809
874
  const firstTool = transformedTools[0];
810
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
875
+ const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
811
876
  caseLogs.push(
812
877
  `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
813
878
  );
@@ -823,49 +888,103 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
823
888
  caseLogs.push(
824
889
  `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
825
890
  );
826
- } catch {
891
+ } catch (e) {
827
892
  caseLogs.push(
828
893
  `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
829
894
  );
830
895
  }
831
896
  };
832
- const buildFailureContext = (options) => {
833
- const {
834
- testCase,
835
- tools,
836
- flatMessages,
837
- mwOriginalText,
838
- text,
839
- finishReason,
840
- mwParsedToolCalls,
841
- restoredCalls,
842
- possibleAnswer
843
- } = options;
844
- const lastUser = (() => {
845
- const reversed = [...flatMessages].reverse();
846
- const found = reversed.find(
847
- (m) => m.role === "user"
848
- );
849
- return found?.content ?? void 0;
850
- })();
851
- const rawModelText = (() => {
852
- if (mwOriginalText && mwOriginalText.length > 0) {
853
- return mwOriginalText;
897
+ const hasPercentPattern = (diff) => {
898
+ return diff.some((d) => {
899
+ if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
900
+ return false;
854
901
  }
855
- if (typeof text === "string") {
856
- return text;
902
+ const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
903
+ if (!numMatch) {
904
+ return false;
857
905
  }
858
- return "";
859
- })();
860
- return {
861
- id: testCase.id,
862
- tool_schema: tools,
863
- last_user_query: lastUser,
864
- raw_model_text: rawModelText,
865
- finish_reason: finishReason,
866
- parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
867
- ground_truth: possibleAnswer.ground_truth
868
- };
906
+ const num = Number.parseFloat(numMatch[1]);
907
+ return num >= 1 && num <= 100;
908
+ });
909
+ };
910
+ const isValueError = (errorType, diff) => {
911
+ return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
912
+ };
913
+ const isFunctionNameError = (errorType, diff) => {
914
+ return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
915
+ };
916
+ const isMissingParamError = (errorType, diff) => {
917
+ return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
918
+ };
919
+ const isUnexpectedParamError = (errorType, diff) => {
920
+ return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
921
+ };
922
+ const classifyByErrorPatterns = (errorType, diff) => {
923
+ const patterns = [
924
+ [
925
+ isValueError,
926
+ hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
927
+ ],
928
+ [isFunctionNameError, "WRONG_FUNCTION"],
929
+ [isMissingParamError, "MISSING_PARAMS"],
930
+ [isUnexpectedParamError, "UNEXPECTED_PARAMS"]
931
+ ];
932
+ for (const [classifier, result] of patterns) {
933
+ if (classifier(errorType, diff)) {
934
+ return result;
935
+ }
936
+ }
937
+ if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
938
+ return "NO_MATCH";
939
+ }
940
+ return null;
941
+ };
942
+ const classifyByCallCount = (actualCount, expectedCount) => {
943
+ if (actualCount === 0 && expectedCount > 0) {
944
+ return "PARSE_FAILURE";
945
+ }
946
+ if (actualCount > 0 && actualCount < expectedCount) {
947
+ return "PARTIAL_CALLS";
948
+ }
949
+ if (actualCount > expectedCount) {
950
+ return "EXTRA_CALLS";
951
+ }
952
+ return null;
953
+ };
954
+ const classifyFailureType = (options) => {
955
+ const { errorType, restoredCalls, expectedCount, diff } = options;
956
+ const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
957
+ const countBasedResult = classifyByCallCount(
958
+ actualCount,
959
+ expectedCount
960
+ );
961
+ if (countBasedResult) {
962
+ return countBasedResult;
963
+ }
964
+ const patternBasedResult = classifyByErrorPatterns(errorType, diff);
965
+ if (patternBasedResult) {
966
+ return patternBasedResult;
967
+ }
968
+ return "OTHER";
969
+ };
970
+ const extractRawModelText = (mwOriginalText, text) => {
971
+ if (mwOriginalText && mwOriginalText.length > 0) {
972
+ return mwOriginalText;
973
+ }
974
+ if (typeof text === "string") {
975
+ return text;
976
+ }
977
+ return "";
978
+ };
979
+ const extractLastUserQuery = (flatMessages) => {
980
+ var _a;
981
+ const reversed = [...flatMessages].reverse();
982
+ const found = reversed.find((m) => m.role === "user");
983
+ const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
984
+ return content.length > 200 ? `${content.slice(0, 200)}...` : content;
985
+ };
986
+ const truncateText = (text, maxLen) => {
987
+ return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
869
988
  };
870
989
  const logFailureDetails = (options) => {
871
990
  const {
@@ -883,43 +1002,37 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
883
1002
  } = options;
884
1003
  try {
885
1004
  const category = testCase.id.split("_")[0];
886
- const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
887
- tools,
888
- possibleAnswer,
889
- restoredCalls
890
- ) : buildParallelDiff(
891
- tools,
892
- possibleAnswer,
893
- restoredCalls
894
- );
895
- caseLogs.push(
896
- `[DEBUG-FAIL] ${JSON.stringify({
897
- id: testCase.id,
898
- message: checkerResult.error,
899
- error_type: checkerResult.error_type,
900
- expected,
901
- actual,
902
- diff
903
- })}`
904
- );
905
- try {
906
- const contextPayload = buildFailureContext({
907
- testCase,
908
- tools,
909
- flatMessages,
910
- mwOriginalText,
911
- text,
912
- finishReason,
913
- mwParsedToolCalls,
1005
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
1006
+ const gtArr = possibleAnswer.ground_truth;
1007
+ const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
1008
+ const rawModelText = extractRawModelText(mwOriginalText, text);
1009
+ const lastUserQuery = extractLastUserQuery(flatMessages);
1010
+ const failurePayload = {
1011
+ id: testCase.id,
1012
+ category: classifyFailureType({
1013
+ errorType: checkerResult.error_type,
914
1014
  restoredCalls,
915
- possibleAnswer
916
- });
917
- caseLogs.push(
918
- `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
919
- );
920
- } catch {
921
- }
922
- } catch {
1015
+ expectedCount,
1016
+ diff
1017
+ }),
1018
+ message: checkerResult.error,
1019
+ error_type: checkerResult.error_type,
1020
+ expected,
1021
+ actual,
1022
+ diff,
1023
+ context: {
1024
+ raw_model_text: truncateText(rawModelText, 500),
1025
+ raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
1026
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
1027
+ expected_count: expectedCount,
1028
+ actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
1029
+ finish_reason: finishReason,
1030
+ last_user_query: lastUserQuery,
1031
+ tool_names: tools.map((t) => t.name)
1032
+ }
1033
+ };
1034
+ caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
1035
+ } catch (e) {
923
1036
  caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
924
1037
  }
925
1038
  };
@@ -998,7 +1111,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
998
1111
  const flatMessages = flattenMessages(messages);
999
1112
  const { transformedTools, nameMap } = buildTransformedTools(
1000
1113
  tools,
1001
- fixSchema
1114
+ fixSchema2
1002
1115
  );
1003
1116
  const toolsMap = buildToolsMap(transformedTools);
1004
1117
  return { flatMessages, transformedTools, nameMap, toolsMap };
@@ -1020,6 +1133,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1020
1133
  const mwParsedToolCalls = parseDebugToolCalls(
1021
1134
  debugSummaryRef.toolCalls
1022
1135
  );
1136
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1137
+ if (!possibleAnswer) {
1138
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1139
+ }
1140
+ if (process.env.DEBUG_PARSER_OUTPUT === "true") {
1141
+ const groundTruth = possibleAnswer.ground_truth;
1142
+ const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
1143
+ console.log("\n========== BFCL CASE DEBUG ==========");
1144
+ console.log(`Test Case: ${testCase.id}`);
1145
+ console.log(`Expected count: ${groundTruth.length} call(s)`);
1146
+ console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
1147
+ console.log(expectedXML);
1148
+ console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
1149
+ console.log(mwOriginalText || text || "(empty)");
1150
+ console.log(
1151
+ "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
1152
+ );
1153
+ console.log(JSON.stringify(toolCalls, null, 2));
1154
+ console.log("======================================\n");
1155
+ }
1023
1156
  logRawToolCalls({
1024
1157
  toolCalls,
1025
1158
  finishReason,
@@ -1027,10 +1160,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1027
1160
  testCaseId: testCase.id,
1028
1161
  caseLogs
1029
1162
  });
1030
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
1031
- if (!possibleAnswer) {
1032
- throw new Error(`No possible answer for id: ${testCase.id}`);
1033
- }
1034
1163
  const restoredCalls = restoreToolCalls(
1035
1164
  toolCalls || [],
1036
1165
  nameMap,
@@ -1051,12 +1180,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1051
1180
  caseLogs
1052
1181
  });
1053
1182
  };
1054
- const runSingleCase = async (testCase) => {
1183
+ const runSingleCase2 = async (testCase) => {
1055
1184
  const caseLogs = [];
1056
1185
  const { function: tools } = testCase;
1057
- const temp = config?.temperature;
1186
+ const temp = config == null ? void 0 : config.temperature;
1058
1187
  const temperature = typeof temp === "number" ? temp : void 0;
1059
- const maxTok = config?.maxTokens;
1188
+ const maxTok = config == null ? void 0 : config.maxTokens;
1060
1189
  const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1061
1190
  try {
1062
1191
  const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
@@ -1082,15 +1211,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1082
1211
  });
1083
1212
  } catch (e) {
1084
1213
  caseLogs.push(
1085
- `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
1214
+ `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
1086
1215
  );
1087
- if (e?.stack) {
1216
+ if (e == null ? void 0 : e.stack) {
1088
1217
  caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1089
1218
  }
1090
1219
  return { valid: false, logs: caseLogs };
1091
1220
  }
1092
1221
  };
1093
- const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
1222
+ const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
1094
1223
  const results = new Array(items.length);
1095
1224
  let idx = 0;
1096
1225
  const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
@@ -1106,10 +1235,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1106
1235
  await Promise.all(workers);
1107
1236
  return results;
1108
1237
  };
1109
- const resultsPerCase = await mapWithConcurrency(
1238
+ const resultsPerCase = await mapWithConcurrency2(
1110
1239
  testCases,
1111
1240
  concurrency,
1112
- async (tc) => runSingleCase(tc)
1241
+ async (tc) => runSingleCase2(tc)
1113
1242
  );
1114
1243
  correctCount = resultsPerCase.reduce(
1115
1244
  (acc, r) => acc + (r.valid ? 1 : 0),
@@ -1127,14 +1256,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1127
1256
  };
1128
1257
  }
1129
1258
  const score = correctCount / testCases.length;
1259
+ const caseResults = resultsPerCase.map((r, i) => ({
1260
+ id: testCases[i].id,
1261
+ valid: r.valid
1262
+ }));
1130
1263
  return {
1131
1264
  score,
1132
1265
  success: score > 0.95,
1133
- // High success threshold as requested
1134
1266
  metrics: {
1135
1267
  correct_count: correctCount,
1136
1268
  total_cases: testCases.length,
1137
- accuracy: score
1269
+ accuracy: score,
1270
+ case_results: JSON.stringify(caseResults)
1138
1271
  },
1139
1272
  logs
1140
1273
  };
@@ -1154,42 +1287,410 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1154
1287
  }
1155
1288
  var bfclSimpleBenchmark = createBfclBenchmark(
1156
1289
  "bfcl-simple",
1157
- "BFCL Simple Function Calling",
1158
- "BFCL_v3_simple.jsonl",
1159
- "BFCL_v3_simple_possible_answer.jsonl"
1290
+ "BFCL v4 Simple Function Calling",
1291
+ "BFCL_v4_simple.jsonl",
1292
+ "BFCL_v4_simple_possible_answer.jsonl"
1160
1293
  );
1161
1294
  var bfclParallelBenchmark = createBfclBenchmark(
1162
1295
  "bfcl-parallel",
1163
- "BFCL Parallel Function Calling",
1164
- "BFCL_v3_parallel.jsonl",
1165
- "BFCL_v3_parallel_possible_answer.jsonl"
1296
+ "BFCL v4 Parallel Function Calling",
1297
+ "BFCL_v4_parallel.jsonl",
1298
+ "BFCL_v4_parallel_possible_answer.jsonl"
1166
1299
  );
1167
1300
  var bfclMultipleBenchmark = createBfclBenchmark(
1168
1301
  "bfcl-multiple",
1169
- "BFCL Multiple Function Calling",
1170
- "BFCL_v3_multiple.jsonl",
1171
- "BFCL_v3_multiple_possible_answer.jsonl"
1302
+ "BFCL v4 Multiple Function Calling",
1303
+ "BFCL_v4_multiple.jsonl",
1304
+ "BFCL_v4_multiple_possible_answer.jsonl"
1172
1305
  );
1173
1306
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1174
1307
  "bfcl-parallel-multiple",
1175
- "BFCL Parallel & Multiple Function Calling",
1176
- "BFCL_v3_parallel_multiple.jsonl",
1177
- "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1308
+ "BFCL v4 Parallel & Multiple Function Calling",
1309
+ "BFCL_v4_parallel_multiple.jsonl",
1310
+ "BFCL_v4_parallel_multiple_possible_answer.jsonl"
1178
1311
  );
1179
1312
 
1180
- // src/benchmarks/json-generation.ts
1313
+ // src/benchmarks/complex-func-bench.ts
1181
1314
  var import_node_fs3 = require("fs");
1182
1315
  var import_node_path3 = __toESM(require("path"), 1);
1183
1316
  var import_ai2 = require("ai");
1317
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1318
+ function standardizeString2(input) {
1319
+ if (typeof input !== "string") {
1320
+ return input;
1321
+ }
1322
+ return input.toLowerCase().trim();
1323
+ }
1324
+ function valuesMatch2(modelValue, expectedValue) {
1325
+ if (modelValue === expectedValue) {
1326
+ return true;
1327
+ }
1328
+ if (typeof modelValue === "string" && typeof expectedValue === "string") {
1329
+ return standardizeString2(modelValue) === standardizeString2(expectedValue);
1330
+ }
1331
+ if (typeof modelValue === "number" && typeof expectedValue === "string") {
1332
+ return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
1333
+ }
1334
+ if (typeof modelValue === "string" && typeof expectedValue === "number") {
1335
+ return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
1336
+ }
1337
+ if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
1338
+ try {
1339
+ return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
1340
+ } catch (e) {
1341
+ return false;
1342
+ }
1343
+ }
1344
+ return false;
1345
+ }
1346
+ function validateFunctionName(modelFuncName, expectedFuncName) {
1347
+ if (modelFuncName !== expectedFuncName) {
1348
+ return {
1349
+ valid: false,
1350
+ error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
1351
+ error_type: "function_name_mismatch"
1352
+ };
1353
+ }
1354
+ return { valid: true };
1355
+ }
1356
+ function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
1357
+ for (const param of requiredParams) {
1358
+ if (!(param in modelArgs) && param in expectedArgs) {
1359
+ return {
1360
+ valid: false,
1361
+ error: `Missing required parameter: '${param}'`,
1362
+ error_type: "missing_required_param"
1363
+ };
1364
+ }
1365
+ }
1366
+ return { valid: true };
1367
+ }
1368
+ function validateParamValues(expectedArgs, modelArgs, requiredParams) {
1369
+ for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
1370
+ if (!(paramName in modelArgs)) {
1371
+ if (!requiredParams.includes(paramName)) {
1372
+ continue;
1373
+ }
1374
+ return {
1375
+ valid: false,
1376
+ error: `Missing parameter: '${paramName}'`,
1377
+ error_type: "missing_param"
1378
+ };
1379
+ }
1380
+ const modelValue = modelArgs[paramName];
1381
+ if (!valuesMatch2(modelValue, expectedValue)) {
1382
+ return {
1383
+ valid: false,
1384
+ error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
1385
+ error_type: "value_mismatch"
1386
+ };
1387
+ }
1388
+ }
1389
+ return { valid: true };
1390
+ }
1391
+ function checkFunctionCall(modelCall, expected, toolSpecs) {
1392
+ var _a, _b, _c, _d;
1393
+ const expectedFuncName = Object.keys(expected)[0];
1394
+ const expectedArgs = expected[expectedFuncName];
1395
+ const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
1396
+ const modelArgs = (_b = modelCall.args) != null ? _b : {};
1397
+ const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
1398
+ if (!nameResult.valid) {
1399
+ return nameResult;
1400
+ }
1401
+ const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
1402
+ const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
1403
+ const requiredResult = validateRequiredParams(
1404
+ requiredParams,
1405
+ modelArgs,
1406
+ expectedArgs
1407
+ );
1408
+ if (!requiredResult.valid) {
1409
+ return requiredResult;
1410
+ }
1411
+ return validateParamValues(expectedArgs, modelArgs, requiredParams);
1412
+ }
1413
+ function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
1414
+ if (modelCalls.length !== expectedCalls.length) {
1415
+ return {
1416
+ valid: false,
1417
+ error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
1418
+ error_type: "wrong_call_count"
1419
+ };
1420
+ }
1421
+ if (expectedCalls.length === 1) {
1422
+ return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
1423
+ }
1424
+ const matchedIndices = /* @__PURE__ */ new Set();
1425
+ for (const expected of expectedCalls) {
1426
+ let foundMatch = false;
1427
+ for (let i = 0; i < modelCalls.length; i++) {
1428
+ if (matchedIndices.has(i)) {
1429
+ continue;
1430
+ }
1431
+ const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
1432
+ if (result.valid) {
1433
+ matchedIndices.add(i);
1434
+ foundMatch = true;
1435
+ break;
1436
+ }
1437
+ }
1438
+ if (!foundMatch) {
1439
+ const expectedFuncName = Object.keys(expected)[0];
1440
+ return {
1441
+ valid: false,
1442
+ error: `Could not find matching call for function '${expectedFuncName}'`,
1443
+ error_type: "no_matching_call"
1444
+ };
1445
+ }
1446
+ }
1447
+ return { valid: true };
1448
+ }
1449
+ var fixSchemaType = (copy) => {
1450
+ if (!copy.type) {
1451
+ return;
1452
+ }
1453
+ if (copy.type === "dict") {
1454
+ copy.type = "object";
1455
+ }
1456
+ if (copy.type === "tuple") {
1457
+ copy.type = "array";
1458
+ }
1459
+ if (copy.type === "integer" || copy.type === "float") {
1460
+ copy.type = "number";
1461
+ }
1462
+ };
1463
+ var fixSchema = (schema) => {
1464
+ if (!schema || typeof schema !== "object") {
1465
+ return { type: "object", properties: {} };
1466
+ }
1467
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
1468
+ if (!Array.isArray(copy)) {
1469
+ fixSchemaType(copy);
1470
+ if (copy.properties && typeof copy.properties === "object") {
1471
+ for (const k of Object.keys(copy.properties)) {
1472
+ copy.properties[k] = fixSchema(
1473
+ copy.properties[k]
1474
+ );
1475
+ }
1476
+ }
1477
+ if (copy.items) {
1478
+ copy.items = fixSchema(copy.items);
1479
+ }
1480
+ }
1481
+ return copy;
1482
+ };
1483
+ function buildTools(tools) {
1484
+ const nameMap = /* @__PURE__ */ new Map();
1485
+ const transformedTools = tools.map((t) => {
1486
+ const fixed = fixSchema(t.parameters);
1487
+ const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
1488
+ const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
1489
+ nameMap.set(sanitized, t.name);
1490
+ return {
1491
+ type: "function",
1492
+ name: sanitized,
1493
+ description: t.description,
1494
+ inputSchema
1495
+ };
1496
+ });
1497
+ const toolsMap = Object.fromEntries(
1498
+ transformedTools.map((t) => [
1499
+ t.name,
1500
+ (0, import_ai2.tool)({
1501
+ description: typeof t.description === "string" ? t.description : void 0,
1502
+ inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
1503
+ })
1504
+ ])
1505
+ );
1506
+ return { nameMap, toolsMap };
1507
+ }
1508
+ async function mapWithConcurrency(items, concurrencyLimit, mapper) {
1509
+ const results = new Array(items.length);
1510
+ let idx = 0;
1511
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1512
+ while (true) {
1513
+ const current = idx;
1514
+ idx += 1;
1515
+ if (current >= items.length) {
1516
+ break;
1517
+ }
1518
+ results[current] = await mapper(items[current]);
1519
+ }
1520
+ });
1521
+ await Promise.all(workers);
1522
+ return results;
1523
+ }
1524
+ async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
1525
+ const caseLogs = [];
1526
+ const { function: tools, question: messages } = testCase;
1527
+ try {
1528
+ const { nameMap, toolsMap } = buildTools(tools);
1529
+ const debugSummaryRef = {};
1530
+ const providerOptions = {
1531
+ toolCallMiddleware: { debugSummary: debugSummaryRef }
1532
+ };
1533
+ const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
1534
+ model,
1535
+ messages,
1536
+ tools: toolsMap,
1537
+ toolChoice: "auto",
1538
+ providerOptions,
1539
+ ...temperature !== void 0 ? { temperature } : {},
1540
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
1541
+ });
1542
+ const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
1543
+ var _a, _b, _c, _d;
1544
+ const rawName = (_a = c.toolName) != null ? _a : c.name;
1545
+ const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
1546
+ return {
1547
+ toolName: originalName,
1548
+ name: originalName,
1549
+ args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
1550
+ };
1551
+ });
1552
+ caseLogs.push(
1553
+ `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
1554
+ );
1555
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1556
+ if (!possibleAnswer) {
1557
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1558
+ }
1559
+ const checkerResult = checkAllFunctionCalls(
1560
+ restoredCalls,
1561
+ possibleAnswer.ground_truth,
1562
+ tools
1563
+ );
1564
+ if (checkerResult.valid) {
1565
+ caseLogs.push(`[PASS] ${testCase.id}`);
1566
+ return { valid: true, logs: caseLogs };
1567
+ }
1568
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1569
+ return { valid: false, logs: caseLogs };
1570
+ } catch (e) {
1571
+ caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
1572
+ return { valid: false, logs: caseLogs };
1573
+ }
1574
+ }
1575
+ async function loadTestData(dataPath, testDataFile) {
1576
+ const testCasesJson = await import_node_fs3.promises.readFile(
1577
+ import_node_path3.default.join(dataPath, testDataFile),
1578
+ "utf-8"
1579
+ );
1580
+ return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1581
+ }
1582
+ async function loadAnswerData(dataPath, answerDataFile) {
1583
+ const answersJson = await import_node_fs3.promises.readFile(
1584
+ import_node_path3.default.join(dataPath, answerDataFile),
1585
+ "utf-8"
1586
+ );
1587
+ const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1588
+ return new Map(answers.map((ans) => [ans.id, ans]));
1589
+ }
1590
+ function getConfigValues(config) {
1591
+ const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
1592
+ const limit = limitEnv ? Number(limitEnv) : void 0;
1593
+ const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
1594
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
1595
+ const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
1596
+ const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
1597
+ return { limit, concurrency, temperature, maxTokens };
1598
+ }
1599
+ function aggregateResults(resultsPerCase, testCases) {
1600
+ const logs = [];
1601
+ const correctCount = resultsPerCase.reduce(
1602
+ (acc, r) => acc + (r.valid ? 1 : 0),
1603
+ 0
1604
+ );
1605
+ for (const r of resultsPerCase) {
1606
+ logs.push(...r.logs);
1607
+ }
1608
+ if (testCases.length === 0) {
1609
+ return {
1610
+ score: 0,
1611
+ success: false,
1612
+ metrics: {},
1613
+ logs: ["No test cases found."]
1614
+ };
1615
+ }
1616
+ const score = correctCount / testCases.length;
1617
+ return {
1618
+ score,
1619
+ success: score > 0.5,
1620
+ metrics: {
1621
+ correct_count: correctCount,
1622
+ total_cases: testCases.length,
1623
+ accuracy: score
1624
+ },
1625
+ logs
1626
+ };
1627
+ }
1628
+ function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
1629
+ return {
1630
+ name,
1631
+ version: "1.0.0",
1632
+ description,
1633
+ async run(model, config) {
1634
+ var _a;
1635
+ const logs = [];
1636
+ try {
1637
+ const dataPath = resolveDataDir();
1638
+ logs.push(`[INFO] Using data dir: ${dataPath}`);
1639
+ let testCases = await loadTestData(dataPath, testDataFile);
1640
+ const possibleAnswersMap = await loadAnswerData(
1641
+ dataPath,
1642
+ answerDataFile
1643
+ );
1644
+ const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
1645
+ if (limit && Number.isFinite(limit) && limit > 0) {
1646
+ testCases = testCases.slice(0, limit);
1647
+ logs.push(`[INFO] Limiting test cases to ${limit}`);
1648
+ }
1649
+ logs.push(
1650
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
1651
+ );
1652
+ const resultsPerCase = await mapWithConcurrency(
1653
+ testCases,
1654
+ concurrency,
1655
+ (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
1656
+ );
1657
+ const result = aggregateResults(resultsPerCase, testCases);
1658
+ result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
1659
+ return result;
1660
+ } catch (e) {
1661
+ return {
1662
+ score: 0,
1663
+ success: false,
1664
+ metrics: {},
1665
+ error: e,
1666
+ logs: [
1667
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1668
+ ]
1669
+ };
1670
+ }
1671
+ }
1672
+ };
1673
+ }
1674
+ var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
1675
+ "complex-func-bench",
1676
+ "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
1677
+ "ComplexFuncBench.jsonl",
1678
+ "ComplexFuncBench_possible_answer.jsonl"
1679
+ );
1680
+
1681
+ // src/benchmarks/json-generation.ts
1682
+ var import_node_fs4 = require("fs");
1683
+ var import_node_path4 = __toESM(require("path"), 1);
1684
+ var import_ai3 = require("ai");
1184
1685
  var import_ajv = __toESM(require("ajv"), 1);
1185
1686
  var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1186
1687
  var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1187
1688
  var NEWLINE_REGEX = /\r?\n/;
1188
- var LINE_SPLIT_REGEX2 = /\r?\n/;
1689
+ var LINE_SPLIT_REGEX3 = /\r?\n/;
1189
1690
  function tryDirectParse(text) {
1190
1691
  try {
1191
1692
  return JSON.parse(text);
1192
- } catch {
1693
+ } catch (e) {
1193
1694
  return;
1194
1695
  }
1195
1696
  }
@@ -1201,7 +1702,7 @@ function tryCodeFenceParse(text) {
1201
1702
  const inner = fenceMatch[1].trim();
1202
1703
  try {
1203
1704
  return JSON.parse(inner);
1204
- } catch {
1705
+ } catch (e) {
1205
1706
  return;
1206
1707
  }
1207
1708
  }
@@ -1226,7 +1727,7 @@ function tryBracketScan(text) {
1226
1727
  const candidate = text.slice(start, i + 1);
1227
1728
  try {
1228
1729
  return JSON.parse(candidate);
1229
- } catch {
1730
+ } catch (e) {
1230
1731
  return;
1231
1732
  }
1232
1733
  }
@@ -1274,12 +1775,12 @@ function subsetMatch(expected, actual) {
1274
1775
  async function loadDatasets() {
1275
1776
  try {
1276
1777
  const dataDir = resolveDataDir();
1277
- const testsJsonl = await import_node_fs3.promises.readFile(
1278
- import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1778
+ const testsJsonl = await import_node_fs4.promises.readFile(
1779
+ import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
1279
1780
  "utf-8"
1280
1781
  );
1281
- const expectedJsonl = await import_node_fs3.promises.readFile(
1282
- import_node_path3.default.join(dataDir, "json_generation_expected.jsonl"),
1782
+ const expectedJsonl = await import_node_fs4.promises.readFile(
1783
+ import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
1283
1784
  "utf-8"
1284
1785
  );
1285
1786
  const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -1335,10 +1836,11 @@ function validateTestCase(tc, parsed, context) {
1335
1836
  return { valid, valuesOk, parsed };
1336
1837
  }
1337
1838
  async function processTestCase(tc, context) {
1839
+ var _a;
1338
1840
  const messages = buildMessages(tc);
1339
- const temp = context.config?.temperature;
1841
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1340
1842
  const temperature = typeof temp === "number" ? temp : void 0;
1341
- const { text } = await (0, import_ai2.generateText)({
1843
+ const { text } = await (0, import_ai3.generateText)({
1342
1844
  model: context.model,
1343
1845
  messages,
1344
1846
  ...temperature !== void 0 ? { temperature } : {}
@@ -1346,7 +1848,7 @@ async function processTestCase(tc, context) {
1346
1848
  let parsed;
1347
1849
  try {
1348
1850
  parsed = extractFirstJsonBlock(text);
1349
- } catch {
1851
+ } catch (e) {
1350
1852
  }
1351
1853
  if (parsed === void 0) {
1352
1854
  context.validation.logs.push(
@@ -1440,21 +1942,22 @@ function buildBenchmarkResult(total, counts, logs) {
1440
1942
  async function loadSchemaOnlyTests() {
1441
1943
  try {
1442
1944
  const dataDir = resolveDataDir();
1443
- const testsJsonl = await import_node_fs3.promises.readFile(
1444
- import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
1945
+ const testsJsonl = await import_node_fs4.promises.readFile(
1946
+ import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
1445
1947
  "utf-8"
1446
1948
  );
1447
- const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1949
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1448
1950
  return { tests };
1449
1951
  } catch (e) {
1450
1952
  return { tests: [], error: e };
1451
1953
  }
1452
1954
  }
1453
1955
  async function processSchemaOnlyTestCase(tc, context) {
1956
+ var _a;
1454
1957
  const messages = buildMessages(tc);
1455
- const temp = context.config?.temperature;
1958
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1456
1959
  const temperature = typeof temp === "number" ? temp : void 0;
1457
- const { text } = await (0, import_ai2.generateText)({
1960
+ const { text } = await (0, import_ai3.generateText)({
1458
1961
  model: context.model,
1459
1962
  messages,
1460
1963
  ...temperature !== void 0 ? { temperature } : {}
@@ -1462,7 +1965,7 @@ async function processSchemaOnlyTestCase(tc, context) {
1462
1965
  let parsed;
1463
1966
  try {
1464
1967
  parsed = extractFirstJsonBlock(text);
1465
- } catch {
1968
+ } catch (e) {
1466
1969
  }
1467
1970
  if (parsed === void 0) {
1468
1971
  context.logs.push(
@@ -1531,38 +2034,144 @@ var jsonGenerationSchemaOnlyBenchmark = {
1531
2034
  }
1532
2035
  };
1533
2036
 
2037
+ // src/evaluate.ts
2038
+ var import_middleware = require("@ai-sdk-tool/middleware");
2039
+ var import_ai4 = require("ai");
2040
+
1534
2041
  // src/reporters/console.ts
1535
2042
  var colors = {
1536
2043
  reset: "\x1B[0m",
2044
+ bold: "\x1B[1m",
1537
2045
  green: "\x1B[32m",
1538
2046
  red: "\x1B[31m",
1539
2047
  yellow: "\x1B[33m",
1540
2048
  cyan: "\x1B[36m",
1541
2049
  magenta: "\x1B[35m",
1542
- gray: "\x1B[90m"
2050
+ gray: "\x1B[90m",
2051
+ white: "\x1B[37m"
1543
2052
  };
2053
+ var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
2054
+ function formatDiff(diff) {
2055
+ if (!diff || diff.length === 0) {
2056
+ return "";
2057
+ }
2058
+ return diff.slice(0, 8).map((line) => {
2059
+ if (line.startsWith("-")) {
2060
+ return `${colors.red}${line}${colors.reset}`;
2061
+ }
2062
+ if (line.startsWith("+")) {
2063
+ return `${colors.green}${line}${colors.reset}`;
2064
+ }
2065
+ if (line.startsWith("@@")) {
2066
+ return `${colors.cyan}${line}${colors.reset}`;
2067
+ }
2068
+ return line;
2069
+ }).join("\n ");
2070
+ }
2071
+ function parseFailures(logs) {
2072
+ const failures = [];
2073
+ for (const log of logs) {
2074
+ if (!DEBUG_FAIL_REGEX.test(log)) {
2075
+ continue;
2076
+ }
2077
+ try {
2078
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
2079
+ const parsed = JSON.parse(jsonStr);
2080
+ failures.push(parsed);
2081
+ } catch (e) {
2082
+ }
2083
+ }
2084
+ return failures;
2085
+ }
2086
+ function groupFailuresByCategory(failures) {
2087
+ const groups = /* @__PURE__ */ new Map();
2088
+ for (const failure of failures) {
2089
+ const category = failure.category || "OTHER";
2090
+ const existing = groups.get(category);
2091
+ if (existing) {
2092
+ existing.push(failure);
2093
+ } else {
2094
+ groups.set(category, [failure]);
2095
+ }
2096
+ }
2097
+ return groups;
2098
+ }
2099
+ function printCompactFailure(failure) {
2100
+ var _a;
2101
+ console.log(
2102
+ `
2103
+ ${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
2104
+ );
2105
+ if (failure.message) {
2106
+ console.log(` ${failure.message}`);
2107
+ }
2108
+ if (failure.diff && failure.diff.length > 0) {
2109
+ console.log(` ${formatDiff(failure.diff)}`);
2110
+ }
2111
+ if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
2112
+ const text = failure.context.raw_model_text;
2113
+ const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
2114
+ console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
2115
+ }
2116
+ }
2117
+ function printFailureSummary(failures) {
2118
+ const groups = groupFailuresByCategory(failures);
2119
+ const sorted = [...groups.entries()].sort(
2120
+ (a, b) => b[1].length - a[1].length
2121
+ );
2122
+ console.log(`
2123
+ ${colors.bold}Failures by category:${colors.reset}`);
2124
+ for (const [category, categoryFailures] of sorted) {
2125
+ console.log(
2126
+ ` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
2127
+ );
2128
+ }
2129
+ const maxToShow = 5;
2130
+ const shown = failures.slice(0, maxToShow);
2131
+ for (const failure of shown) {
2132
+ printCompactFailure(failure);
2133
+ }
2134
+ if (failures.length > maxToShow) {
2135
+ const remaining = failures.length - maxToShow;
2136
+ const remainingIds = failures.slice(maxToShow).map((f) => f.id);
2137
+ const idPreview = remainingIds.slice(0, 5).join(", ");
2138
+ const more = remainingIds.length > 5 ? "..." : "";
2139
+ console.log(
2140
+ `
2141
+ ${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
2142
+ );
2143
+ }
2144
+ }
1544
2145
  function printResult(result) {
1545
2146
  const { model, modelKey, benchmark, result: benchmarkResult } = result;
1546
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
2147
+ const passed = benchmarkResult.metrics.correct_count;
2148
+ const total = benchmarkResult.metrics.total_cases;
2149
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2150
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2151
+ const statusColor = benchmarkResult.success ? colors.green : colors.red;
1547
2152
  console.log(
1548
2153
  `
1549
2154
  ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
1550
2155
  );
1551
2156
  console.log(
1552
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
2157
+ ` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
1553
2158
  );
1554
- const metrics = Object.entries(benchmarkResult.metrics);
1555
- if (metrics.length > 0) {
1556
- console.log(" Metrics:");
1557
- for (const [key, value] of metrics) {
1558
- console.log(` - ${key}: ${value}`);
1559
- }
1560
- }
1561
2159
  if (benchmarkResult.error) {
1562
2160
  console.log(
1563
2161
  ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
1564
2162
  );
1565
2163
  }
2164
+ if (!benchmarkResult.success && benchmarkResult.logs) {
2165
+ const failures = parseFailures(benchmarkResult.logs);
2166
+ if (failures.length > 0) {
2167
+ printFailureSummary(failures);
2168
+ } else if (benchmarkResult.logs.length > 0) {
2169
+ console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
2170
+ for (const l of benchmarkResult.logs.slice(0, 5)) {
2171
+ console.log(` ${l}`);
2172
+ }
2173
+ }
2174
+ }
1566
2175
  }
1567
2176
  function consoleReporter(results) {
1568
2177
  console.log("\n--- \u{1F4CA} Evaluation Report ---");
@@ -1617,14 +2226,14 @@ function hasFunctionNameIssue(diff) {
1617
2226
  );
1618
2227
  }
1619
2228
  function suggestFunctionNameFix(expected, actual, suggestions) {
1620
- const expectedName = expected?.function;
1621
- const actualName = actual?.function;
2229
+ const expectedName = expected == null ? void 0 : expected.function;
2230
+ const actualName = actual == null ? void 0 : actual.function;
1622
2231
  if (expectedName && actualName && expectedName !== actualName) {
1623
2232
  suggestions.push(
1624
2233
  `Call the function '${expectedName}' instead of '${actualName}'.`
1625
2234
  );
1626
2235
  }
1627
- if (Array.isArray(expected?.functions)) {
2236
+ if (Array.isArray(expected == null ? void 0 : expected.functions)) {
1628
2237
  suggestions.push(
1629
2238
  `Ensure tool calls include: ${expected.functions.join(", ")}.`
1630
2239
  );
@@ -1679,7 +2288,7 @@ function suggestFromErrorType(error_type, suggestions) {
1679
2288
  }
1680
2289
  function suggestFixFromDiff(parsed) {
1681
2290
  const suggestions = [];
1682
- const { error_type, expected, actual, diff } = parsed ?? {};
2291
+ const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
1683
2292
  if (!Array.isArray(diff)) {
1684
2293
  if (suggestions.length === 0 && typeof error_type === "string") {
1685
2294
  suggestFromErrorType(error_type, suggestions);
@@ -1704,15 +2313,16 @@ function suggestFixFromDiff(parsed) {
1704
2313
  return uniqueLines(suggestions);
1705
2314
  }
1706
2315
  function getTestIdFromLogLine(line) {
2316
+ var _a, _b;
1707
2317
  if (line.startsWith("[FAIL]")) {
1708
2318
  const m = line.match(FAIL_ID_REGEX);
1709
- return m?.[1];
2319
+ return m == null ? void 0 : m[1];
1710
2320
  }
1711
2321
  if (line.startsWith("[DEBUG-FAIL]")) {
1712
2322
  try {
1713
2323
  const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1714
- return String(parsed?.id ?? "");
1715
- } catch {
2324
+ return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
2325
+ } catch (e) {
1716
2326
  }
1717
2327
  }
1718
2328
  if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
@@ -1720,18 +2330,19 @@ function getTestIdFromLogLine(line) {
1720
2330
  const parsed = JSON.parse(
1721
2331
  line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1722
2332
  );
1723
- return String(parsed?.id ?? "");
1724
- } catch {
2333
+ return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
2334
+ } catch (e) {
1725
2335
  }
1726
2336
  }
1727
2337
  return;
1728
2338
  }
1729
2339
  function groupLogsByTestId(failLogs) {
2340
+ var _a;
1730
2341
  const byId = /* @__PURE__ */ new Map();
1731
2342
  for (const line of failLogs) {
1732
2343
  const id = getTestIdFromLogLine(line);
1733
- const key = id ?? "__general__";
1734
- const arr = byId.get(key) ?? [];
2344
+ const key = id != null ? id : "__general__";
2345
+ const arr = (_a = byId.get(key)) != null ? _a : [];
1735
2346
  arr.push(line);
1736
2347
  byId.set(key, arr);
1737
2348
  }
@@ -1743,10 +2354,10 @@ function collectDebugIds(lines) {
1743
2354
  if (l.startsWith("[DEBUG-FAIL]")) {
1744
2355
  try {
1745
2356
  const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1746
- if (parsed?.id) {
2357
+ if (parsed == null ? void 0 : parsed.id) {
1747
2358
  debugIds.add(String(parsed.id));
1748
2359
  }
1749
- } catch {
2360
+ } catch (e) {
1750
2361
  }
1751
2362
  }
1752
2363
  }
@@ -1782,7 +2393,7 @@ function displayDebugFailLine(line) {
1782
2393
  console.log(` \u2022 ${s}`);
1783
2394
  }
1784
2395
  }
1785
- } catch {
2396
+ } catch (e) {
1786
2397
  console.log(` ${line}`);
1787
2398
  }
1788
2399
  }
@@ -1826,14 +2437,14 @@ function displayDebugFailContextLine(line) {
1826
2437
  const ctx = JSON.parse(payload);
1827
2438
  console.log(` ${colors2.gray}context:${colors2.reset}`);
1828
2439
  displayContextInfo(ctx);
1829
- } catch {
2440
+ } catch (e) {
1830
2441
  console.log(` ${line}`);
1831
2442
  }
1832
2443
  }
1833
2444
  function displayLogLine(line, debugIds) {
1834
2445
  if (line.startsWith("[FAIL]")) {
1835
2446
  const m = line.match(FAIL_ID_REGEX);
1836
- const failId = m?.[1];
2447
+ const failId = m == null ? void 0 : m[1];
1837
2448
  if (failId && debugIds.has(failId)) {
1838
2449
  return;
1839
2450
  }
@@ -1903,26 +2514,350 @@ function displayResultHeader(r) {
1903
2514
  );
1904
2515
  }
1905
2516
  function consoleDebugReporter(results) {
2517
+ var _a;
1906
2518
  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
1907
2519
  for (const r of results) {
1908
2520
  displayResultHeader(r);
1909
2521
  displayMetrics(Object.entries(r.result.metrics));
1910
- if (r.result.logs?.length) {
2522
+ if ((_a = r.result.logs) == null ? void 0 : _a.length) {
1911
2523
  displayResultLogs(r.result.logs);
1912
2524
  }
1913
2525
  }
1914
2526
  console.log("\n------------------------------------\n");
1915
2527
  }
1916
2528
 
2529
+ // src/reporters/console.summary.ts
2530
+ var colors3 = {
2531
+ reset: "\x1B[0m",
2532
+ bold: "\x1B[1m",
2533
+ dim: "\x1B[2m",
2534
+ green: "\x1B[32m",
2535
+ red: "\x1B[31m",
2536
+ yellow: "\x1B[33m",
2537
+ cyan: "\x1B[36m",
2538
+ magenta: "\x1B[35m",
2539
+ gray: "\x1B[90m",
2540
+ white: "\x1B[37m"
2541
+ };
2542
+ var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
2543
+ var ID_NUM_REGEX = /_(\d+)$/;
2544
+ var REASONING_TAG = "think";
2545
+ var MAX_FAILURES_TO_DISPLAY = 5;
2546
+ var CATEGORY_DESCRIPTIONS = {
2547
+ PARSE_FAILURE: {
2548
+ label: "Parse Failure",
2549
+ description: "No tool calls extracted from model output",
2550
+ hint: "Model may have responded in text instead of tool format"
2551
+ },
2552
+ PARTIAL_CALLS: {
2553
+ label: "Partial Calls",
2554
+ description: "Some expected tool calls missing",
2555
+ hint: "Model stopped early or missed some tools"
2556
+ },
2557
+ EXTRA_CALLS: {
2558
+ label: "Extra Calls",
2559
+ description: "More tool calls than expected",
2560
+ hint: "Model called tools that weren't needed"
2561
+ },
2562
+ PARAM_VALUE_PERCENT: {
2563
+ label: "Param Value (Percent)",
2564
+ description: "Percentage sent as integer instead of decimal",
2565
+ hint: "e.g., 5 instead of 0.05 for 5%"
2566
+ },
2567
+ PARAM_VALUE_MISMATCH: {
2568
+ label: "Param Value Mismatch",
2569
+ description: "Parameter values don't match expected"
2570
+ },
2571
+ WRONG_FUNCTION: {
2572
+ label: "Wrong Function",
2573
+ description: "Called wrong function name"
2574
+ },
2575
+ MISSING_PARAMS: {
2576
+ label: "Missing Params",
2577
+ description: "Required parameters not provided"
2578
+ },
2579
+ UNEXPECTED_PARAMS: {
2580
+ label: "Unexpected Params",
2581
+ description: "Extra parameters that shouldn't be there"
2582
+ },
2583
+ NO_MATCH: {
2584
+ label: "No Match",
2585
+ description: "Function called but couldn't match to expected",
2586
+ hint: "Parameters may be correct but don't match any expected combination"
2587
+ },
2588
+ OTHER: {
2589
+ label: "Other",
2590
+ description: "Uncategorized failure"
2591
+ }
2592
+ };
2593
+ function parseFailureLogs(logs) {
2594
+ return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
2595
+ try {
2596
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
2597
+ return JSON.parse(jsonStr);
2598
+ } catch (e) {
2599
+ return null;
2600
+ }
2601
+ }).filter((parsed) => parsed !== null);
2602
+ }
2603
+ function groupByCategory(failures) {
2604
+ const groups = /* @__PURE__ */ new Map();
2605
+ for (const failure of failures) {
2606
+ const category = failure.category || "OTHER";
2607
+ const existing = groups.get(category);
2608
+ if (existing) {
2609
+ existing.failures.push(failure);
2610
+ } else {
2611
+ groups.set(category, { failures: [failure] });
2612
+ }
2613
+ }
2614
+ return groups;
2615
+ }
2616
+ function extractParamNames(failures) {
2617
+ const paramNames = /* @__PURE__ */ new Set();
2618
+ for (const f of failures) {
2619
+ if (!f.diff) {
2620
+ continue;
2621
+ }
2622
+ for (const d of f.diff) {
2623
+ if (d.startsWith("@@ param ")) {
2624
+ paramNames.add(d.replace("@@ param ", ""));
2625
+ }
2626
+ }
2627
+ }
2628
+ return paramNames;
2629
+ }
2630
+ function extractFinishReasons(failures) {
2631
+ var _a;
2632
+ const finishReasons = /* @__PURE__ */ new Set();
2633
+ for (const f of failures) {
2634
+ if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
2635
+ finishReasons.add(String(f.context.finish_reason));
2636
+ }
2637
+ }
2638
+ return finishReasons;
2639
+ }
2640
+ function detectPatterns(group) {
2641
+ const { failures } = group;
2642
+ if (failures.length < 2) {
2643
+ return;
2644
+ }
2645
+ const firstCategory = failures[0].category;
2646
+ if (firstCategory === "PARAM_VALUE_PERCENT") {
2647
+ const paramNames = extractParamNames(failures);
2648
+ if (paramNames.size > 0) {
2649
+ group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
2650
+ }
2651
+ }
2652
+ if (firstCategory === "PARSE_FAILURE") {
2653
+ const finishReasons = extractFinishReasons(failures);
2654
+ if (finishReasons.size === 1) {
2655
+ group.pattern = `All finished with: ${[...finishReasons][0]}`;
2656
+ }
2657
+ }
2658
+ }
2659
+ function getLineColor(line) {
2660
+ if (line.startsWith("+")) {
2661
+ return colors3.green;
2662
+ }
2663
+ if (line.startsWith("-")) {
2664
+ return colors3.red;
2665
+ }
2666
+ if (line.startsWith("@@")) {
2667
+ return colors3.cyan;
2668
+ }
2669
+ return colors3.white;
2670
+ }
2671
+ function formatFunctions(funcs) {
2672
+ if (Array.isArray(funcs)) {
2673
+ return funcs.join(", ");
2674
+ }
2675
+ return String(funcs);
2676
+ }
2677
+ function printExpectedActual(failure) {
2678
+ if (failure.expected) {
2679
+ const expFuncs = failure.expected.functions || failure.expected.function;
2680
+ if (expFuncs) {
2681
+ console.log(
2682
+ ` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
2683
+ );
2684
+ }
2685
+ }
2686
+ if (failure.actual) {
2687
+ const actFuncs = failure.actual.functions || failure.actual.function;
2688
+ if (actFuncs) {
2689
+ const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
2690
+ const color = isEmpty ? colors3.red : colors3.white;
2691
+ const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
2692
+ console.log(
2693
+ ` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
2694
+ );
2695
+ }
2696
+ }
2697
+ }
2698
+ function printDiff(diff) {
2699
+ console.log(` ${colors3.gray}Diff:${colors3.reset}`);
2700
+ for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
2701
+ const lineColor = getLineColor(line);
2702
+ console.log(` ${lineColor}${line}${colors3.reset}`);
2703
+ }
2704
+ }
2705
+ function removeReasoningTags(text) {
2706
+ const openTag = `<${REASONING_TAG}>`;
2707
+ const closeTag = `</${REASONING_TAG}>`;
2708
+ const closedTagPattern = new RegExp(
2709
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
2710
+ "g"
2711
+ );
2712
+ const unclosedTagPattern = new RegExp(
2713
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
2714
+ "g"
2715
+ );
2716
+ let result = text.replace(closedTagPattern, "");
2717
+ result = result.replace(unclosedTagPattern, "");
2718
+ return result.trim();
2719
+ }
2720
+ function printModelOutput(failure, category) {
2721
+ var _a, _b;
2722
+ if (category !== "PARSE_FAILURE") {
2723
+ return;
2724
+ }
2725
+ const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
2726
+ const cleanedText = removeReasoningTags(rawText);
2727
+ if (cleanedText) {
2728
+ console.log(
2729
+ ` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
2730
+ );
2731
+ } else {
2732
+ console.log(
2733
+ ` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
2734
+ );
2735
+ }
2736
+ }
2737
+ function shouldShowDiffByDefault(category) {
2738
+ return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
2739
+ }
2740
+ function printSingleFailure(failure, category, verbose) {
2741
+ console.log(`
2742
+ ${colors3.bold}${failure.id}${colors3.reset}`);
2743
+ const hasDiff = failure.diff && failure.diff.length > 0;
2744
+ const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
2745
+ if (showDiffPrimarily) {
2746
+ printDiff(failure.diff);
2747
+ } else {
2748
+ printExpectedActual(failure);
2749
+ if (hasDiff && verbose) {
2750
+ printDiff(failure.diff);
2751
+ }
2752
+ }
2753
+ printModelOutput(failure, category);
2754
+ }
2755
+ var MAX_SAMPLE_FAILURES = 2;
2756
+ function printRemainingIds(failures) {
2757
+ const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
2758
+ const idNums = remainingIds.map((id) => {
2759
+ const match = id.match(ID_NUM_REGEX);
2760
+ return match ? match[1] : id;
2761
+ });
2762
+ console.log(
2763
+ `
2764
+ ${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
2765
+ );
2766
+ }
2767
+ function printCategoryHeader(info, count) {
2768
+ console.log(
2769
+ `
2770
+ ${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
2771
+ );
2772
+ console.log(`${colors3.dim}${info.description}${colors3.reset}`);
2773
+ }
2774
+ function printCategoryDetails(category, group, verbose) {
2775
+ const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
2776
+ const { failures } = group;
2777
+ printCategoryHeader(info, failures.length);
2778
+ if (group.pattern) {
2779
+ console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
2780
+ }
2781
+ if (info.hint) {
2782
+ console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
2783
+ }
2784
+ const samplesToShow = verbose ? failures : failures.slice(0, 2);
2785
+ for (const failure of samplesToShow) {
2786
+ printSingleFailure(failure, category, verbose);
2787
+ }
2788
+ if (!verbose && failures.length > 2) {
2789
+ printRemainingIds(failures);
2790
+ }
2791
+ }
2792
+ function printResultHeader(result) {
2793
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
2794
+ const passed = benchmarkResult.metrics.correct_count;
2795
+ const total = benchmarkResult.metrics.total_cases;
2796
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2797
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2798
+ const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
2799
+ const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
2800
+ const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
2801
+ const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
2802
+ console.log(
2803
+ `
2804
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
2805
+ );
2806
+ console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
2807
+ }
2808
+ function printResultSummary(result, verbose) {
2809
+ const { result: benchmarkResult } = result;
2810
+ printResultHeader(result);
2811
+ if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
2812
+ return;
2813
+ }
2814
+ const failures = parseFailureLogs(benchmarkResult.logs);
2815
+ if (failures.length === 0) {
2816
+ if (!benchmarkResult.success) {
2817
+ console.log(
2818
+ `${colors3.yellow}No structured failure data available${colors3.reset}`
2819
+ );
2820
+ }
2821
+ return;
2822
+ }
2823
+ const groups = groupByCategory(failures);
2824
+ for (const group of groups.values()) {
2825
+ detectPatterns(group);
2826
+ }
2827
+ const sortedCategories = [...groups.entries()].sort(
2828
+ (a, b) => b[1].failures.length - a[1].failures.length
2829
+ );
2830
+ for (const [cat, group] of sortedCategories) {
2831
+ printCategoryDetails(cat, group, verbose);
2832
+ }
2833
+ }
2834
+ function consoleSummaryReporter(results) {
2835
+ const verbose = process.env.VERBOSE === "true";
2836
+ console.log(`
2837
+ ${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
2838
+ console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
2839
+ for (const result of results) {
2840
+ printResultSummary(result, verbose);
2841
+ }
2842
+ console.log(
2843
+ `
2844
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
2845
+ `
2846
+ );
2847
+ }
2848
+
1917
2849
  // src/reporters/json.ts
1918
2850
  function jsonReporter(results) {
1919
- const serializableResults = results.map((r) => ({
1920
- ...r,
1921
- result: {
1922
- ...r.result,
1923
- error: r.result.error?.message
1924
- }
1925
- }));
2851
+ const serializableResults = results.map((r) => {
2852
+ var _a;
2853
+ return {
2854
+ ...r,
2855
+ result: {
2856
+ ...r.result,
2857
+ error: (_a = r.result.error) == null ? void 0 : _a.message
2858
+ }
2859
+ };
2860
+ });
1926
2861
  console.log(JSON.stringify(serializableResults, null, 2));
1927
2862
  }
1928
2863
 
@@ -1930,60 +2865,56 @@ function jsonReporter(results) {
1930
2865
  var reporters = {
1931
2866
  console: consoleReporter,
1932
2867
  json: jsonReporter,
1933
- "console.debug": consoleDebugReporter
2868
+ "console.debug": consoleDebugReporter,
2869
+ "console.summary": consoleSummaryReporter
1934
2870
  };
1935
2871
 
1936
2872
  // src/evaluate.ts
1937
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
1938
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
1939
- try {
1940
- console.log(
1941
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
1942
- );
1943
- const result = await benchmark.run(model, config);
1944
- console.log(
1945
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
1946
- );
1947
- return {
1948
- model: modelId,
1949
- modelKey,
1950
- benchmark: benchmark.name,
1951
- result
1952
- };
1953
- } catch (error) {
1954
- console.error(
1955
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
1956
- error
1957
- );
1958
- return {
1959
- model: modelId,
1960
- modelKey,
1961
- benchmark: benchmark.name,
1962
- result: {
1963
- score: 0,
1964
- success: false,
1965
- metrics: {},
1966
- error: error instanceof Error ? error : new Error(String(error))
1967
- }
1968
- };
2873
+ function isModelConfig(value) {
2874
+ if (typeof value !== "object" || value === null) {
2875
+ return false;
2876
+ }
2877
+ const obj = value;
2878
+ if (!("model" in obj)) {
2879
+ return false;
2880
+ }
2881
+ const model = obj.model;
2882
+ if (typeof model !== "object" || model === null) {
2883
+ return false;
1969
2884
  }
2885
+ return "modelId" in model;
2886
+ }
2887
+ function isLanguageModel(value) {
2888
+ if (typeof value !== "object" || value === null) {
2889
+ return false;
2890
+ }
2891
+ const obj = value;
2892
+ return "modelId" in obj && typeof obj.modelId === "string";
2893
+ }
2894
+ function extractModelAndMiddleware(input) {
2895
+ if (isModelConfig(input)) {
2896
+ return [input.model, input.middleware];
2897
+ }
2898
+ return [input, void 0];
1970
2899
  }
1971
2900
  function normalizeModels(models) {
1972
- const modelEntries = [];
2901
+ const entries = [];
1973
2902
  if (Array.isArray(models)) {
1974
2903
  for (const m of models) {
1975
- modelEntries.push([void 0, m]);
2904
+ const [model, middleware] = extractModelAndMiddleware(m);
2905
+ entries.push([void 0, model, middleware]);
1976
2906
  }
1977
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
1978
- modelEntries.push([void 0, models]);
2907
+ } else if (isModelConfig(models)) {
2908
+ entries.push([void 0, models.model, models.middleware]);
2909
+ } else if (isLanguageModel(models)) {
2910
+ entries.push([void 0, models, void 0]);
1979
2911
  } else {
1980
- for (const [key, m] of Object.entries(
1981
- models
1982
- )) {
1983
- modelEntries.push([key, m]);
2912
+ for (const [key, m] of Object.entries(models)) {
2913
+ const [model, middleware] = extractModelAndMiddleware(m);
2914
+ entries.push([key, model, middleware]);
1984
2915
  }
1985
2916
  }
1986
- return modelEntries;
2917
+ return entries;
1987
2918
  }
1988
2919
  function buildConfig(temperature, maxTokens) {
1989
2920
  const config = {};
@@ -2004,21 +2935,90 @@ function executeReporter(reporter, results) {
2004
2935
  reporters.console(results);
2005
2936
  }
2006
2937
  }
2938
+ function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
2939
+ var _a, _b;
2940
+ const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
2941
+ if (!(cacheEnabled || userMiddleware)) {
2942
+ return baseModel;
2943
+ }
2944
+ const cacheMiddleware = cacheEnabled ? (0, import_middleware.createDiskCacheMiddleware)({
2945
+ cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
2946
+ enabled: true,
2947
+ debug: (_b = cacheOptions.debug) != null ? _b : false
2948
+ }) : null;
2949
+ const middlewares = [];
2950
+ if (userMiddleware) {
2951
+ if (Array.isArray(userMiddleware)) {
2952
+ middlewares.push(...userMiddleware);
2953
+ } else {
2954
+ middlewares.push(userMiddleware);
2955
+ }
2956
+ }
2957
+ if (cacheMiddleware) {
2958
+ middlewares.push(cacheMiddleware);
2959
+ }
2960
+ if (middlewares.length === 0) {
2961
+ return baseModel;
2962
+ }
2963
+ return (0, import_ai4.wrapLanguageModel)({
2964
+ // biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
2965
+ model: baseModel,
2966
+ middleware: middlewares.length === 1 ? middlewares[0] : middlewares
2967
+ });
2968
+ }
2969
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
2970
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2971
+ const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
2972
+ try {
2973
+ process.stdout.write(`${prefix}: ...`);
2974
+ const result = await benchmark.run(model, config);
2975
+ const scoreDisplay = result.score.toFixed(2);
2976
+ process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
2977
+ `);
2978
+ return {
2979
+ model: modelId,
2980
+ modelKey,
2981
+ benchmark: benchmark.name,
2982
+ result
2983
+ };
2984
+ } catch (error) {
2985
+ process.stdout.write(`\r${prefix}: .... Score: ERROR
2986
+ `);
2987
+ console.error(error);
2988
+ return {
2989
+ model: modelId,
2990
+ modelKey,
2991
+ benchmark: benchmark.name,
2992
+ result: {
2993
+ score: 0,
2994
+ success: false,
2995
+ metrics: {},
2996
+ error: error instanceof Error ? error : new Error(String(error))
2997
+ }
2998
+ };
2999
+ }
3000
+ }
2007
3001
  async function evaluate(options) {
2008
3002
  const {
2009
3003
  models,
2010
3004
  benchmarks,
2011
3005
  reporter = "console",
2012
3006
  temperature,
2013
- maxTokens
3007
+ maxTokens,
3008
+ cache
2014
3009
  } = options;
2015
3010
  const modelEntries = normalizeModels(models);
2016
3011
  const config = buildConfig(temperature, maxTokens);
2017
3012
  const allResults = [];
2018
- for (const [modelKey, model] of modelEntries) {
3013
+ for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
3014
+ const effectiveModel = buildEffectiveModel(
3015
+ baseModel,
3016
+ userMiddleware,
3017
+ cache
3018
+ );
2019
3019
  for (const benchmark of benchmarks) {
2020
3020
  const evaluationResult = await runSingleBenchmark(
2021
- model,
3021
+ effectiveModel,
2022
3022
  benchmark,
2023
3023
  modelKey,
2024
3024
  config
@@ -2035,6 +3035,7 @@ async function evaluate(options) {
2035
3035
  bfclParallelBenchmark,
2036
3036
  bfclParallelMultipleBenchmark,
2037
3037
  bfclSimpleBenchmark,
3038
+ complexFuncBenchBenchmark,
2038
3039
  evaluate,
2039
3040
  jsonGenerationBenchmark,
2040
3041
  jsonGenerationSchemaOnlyBenchmark