@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -23,7 +23,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
23
23
  if (fs.existsSync(dataAtRoot)) {
24
24
  return dataAtRoot;
25
25
  }
26
- } catch {
26
+ } catch (e) {
27
27
  }
28
28
  return null;
29
29
  }
@@ -37,7 +37,7 @@ function tryResolveViaPackageJson(moduleUrl) {
37
37
  if (fs.existsSync(dataAtPkg)) {
38
38
  return dataAtPkg;
39
39
  }
40
- } catch {
40
+ } catch (e) {
41
41
  }
42
42
  return null;
43
43
  }
@@ -45,7 +45,7 @@ function getStartDir(moduleUrl) {
45
45
  if (moduleUrl) {
46
46
  try {
47
47
  return path.dirname(fileURLToPath(moduleUrl));
48
- } catch {
48
+ } catch (e) {
49
49
  return process.cwd();
50
50
  }
51
51
  }
@@ -139,7 +139,7 @@ function valuesMatch(modelValue, possibleValue) {
139
139
  const normalizedModel = normalizeObject(modelValue);
140
140
  const normalizedPossible = normalizeObject(possibleValue);
141
141
  return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
142
- } catch {
142
+ } catch (e) {
143
143
  return false;
144
144
  }
145
145
  }
@@ -268,7 +268,7 @@ function checkSingleParameter(paramName, modelValue, context) {
268
268
  return checkStringValue(
269
269
  paramName,
270
270
  modelValue,
271
- possibleValues ?? []
271
+ possibleValues != null ? possibleValues : []
272
272
  );
273
273
  }
274
274
  if (Array.isArray(modelValue)) {
@@ -368,45 +368,99 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
368
368
  // src/benchmarks/bfcl.ts
369
369
  var LINE_SPLIT_REGEX = /\r?\n/;
370
370
  var NUMERIC_STRING_REGEX = /^\d+$/;
371
+ var DIFF_NUMERIC_EXTRACT_REGEX = /:\s*([\d.]+)/;
372
+ function convertGroundTruthToXML(call) {
373
+ const keys = Object.keys(call);
374
+ if (keys.length === 0) {
375
+ return "<empty_call />";
376
+ }
377
+ const funcName = keys[0];
378
+ if (!funcName) {
379
+ return "<undefined_function />";
380
+ }
381
+ const params = call[funcName];
382
+ if (!params || typeof params !== "object") {
383
+ return `<${funcName} />`;
384
+ }
385
+ let xml = `<${funcName}>
386
+ `;
387
+ for (const [key, value] of Object.entries(params)) {
388
+ const displayValue = Array.isArray(value) ? value[0] : value;
389
+ let valueStr;
390
+ if (typeof displayValue === "string") {
391
+ valueStr = displayValue;
392
+ } else if (displayValue === null || displayValue === void 0) {
393
+ valueStr = "";
394
+ } else {
395
+ valueStr = JSON.stringify(displayValue);
396
+ }
397
+ xml += ` <${key}>${valueStr}</${key}>
398
+ `;
399
+ }
400
+ xml += `</${funcName}>`;
401
+ return xml;
402
+ }
403
+ function extractCategory(id) {
404
+ if (id.startsWith("parallel_multiple")) {
405
+ return "parallel_multiple";
406
+ }
407
+ if (id.startsWith("simple_python")) {
408
+ return "simple";
409
+ }
410
+ if (id.startsWith("simple_java")) {
411
+ return "simple";
412
+ }
413
+ if (id.startsWith("simple_javascript")) {
414
+ return "simple";
415
+ }
416
+ if (id.startsWith("parallel")) {
417
+ return "parallel";
418
+ }
419
+ if (id.startsWith("multiple")) {
420
+ return "multiple";
421
+ }
422
+ if (id.startsWith("simple")) {
423
+ return "simple";
424
+ }
425
+ return id.split("_")[0];
426
+ }
371
427
  function check(testCase, modelOutput, possibleAnswer) {
372
- const category = testCase.id.split("_")[0];
428
+ const category = extractCategory(testCase.id);
373
429
  try {
374
- if (category === "simple") {
375
- if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
376
- return {
377
- valid: false,
378
- error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
379
- error_type: "simple:wrong_count"
380
- };
430
+ switch (category) {
431
+ case "simple": {
432
+ if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
433
+ return {
434
+ valid: false,
435
+ error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
436
+ error_type: "simple:wrong_count"
437
+ };
438
+ }
439
+ return simpleFunctionChecker(
440
+ testCase.function[0],
441
+ modelOutput[0],
442
+ possibleAnswer.ground_truth[0]
443
+ );
444
+ }
445
+ case "multiple": {
446
+ return multipleFunctionChecker(
447
+ testCase.function,
448
+ modelOutput,
449
+ possibleAnswer.ground_truth
450
+ );
451
+ }
452
+ case "parallel":
453
+ case "parallel_multiple": {
454
+ return parallelFunctionCheckerNoOrder(
455
+ testCase.function,
456
+ modelOutput,
457
+ possibleAnswer.ground_truth
458
+ );
459
+ }
460
+ default: {
461
+ return { valid: true };
381
462
  }
382
- return simpleFunctionChecker(
383
- testCase.function[0],
384
- modelOutput[0],
385
- possibleAnswer.ground_truth[0]
386
- );
387
- }
388
- if (category === "parallel") {
389
- return parallelFunctionCheckerNoOrder(
390
- testCase.function,
391
- modelOutput,
392
- possibleAnswer.ground_truth
393
- );
394
- }
395
- if (category === "multiple") {
396
- return multipleFunctionChecker(
397
- testCase.function,
398
- modelOutput,
399
- possibleAnswer.ground_truth
400
- );
401
- }
402
- if (category.includes("parallel-multiple")) {
403
- return parallelFunctionCheckerNoOrder(
404
- testCase.function,
405
- modelOutput,
406
- possibleAnswer.ground_truth
407
- );
408
463
  }
409
- return { valid: true };
410
464
  } catch (e) {
411
465
  return {
412
466
  valid: false,
@@ -448,7 +502,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
448
502
  `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
449
503
  );
450
504
  }
451
- const fixSchemaType = (copy) => {
505
+ const fixSchemaType2 = (copy) => {
452
506
  if (!copy.type) {
453
507
  return;
454
508
  }
@@ -472,16 +526,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
472
526
  );
473
527
  }
474
528
  };
475
- const fixSchema = (schema) => {
529
+ const fixSchema2 = (schema) => {
476
530
  if (!schema || typeof schema !== "object") {
477
531
  return { type: "object", properties: {} };
478
532
  }
479
- const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
533
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
480
534
  if (!Array.isArray(copy)) {
481
- fixSchemaType(copy);
482
- fixSchemaProperties(copy, fixSchema);
535
+ fixSchemaType2(copy);
536
+ fixSchemaProperties(copy, fixSchema2);
483
537
  if (copy.items) {
484
- copy.items = fixSchema(copy.items);
538
+ copy.items = fixSchema2(copy.items);
485
539
  }
486
540
  return copy;
487
541
  }
@@ -516,13 +570,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
516
570
  try {
517
571
  const arr = JSON.parse(raw);
518
572
  return Array.isArray(arr) ? arr : [];
519
- } catch {
573
+ } catch (e) {
520
574
  return [];
521
575
  }
522
576
  };
523
577
  const getSanitizedName = (rawName, transformedTools) => {
578
+ var _a, _b;
524
579
  if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
525
- return transformedTools[Number(rawName)]?.name ?? rawName;
580
+ return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
526
581
  }
527
582
  return rawName;
528
583
  };
@@ -532,25 +587,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
532
587
  }
533
588
  try {
534
589
  return JSON.parse(extractedArgs);
535
- } catch {
590
+ } catch (e) {
536
591
  return extractedArgs;
537
592
  }
538
593
  };
539
594
  const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
595
+ var _a, _b, _c, _d, _e, _f;
540
596
  const call = c;
541
- const rawName = call.toolName ?? call.name;
597
+ const rawName = (_a = call.toolName) != null ? _a : call.name;
542
598
  const sanitizedFromIndex = getSanitizedName(
543
599
  rawName,
544
600
  transformedTools
545
601
  );
546
- const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
547
- const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
602
+ const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
603
+ const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
548
604
  const parsedArgs = parseToolArgs(extractedArgs);
549
605
  return {
550
606
  ...call,
551
607
  toolName: originalName,
552
608
  name: originalName,
553
- args: parsedArgs ?? {}
609
+ args: parsedArgs != null ? parsedArgs : {}
554
610
  };
555
611
  });
556
612
  const summarizeArgs = (args) => {
@@ -582,7 +638,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
582
638
  return `- expected one of: ${formatted}`;
583
639
  })();
584
640
  diffLines.push(expectedLine);
585
- diffLines.push(`+ got: ${JSON.stringify(got)}`);
641
+ diffLines.push(`+ got: ${JSON.stringify(got)}`);
586
642
  return diffLines;
587
643
  };
588
644
  const paramValueMatches = (allowed, got) => {
@@ -594,7 +650,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
594
650
  if (Array.isArray(got)) {
595
651
  return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
596
652
  }
597
- } catch {
653
+ } catch (e) {
598
654
  }
599
655
  return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
600
656
  });
@@ -632,13 +688,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
632
688
  }
633
689
  };
634
690
  const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
691
+ var _a, _b, _c, _d;
635
692
  const funcDesc = tools[0];
636
- const gt = possibleAnswer.ground_truth?.[0];
637
- const expectedFuncName = funcDesc?.name;
693
+ const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
694
+ const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
638
695
  const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
639
696
  const received = restoredCalls[0];
640
- const receivedName = received?.toolName ?? received?.name;
641
- const receivedArgs = summarizeArgs(received?.args);
697
+ const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
698
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
642
699
  const expected = {
643
700
  function: expectedFuncName,
644
701
  params: expectedParams
@@ -650,7 +707,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
650
707
  const diff = [];
651
708
  checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
652
709
  if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
653
- const required = funcDesc?.parameters?.required ?? [];
710
+ const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
654
711
  checkMissingParams(
655
712
  required,
656
713
  receivedArgs,
@@ -687,12 +744,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
687
744
  }
688
745
  };
689
746
  const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
747
+ var _a;
690
748
  for (let i = 0; i < restoredCalls.length; i += 1) {
691
749
  if (usedActual.has(i)) {
692
750
  continue;
693
751
  }
694
752
  const rc = restoredCalls[i];
695
- const rcName = rc?.toolName ?? rc?.name;
753
+ const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
696
754
  if (rcName === fname) {
697
755
  return i;
698
756
  }
@@ -706,6 +764,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
706
764
  checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
707
765
  };
708
766
  const processExpectedCall = (options) => {
767
+ var _a, _b;
709
768
  const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
710
769
  const fname = Object.keys(expectedObj)[0];
711
770
  const matchedIndex = findMatchingCallIndex(
@@ -718,10 +777,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
718
777
  }
719
778
  usedActual.add(matchedIndex);
720
779
  const received = restoredCalls[matchedIndex];
721
- const receivedArgs = summarizeArgs(received?.args);
780
+ const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
722
781
  const expectedParamsAllowed = expectedObj[fname];
723
782
  const funcDesc = tools.find((t) => t.name === fname);
724
- const requiredParams = funcDesc?.parameters?.required ?? [];
783
+ const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
725
784
  diff.push(`@@ function ${fname}`);
726
785
  if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
727
786
  validateFunctionParams({
@@ -733,10 +792,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
733
792
  }
734
793
  };
735
794
  const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
736
- const gtArr = possibleAnswer.ground_truth ?? [];
795
+ var _a;
796
+ const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
737
797
  const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
738
798
  const actualNames = restoredCalls.map(
739
- (c) => c.toolName ?? c.name
799
+ (c) => {
800
+ var _a2;
801
+ return (_a2 = c.toolName) != null ? _a2 : c.name;
802
+ }
740
803
  );
741
804
  const expected = {
742
805
  functions: expectedNames
@@ -762,14 +825,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
762
825
  return { expected, actual, diff };
763
826
  };
764
827
  const concurrencyEnv = process.env.BFCL_CONCURRENCY;
765
- const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
828
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
766
829
  logs.push(
767
830
  `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
768
831
  );
769
832
  const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
833
+ var _a, _b, _c, _d;
770
834
  try {
771
835
  const firstTool = transformedTools[0];
772
- const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
836
+ const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
773
837
  caseLogs.push(
774
838
  `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
775
839
  );
@@ -785,49 +849,103 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
785
849
  caseLogs.push(
786
850
  `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
787
851
  );
788
- } catch {
852
+ } catch (e) {
789
853
  caseLogs.push(
790
854
  `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
791
855
  );
792
856
  }
793
857
  };
794
- const buildFailureContext = (options) => {
795
- const {
796
- testCase,
797
- tools,
798
- flatMessages,
799
- mwOriginalText,
800
- text,
801
- finishReason,
802
- mwParsedToolCalls,
803
- restoredCalls,
804
- possibleAnswer
805
- } = options;
806
- const lastUser = (() => {
807
- const reversed = [...flatMessages].reverse();
808
- const found = reversed.find(
809
- (m) => m.role === "user"
810
- );
811
- return found?.content ?? void 0;
812
- })();
813
- const rawModelText = (() => {
814
- if (mwOriginalText && mwOriginalText.length > 0) {
815
- return mwOriginalText;
858
+ const hasPercentPattern = (diff) => {
859
+ return diff.some((d) => {
860
+ if (!(d.startsWith("+ got:") || d.startsWith("- expected:"))) {
861
+ return false;
816
862
  }
817
- if (typeof text === "string") {
818
- return text;
863
+ const numMatch = d.match(DIFF_NUMERIC_EXTRACT_REGEX);
864
+ if (!numMatch) {
865
+ return false;
819
866
  }
820
- return "";
821
- })();
822
- return {
823
- id: testCase.id,
824
- tool_schema: tools,
825
- last_user_query: lastUser,
826
- raw_model_text: rawModelText,
827
- finish_reason: finishReason,
828
- parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
829
- ground_truth: possibleAnswer.ground_truth
830
- };
867
+ const num = Number.parseFloat(numMatch[1]);
868
+ return num >= 1 && num <= 100;
869
+ });
870
+ };
871
+ const isValueError = (errorType, diff) => {
872
+ return !!(errorType == null ? void 0 : errorType.includes("value_error")) || diff.some((d) => d.startsWith("@@ param"));
873
+ };
874
+ const isFunctionNameError = (errorType, diff) => {
875
+ return !!(errorType == null ? void 0 : errorType.includes("wrong_func_name")) || diff.some((d) => d.includes("function name"));
876
+ };
877
+ const isMissingParamError = (errorType, diff) => {
878
+ return !!(errorType == null ? void 0 : errorType.includes("missing_required")) || diff.some((d) => d.includes("missing required param"));
879
+ };
880
+ const isUnexpectedParamError = (errorType, diff) => {
881
+ return !!(errorType == null ? void 0 : errorType.includes("unexpected_param")) || diff.some((d) => d.includes("unexpected param"));
882
+ };
883
+ const classifyByErrorPatterns = (errorType, diff) => {
884
+ const patterns = [
885
+ [
886
+ isValueError,
887
+ hasPercentPattern(diff) ? "PARAM_VALUE_PERCENT" : "PARAM_VALUE_MISMATCH"
888
+ ],
889
+ [isFunctionNameError, "WRONG_FUNCTION"],
890
+ [isMissingParamError, "MISSING_PARAMS"],
891
+ [isUnexpectedParamError, "UNEXPECTED_PARAMS"]
892
+ ];
893
+ for (const [classifier, result] of patterns) {
894
+ if (classifier(errorType, diff)) {
895
+ return result;
896
+ }
897
+ }
898
+ if (errorType == null ? void 0 : errorType.includes("cannot_find_match")) {
899
+ return "NO_MATCH";
900
+ }
901
+ return null;
902
+ };
903
+ const classifyByCallCount = (actualCount, expectedCount) => {
904
+ if (actualCount === 0 && expectedCount > 0) {
905
+ return "PARSE_FAILURE";
906
+ }
907
+ if (actualCount > 0 && actualCount < expectedCount) {
908
+ return "PARTIAL_CALLS";
909
+ }
910
+ if (actualCount > expectedCount) {
911
+ return "EXTRA_CALLS";
912
+ }
913
+ return null;
914
+ };
915
+ const classifyFailureType = (options) => {
916
+ const { errorType, restoredCalls, expectedCount, diff } = options;
917
+ const actualCount = Array.isArray(restoredCalls) ? restoredCalls.length : 0;
918
+ const countBasedResult = classifyByCallCount(
919
+ actualCount,
920
+ expectedCount
921
+ );
922
+ if (countBasedResult) {
923
+ return countBasedResult;
924
+ }
925
+ const patternBasedResult = classifyByErrorPatterns(errorType, diff);
926
+ if (patternBasedResult) {
927
+ return patternBasedResult;
928
+ }
929
+ return "OTHER";
930
+ };
931
+ const extractRawModelText = (mwOriginalText, text) => {
932
+ if (mwOriginalText && mwOriginalText.length > 0) {
933
+ return mwOriginalText;
934
+ }
935
+ if (typeof text === "string") {
936
+ return text;
937
+ }
938
+ return "";
939
+ };
940
+ const extractLastUserQuery = (flatMessages) => {
941
+ var _a;
942
+ const reversed = [...flatMessages].reverse();
943
+ const found = reversed.find((m) => m.role === "user");
944
+ const content = (_a = found == null ? void 0 : found.content) != null ? _a : "";
945
+ return content.length > 200 ? `${content.slice(0, 200)}...` : content;
946
+ };
947
+ const truncateText = (text, maxLen) => {
948
+ return text.length > maxLen ? `${text.slice(0, maxLen)}...` : text;
831
949
  };
832
950
  const logFailureDetails = (options) => {
833
951
  const {
@@ -845,43 +963,37 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
845
963
  } = options;
846
964
  try {
847
965
  const category = testCase.id.split("_")[0];
848
- const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(
849
- tools,
850
- possibleAnswer,
851
- restoredCalls
852
- ) : buildParallelDiff(
853
- tools,
854
- possibleAnswer,
855
- restoredCalls
856
- );
857
- caseLogs.push(
858
- `[DEBUG-FAIL] ${JSON.stringify({
859
- id: testCase.id,
860
- message: checkerResult.error,
861
- error_type: checkerResult.error_type,
862
- expected,
863
- actual,
864
- diff
865
- })}`
866
- );
867
- try {
868
- const contextPayload = buildFailureContext({
869
- testCase,
870
- tools,
871
- flatMessages,
872
- mwOriginalText,
873
- text,
874
- finishReason,
875
- mwParsedToolCalls,
966
+ const { expected, actual, diff } = category === "simple" ? buildSimpleDiff(tools, possibleAnswer, restoredCalls) : buildParallelDiff(tools, possibleAnswer, restoredCalls);
967
+ const gtArr = possibleAnswer.ground_truth;
968
+ const expectedCount = Array.isArray(gtArr) ? gtArr.length : 1;
969
+ const rawModelText = extractRawModelText(mwOriginalText, text);
970
+ const lastUserQuery = extractLastUserQuery(flatMessages);
971
+ const failurePayload = {
972
+ id: testCase.id,
973
+ category: classifyFailureType({
974
+ errorType: checkerResult.error_type,
876
975
  restoredCalls,
877
- possibleAnswer
878
- });
879
- caseLogs.push(
880
- `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
881
- );
882
- } catch {
883
- }
884
- } catch {
976
+ expectedCount,
977
+ diff
978
+ }),
979
+ message: checkerResult.error,
980
+ error_type: checkerResult.error_type,
981
+ expected,
982
+ actual,
983
+ diff,
984
+ context: {
985
+ raw_model_text: truncateText(rawModelText, 500),
986
+ raw_model_text_full: rawModelText.length > 500 ? rawModelText : void 0,
987
+ parsed_tool_calls: mwParsedToolCalls.length ? mwParsedToolCalls : restoredCalls,
988
+ expected_count: expectedCount,
989
+ actual_count: Array.isArray(restoredCalls) ? restoredCalls.length : 0,
990
+ finish_reason: finishReason,
991
+ last_user_query: lastUserQuery,
992
+ tool_names: tools.map((t) => t.name)
993
+ }
994
+ };
995
+ caseLogs.push(`[DEBUG-FAIL] ${JSON.stringify(failurePayload)}`);
996
+ } catch (e) {
885
997
  caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
886
998
  }
887
999
  };
@@ -960,7 +1072,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
960
1072
  const flatMessages = flattenMessages(messages);
961
1073
  const { transformedTools, nameMap } = buildTransformedTools(
962
1074
  tools,
963
- fixSchema
1075
+ fixSchema2
964
1076
  );
965
1077
  const toolsMap = buildToolsMap(transformedTools);
966
1078
  return { flatMessages, transformedTools, nameMap, toolsMap };
@@ -982,6 +1094,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
982
1094
  const mwParsedToolCalls = parseDebugToolCalls(
983
1095
  debugSummaryRef.toolCalls
984
1096
  );
1097
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1098
+ if (!possibleAnswer) {
1099
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1100
+ }
1101
+ if (process.env.DEBUG_PARSER_OUTPUT === "true") {
1102
+ const groundTruth = possibleAnswer.ground_truth;
1103
+ const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
1104
+ console.log("\n========== BFCL CASE DEBUG ==========");
1105
+ console.log(`Test Case: ${testCase.id}`);
1106
+ console.log(`Expected count: ${groundTruth.length} call(s)`);
1107
+ console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
1108
+ console.log(expectedXML);
1109
+ console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
1110
+ console.log(mwOriginalText || text || "(empty)");
1111
+ console.log(
1112
+ "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
1113
+ );
1114
+ console.log(JSON.stringify(toolCalls, null, 2));
1115
+ console.log("======================================\n");
1116
+ }
985
1117
  logRawToolCalls({
986
1118
  toolCalls,
987
1119
  finishReason,
@@ -989,10 +1121,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
989
1121
  testCaseId: testCase.id,
990
1122
  caseLogs
991
1123
  });
992
- const possibleAnswer = possibleAnswersMap.get(testCase.id);
993
- if (!possibleAnswer) {
994
- throw new Error(`No possible answer for id: ${testCase.id}`);
995
- }
996
1124
  const restoredCalls = restoreToolCalls(
997
1125
  toolCalls || [],
998
1126
  nameMap,
@@ -1013,12 +1141,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1013
1141
  caseLogs
1014
1142
  });
1015
1143
  };
1016
- const runSingleCase = async (testCase) => {
1144
+ const runSingleCase2 = async (testCase) => {
1017
1145
  const caseLogs = [];
1018
1146
  const { function: tools } = testCase;
1019
- const temp = config?.temperature;
1147
+ const temp = config == null ? void 0 : config.temperature;
1020
1148
  const temperature = typeof temp === "number" ? temp : void 0;
1021
- const maxTok = config?.maxTokens;
1149
+ const maxTok = config == null ? void 0 : config.maxTokens;
1022
1150
  const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
1023
1151
  try {
1024
1152
  const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
@@ -1044,15 +1172,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1044
1172
  });
1045
1173
  } catch (e) {
1046
1174
  caseLogs.push(
1047
- `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
1175
+ `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
1048
1176
  );
1049
- if (e?.stack) {
1177
+ if (e == null ? void 0 : e.stack) {
1050
1178
  caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
1051
1179
  }
1052
1180
  return { valid: false, logs: caseLogs };
1053
1181
  }
1054
1182
  };
1055
- const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
1183
+ const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
1056
1184
  const results = new Array(items.length);
1057
1185
  let idx = 0;
1058
1186
  const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
@@ -1068,10 +1196,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1068
1196
  await Promise.all(workers);
1069
1197
  return results;
1070
1198
  };
1071
- const resultsPerCase = await mapWithConcurrency(
1199
+ const resultsPerCase = await mapWithConcurrency2(
1072
1200
  testCases,
1073
1201
  concurrency,
1074
- async (tc) => runSingleCase(tc)
1202
+ async (tc) => runSingleCase2(tc)
1075
1203
  );
1076
1204
  correctCount = resultsPerCase.reduce(
1077
1205
  (acc, r) => acc + (r.valid ? 1 : 0),
@@ -1089,14 +1217,18 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1089
1217
  };
1090
1218
  }
1091
1219
  const score = correctCount / testCases.length;
1220
+ const caseResults = resultsPerCase.map((r, i) => ({
1221
+ id: testCases[i].id,
1222
+ valid: r.valid
1223
+ }));
1092
1224
  return {
1093
1225
  score,
1094
1226
  success: score > 0.95,
1095
- // High success threshold as requested
1096
1227
  metrics: {
1097
1228
  correct_count: correctCount,
1098
1229
  total_cases: testCases.length,
1099
- accuracy: score
1230
+ accuracy: score,
1231
+ case_results: JSON.stringify(caseResults)
1100
1232
  },
1101
1233
  logs
1102
1234
  };
@@ -1116,42 +1248,414 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
1116
1248
  }
1117
1249
  var bfclSimpleBenchmark = createBfclBenchmark(
1118
1250
  "bfcl-simple",
1119
- "BFCL Simple Function Calling",
1120
- "BFCL_v3_simple.jsonl",
1121
- "BFCL_v3_simple_possible_answer.jsonl"
1251
+ "BFCL v4 Simple Function Calling",
1252
+ "BFCL_v4_simple.jsonl",
1253
+ "BFCL_v4_simple_possible_answer.jsonl"
1122
1254
  );
1123
1255
  var bfclParallelBenchmark = createBfclBenchmark(
1124
1256
  "bfcl-parallel",
1125
- "BFCL Parallel Function Calling",
1126
- "BFCL_v3_parallel.jsonl",
1127
- "BFCL_v3_parallel_possible_answer.jsonl"
1257
+ "BFCL v4 Parallel Function Calling",
1258
+ "BFCL_v4_parallel.jsonl",
1259
+ "BFCL_v4_parallel_possible_answer.jsonl"
1128
1260
  );
1129
1261
  var bfclMultipleBenchmark = createBfclBenchmark(
1130
1262
  "bfcl-multiple",
1131
- "BFCL Multiple Function Calling",
1132
- "BFCL_v3_multiple.jsonl",
1133
- "BFCL_v3_multiple_possible_answer.jsonl"
1263
+ "BFCL v4 Multiple Function Calling",
1264
+ "BFCL_v4_multiple.jsonl",
1265
+ "BFCL_v4_multiple_possible_answer.jsonl"
1134
1266
  );
1135
1267
  var bfclParallelMultipleBenchmark = createBfclBenchmark(
1136
1268
  "bfcl-parallel-multiple",
1137
- "BFCL Parallel & Multiple Function Calling",
1138
- "BFCL_v3_parallel_multiple.jsonl",
1139
- "BFCL_v3_parallel_multiple_possible_answer.jsonl"
1269
+ "BFCL v4 Parallel & Multiple Function Calling",
1270
+ "BFCL_v4_parallel_multiple.jsonl",
1271
+ "BFCL_v4_parallel_multiple_possible_answer.jsonl"
1140
1272
  );
1141
1273
 
1142
- // src/benchmarks/json-generation.ts
1274
+ // src/benchmarks/complex-func-bench.ts
1143
1275
  import { promises as fs3 } from "fs";
1144
1276
  import path3 from "path";
1145
- import { generateText as generateText2 } from "ai";
1277
+ import {
1278
+ generateText as generateText2,
1279
+ jsonSchema as jsonSchema2,
1280
+ tool as tool2
1281
+ } from "ai";
1282
+ var LINE_SPLIT_REGEX2 = /\r?\n/;
1283
+ function standardizeString2(input) {
1284
+ if (typeof input !== "string") {
1285
+ return input;
1286
+ }
1287
+ return input.toLowerCase().trim();
1288
+ }
1289
+ function valuesMatch2(modelValue, expectedValue) {
1290
+ if (modelValue === expectedValue) {
1291
+ return true;
1292
+ }
1293
+ if (typeof modelValue === "string" && typeof expectedValue === "string") {
1294
+ return standardizeString2(modelValue) === standardizeString2(expectedValue);
1295
+ }
1296
+ if (typeof modelValue === "number" && typeof expectedValue === "string") {
1297
+ return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
1298
+ }
1299
+ if (typeof modelValue === "string" && typeof expectedValue === "number") {
1300
+ return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
1301
+ }
1302
+ if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
1303
+ try {
1304
+ return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
1305
+ } catch (e) {
1306
+ return false;
1307
+ }
1308
+ }
1309
+ return false;
1310
+ }
1311
+ function validateFunctionName(modelFuncName, expectedFuncName) {
1312
+ if (modelFuncName !== expectedFuncName) {
1313
+ return {
1314
+ valid: false,
1315
+ error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
1316
+ error_type: "function_name_mismatch"
1317
+ };
1318
+ }
1319
+ return { valid: true };
1320
+ }
1321
+ function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
1322
+ for (const param of requiredParams) {
1323
+ if (!(param in modelArgs) && param in expectedArgs) {
1324
+ return {
1325
+ valid: false,
1326
+ error: `Missing required parameter: '${param}'`,
1327
+ error_type: "missing_required_param"
1328
+ };
1329
+ }
1330
+ }
1331
+ return { valid: true };
1332
+ }
1333
+ function validateParamValues(expectedArgs, modelArgs, requiredParams) {
1334
+ for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
1335
+ if (!(paramName in modelArgs)) {
1336
+ if (!requiredParams.includes(paramName)) {
1337
+ continue;
1338
+ }
1339
+ return {
1340
+ valid: false,
1341
+ error: `Missing parameter: '${paramName}'`,
1342
+ error_type: "missing_param"
1343
+ };
1344
+ }
1345
+ const modelValue = modelArgs[paramName];
1346
+ if (!valuesMatch2(modelValue, expectedValue)) {
1347
+ return {
1348
+ valid: false,
1349
+ error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
1350
+ error_type: "value_mismatch"
1351
+ };
1352
+ }
1353
+ }
1354
+ return { valid: true };
1355
+ }
1356
+ function checkFunctionCall(modelCall, expected, toolSpecs) {
1357
+ var _a, _b, _c, _d;
1358
+ const expectedFuncName = Object.keys(expected)[0];
1359
+ const expectedArgs = expected[expectedFuncName];
1360
+ const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
1361
+ const modelArgs = (_b = modelCall.args) != null ? _b : {};
1362
+ const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
1363
+ if (!nameResult.valid) {
1364
+ return nameResult;
1365
+ }
1366
+ const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
1367
+ const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
1368
+ const requiredResult = validateRequiredParams(
1369
+ requiredParams,
1370
+ modelArgs,
1371
+ expectedArgs
1372
+ );
1373
+ if (!requiredResult.valid) {
1374
+ return requiredResult;
1375
+ }
1376
+ return validateParamValues(expectedArgs, modelArgs, requiredParams);
1377
+ }
1378
+ function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
1379
+ if (modelCalls.length !== expectedCalls.length) {
1380
+ return {
1381
+ valid: false,
1382
+ error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
1383
+ error_type: "wrong_call_count"
1384
+ };
1385
+ }
1386
+ if (expectedCalls.length === 1) {
1387
+ return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
1388
+ }
1389
+ const matchedIndices = /* @__PURE__ */ new Set();
1390
+ for (const expected of expectedCalls) {
1391
+ let foundMatch = false;
1392
+ for (let i = 0; i < modelCalls.length; i++) {
1393
+ if (matchedIndices.has(i)) {
1394
+ continue;
1395
+ }
1396
+ const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
1397
+ if (result.valid) {
1398
+ matchedIndices.add(i);
1399
+ foundMatch = true;
1400
+ break;
1401
+ }
1402
+ }
1403
+ if (!foundMatch) {
1404
+ const expectedFuncName = Object.keys(expected)[0];
1405
+ return {
1406
+ valid: false,
1407
+ error: `Could not find matching call for function '${expectedFuncName}'`,
1408
+ error_type: "no_matching_call"
1409
+ };
1410
+ }
1411
+ }
1412
+ return { valid: true };
1413
+ }
1414
+ var fixSchemaType = (copy) => {
1415
+ if (!copy.type) {
1416
+ return;
1417
+ }
1418
+ if (copy.type === "dict") {
1419
+ copy.type = "object";
1420
+ }
1421
+ if (copy.type === "tuple") {
1422
+ copy.type = "array";
1423
+ }
1424
+ if (copy.type === "integer" || copy.type === "float") {
1425
+ copy.type = "number";
1426
+ }
1427
+ };
1428
+ var fixSchema = (schema) => {
1429
+ if (!schema || typeof schema !== "object") {
1430
+ return { type: "object", properties: {} };
1431
+ }
1432
+ const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
1433
+ if (!Array.isArray(copy)) {
1434
+ fixSchemaType(copy);
1435
+ if (copy.properties && typeof copy.properties === "object") {
1436
+ for (const k of Object.keys(copy.properties)) {
1437
+ copy.properties[k] = fixSchema(
1438
+ copy.properties[k]
1439
+ );
1440
+ }
1441
+ }
1442
+ if (copy.items) {
1443
+ copy.items = fixSchema(copy.items);
1444
+ }
1445
+ }
1446
+ return copy;
1447
+ };
1448
+ function buildTools(tools) {
1449
+ const nameMap = /* @__PURE__ */ new Map();
1450
+ const transformedTools = tools.map((t) => {
1451
+ const fixed = fixSchema(t.parameters);
1452
+ const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
1453
+ const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
1454
+ nameMap.set(sanitized, t.name);
1455
+ return {
1456
+ type: "function",
1457
+ name: sanitized,
1458
+ description: t.description,
1459
+ inputSchema
1460
+ };
1461
+ });
1462
+ const toolsMap = Object.fromEntries(
1463
+ transformedTools.map((t) => [
1464
+ t.name,
1465
+ tool2({
1466
+ description: typeof t.description === "string" ? t.description : void 0,
1467
+ inputSchema: jsonSchema2(t.inputSchema)
1468
+ })
1469
+ ])
1470
+ );
1471
+ return { nameMap, toolsMap };
1472
+ }
1473
+ async function mapWithConcurrency(items, concurrencyLimit, mapper) {
1474
+ const results = new Array(items.length);
1475
+ let idx = 0;
1476
+ const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
1477
+ while (true) {
1478
+ const current = idx;
1479
+ idx += 1;
1480
+ if (current >= items.length) {
1481
+ break;
1482
+ }
1483
+ results[current] = await mapper(items[current]);
1484
+ }
1485
+ });
1486
+ await Promise.all(workers);
1487
+ return results;
1488
+ }
1489
+ async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
1490
+ const caseLogs = [];
1491
+ const { function: tools, question: messages } = testCase;
1492
+ try {
1493
+ const { nameMap, toolsMap } = buildTools(tools);
1494
+ const debugSummaryRef = {};
1495
+ const providerOptions = {
1496
+ toolCallMiddleware: { debugSummary: debugSummaryRef }
1497
+ };
1498
+ const { toolCalls, finishReason } = await generateText2({
1499
+ model,
1500
+ messages,
1501
+ tools: toolsMap,
1502
+ toolChoice: "auto",
1503
+ providerOptions,
1504
+ ...temperature !== void 0 ? { temperature } : {},
1505
+ ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
1506
+ });
1507
+ const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
1508
+ var _a, _b, _c, _d;
1509
+ const rawName = (_a = c.toolName) != null ? _a : c.name;
1510
+ const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
1511
+ return {
1512
+ toolName: originalName,
1513
+ name: originalName,
1514
+ args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
1515
+ };
1516
+ });
1517
+ caseLogs.push(
1518
+ `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
1519
+ );
1520
+ const possibleAnswer = possibleAnswersMap.get(testCase.id);
1521
+ if (!possibleAnswer) {
1522
+ throw new Error(`No possible answer for id: ${testCase.id}`);
1523
+ }
1524
+ const checkerResult = checkAllFunctionCalls(
1525
+ restoredCalls,
1526
+ possibleAnswer.ground_truth,
1527
+ tools
1528
+ );
1529
+ if (checkerResult.valid) {
1530
+ caseLogs.push(`[PASS] ${testCase.id}`);
1531
+ return { valid: true, logs: caseLogs };
1532
+ }
1533
+ caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
1534
+ return { valid: false, logs: caseLogs };
1535
+ } catch (e) {
1536
+ caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
1537
+ return { valid: false, logs: caseLogs };
1538
+ }
1539
+ }
1540
+ async function loadTestData(dataPath, testDataFile) {
1541
+ const testCasesJson = await fs3.readFile(
1542
+ path3.join(dataPath, testDataFile),
1543
+ "utf-8"
1544
+ );
1545
+ return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1546
+ }
1547
+ async function loadAnswerData(dataPath, answerDataFile) {
1548
+ const answersJson = await fs3.readFile(
1549
+ path3.join(dataPath, answerDataFile),
1550
+ "utf-8"
1551
+ );
1552
+ const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1553
+ return new Map(answers.map((ans) => [ans.id, ans]));
1554
+ }
1555
+ function getConfigValues(config) {
1556
+ const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
1557
+ const limit = limitEnv ? Number(limitEnv) : void 0;
1558
+ const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
1559
+ const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
1560
+ const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
1561
+ const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
1562
+ return { limit, concurrency, temperature, maxTokens };
1563
+ }
1564
+ function aggregateResults(resultsPerCase, testCases) {
1565
+ const logs = [];
1566
+ const correctCount = resultsPerCase.reduce(
1567
+ (acc, r) => acc + (r.valid ? 1 : 0),
1568
+ 0
1569
+ );
1570
+ for (const r of resultsPerCase) {
1571
+ logs.push(...r.logs);
1572
+ }
1573
+ if (testCases.length === 0) {
1574
+ return {
1575
+ score: 0,
1576
+ success: false,
1577
+ metrics: {},
1578
+ logs: ["No test cases found."]
1579
+ };
1580
+ }
1581
+ const score = correctCount / testCases.length;
1582
+ return {
1583
+ score,
1584
+ success: score > 0.5,
1585
+ metrics: {
1586
+ correct_count: correctCount,
1587
+ total_cases: testCases.length,
1588
+ accuracy: score
1589
+ },
1590
+ logs
1591
+ };
1592
+ }
1593
+ function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
1594
+ return {
1595
+ name,
1596
+ version: "1.0.0",
1597
+ description,
1598
+ async run(model, config) {
1599
+ var _a;
1600
+ const logs = [];
1601
+ try {
1602
+ const dataPath = resolveDataDir();
1603
+ logs.push(`[INFO] Using data dir: ${dataPath}`);
1604
+ let testCases = await loadTestData(dataPath, testDataFile);
1605
+ const possibleAnswersMap = await loadAnswerData(
1606
+ dataPath,
1607
+ answerDataFile
1608
+ );
1609
+ const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
1610
+ if (limit && Number.isFinite(limit) && limit > 0) {
1611
+ testCases = testCases.slice(0, limit);
1612
+ logs.push(`[INFO] Limiting test cases to ${limit}`);
1613
+ }
1614
+ logs.push(
1615
+ `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
1616
+ );
1617
+ const resultsPerCase = await mapWithConcurrency(
1618
+ testCases,
1619
+ concurrency,
1620
+ (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
1621
+ );
1622
+ const result = aggregateResults(resultsPerCase, testCases);
1623
+ result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
1624
+ return result;
1625
+ } catch (e) {
1626
+ return {
1627
+ score: 0,
1628
+ success: false,
1629
+ metrics: {},
1630
+ error: e,
1631
+ logs: [
1632
+ `[FATAL] Failed to run benchmark ${name}: ${e.message}`
1633
+ ]
1634
+ };
1635
+ }
1636
+ }
1637
+ };
1638
+ }
1639
+ var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
1640
+ "complex-func-bench",
1641
+ "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
1642
+ "ComplexFuncBench.jsonl",
1643
+ "ComplexFuncBench_possible_answer.jsonl"
1644
+ );
1645
+
1646
+ // src/benchmarks/json-generation.ts
1647
+ import { promises as fs4 } from "fs";
1648
+ import path4 from "path";
1649
+ import { generateText as generateText3 } from "ai";
1146
1650
  import Ajv from "ajv";
1147
1651
  var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
1148
1652
  var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
1149
1653
  var NEWLINE_REGEX = /\r?\n/;
1150
- var LINE_SPLIT_REGEX2 = /\r?\n/;
1654
+ var LINE_SPLIT_REGEX3 = /\r?\n/;
1151
1655
  function tryDirectParse(text) {
1152
1656
  try {
1153
1657
  return JSON.parse(text);
1154
- } catch {
1658
+ } catch (e) {
1155
1659
  return;
1156
1660
  }
1157
1661
  }
@@ -1163,7 +1667,7 @@ function tryCodeFenceParse(text) {
1163
1667
  const inner = fenceMatch[1].trim();
1164
1668
  try {
1165
1669
  return JSON.parse(inner);
1166
- } catch {
1670
+ } catch (e) {
1167
1671
  return;
1168
1672
  }
1169
1673
  }
@@ -1188,7 +1692,7 @@ function tryBracketScan(text) {
1188
1692
  const candidate = text.slice(start, i + 1);
1189
1693
  try {
1190
1694
  return JSON.parse(candidate);
1191
- } catch {
1695
+ } catch (e) {
1192
1696
  return;
1193
1697
  }
1194
1698
  }
@@ -1236,12 +1740,12 @@ function subsetMatch(expected, actual) {
1236
1740
  async function loadDatasets() {
1237
1741
  try {
1238
1742
  const dataDir = resolveDataDir();
1239
- const testsJsonl = await fs3.readFile(
1240
- path3.join(dataDir, "json_generation_tests.jsonl"),
1743
+ const testsJsonl = await fs4.readFile(
1744
+ path4.join(dataDir, "json_generation_tests.jsonl"),
1241
1745
  "utf-8"
1242
1746
  );
1243
- const expectedJsonl = await fs3.readFile(
1244
- path3.join(dataDir, "json_generation_expected.jsonl"),
1747
+ const expectedJsonl = await fs4.readFile(
1748
+ path4.join(dataDir, "json_generation_expected.jsonl"),
1245
1749
  "utf-8"
1246
1750
  );
1247
1751
  const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -1297,10 +1801,11 @@ function validateTestCase(tc, parsed, context) {
1297
1801
  return { valid, valuesOk, parsed };
1298
1802
  }
1299
1803
  async function processTestCase(tc, context) {
1804
+ var _a;
1300
1805
  const messages = buildMessages(tc);
1301
- const temp = context.config?.temperature;
1806
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1302
1807
  const temperature = typeof temp === "number" ? temp : void 0;
1303
- const { text } = await generateText2({
1808
+ const { text } = await generateText3({
1304
1809
  model: context.model,
1305
1810
  messages,
1306
1811
  ...temperature !== void 0 ? { temperature } : {}
@@ -1308,7 +1813,7 @@ async function processTestCase(tc, context) {
1308
1813
  let parsed;
1309
1814
  try {
1310
1815
  parsed = extractFirstJsonBlock(text);
1311
- } catch {
1816
+ } catch (e) {
1312
1817
  }
1313
1818
  if (parsed === void 0) {
1314
1819
  context.validation.logs.push(
@@ -1402,21 +1907,22 @@ function buildBenchmarkResult(total, counts, logs) {
1402
1907
  async function loadSchemaOnlyTests() {
1403
1908
  try {
1404
1909
  const dataDir = resolveDataDir();
1405
- const testsJsonl = await fs3.readFile(
1406
- path3.join(dataDir, "json_generation_tests.jsonl"),
1910
+ const testsJsonl = await fs4.readFile(
1911
+ path4.join(dataDir, "json_generation_tests.jsonl"),
1407
1912
  "utf-8"
1408
1913
  );
1409
- const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1914
+ const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
1410
1915
  return { tests };
1411
1916
  } catch (e) {
1412
1917
  return { tests: [], error: e };
1413
1918
  }
1414
1919
  }
1415
1920
  async function processSchemaOnlyTestCase(tc, context) {
1921
+ var _a;
1416
1922
  const messages = buildMessages(tc);
1417
- const temp = context.config?.temperature;
1923
+ const temp = (_a = context.config) == null ? void 0 : _a.temperature;
1418
1924
  const temperature = typeof temp === "number" ? temp : void 0;
1419
- const { text } = await generateText2({
1925
+ const { text } = await generateText3({
1420
1926
  model: context.model,
1421
1927
  messages,
1422
1928
  ...temperature !== void 0 ? { temperature } : {}
@@ -1424,7 +1930,7 @@ async function processSchemaOnlyTestCase(tc, context) {
1424
1930
  let parsed;
1425
1931
  try {
1426
1932
  parsed = extractFirstJsonBlock(text);
1427
- } catch {
1933
+ } catch (e) {
1428
1934
  }
1429
1935
  if (parsed === void 0) {
1430
1936
  context.logs.push(
@@ -1493,38 +1999,144 @@ var jsonGenerationSchemaOnlyBenchmark = {
1493
1999
  }
1494
2000
  };
1495
2001
 
2002
+ // src/evaluate.ts
2003
+ import { createDiskCacheMiddleware } from "@ai-sdk-tool/middleware";
2004
+ import { wrapLanguageModel } from "ai";
2005
+
1496
2006
  // src/reporters/console.ts
1497
2007
  var colors = {
1498
2008
  reset: "\x1B[0m",
2009
+ bold: "\x1B[1m",
1499
2010
  green: "\x1B[32m",
1500
2011
  red: "\x1B[31m",
1501
2012
  yellow: "\x1B[33m",
1502
2013
  cyan: "\x1B[36m",
1503
2014
  magenta: "\x1B[35m",
1504
- gray: "\x1B[90m"
2015
+ gray: "\x1B[90m",
2016
+ white: "\x1B[37m"
1505
2017
  };
2018
+ var DEBUG_FAIL_REGEX = /^\[DEBUG-FAIL\] /;
2019
+ function formatDiff(diff) {
2020
+ if (!diff || diff.length === 0) {
2021
+ return "";
2022
+ }
2023
+ return diff.slice(0, 8).map((line) => {
2024
+ if (line.startsWith("-")) {
2025
+ return `${colors.red}${line}${colors.reset}`;
2026
+ }
2027
+ if (line.startsWith("+")) {
2028
+ return `${colors.green}${line}${colors.reset}`;
2029
+ }
2030
+ if (line.startsWith("@@")) {
2031
+ return `${colors.cyan}${line}${colors.reset}`;
2032
+ }
2033
+ return line;
2034
+ }).join("\n ");
2035
+ }
2036
+ function parseFailures(logs) {
2037
+ const failures = [];
2038
+ for (const log of logs) {
2039
+ if (!DEBUG_FAIL_REGEX.test(log)) {
2040
+ continue;
2041
+ }
2042
+ try {
2043
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX, "");
2044
+ const parsed = JSON.parse(jsonStr);
2045
+ failures.push(parsed);
2046
+ } catch (e) {
2047
+ }
2048
+ }
2049
+ return failures;
2050
+ }
2051
+ function groupFailuresByCategory(failures) {
2052
+ const groups = /* @__PURE__ */ new Map();
2053
+ for (const failure of failures) {
2054
+ const category = failure.category || "OTHER";
2055
+ const existing = groups.get(category);
2056
+ if (existing) {
2057
+ existing.push(failure);
2058
+ } else {
2059
+ groups.set(category, [failure]);
2060
+ }
2061
+ }
2062
+ return groups;
2063
+ }
2064
+ function printCompactFailure(failure) {
2065
+ var _a;
2066
+ console.log(
2067
+ `
2068
+ ${colors.red}${failure.id}${colors.reset} [${colors.yellow}${failure.category || "OTHER"}${colors.reset}]`
2069
+ );
2070
+ if (failure.message) {
2071
+ console.log(` ${failure.message}`);
2072
+ }
2073
+ if (failure.diff && failure.diff.length > 0) {
2074
+ console.log(` ${formatDiff(failure.diff)}`);
2075
+ }
2076
+ if (((_a = failure.context) == null ? void 0 : _a.raw_model_text) && failure.category === "PARSE_FAILURE") {
2077
+ const text = failure.context.raw_model_text;
2078
+ const truncated = text.length > 80 ? `${text.slice(0, 80)}...` : text;
2079
+ console.log(` ${colors.gray}Model: "${truncated}"${colors.reset}`);
2080
+ }
2081
+ }
2082
+ function printFailureSummary(failures) {
2083
+ const groups = groupFailuresByCategory(failures);
2084
+ const sorted = [...groups.entries()].sort(
2085
+ (a, b) => b[1].length - a[1].length
2086
+ );
2087
+ console.log(`
2088
+ ${colors.bold}Failures by category:${colors.reset}`);
2089
+ for (const [category, categoryFailures] of sorted) {
2090
+ console.log(
2091
+ ` ${colors.yellow}${category}${colors.reset}: ${categoryFailures.length}`
2092
+ );
2093
+ }
2094
+ const maxToShow = 5;
2095
+ const shown = failures.slice(0, maxToShow);
2096
+ for (const failure of shown) {
2097
+ printCompactFailure(failure);
2098
+ }
2099
+ if (failures.length > maxToShow) {
2100
+ const remaining = failures.length - maxToShow;
2101
+ const remainingIds = failures.slice(maxToShow).map((f) => f.id);
2102
+ const idPreview = remainingIds.slice(0, 5).join(", ");
2103
+ const more = remainingIds.length > 5 ? "..." : "";
2104
+ console.log(
2105
+ `
2106
+ ${colors.gray}+${remaining} more: ${idPreview}${more}${colors.reset}`
2107
+ );
2108
+ }
2109
+ }
1506
2110
  function printResult(result) {
1507
2111
  const { model, modelKey, benchmark, result: benchmarkResult } = result;
1508
- const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
2112
+ const passed = benchmarkResult.metrics.correct_count;
2113
+ const total = benchmarkResult.metrics.total_cases;
2114
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2115
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2116
+ const statusColor = benchmarkResult.success ? colors.green : colors.red;
1509
2117
  console.log(
1510
2118
  `
1511
2119
  ${colors.cyan}[${model}]${colors.reset}${modelKey ? ` ${colors.gray}(${modelKey})${colors.reset}` : ""} - ${colors.magenta}${benchmark}${colors.reset}`
1512
2120
  );
1513
2121
  console.log(
1514
- ` \u2514 ${status} | Score: ${colors.yellow}${benchmarkResult.score.toFixed(2)}${colors.reset}`
2122
+ ` \u2514 ${statusColor}${statusIcon} ${scorePercent}%${colors.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`
1515
2123
  );
1516
- const metrics = Object.entries(benchmarkResult.metrics);
1517
- if (metrics.length > 0) {
1518
- console.log(" Metrics:");
1519
- for (const [key, value] of metrics) {
1520
- console.log(` - ${key}: ${value}`);
1521
- }
1522
- }
1523
2124
  if (benchmarkResult.error) {
1524
2125
  console.log(
1525
2126
  ` ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
1526
2127
  );
1527
2128
  }
2129
+ if (!benchmarkResult.success && benchmarkResult.logs) {
2130
+ const failures = parseFailures(benchmarkResult.logs);
2131
+ if (failures.length > 0) {
2132
+ printFailureSummary(failures);
2133
+ } else if (benchmarkResult.logs.length > 0) {
2134
+ console.log(` ${colors.gray}Raw Logs (Sample):${colors.reset}`);
2135
+ for (const l of benchmarkResult.logs.slice(0, 5)) {
2136
+ console.log(` ${l}`);
2137
+ }
2138
+ }
2139
+ }
1528
2140
  }
1529
2141
  function consoleReporter(results) {
1530
2142
  console.log("\n--- \u{1F4CA} Evaluation Report ---");
@@ -1579,14 +2191,14 @@ function hasFunctionNameIssue(diff) {
1579
2191
  );
1580
2192
  }
1581
2193
  function suggestFunctionNameFix(expected, actual, suggestions) {
1582
- const expectedName = expected?.function;
1583
- const actualName = actual?.function;
2194
+ const expectedName = expected == null ? void 0 : expected.function;
2195
+ const actualName = actual == null ? void 0 : actual.function;
1584
2196
  if (expectedName && actualName && expectedName !== actualName) {
1585
2197
  suggestions.push(
1586
2198
  `Call the function '${expectedName}' instead of '${actualName}'.`
1587
2199
  );
1588
2200
  }
1589
- if (Array.isArray(expected?.functions)) {
2201
+ if (Array.isArray(expected == null ? void 0 : expected.functions)) {
1590
2202
  suggestions.push(
1591
2203
  `Ensure tool calls include: ${expected.functions.join(", ")}.`
1592
2204
  );
@@ -1641,7 +2253,7 @@ function suggestFromErrorType(error_type, suggestions) {
1641
2253
  }
1642
2254
  function suggestFixFromDiff(parsed) {
1643
2255
  const suggestions = [];
1644
- const { error_type, expected, actual, diff } = parsed ?? {};
2256
+ const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
1645
2257
  if (!Array.isArray(diff)) {
1646
2258
  if (suggestions.length === 0 && typeof error_type === "string") {
1647
2259
  suggestFromErrorType(error_type, suggestions);
@@ -1666,15 +2278,16 @@ function suggestFixFromDiff(parsed) {
1666
2278
  return uniqueLines(suggestions);
1667
2279
  }
1668
2280
  function getTestIdFromLogLine(line) {
2281
+ var _a, _b;
1669
2282
  if (line.startsWith("[FAIL]")) {
1670
2283
  const m = line.match(FAIL_ID_REGEX);
1671
- return m?.[1];
2284
+ return m == null ? void 0 : m[1];
1672
2285
  }
1673
2286
  if (line.startsWith("[DEBUG-FAIL]")) {
1674
2287
  try {
1675
2288
  const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1676
- return String(parsed?.id ?? "");
1677
- } catch {
2289
+ return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
2290
+ } catch (e) {
1678
2291
  }
1679
2292
  }
1680
2293
  if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
@@ -1682,18 +2295,19 @@ function getTestIdFromLogLine(line) {
1682
2295
  const parsed = JSON.parse(
1683
2296
  line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
1684
2297
  );
1685
- return String(parsed?.id ?? "");
1686
- } catch {
2298
+ return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
2299
+ } catch (e) {
1687
2300
  }
1688
2301
  }
1689
2302
  return;
1690
2303
  }
1691
2304
  function groupLogsByTestId(failLogs) {
2305
+ var _a;
1692
2306
  const byId = /* @__PURE__ */ new Map();
1693
2307
  for (const line of failLogs) {
1694
2308
  const id = getTestIdFromLogLine(line);
1695
- const key = id ?? "__general__";
1696
- const arr = byId.get(key) ?? [];
2309
+ const key = id != null ? id : "__general__";
2310
+ const arr = (_a = byId.get(key)) != null ? _a : [];
1697
2311
  arr.push(line);
1698
2312
  byId.set(key, arr);
1699
2313
  }
@@ -1705,10 +2319,10 @@ function collectDebugIds(lines) {
1705
2319
  if (l.startsWith("[DEBUG-FAIL]")) {
1706
2320
  try {
1707
2321
  const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
1708
- if (parsed?.id) {
2322
+ if (parsed == null ? void 0 : parsed.id) {
1709
2323
  debugIds.add(String(parsed.id));
1710
2324
  }
1711
- } catch {
2325
+ } catch (e) {
1712
2326
  }
1713
2327
  }
1714
2328
  }
@@ -1744,7 +2358,7 @@ function displayDebugFailLine(line) {
1744
2358
  console.log(` \u2022 ${s}`);
1745
2359
  }
1746
2360
  }
1747
- } catch {
2361
+ } catch (e) {
1748
2362
  console.log(` ${line}`);
1749
2363
  }
1750
2364
  }
@@ -1788,14 +2402,14 @@ function displayDebugFailContextLine(line) {
1788
2402
  const ctx = JSON.parse(payload);
1789
2403
  console.log(` ${colors2.gray}context:${colors2.reset}`);
1790
2404
  displayContextInfo(ctx);
1791
- } catch {
2405
+ } catch (e) {
1792
2406
  console.log(` ${line}`);
1793
2407
  }
1794
2408
  }
1795
2409
  function displayLogLine(line, debugIds) {
1796
2410
  if (line.startsWith("[FAIL]")) {
1797
2411
  const m = line.match(FAIL_ID_REGEX);
1798
- const failId = m?.[1];
2412
+ const failId = m == null ? void 0 : m[1];
1799
2413
  if (failId && debugIds.has(failId)) {
1800
2414
  return;
1801
2415
  }
@@ -1865,26 +2479,350 @@ function displayResultHeader(r) {
1865
2479
  );
1866
2480
  }
1867
2481
  function consoleDebugReporter(results) {
2482
+ var _a;
1868
2483
  console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
1869
2484
  for (const r of results) {
1870
2485
  displayResultHeader(r);
1871
2486
  displayMetrics(Object.entries(r.result.metrics));
1872
- if (r.result.logs?.length) {
2487
+ if ((_a = r.result.logs) == null ? void 0 : _a.length) {
1873
2488
  displayResultLogs(r.result.logs);
1874
2489
  }
1875
2490
  }
1876
2491
  console.log("\n------------------------------------\n");
1877
2492
  }
1878
2493
 
2494
+ // src/reporters/console.summary.ts
2495
+ var colors3 = {
2496
+ reset: "\x1B[0m",
2497
+ bold: "\x1B[1m",
2498
+ dim: "\x1B[2m",
2499
+ green: "\x1B[32m",
2500
+ red: "\x1B[31m",
2501
+ yellow: "\x1B[33m",
2502
+ cyan: "\x1B[36m",
2503
+ magenta: "\x1B[35m",
2504
+ gray: "\x1B[90m",
2505
+ white: "\x1B[37m"
2506
+ };
2507
+ var DEBUG_FAIL_REGEX2 = /^\[DEBUG-FAIL\] /;
2508
+ var ID_NUM_REGEX = /_(\d+)$/;
2509
+ var REASONING_TAG = "think";
2510
+ var MAX_FAILURES_TO_DISPLAY = 5;
2511
+ var CATEGORY_DESCRIPTIONS = {
2512
+ PARSE_FAILURE: {
2513
+ label: "Parse Failure",
2514
+ description: "No tool calls extracted from model output",
2515
+ hint: "Model may have responded in text instead of tool format"
2516
+ },
2517
+ PARTIAL_CALLS: {
2518
+ label: "Partial Calls",
2519
+ description: "Some expected tool calls missing",
2520
+ hint: "Model stopped early or missed some tools"
2521
+ },
2522
+ EXTRA_CALLS: {
2523
+ label: "Extra Calls",
2524
+ description: "More tool calls than expected",
2525
+ hint: "Model called tools that weren't needed"
2526
+ },
2527
+ PARAM_VALUE_PERCENT: {
2528
+ label: "Param Value (Percent)",
2529
+ description: "Percentage sent as integer instead of decimal",
2530
+ hint: "e.g., 5 instead of 0.05 for 5%"
2531
+ },
2532
+ PARAM_VALUE_MISMATCH: {
2533
+ label: "Param Value Mismatch",
2534
+ description: "Parameter values don't match expected"
2535
+ },
2536
+ WRONG_FUNCTION: {
2537
+ label: "Wrong Function",
2538
+ description: "Called wrong function name"
2539
+ },
2540
+ MISSING_PARAMS: {
2541
+ label: "Missing Params",
2542
+ description: "Required parameters not provided"
2543
+ },
2544
+ UNEXPECTED_PARAMS: {
2545
+ label: "Unexpected Params",
2546
+ description: "Extra parameters that shouldn't be there"
2547
+ },
2548
+ NO_MATCH: {
2549
+ label: "No Match",
2550
+ description: "Function called but couldn't match to expected",
2551
+ hint: "Parameters may be correct but don't match any expected combination"
2552
+ },
2553
+ OTHER: {
2554
+ label: "Other",
2555
+ description: "Uncategorized failure"
2556
+ }
2557
+ };
2558
+ function parseFailureLogs(logs) {
2559
+ return logs.filter((log) => DEBUG_FAIL_REGEX2.test(log)).map((log) => {
2560
+ try {
2561
+ const jsonStr = log.replace(DEBUG_FAIL_REGEX2, "");
2562
+ return JSON.parse(jsonStr);
2563
+ } catch (e) {
2564
+ return null;
2565
+ }
2566
+ }).filter((parsed) => parsed !== null);
2567
+ }
2568
+ function groupByCategory(failures) {
2569
+ const groups = /* @__PURE__ */ new Map();
2570
+ for (const failure of failures) {
2571
+ const category = failure.category || "OTHER";
2572
+ const existing = groups.get(category);
2573
+ if (existing) {
2574
+ existing.failures.push(failure);
2575
+ } else {
2576
+ groups.set(category, { failures: [failure] });
2577
+ }
2578
+ }
2579
+ return groups;
2580
+ }
2581
+ function extractParamNames(failures) {
2582
+ const paramNames = /* @__PURE__ */ new Set();
2583
+ for (const f of failures) {
2584
+ if (!f.diff) {
2585
+ continue;
2586
+ }
2587
+ for (const d of f.diff) {
2588
+ if (d.startsWith("@@ param ")) {
2589
+ paramNames.add(d.replace("@@ param ", ""));
2590
+ }
2591
+ }
2592
+ }
2593
+ return paramNames;
2594
+ }
2595
+ function extractFinishReasons(failures) {
2596
+ var _a;
2597
+ const finishReasons = /* @__PURE__ */ new Set();
2598
+ for (const f of failures) {
2599
+ if ((_a = f.context) == null ? void 0 : _a.finish_reason) {
2600
+ finishReasons.add(String(f.context.finish_reason));
2601
+ }
2602
+ }
2603
+ return finishReasons;
2604
+ }
2605
+ function detectPatterns(group) {
2606
+ const { failures } = group;
2607
+ if (failures.length < 2) {
2608
+ return;
2609
+ }
2610
+ const firstCategory = failures[0].category;
2611
+ if (firstCategory === "PARAM_VALUE_PERCENT") {
2612
+ const paramNames = extractParamNames(failures);
2613
+ if (paramNames.size > 0) {
2614
+ group.pattern = `Affected params: ${[...paramNames].join(", ")}`;
2615
+ }
2616
+ }
2617
+ if (firstCategory === "PARSE_FAILURE") {
2618
+ const finishReasons = extractFinishReasons(failures);
2619
+ if (finishReasons.size === 1) {
2620
+ group.pattern = `All finished with: ${[...finishReasons][0]}`;
2621
+ }
2622
+ }
2623
+ }
2624
+ function getLineColor(line) {
2625
+ if (line.startsWith("+")) {
2626
+ return colors3.green;
2627
+ }
2628
+ if (line.startsWith("-")) {
2629
+ return colors3.red;
2630
+ }
2631
+ if (line.startsWith("@@")) {
2632
+ return colors3.cyan;
2633
+ }
2634
+ return colors3.white;
2635
+ }
2636
+ function formatFunctions(funcs) {
2637
+ if (Array.isArray(funcs)) {
2638
+ return funcs.join(", ");
2639
+ }
2640
+ return String(funcs);
2641
+ }
2642
+ function printExpectedActual(failure) {
2643
+ if (failure.expected) {
2644
+ const expFuncs = failure.expected.functions || failure.expected.function;
2645
+ if (expFuncs) {
2646
+ console.log(
2647
+ ` ${colors3.gray}Expected:${colors3.reset} ${formatFunctions(expFuncs)}`
2648
+ );
2649
+ }
2650
+ }
2651
+ if (failure.actual) {
2652
+ const actFuncs = failure.actual.functions || failure.actual.function;
2653
+ if (actFuncs) {
2654
+ const isEmpty = Array.isArray(actFuncs) && actFuncs.length === 0;
2655
+ const color = isEmpty ? colors3.red : colors3.white;
2656
+ const text = isEmpty ? "(none)" : formatFunctions(actFuncs);
2657
+ console.log(
2658
+ ` ${colors3.gray}Actual:${colors3.reset} ${color}${text}${colors3.reset}`
2659
+ );
2660
+ }
2661
+ }
2662
+ }
2663
+ function printDiff(diff) {
2664
+ console.log(` ${colors3.gray}Diff:${colors3.reset}`);
2665
+ for (const line of diff.slice(0, MAX_FAILURES_TO_DISPLAY)) {
2666
+ const lineColor = getLineColor(line);
2667
+ console.log(` ${lineColor}${line}${colors3.reset}`);
2668
+ }
2669
+ }
2670
+ function removeReasoningTags(text) {
2671
+ const openTag = `<${REASONING_TAG}>`;
2672
+ const closeTag = `</${REASONING_TAG}>`;
2673
+ const closedTagPattern = new RegExp(
2674
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*?${closeTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`,
2675
+ "g"
2676
+ );
2677
+ const unclosedTagPattern = new RegExp(
2678
+ `${openTag.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}[\\s\\S]*`,
2679
+ "g"
2680
+ );
2681
+ let result = text.replace(closedTagPattern, "");
2682
+ result = result.replace(unclosedTagPattern, "");
2683
+ return result.trim();
2684
+ }
2685
+ function printModelOutput(failure, category) {
2686
+ var _a, _b;
2687
+ if (category !== "PARSE_FAILURE") {
2688
+ return;
2689
+ }
2690
+ const rawText = ((_a = failure.context) == null ? void 0 : _a.raw_model_text_full) || ((_b = failure.context) == null ? void 0 : _b.raw_model_text) || "";
2691
+ const cleanedText = removeReasoningTags(rawText);
2692
+ if (cleanedText) {
2693
+ console.log(
2694
+ ` ${colors3.gray}Model said:${colors3.reset} "${colors3.dim}${cleanedText}${colors3.reset}"`
2695
+ );
2696
+ } else {
2697
+ console.log(
2698
+ ` ${colors3.gray}Model said:${colors3.reset} ${colors3.dim}(only reasoning, no tool call output)${colors3.reset}`
2699
+ );
2700
+ }
2701
+ }
2702
+ function shouldShowDiffByDefault(category) {
2703
+ return category === "PARAM_VALUE_MISMATCH" || category === "PARAM_VALUE_PERCENT";
2704
+ }
2705
+ function printSingleFailure(failure, category, verbose) {
2706
+ console.log(`
2707
+ ${colors3.bold}${failure.id}${colors3.reset}`);
2708
+ const hasDiff = failure.diff && failure.diff.length > 0;
2709
+ const showDiffPrimarily = shouldShowDiffByDefault(category) && hasDiff;
2710
+ if (showDiffPrimarily) {
2711
+ printDiff(failure.diff);
2712
+ } else {
2713
+ printExpectedActual(failure);
2714
+ if (hasDiff && verbose) {
2715
+ printDiff(failure.diff);
2716
+ }
2717
+ }
2718
+ printModelOutput(failure, category);
2719
+ }
2720
+ var MAX_SAMPLE_FAILURES = 2;
2721
+ function printRemainingIds(failures) {
2722
+ const remainingIds = failures.slice(MAX_SAMPLE_FAILURES).map((f) => f.id);
2723
+ const idNums = remainingIds.map((id) => {
2724
+ const match = id.match(ID_NUM_REGEX);
2725
+ return match ? match[1] : id;
2726
+ });
2727
+ console.log(
2728
+ `
2729
+ ${colors3.dim}+${failures.length - MAX_SAMPLE_FAILURES} more: ${idNums.join(", ")}${colors3.reset}`
2730
+ );
2731
+ }
2732
+ function printCategoryHeader(info, count) {
2733
+ console.log(
2734
+ `
2735
+ ${colors3.cyan}\u2500\u2500\u2500\u2500\u2500 ${info.label} (${count}) \u2500\u2500\u2500\u2500\u2500${colors3.reset}`
2736
+ );
2737
+ console.log(`${colors3.dim}${info.description}${colors3.reset}`);
2738
+ }
2739
+ function printCategoryDetails(category, group, verbose) {
2740
+ const info = CATEGORY_DESCRIPTIONS[category] || CATEGORY_DESCRIPTIONS.OTHER;
2741
+ const { failures } = group;
2742
+ printCategoryHeader(info, failures.length);
2743
+ if (group.pattern) {
2744
+ console.log(`${colors3.yellow}Pattern: ${group.pattern}${colors3.reset}`);
2745
+ }
2746
+ if (info.hint) {
2747
+ console.log(`${colors3.magenta}Hint: ${info.hint}${colors3.reset}`);
2748
+ }
2749
+ const samplesToShow = verbose ? failures : failures.slice(0, 2);
2750
+ for (const failure of samplesToShow) {
2751
+ printSingleFailure(failure, category, verbose);
2752
+ }
2753
+ if (!verbose && failures.length > 2) {
2754
+ printRemainingIds(failures);
2755
+ }
2756
+ }
2757
+ function printResultHeader(result) {
2758
+ const { model, modelKey, benchmark, result: benchmarkResult } = result;
2759
+ const passed = benchmarkResult.metrics.correct_count;
2760
+ const total = benchmarkResult.metrics.total_cases;
2761
+ const scorePercent = (benchmarkResult.score * 100).toFixed(1);
2762
+ const statusIcon = benchmarkResult.success ? "\u2714" : "\u2716";
2763
+ const statusColor = benchmarkResult.success ? colors3.green : colors3.red;
2764
+ const modelPart = `${colors3.cyan}${model}${colors3.reset}${modelKey ? ` ${colors3.dim}(${modelKey})${colors3.reset}` : ""}`;
2765
+ const benchmarkPart = `${colors3.magenta}${benchmark}${colors3.reset}`;
2766
+ const scorePart = `${statusColor}${statusIcon} ${scorePercent}%${colors3.reset} (${passed != null ? passed : "?"}/${total != null ? total : "?"} passed)`;
2767
+ console.log(
2768
+ `
2769
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}`
2770
+ );
2771
+ console.log(`${modelPart} \u2502 ${benchmarkPart} \u2502 ${scorePart}`);
2772
+ }
2773
+ function printResultSummary(result, verbose) {
2774
+ const { result: benchmarkResult } = result;
2775
+ printResultHeader(result);
2776
+ if (!benchmarkResult.logs || benchmarkResult.logs.length === 0) {
2777
+ return;
2778
+ }
2779
+ const failures = parseFailureLogs(benchmarkResult.logs);
2780
+ if (failures.length === 0) {
2781
+ if (!benchmarkResult.success) {
2782
+ console.log(
2783
+ `${colors3.yellow}No structured failure data available${colors3.reset}`
2784
+ );
2785
+ }
2786
+ return;
2787
+ }
2788
+ const groups = groupByCategory(failures);
2789
+ for (const group of groups.values()) {
2790
+ detectPatterns(group);
2791
+ }
2792
+ const sortedCategories = [...groups.entries()].sort(
2793
+ (a, b) => b[1].failures.length - a[1].failures.length
2794
+ );
2795
+ for (const [cat, group] of sortedCategories) {
2796
+ printCategoryDetails(cat, group, verbose);
2797
+ }
2798
+ }
2799
+ function consoleSummaryReporter(results) {
2800
+ const verbose = process.env.VERBOSE === "true";
2801
+ console.log(`
2802
+ ${colors3.bold}Evaluation Report (Summary)${colors3.reset}`);
2803
+ console.log(`${colors3.dim}Use VERBOSE=true for full details${colors3.reset}`);
2804
+ for (const result of results) {
2805
+ printResultSummary(result, verbose);
2806
+ }
2807
+ console.log(
2808
+ `
2809
+ ${colors3.bold}\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501\u2501${colors3.reset}
2810
+ `
2811
+ );
2812
+ }
2813
+
1879
2814
  // src/reporters/json.ts
1880
2815
  function jsonReporter(results) {
1881
- const serializableResults = results.map((r) => ({
1882
- ...r,
1883
- result: {
1884
- ...r.result,
1885
- error: r.result.error?.message
1886
- }
1887
- }));
2816
+ const serializableResults = results.map((r) => {
2817
+ var _a;
2818
+ return {
2819
+ ...r,
2820
+ result: {
2821
+ ...r.result,
2822
+ error: (_a = r.result.error) == null ? void 0 : _a.message
2823
+ }
2824
+ };
2825
+ });
1888
2826
  console.log(JSON.stringify(serializableResults, null, 2));
1889
2827
  }
1890
2828
 
@@ -1892,60 +2830,56 @@ function jsonReporter(results) {
1892
2830
  var reporters = {
1893
2831
  console: consoleReporter,
1894
2832
  json: jsonReporter,
1895
- "console.debug": consoleDebugReporter
2833
+ "console.debug": consoleDebugReporter,
2834
+ "console.summary": consoleSummaryReporter
1896
2835
  };
1897
2836
 
1898
2837
  // src/evaluate.ts
1899
- async function runSingleBenchmark(model, benchmark, modelKey, config) {
1900
- const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
1901
- try {
1902
- console.log(
1903
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
1904
- );
1905
- const result = await benchmark.run(model, config);
1906
- console.log(
1907
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
1908
- );
1909
- return {
1910
- model: modelId,
1911
- modelKey,
1912
- benchmark: benchmark.name,
1913
- result
1914
- };
1915
- } catch (error) {
1916
- console.error(
1917
- `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Error running benchmark: ${benchmark.name}`,
1918
- error
1919
- );
1920
- return {
1921
- model: modelId,
1922
- modelKey,
1923
- benchmark: benchmark.name,
1924
- result: {
1925
- score: 0,
1926
- success: false,
1927
- metrics: {},
1928
- error: error instanceof Error ? error : new Error(String(error))
1929
- }
1930
- };
2838
+ function isModelConfig(value) {
2839
+ if (typeof value !== "object" || value === null) {
2840
+ return false;
2841
+ }
2842
+ const obj = value;
2843
+ if (!("model" in obj)) {
2844
+ return false;
2845
+ }
2846
+ const model = obj.model;
2847
+ if (typeof model !== "object" || model === null) {
2848
+ return false;
1931
2849
  }
2850
+ return "modelId" in model;
2851
+ }
2852
+ function isLanguageModel(value) {
2853
+ if (typeof value !== "object" || value === null) {
2854
+ return false;
2855
+ }
2856
+ const obj = value;
2857
+ return "modelId" in obj && typeof obj.modelId === "string";
2858
+ }
2859
+ function extractModelAndMiddleware(input) {
2860
+ if (isModelConfig(input)) {
2861
+ return [input.model, input.middleware];
2862
+ }
2863
+ return [input, void 0];
1932
2864
  }
1933
2865
  function normalizeModels(models) {
1934
- const modelEntries = [];
2866
+ const entries = [];
1935
2867
  if (Array.isArray(models)) {
1936
2868
  for (const m of models) {
1937
- modelEntries.push([void 0, m]);
2869
+ const [model, middleware] = extractModelAndMiddleware(m);
2870
+ entries.push([void 0, model, middleware]);
1938
2871
  }
1939
- } else if (typeof models === "object" && models !== null && "modelId" in models) {
1940
- modelEntries.push([void 0, models]);
2872
+ } else if (isModelConfig(models)) {
2873
+ entries.push([void 0, models.model, models.middleware]);
2874
+ } else if (isLanguageModel(models)) {
2875
+ entries.push([void 0, models, void 0]);
1941
2876
  } else {
1942
- for (const [key, m] of Object.entries(
1943
- models
1944
- )) {
1945
- modelEntries.push([key, m]);
2877
+ for (const [key, m] of Object.entries(models)) {
2878
+ const [model, middleware] = extractModelAndMiddleware(m);
2879
+ entries.push([key, model, middleware]);
1946
2880
  }
1947
2881
  }
1948
- return modelEntries;
2882
+ return entries;
1949
2883
  }
1950
2884
  function buildConfig(temperature, maxTokens) {
1951
2885
  const config = {};
@@ -1966,21 +2900,90 @@ function executeReporter(reporter, results) {
1966
2900
  reporters.console(results);
1967
2901
  }
1968
2902
  }
2903
+ function buildEffectiveModel(baseModel, userMiddleware, cacheOptions) {
2904
+ var _a, _b;
2905
+ const cacheEnabled = (cacheOptions == null ? void 0 : cacheOptions.enabled) === true;
2906
+ if (!(cacheEnabled || userMiddleware)) {
2907
+ return baseModel;
2908
+ }
2909
+ const cacheMiddleware = cacheEnabled ? createDiskCacheMiddleware({
2910
+ cacheDir: (_a = cacheOptions.cacheDir) != null ? _a : ".ai-cache",
2911
+ enabled: true,
2912
+ debug: (_b = cacheOptions.debug) != null ? _b : false
2913
+ }) : null;
2914
+ const middlewares = [];
2915
+ if (userMiddleware) {
2916
+ if (Array.isArray(userMiddleware)) {
2917
+ middlewares.push(...userMiddleware);
2918
+ } else {
2919
+ middlewares.push(userMiddleware);
2920
+ }
2921
+ }
2922
+ if (cacheMiddleware) {
2923
+ middlewares.push(cacheMiddleware);
2924
+ }
2925
+ if (middlewares.length === 0) {
2926
+ return baseModel;
2927
+ }
2928
+ return wrapLanguageModel({
2929
+ // biome-ignore lint/suspicious/noExplicitAny: AI SDK v5/v6 type mismatch
2930
+ model: baseModel,
2931
+ middleware: middlewares.length === 1 ? middlewares[0] : middlewares
2932
+ });
2933
+ }
2934
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
2935
+ const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
2936
+ const prefix = `[${modelId}]${modelKey ? ` (${modelKey})` : ""} ${benchmark.name}`;
2937
+ try {
2938
+ process.stdout.write(`${prefix}: ...`);
2939
+ const result = await benchmark.run(model, config);
2940
+ const scoreDisplay = result.score.toFixed(2);
2941
+ process.stdout.write(`\r${prefix}: .... Score: ${scoreDisplay}
2942
+ `);
2943
+ return {
2944
+ model: modelId,
2945
+ modelKey,
2946
+ benchmark: benchmark.name,
2947
+ result
2948
+ };
2949
+ } catch (error) {
2950
+ process.stdout.write(`\r${prefix}: .... Score: ERROR
2951
+ `);
2952
+ console.error(error);
2953
+ return {
2954
+ model: modelId,
2955
+ modelKey,
2956
+ benchmark: benchmark.name,
2957
+ result: {
2958
+ score: 0,
2959
+ success: false,
2960
+ metrics: {},
2961
+ error: error instanceof Error ? error : new Error(String(error))
2962
+ }
2963
+ };
2964
+ }
2965
+ }
1969
2966
  async function evaluate(options) {
1970
2967
  const {
1971
2968
  models,
1972
2969
  benchmarks,
1973
2970
  reporter = "console",
1974
2971
  temperature,
1975
- maxTokens
2972
+ maxTokens,
2973
+ cache
1976
2974
  } = options;
1977
2975
  const modelEntries = normalizeModels(models);
1978
2976
  const config = buildConfig(temperature, maxTokens);
1979
2977
  const allResults = [];
1980
- for (const [modelKey, model] of modelEntries) {
2978
+ for (const [modelKey, baseModel, userMiddleware] of modelEntries) {
2979
+ const effectiveModel = buildEffectiveModel(
2980
+ baseModel,
2981
+ userMiddleware,
2982
+ cache
2983
+ );
1981
2984
  for (const benchmark of benchmarks) {
1982
2985
  const evaluationResult = await runSingleBenchmark(
1983
- model,
2986
+ effectiveModel,
1984
2987
  benchmark,
1985
2988
  modelKey,
1986
2989
  config
@@ -1996,6 +2999,7 @@ export {
1996
2999
  bfclParallelBenchmark,
1997
3000
  bfclParallelMultipleBenchmark,
1998
3001
  bfclSimpleBenchmark,
3002
+ complexFuncBenchBenchmark,
1999
3003
  evaluate,
2000
3004
  jsonGenerationBenchmark,
2001
3005
  jsonGenerationSchemaOnlyBenchmark