npm - @ai-sdk-tool/eval - Versions diffs - 1.0.0-canary.0 → 1.0.0-canary.1 - Mend

@ai-sdk-tool/eval 1.0.0-canary.0 → 1.0.0-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/data/ComplexFuncBench.jsonl +1000 -0
package/data/ComplexFuncBench_possible_answer.jsonl +1000 -0
package/dist/index.cjs +587 -91
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +29 -9
package/dist/index.d.ts +29 -9
package/dist/index.js +591 -92
package/dist/index.js.map +1 -1
package/package.json +17 -11

package/dist/index.cjs CHANGED Viewed

@@ -34,6 +34,7 @@ __export(index_exports, {
   bfclParallelBenchmark: () => bfclParallelBenchmark,
   bfclParallelMultipleBenchmark: () => bfclParallelMultipleBenchmark,
   bfclSimpleBenchmark: () => bfclSimpleBenchmark,
+  complexFuncBenchBenchmark: () => complexFuncBenchBenchmark,
   evaluate: () => evaluate,
   jsonGenerationBenchmark: () => jsonGenerationBenchmark,
   jsonGenerationSchemaOnlyBenchmark: () => jsonGenerationSchemaOnlyBenchmark
@@ -61,7 +62,7 @@ function tryResolveViaPackageEntry(moduleUrl) {
     if (import_node_fs.default.existsSync(dataAtRoot)) {
       return dataAtRoot;
     }
-  } catch {
+  } catch (e) {
   }
   return null;
 }
@@ -75,7 +76,7 @@ function tryResolveViaPackageJson(moduleUrl) {
     if (import_node_fs.default.existsSync(dataAtPkg)) {
       return dataAtPkg;
     }
-  } catch {
+  } catch (e) {
   }
   return null;
 }
@@ -83,7 +84,7 @@ function getStartDir(moduleUrl) {
   if (moduleUrl) {
     try {
       return import_node_path.default.dirname((0, import_node_url.fileURLToPath)(moduleUrl));
-    } catch {
+    } catch (e) {
       return process.cwd();
     }
   }
@@ -177,7 +178,7 @@ function valuesMatch(modelValue, possibleValue) {
       const normalizedModel = normalizeObject(modelValue);
       const normalizedPossible = normalizeObject(possibleValue);
       return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
-    } catch {
+    } catch (e) {
       return false;
     }
   }
@@ -306,7 +307,7 @@ function checkSingleParameter(paramName, modelValue, context) {
     return checkStringValue(
       paramName,
       modelValue,
-      possibleValues ?? []
+      possibleValues != null ? possibleValues : []
     );
   }
   if (Array.isArray(modelValue)) {
@@ -406,6 +407,37 @@ function multipleFunctionChecker(funcDescriptions, modelToolCalls, possibleAnswe
 // src/benchmarks/bfcl.ts
 var LINE_SPLIT_REGEX = /\r?\n/;
 var NUMERIC_STRING_REGEX = /^\d+$/;
+function convertGroundTruthToXML(call) {
+  const keys = Object.keys(call);
+  if (keys.length === 0) {
+    return "<empty_call />";
+  }
+  const funcName = keys[0];
+  if (!funcName) {
+    return "<undefined_function />";
+  }
+  const params = call[funcName];
+  if (!params || typeof params !== "object") {
+    return `<${funcName} />`;
+  }
+  let xml = `<${funcName}>
+`;
+  for (const [key, value] of Object.entries(params)) {
+    const displayValue = Array.isArray(value) ? value[0] : value;
+    let valueStr;
+    if (typeof displayValue === "string") {
+      valueStr = displayValue;
+    } else if (displayValue === null || displayValue === void 0) {
+      valueStr = "";
+    } else {
+      valueStr = JSON.stringify(displayValue);
+    }
+    xml += `  <${key}>${valueStr}</${key}>
+`;
+  }
+  xml += `</${funcName}>`;
+  return xml;
+}
 function check(testCase, modelOutput, possibleAnswer) {
   const category = testCase.id.split("_")[0];
   try {
@@ -486,7 +518,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             `[INFO] Limiting test cases to ${limit} due to BFCL_LIMIT.`
           );
         }
-        const fixSchemaType = (copy) => {
+        const fixSchemaType2 = (copy) => {
           if (!copy.type) {
             return;
           }
@@ -510,16 +542,16 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             );
           }
         };
-        const fixSchema = (schema) => {
+        const fixSchema2 = (schema) => {
           if (!schema || typeof schema !== "object") {
             return { type: "object", properties: {} };
           }
-          const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
+          const copy = Array.isArray(schema) ? schema.map((v) => fixSchema2(v)) : { ...schema };
           if (!Array.isArray(copy)) {
-            fixSchemaType(copy);
-            fixSchemaProperties(copy, fixSchema);
+            fixSchemaType2(copy);
+            fixSchemaProperties(copy, fixSchema2);
             if (copy.items) {
-              copy.items = fixSchema(copy.items);
+              copy.items = fixSchema2(copy.items);
             }
             return copy;
           }
@@ -554,13 +586,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           try {
             const arr = JSON.parse(raw);
             return Array.isArray(arr) ? arr : [];
-          } catch {
+          } catch (e) {
             return [];
           }
         };
         const getSanitizedName = (rawName, transformedTools) => {
+          var _a, _b;
           if (typeof rawName === "string" && NUMERIC_STRING_REGEX.test(rawName)) {
-            return transformedTools[Number(rawName)]?.name ?? rawName;
+            return (_b = (_a = transformedTools[Number(rawName)]) == null ? void 0 : _a.name) != null ? _b : rawName;
           }
           return rawName;
         };
@@ -570,25 +603,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
           try {
             return JSON.parse(extractedArgs);
-          } catch {
+          } catch (e) {
             return extractedArgs;
           }
         };
         const restoreToolCalls = (toolCalls, nameMap, transformedTools) => (toolCalls || []).map((c) => {
+          var _a, _b, _c, _d, _e, _f;
           const call = c;
-          const rawName = call.toolName ?? call.name;
+          const rawName = (_a = call.toolName) != null ? _a : call.name;
           const sanitizedFromIndex = getSanitizedName(
             rawName,
             transformedTools
           );
-          const originalName = nameMap.get(sanitizedFromIndex) ?? sanitizedFromIndex;
-          const extractedArgs = call.args ?? call.arguments ?? call.input ?? call.params ?? call.parameters;
+          const originalName = (_b = nameMap.get(sanitizedFromIndex)) != null ? _b : sanitizedFromIndex;
+          const extractedArgs = (_f = (_e = (_d = (_c = call.args) != null ? _c : call.arguments) != null ? _d : call.input) != null ? _e : call.params) != null ? _f : call.parameters;
           const parsedArgs = parseToolArgs(extractedArgs);
           return {
             ...call,
             toolName: originalName,
             name: originalName,
-            args: parsedArgs ?? {}
+            args: parsedArgs != null ? parsedArgs : {}
           };
         });
         const summarizeArgs = (args) => {
@@ -632,7 +666,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
               if (Array.isArray(got)) {
                 return JSON.stringify(got.map((x) => String(x)).sort()) === JSON.stringify(v.map((x) => String(x)).sort());
               }
-            } catch {
+            } catch (e) {
             }
             return String(v).toLowerCase().replace(/\s+/g, "") === String(got).toLowerCase().replace(/\s+/g, "");
           });
@@ -670,13 +704,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
         };
         const buildSimpleDiff = (tools, possibleAnswer, restoredCalls) => {
+          var _a, _b, _c, _d;
           const funcDesc = tools[0];
-          const gt = possibleAnswer.ground_truth?.[0];
-          const expectedFuncName = funcDesc?.name;
+          const gt = (_a = possibleAnswer.ground_truth) == null ? void 0 : _a[0];
+          const expectedFuncName = funcDesc == null ? void 0 : funcDesc.name;
           const expectedParams = gt ? gt[Object.keys(gt)[0]] : void 0;
           const received = restoredCalls[0];
-          const receivedName = received?.toolName ?? received?.name;
-          const receivedArgs = summarizeArgs(received?.args);
+          const receivedName = (_b = received == null ? void 0 : received.toolName) != null ? _b : received == null ? void 0 : received.name;
+          const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
           const expected = {
             function: expectedFuncName,
             params: expectedParams
@@ -688,7 +723,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const diff = [];
           checkFunctionNameMismatch(expectedFuncName, receivedName, diff);
           if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
-            const required = funcDesc?.parameters?.required ?? [];
+            const required = (_d = (_c = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _c.required) != null ? _d : [];
             checkMissingParams(
               required,
               receivedArgs,
@@ -725,12 +760,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
         };
         const findMatchingCallIndex = (fname, restoredCalls, usedActual) => {
+          var _a;
           for (let i = 0; i < restoredCalls.length; i += 1) {
             if (usedActual.has(i)) {
               continue;
             }
             const rc = restoredCalls[i];
-            const rcName = rc?.toolName ?? rc?.name;
+            const rcName = (_a = rc == null ? void 0 : rc.toolName) != null ? _a : rc == null ? void 0 : rc.name;
             if (rcName === fname) {
               return i;
             }
@@ -744,6 +780,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           checkParamValueMismatches(expectedParamsAllowed, receivedArgs, diff);
         };
         const processExpectedCall = (options) => {
+          var _a, _b;
           const { expectedObj, restoredCalls, tools, usedActual, diff } = options;
           const fname = Object.keys(expectedObj)[0];
           const matchedIndex = findMatchingCallIndex(
@@ -756,10 +793,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
           usedActual.add(matchedIndex);
           const received = restoredCalls[matchedIndex];
-          const receivedArgs = summarizeArgs(received?.args);
+          const receivedArgs = summarizeArgs(received == null ? void 0 : received.args);
           const expectedParamsAllowed = expectedObj[fname];
           const funcDesc = tools.find((t) => t.name === fname);
-          const requiredParams = funcDesc?.parameters?.required ?? [];
+          const requiredParams = (_b = (_a = funcDesc == null ? void 0 : funcDesc.parameters) == null ? void 0 : _a.required) != null ? _b : [];
           diff.push(`@@ function ${fname}`);
           if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
             validateFunctionParams({
@@ -771,10 +808,14 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           }
         };
         const buildParallelDiff = (tools, possibleAnswer, restoredCalls) => {
-          const gtArr = possibleAnswer.ground_truth ?? [];
+          var _a;
+          const gtArr = (_a = possibleAnswer.ground_truth) != null ? _a : [];
           const expectedNames = gtArr.map((g) => Object.keys(g)[0]);
           const actualNames = restoredCalls.map(
-            (c) => c.toolName ?? c.name
+            (c) => {
+              var _a2;
+              return (_a2 = c.toolName) != null ? _a2 : c.name;
+            }
           );
           const expected = {
             functions: expectedNames
@@ -800,14 +841,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           return { expected, actual, diff };
         };
         const concurrencyEnv = process.env.BFCL_CONCURRENCY;
-        const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
+        const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 16;
         logs.push(
           `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
         );
         const logFirstToolDebug = (transformedTools, testCaseId, caseLogs) => {
+          var _a, _b, _c, _d;
           try {
             const firstTool = transformedTools[0];
-            const schemaType = firstTool?.inputSchema?.type ?? firstTool?.inputSchema?.jsonSchema?.type;
+            const schemaType = (_d = (_a = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _a.type) != null ? _d : (_c = (_b = firstTool == null ? void 0 : firstTool.inputSchema) == null ? void 0 : _b.jsonSchema) == null ? void 0 : _c.type;
             caseLogs.push(
               `[DEBUG] ${testCaseId}: firstTool=${JSON.stringify(firstTool)}, schemaType=${schemaType}`
             );
@@ -823,7 +865,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             caseLogs.push(
               `[DEBUG] ${testCaseId}: rawToolCalls=${JSON.stringify(toolCalls)}, finishReason=${finishReason}, text=${JSON.stringify(text)}`
             );
-          } catch {
+          } catch (e) {
             caseLogs.push(
               `[DEBUG] ${testCaseId}: failed to serialize toolCalls`
             );
@@ -842,11 +884,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             possibleAnswer
           } = options;
           const lastUser = (() => {
+            var _a;
             const reversed = [...flatMessages].reverse();
             const found = reversed.find(
               (m) => m.role === "user"
             );
-            return found?.content ?? void 0;
+            return (_a = found == null ? void 0 : found.content) != null ? _a : void 0;
           })();
           const rawModelText = (() => {
             if (mwOriginalText && mwOriginalText.length > 0) {
@@ -917,9 +960,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
               caseLogs.push(
                 `[DEBUG-FAIL-CONTEXT] ${JSON.stringify(contextPayload)}`
               );
-            } catch {
+            } catch (e) {
             }
-          } catch {
+          } catch (e) {
             caseLogs.push(`[DEBUG] ${testCase.id}: failed to build debug diff`);
           }
         };
@@ -998,7 +1041,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const flatMessages = flattenMessages(messages);
           const { transformedTools, nameMap } = buildTransformedTools(
             tools,
-            fixSchema
+            fixSchema2
           );
           const toolsMap = buildToolsMap(transformedTools);
           return { flatMessages, transformedTools, nameMap, toolsMap };
@@ -1020,6 +1063,26 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           const mwParsedToolCalls = parseDebugToolCalls(
             debugSummaryRef.toolCalls
           );
+          const possibleAnswer = possibleAnswersMap.get(testCase.id);
+          if (!possibleAnswer) {
+            throw new Error(`No possible answer for id: ${testCase.id}`);
+          }
+          if (process.env.DEBUG_PARSER_OUTPUT === "true") {
+            const groundTruth = possibleAnswer.ground_truth;
+            const expectedXML = groundTruth.map((call) => convertGroundTruthToXML(call)).join("\n\n");
+            console.log("\n========== BFCL CASE DEBUG ==========");
+            console.log(`Test Case: ${testCase.id}`);
+            console.log(`Expected count: ${groundTruth.length} call(s)`);
+            console.log("\n--- EXPECTED OUTPUT (morphXML format) ---");
+            console.log(expectedXML);
+            console.log("\n--- ACTUAL MODEL OUTPUT (raw, with whitespace) ---");
+            console.log(mwOriginalText || text || "(empty)");
+            console.log(
+              "\n--- PARSED TOOL CALLS (count: " + (Array.isArray(toolCalls) ? toolCalls.length : 0) + ") ---"
+            );
+            console.log(JSON.stringify(toolCalls, null, 2));
+            console.log("======================================\n");
+          }
           logRawToolCalls({
             toolCalls,
             finishReason,
@@ -1027,10 +1090,6 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             testCaseId: testCase.id,
             caseLogs
           });
-          const possibleAnswer = possibleAnswersMap.get(testCase.id);
-          if (!possibleAnswer) {
-            throw new Error(`No possible answer for id: ${testCase.id}`);
-          }
           const restoredCalls = restoreToolCalls(
             toolCalls || [],
             nameMap,
@@ -1051,12 +1110,12 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             caseLogs
           });
         };
-        const runSingleCase = async (testCase) => {
+        const runSingleCase2 = async (testCase) => {
           const caseLogs = [];
           const { function: tools } = testCase;
-          const temp = config?.temperature;
+          const temp = config == null ? void 0 : config.temperature;
           const temperature = typeof temp === "number" ? temp : void 0;
-          const maxTok = config?.maxTokens;
+          const maxTok = config == null ? void 0 : config.maxTokens;
           const maxTokens = typeof maxTok === "number" ? maxTok : void 0;
           try {
             const { flatMessages, transformedTools, nameMap, toolsMap } = prepareTestCaseData(testCase);
@@ -1082,15 +1141,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             });
           } catch (e) {
             caseLogs.push(
-              `[ERROR] ${testCase.id}: Model generation failed: ${e?.message}`
+              `[ERROR] ${testCase.id}: Model generation failed: ${e == null ? void 0 : e.message}`
             );
-            if (e?.stack) {
+            if (e == null ? void 0 : e.stack) {
               caseLogs.push(`[STACK] ${testCase.id}: ${e.stack}`);
             }
             return { valid: false, logs: caseLogs };
           }
         };
-        const mapWithConcurrency = async (items, concurrencyLimit, mapper) => {
+        const mapWithConcurrency2 = async (items, concurrencyLimit, mapper) => {
           const results = new Array(items.length);
           let idx = 0;
           const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
@@ -1106,10 +1165,10 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           await Promise.all(workers);
           return results;
         };
-        const resultsPerCase = await mapWithConcurrency(
+        const resultsPerCase = await mapWithConcurrency2(
           testCases,
           concurrency,
-          async (tc) => runSingleCase(tc)
+          async (tc) => runSingleCase2(tc)
         );
         correctCount = resultsPerCase.reduce(
           (acc, r) => acc + (r.valid ? 1 : 0),
@@ -1177,19 +1236,387 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
   "BFCL_v3_parallel_multiple_possible_answer.jsonl"
 );
-// src/benchmarks/json-generation.ts
+// src/benchmarks/complex-func-bench.ts
 var import_node_fs3 = require("fs");
 var import_node_path3 = __toESM(require("path"), 1);
 var import_ai2 = require("ai");
+var LINE_SPLIT_REGEX2 = /\r?\n/;
+function standardizeString2(input) {
+  if (typeof input !== "string") {
+    return input;
+  }
+  return input.toLowerCase().trim();
+}
+function valuesMatch2(modelValue, expectedValue) {
+  if (modelValue === expectedValue) {
+    return true;
+  }
+  if (typeof modelValue === "string" && typeof expectedValue === "string") {
+    return standardizeString2(modelValue) === standardizeString2(expectedValue);
+  }
+  if (typeof modelValue === "number" && typeof expectedValue === "string") {
+    return modelValue.toString() === expectedValue || modelValue === Number(expectedValue);
+  }
+  if (typeof modelValue === "string" && typeof expectedValue === "number") {
+    return modelValue === expectedValue.toString() || Number(modelValue) === expectedValue;
+  }
+  if (typeof modelValue === "object" && modelValue !== null && typeof expectedValue === "object" && expectedValue !== null) {
+    try {
+      return JSON.stringify(modelValue) === JSON.stringify(expectedValue);
+    } catch (e) {
+      return false;
+    }
+  }
+  return false;
+}
+function validateFunctionName(modelFuncName, expectedFuncName) {
+  if (modelFuncName !== expectedFuncName) {
+    return {
+      valid: false,
+      error: `Function name mismatch: expected '${expectedFuncName}', got '${modelFuncName}'`,
+      error_type: "function_name_mismatch"
+    };
+  }
+  return { valid: true };
+}
+function validateRequiredParams(requiredParams, modelArgs, expectedArgs) {
+  for (const param of requiredParams) {
+    if (!(param in modelArgs) && param in expectedArgs) {
+      return {
+        valid: false,
+        error: `Missing required parameter: '${param}'`,
+        error_type: "missing_required_param"
+      };
+    }
+  }
+  return { valid: true };
+}
+function validateParamValues(expectedArgs, modelArgs, requiredParams) {
+  for (const [paramName, expectedValue] of Object.entries(expectedArgs)) {
+    if (!(paramName in modelArgs)) {
+      if (!requiredParams.includes(paramName)) {
+        continue;
+      }
+      return {
+        valid: false,
+        error: `Missing parameter: '${paramName}'`,
+        error_type: "missing_param"
+      };
+    }
+    const modelValue = modelArgs[paramName];
+    if (!valuesMatch2(modelValue, expectedValue)) {
+      return {
+        valid: false,
+        error: `Parameter '${paramName}' value mismatch: expected ${JSON.stringify(expectedValue)}, got ${JSON.stringify(modelValue)}`,
+        error_type: "value_mismatch"
+      };
+    }
+  }
+  return { valid: true };
+}
+function checkFunctionCall(modelCall, expected, toolSpecs) {
+  var _a, _b, _c, _d;
+  const expectedFuncName = Object.keys(expected)[0];
+  const expectedArgs = expected[expectedFuncName];
+  const modelFuncName = (_a = modelCall.toolName) != null ? _a : modelCall.name;
+  const modelArgs = (_b = modelCall.args) != null ? _b : {};
+  const nameResult = validateFunctionName(modelFuncName, expectedFuncName);
+  if (!nameResult.valid) {
+    return nameResult;
+  }
+  const toolSpec = toolSpecs.find((t) => t.name === expectedFuncName);
+  const requiredParams = (_d = (_c = toolSpec == null ? void 0 : toolSpec.parameters) == null ? void 0 : _c.required) != null ? _d : [];
+  const requiredResult = validateRequiredParams(
+    requiredParams,
+    modelArgs,
+    expectedArgs
+  );
+  if (!requiredResult.valid) {
+    return requiredResult;
+  }
+  return validateParamValues(expectedArgs, modelArgs, requiredParams);
+}
+function checkAllFunctionCalls(modelCalls, expectedCalls, toolSpecs) {
+  if (modelCalls.length !== expectedCalls.length) {
+    return {
+      valid: false,
+      error: `Wrong number of function calls: expected ${expectedCalls.length}, got ${modelCalls.length}`,
+      error_type: "wrong_call_count"
+    };
+  }
+  if (expectedCalls.length === 1) {
+    return checkFunctionCall(modelCalls[0], expectedCalls[0], toolSpecs);
+  }
+  const matchedIndices = /* @__PURE__ */ new Set();
+  for (const expected of expectedCalls) {
+    let foundMatch = false;
+    for (let i = 0; i < modelCalls.length; i++) {
+      if (matchedIndices.has(i)) {
+        continue;
+      }
+      const result = checkFunctionCall(modelCalls[i], expected, toolSpecs);
+      if (result.valid) {
+        matchedIndices.add(i);
+        foundMatch = true;
+        break;
+      }
+    }
+    if (!foundMatch) {
+      const expectedFuncName = Object.keys(expected)[0];
+      return {
+        valid: false,
+        error: `Could not find matching call for function '${expectedFuncName}'`,
+        error_type: "no_matching_call"
+      };
+    }
+  }
+  return { valid: true };
+}
+var fixSchemaType = (copy) => {
+  if (!copy.type) {
+    return;
+  }
+  if (copy.type === "dict") {
+    copy.type = "object";
+  }
+  if (copy.type === "tuple") {
+    copy.type = "array";
+  }
+  if (copy.type === "integer" || copy.type === "float") {
+    copy.type = "number";
+  }
+};
+var fixSchema = (schema) => {
+  if (!schema || typeof schema !== "object") {
+    return { type: "object", properties: {} };
+  }
+  const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
+  if (!Array.isArray(copy)) {
+    fixSchemaType(copy);
+    if (copy.properties && typeof copy.properties === "object") {
+      for (const k of Object.keys(copy.properties)) {
+        copy.properties[k] = fixSchema(
+          copy.properties[k]
+        );
+      }
+    }
+    if (copy.items) {
+      copy.items = fixSchema(copy.items);
+    }
+  }
+  return copy;
+};
+function buildTools(tools) {
+  const nameMap = /* @__PURE__ */ new Map();
+  const transformedTools = tools.map((t) => {
+    const fixed = fixSchema(t.parameters);
+    const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
+    const sanitized = t.name.replace(/[^a-zA-Z0-9_-]/g, "_").slice(0, 64) || "tool";
+    nameMap.set(sanitized, t.name);
+    return {
+      type: "function",
+      name: sanitized,
+      description: t.description,
+      inputSchema
+    };
+  });
+  const toolsMap = Object.fromEntries(
+    transformedTools.map((t) => [
+      t.name,
+      (0, import_ai2.tool)({
+        description: typeof t.description === "string" ? t.description : void 0,
+        inputSchema: (0, import_ai2.jsonSchema)(t.inputSchema)
+      })
+    ])
+  );
+  return { nameMap, toolsMap };
+}
+async function mapWithConcurrency(items, concurrencyLimit, mapper) {
+  const results = new Array(items.length);
+  let idx = 0;
+  const workers = new Array(Math.min(concurrencyLimit, items.length)).fill(0).map(async () => {
+    while (true) {
+      const current = idx;
+      idx += 1;
+      if (current >= items.length) {
+        break;
+      }
+      results[current] = await mapper(items[current]);
+    }
+  });
+  await Promise.all(workers);
+  return results;
+}
+async function runSingleCase(testCase, model, possibleAnswersMap, temperature, maxTokens) {
+  const caseLogs = [];
+  const { function: tools, question: messages } = testCase;
+  try {
+    const { nameMap, toolsMap } = buildTools(tools);
+    const debugSummaryRef = {};
+    const providerOptions = {
+      toolCallMiddleware: { debugSummary: debugSummaryRef }
+    };
+    const { toolCalls, finishReason } = await (0, import_ai2.generateText)({
+      model,
+      messages,
+      tools: toolsMap,
+      toolChoice: "auto",
+      providerOptions,
+      ...temperature !== void 0 ? { temperature } : {},
+      ...maxTokens !== void 0 ? { maxOutputTokens: maxTokens } : {}
+    });
+    const restoredCalls = (toolCalls != null ? toolCalls : []).map((c) => {
+      var _a, _b, _c, _d;
+      const rawName = (_a = c.toolName) != null ? _a : c.name;
+      const originalName = (_b = nameMap.get(rawName)) != null ? _b : rawName;
+      return {
+        toolName: originalName,
+        name: originalName,
+        args: (_d = (_c = c.input) != null ? _c : c.args) != null ? _d : {}
+      };
+    });
+    caseLogs.push(
+      `[DEBUG] ${testCase.id}: toolCalls=${JSON.stringify(restoredCalls)}, finishReason=${finishReason}`
+    );
+    const possibleAnswer = possibleAnswersMap.get(testCase.id);
+    if (!possibleAnswer) {
+      throw new Error(`No possible answer for id: ${testCase.id}`);
+    }
+    const checkerResult = checkAllFunctionCalls(
+      restoredCalls,
+      possibleAnswer.ground_truth,
+      tools
+    );
+    if (checkerResult.valid) {
+      caseLogs.push(`[PASS] ${testCase.id}`);
+      return { valid: true, logs: caseLogs };
+    }
+    caseLogs.push(`[FAIL] ${testCase.id}: ${checkerResult.error}`);
+    return { valid: false, logs: caseLogs };
+  } catch (e) {
+    caseLogs.push(`[ERROR] ${testCase.id}: ${e == null ? void 0 : e.message}`);
+    return { valid: false, logs: caseLogs };
+  }
+}
+async function loadTestData(dataPath, testDataFile) {
+  const testCasesJson = await import_node_fs3.promises.readFile(
+    import_node_path3.default.join(dataPath, testDataFile),
+    "utf-8"
+  );
+  return testCasesJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+}
+async function loadAnswerData(dataPath, answerDataFile) {
+  const answersJson = await import_node_fs3.promises.readFile(
+    import_node_path3.default.join(dataPath, answerDataFile),
+    "utf-8"
+  );
+  const answers = answersJson.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+  return new Map(answers.map((ans) => [ans.id, ans]));
+}
+function getConfigValues(config) {
+  const limitEnv = process.env.COMPLEXFUNCBENCH_LIMIT;
+  const limit = limitEnv ? Number(limitEnv) : void 0;
+  const concurrencyEnv = process.env.COMPLEXFUNCBENCH_CONCURRENCY;
+  const concurrency = concurrencyEnv && Number.isFinite(Number(concurrencyEnv)) ? Math.max(1, Number(concurrencyEnv)) : 4;
+  const temperature = typeof (config == null ? void 0 : config.temperature) === "number" ? config.temperature : void 0;
+  const maxTokens = typeof (config == null ? void 0 : config.maxTokens) === "number" ? config.maxTokens : void 0;
+  return { limit, concurrency, temperature, maxTokens };
+}
+function aggregateResults(resultsPerCase, testCases) {
+  const logs = [];
+  const correctCount = resultsPerCase.reduce(
+    (acc, r) => acc + (r.valid ? 1 : 0),
+    0
+  );
+  for (const r of resultsPerCase) {
+    logs.push(...r.logs);
+  }
+  if (testCases.length === 0) {
+    return {
+      score: 0,
+      success: false,
+      metrics: {},
+      logs: ["No test cases found."]
+    };
+  }
+  const score = correctCount / testCases.length;
+  return {
+    score,
+    success: score > 0.5,
+    metrics: {
+      correct_count: correctCount,
+      total_cases: testCases.length,
+      accuracy: score
+    },
+    logs
+  };
+}
+function createComplexFuncBenchBenchmark(name, description, testDataFile, answerDataFile) {
+  return {
+    name,
+    version: "1.0.0",
+    description,
+    async run(model, config) {
+      var _a;
+      const logs = [];
+      try {
+        const dataPath = resolveDataDir();
+        logs.push(`[INFO] Using data dir: ${dataPath}`);
+        let testCases = await loadTestData(dataPath, testDataFile);
+        const possibleAnswersMap = await loadAnswerData(
+          dataPath,
+          answerDataFile
+        );
+        const { limit, concurrency, temperature, maxTokens } = getConfigValues(config);
+        if (limit && Number.isFinite(limit) && limit > 0) {
+          testCases = testCases.slice(0, limit);
+          logs.push(`[INFO] Limiting test cases to ${limit}`);
+        }
+        logs.push(
+          `[INFO] Running ${testCases.length} test cases with concurrency=${concurrency}`
+        );
+        const resultsPerCase = await mapWithConcurrency(
+          testCases,
+          concurrency,
+          (tc) => runSingleCase(tc, model, possibleAnswersMap, temperature, maxTokens)
+        );
+        const result = aggregateResults(resultsPerCase, testCases);
+        result.logs = [...logs, ...(_a = result.logs) != null ? _a : []];
+        return result;
+      } catch (e) {
+        return {
+          score: 0,
+          success: false,
+          metrics: {},
+          error: e,
+          logs: [
+            `[FATAL] Failed to run benchmark ${name}: ${e.message}`
+          ]
+        };
+      }
+    }
+  };
+}
+var complexFuncBenchBenchmark = createComplexFuncBenchBenchmark(
+  "complex-func-bench",
+  "ComplexFuncBench - Complex Function Calling (multi-step, constraints, long params)",
+  "ComplexFuncBench.jsonl",
+  "ComplexFuncBench_possible_answer.jsonl"
+);
+// src/benchmarks/json-generation.ts
+var import_node_fs4 = require("fs");
+var import_node_path4 = __toESM(require("path"), 1);
+var import_ai3 = require("ai");
 var import_ajv = __toESM(require("ajv"), 1);
 var JSON_FENCE_REGEX = /```json\s*([\s\S]*?)```/i;
 var CODE_FENCE_REGEX = /```\s*([\s\S]*?)```/i;
 var NEWLINE_REGEX = /\r?\n/;
-var LINE_SPLIT_REGEX2 = /\r?\n/;
+var LINE_SPLIT_REGEX3 = /\r?\n/;
 function tryDirectParse(text) {
   try {
     return JSON.parse(text);
-  } catch {
+  } catch (e) {
     return;
   }
 }
@@ -1201,7 +1628,7 @@ function tryCodeFenceParse(text) {
   const inner = fenceMatch[1].trim();
   try {
     return JSON.parse(inner);
-  } catch {
+  } catch (e) {
     return;
   }
 }
@@ -1226,7 +1653,7 @@ function tryBracketScan(text) {
       const candidate = text.slice(start, i + 1);
       try {
         return JSON.parse(candidate);
-      } catch {
+      } catch (e) {
         return;
       }
     }
@@ -1274,12 +1701,12 @@ function subsetMatch(expected, actual) {
 async function loadDatasets() {
   try {
     const dataDir = resolveDataDir();
-    const testsJsonl = await import_node_fs3.promises.readFile(
-      import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
+    const testsJsonl = await import_node_fs4.promises.readFile(
+      import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
       "utf-8"
     );
-    const expectedJsonl = await import_node_fs3.promises.readFile(
-      import_node_path3.default.join(dataDir, "json_generation_expected.jsonl"),
+    const expectedJsonl = await import_node_fs4.promises.readFile(
+      import_node_path4.default.join(dataDir, "json_generation_expected.jsonl"),
       "utf-8"
     );
     const tests = testsJsonl.split(NEWLINE_REGEX).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -1335,10 +1762,11 @@ function validateTestCase(tc, parsed, context) {
   return { valid, valuesOk, parsed };
 }
 async function processTestCase(tc, context) {
+  var _a;
   const messages = buildMessages(tc);
-  const temp = context.config?.temperature;
+  const temp = (_a = context.config) == null ? void 0 : _a.temperature;
   const temperature = typeof temp === "number" ? temp : void 0;
-  const { text } = await (0, import_ai2.generateText)({
+  const { text } = await (0, import_ai3.generateText)({
     model: context.model,
     messages,
     ...temperature !== void 0 ? { temperature } : {}
@@ -1346,7 +1774,7 @@ async function processTestCase(tc, context) {
   let parsed;
   try {
     parsed = extractFirstJsonBlock(text);
-  } catch {
+  } catch (e) {
   }
   if (parsed === void 0) {
     context.validation.logs.push(
@@ -1440,21 +1868,22 @@ function buildBenchmarkResult(total, counts, logs) {
 async function loadSchemaOnlyTests() {
   try {
     const dataDir = resolveDataDir();
-    const testsJsonl = await import_node_fs3.promises.readFile(
-      import_node_path3.default.join(dataDir, "json_generation_tests.jsonl"),
+    const testsJsonl = await import_node_fs4.promises.readFile(
+      import_node_path4.default.join(dataDir, "json_generation_tests.jsonl"),
       "utf-8"
     );
-    const tests = testsJsonl.split(LINE_SPLIT_REGEX2).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+    const tests = testsJsonl.split(LINE_SPLIT_REGEX3).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
     return { tests };
   } catch (e) {
     return { tests: [], error: e };
   }
 }
 async function processSchemaOnlyTestCase(tc, context) {
+  var _a;
   const messages = buildMessages(tc);
-  const temp = context.config?.temperature;
+  const temp = (_a = context.config) == null ? void 0 : _a.temperature;
   const temperature = typeof temp === "number" ? temp : void 0;
-  const { text } = await (0, import_ai2.generateText)({
+  const { text } = await (0, import_ai3.generateText)({
     model: context.model,
     messages,
     ...temperature !== void 0 ? { temperature } : {}
@@ -1462,7 +1891,7 @@ async function processSchemaOnlyTestCase(tc, context) {
   let parsed;
   try {
     parsed = extractFirstJsonBlock(text);
-  } catch {
+  } catch (e) {
   }
   if (parsed === void 0) {
     context.logs.push(
@@ -1539,8 +1968,56 @@ var colors = {
   yellow: "\x1B[33m",
   cyan: "\x1B[36m",
   magenta: "\x1B[35m",
-  gray: "\x1B[90m"
+  gray: "\x1B[90m",
+  white: "\x1B[37m",
+  bgRed: "\x1B[41m"
 };
+function formatDiff(diff) {
+  if (!diff || diff.length === 0) {
+    return "";
+  }
+  return diff.map((line) => {
+    if (line.startsWith("-")) {
+      return `${colors.red}${line}${colors.reset}`;
+    }
+    if (line.startsWith("+")) {
+      return `${colors.green}${line}${colors.reset}`;
+    }
+    if (line.startsWith("@@")) {
+      return `${colors.cyan}${line}${colors.reset}`;
+    }
+    return line;
+  }).join("\n      ");
+}
+function printFailLogs(logs) {
+  const failLogs = logs.filter((l) => l.startsWith("[DEBUG-FAIL]"));
+  for (const log of failLogs) {
+    try {
+      const jsonStr = log.replace("[DEBUG-FAIL] ", "");
+      const data = JSON.parse(jsonStr);
+      console.log(`
+    ${colors.red}FAILED CASE: ${data.id}${colors.reset}`);
+      console.log(
+        `    Error Type: ${colors.yellow}${data.error_type || "unknown"}${colors.reset}`
+      );
+      console.log(`    Message: ${data.message}`);
+      if (data.diff && Array.isArray(data.diff)) {
+        console.log(`    Diff:
+      ${formatDiff(data.diff)}`);
+      }
+      if (data.expected && data.actual) {
+        const expStr = JSON.stringify(data.expected);
+        const actStr = JSON.stringify(data.actual);
+        if (expStr.length < 100 && actStr.length < 100) {
+          console.log(`    Expected: ${colors.gray}${expStr}${colors.reset}`);
+          console.log(`    Actual:   ${colors.gray}${actStr}${colors.reset}`);
+        }
+      }
+    } catch (_e) {
+      console.log(`    Raw Log: ${log}`);
+    }
+  }
+}
 function printResult(result) {
   const { model, modelKey, benchmark, result: benchmarkResult } = result;
   const status = benchmarkResult.success ? `${colors.green}\u2714 SUCCESS${colors.reset}` : `${colors.red}\u2716 FAILURE${colors.reset}`;
@@ -1563,6 +2040,18 @@ function printResult(result) {
       `    ${colors.red}Error: ${benchmarkResult.error.message}${colors.reset}`
     );
   }
+  if (!benchmarkResult.success && benchmarkResult.logs) {
+    printFailLogs(benchmarkResult.logs);
+    const failLogs = benchmarkResult.logs.filter(
+      (l) => l.startsWith("[DEBUG-FAIL]")
+    );
+    if (failLogs.length === 0 && benchmarkResult.logs.length > 0) {
+      console.log("    Raw Logs (Sample):");
+      for (const l of benchmarkResult.logs.slice(0, 10)) {
+        console.log(`      ${l}`);
+      }
+    }
+  }
 }
 function consoleReporter(results) {
   console.log("\n--- \u{1F4CA} Evaluation Report ---");
@@ -1617,14 +2106,14 @@ function hasFunctionNameIssue(diff) {
   );
 }
 function suggestFunctionNameFix(expected, actual, suggestions) {
-  const expectedName = expected?.function;
-  const actualName = actual?.function;
+  const expectedName = expected == null ? void 0 : expected.function;
+  const actualName = actual == null ? void 0 : actual.function;
   if (expectedName && actualName && expectedName !== actualName) {
     suggestions.push(
       `Call the function '${expectedName}' instead of '${actualName}'.`
     );
   }
-  if (Array.isArray(expected?.functions)) {
+  if (Array.isArray(expected == null ? void 0 : expected.functions)) {
     suggestions.push(
       `Ensure tool calls include: ${expected.functions.join(", ")}.`
     );
@@ -1679,7 +2168,7 @@ function suggestFromErrorType(error_type, suggestions) {
 }
 function suggestFixFromDiff(parsed) {
   const suggestions = [];
-  const { error_type, expected, actual, diff } = parsed ?? {};
+  const { error_type, expected, actual, diff } = parsed != null ? parsed : {};
   if (!Array.isArray(diff)) {
     if (suggestions.length === 0 && typeof error_type === "string") {
       suggestFromErrorType(error_type, suggestions);
@@ -1704,15 +2193,16 @@ function suggestFixFromDiff(parsed) {
   return uniqueLines(suggestions);
 }
 function getTestIdFromLogLine(line) {
+  var _a, _b;
   if (line.startsWith("[FAIL]")) {
     const m = line.match(FAIL_ID_REGEX);
-    return m?.[1];
+    return m == null ? void 0 : m[1];
   }
   if (line.startsWith("[DEBUG-FAIL]")) {
     try {
       const parsed = JSON.parse(line.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
-      return String(parsed?.id ?? "");
-    } catch {
+      return String((_a = parsed == null ? void 0 : parsed.id) != null ? _a : "");
+    } catch (e) {
     }
   }
   if (line.startsWith("[DEBUG-FAIL-CONTEXT]")) {
@@ -1720,18 +2210,19 @@ function getTestIdFromLogLine(line) {
       const parsed = JSON.parse(
         line.replace(DEBUG_FAIL_CONTEXT_PREFIX_REGEX, "")
       );
-      return String(parsed?.id ?? "");
-    } catch {
+      return String((_b = parsed == null ? void 0 : parsed.id) != null ? _b : "");
+    } catch (e) {
     }
   }
   return;
 }
 function groupLogsByTestId(failLogs) {
+  var _a;
   const byId = /* @__PURE__ */ new Map();
   for (const line of failLogs) {
     const id = getTestIdFromLogLine(line);
-    const key = id ?? "__general__";
-    const arr = byId.get(key) ?? [];
+    const key = id != null ? id : "__general__";
+    const arr = (_a = byId.get(key)) != null ? _a : [];
     arr.push(line);
     byId.set(key, arr);
   }
@@ -1743,10 +2234,10 @@ function collectDebugIds(lines) {
     if (l.startsWith("[DEBUG-FAIL]")) {
       try {
         const parsed = JSON.parse(l.replace(DEBUG_FAIL_PREFIX_REGEX, ""));
-        if (parsed?.id) {
+        if (parsed == null ? void 0 : parsed.id) {
           debugIds.add(String(parsed.id));
         }
-      } catch {
+      } catch (e) {
       }
     }
   }
@@ -1782,7 +2273,7 @@ function displayDebugFailLine(line) {
         console.log(`            \u2022 ${s}`);
       }
     }
-  } catch {
+  } catch (e) {
     console.log(`        ${line}`);
   }
 }
@@ -1826,14 +2317,14 @@ function displayDebugFailContextLine(line) {
     const ctx = JSON.parse(payload);
     console.log(`        ${colors2.gray}context:${colors2.reset}`);
     displayContextInfo(ctx);
-  } catch {
+  } catch (e) {
     console.log(`        ${line}`);
   }
 }
 function displayLogLine(line, debugIds) {
   if (line.startsWith("[FAIL]")) {
     const m = line.match(FAIL_ID_REGEX);
-    const failId = m?.[1];
+    const failId = m == null ? void 0 : m[1];
     if (failId && debugIds.has(failId)) {
       return;
     }
@@ -1903,11 +2394,12 @@ function displayResultHeader(r) {
   );
 }
 function consoleDebugReporter(results) {
+  var _a;
   console.log("\n--- \u{1F4CA} Evaluation Report (debug) ---");
   for (const r of results) {
     displayResultHeader(r);
     displayMetrics(Object.entries(r.result.metrics));
-    if (r.result.logs?.length) {
+    if ((_a = r.result.logs) == null ? void 0 : _a.length) {
       displayResultLogs(r.result.logs);
     }
   }
@@ -1916,13 +2408,16 @@ function consoleDebugReporter(results) {
 // src/reporters/json.ts
 function jsonReporter(results) {
-  const serializableResults = results.map((r) => ({
-    ...r,
-    result: {
-      ...r.result,
-      error: r.result.error?.message
-    }
-  }));
+  const serializableResults = results.map((r) => {
+    var _a;
+    return {
+      ...r,
+      result: {
+        ...r.result,
+        error: (_a = r.result.error) == null ? void 0 : _a.message
+      }
+    };
+  });
   console.log(JSON.stringify(serializableResults, null, 2));
 }
@@ -2035,6 +2530,7 @@ async function evaluate(options) {
   bfclParallelBenchmark,
   bfclParallelMultipleBenchmark,
   bfclSimpleBenchmark,
+  complexFuncBenchBenchmark,
   evaluate,
   jsonGenerationBenchmark,
   jsonGenerationSchemaOnlyBenchmark