npm - @ai-sdk-tool/eval - Versions diffs - 0.1.3 → 0.1.5 - Mend

@ai-sdk-tool/eval 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.js CHANGED Viewed

@@ -71,7 +71,7 @@ function uniqueLines(lines) {
 function suggestFixFromDiff(parsed) {
   const suggestions = [];
   const { error_type, expected, actual, diff } = parsed ?? {};
-  if (diff && diff.some((d) => d.includes("function name")) || diff && diff.some((d) => d.includes("missing function:"))) {
+  if (Array.isArray(diff) && diff.some((d) => String(d).includes("function name")) || Array.isArray(diff) && diff.some((d) => String(d).includes("missing function:"))) {
     const expectedName = expected?.function;
     const actualName = actual?.function;
     if (expectedName && actualName && expectedName !== actualName) {
@@ -85,23 +85,23 @@ function suggestFixFromDiff(parsed) {
       );
     }
   }
-  if (diff && diff.some((d) => d.startsWith("- missing required param:"))) {
-    const missing = diff.filter((d) => d.startsWith("- missing required param:")).map((d) => d.replace("- missing required param: ", ""));
+  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("- missing required param:"))) {
+    const missing = diff.filter((d) => String(d).startsWith("- missing required param:")).map((d) => String(d).replace("- missing required param: ", ""));
     if (missing.length) {
       suggestions.push(`Add required parameter(s): ${missing.join(", ")}.`);
     }
   }
-  if (diff && diff.some((d) => d.startsWith("+ unexpected param:"))) {
-    const extras = diff.filter((d) => d.startsWith("+ unexpected param:")).map((d) => d.replace("+ unexpected param: ", ""));
+  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("+ unexpected param:"))) {
+    const extras = diff.filter((d) => String(d).startsWith("+ unexpected param:")).map((d) => String(d).replace("+ unexpected param: ", ""));
     if (extras.length) {
       suggestions.push(`Remove unexpected parameter(s): ${extras.join(", ")}.`);
     }
   }
-  if (diff && diff.some((d) => d.startsWith("@@ param "))) {
-    const targets = diff.filter((d) => d.startsWith("@@ param ")).map((d) => d.replace("@@ param ", ""));
+  if (Array.isArray(diff) && diff.some((d) => String(d).startsWith("@@ param "))) {
+    const targets = diff.filter((d) => String(d).startsWith("@@ param ")).map((d) => String(d).replace("@@ param ", ""));
     for (const param of targets) {
       const allowedLine = diff.find(
-        (d) => d.startsWith("- expected one of:")
+        (d) => String(d).startsWith("- expected one of:")
       );
       if (allowedLine) {
         const allowed = allowedLine.replace("- expected one of: ", "");
@@ -239,13 +239,13 @@ var reporters = {
 };
 // src/evaluate.ts
-async function runSingleBenchmark(model, benchmark, modelKey) {
+async function runSingleBenchmark(model, benchmark, modelKey, config) {
   const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
   try {
     console.log(
       `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
     );
-    const result = await benchmark.run(model);
+    const result = await benchmark.run(model, config);
     console.log(
       `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
     );
@@ -274,7 +274,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
   }
 }
 async function evaluate(options) {
-  const { models, benchmarks, reporter = "console" } = options;
+  const { models, benchmarks, reporter = "console", temperature } = options;
   const modelEntries = [];
   if (Array.isArray(models)) {
     for (const m of models) modelEntries.push([void 0, m]);
@@ -293,7 +293,8 @@ async function evaluate(options) {
       const evaluationResult = await runSingleBenchmark(
         model,
         benchmark,
-        modelKey
+        modelKey,
+        temperature !== void 0 ? { temperature } : void 0
       );
       allResults.push(evaluationResult);
     }
@@ -308,17 +309,16 @@ async function evaluate(options) {
   return allResults;
 }
-// src/benchmarks/json-generation.ts
-import { generateText } from "ai";
-import Ajv from "ajv";
+// src/benchmarks/bfcl.ts
+import { generateText, jsonSchema, tool } from "ai";
 import { promises as fs2 } from "fs";
 import path2 from "path";
 // src/utils/paths.ts
 import fs from "fs";
+import { createRequire } from "module";
 import path from "path";
 import { fileURLToPath } from "url";
-import { createRequire } from "module";
 function resolveDataDir(fromModuleUrl) {
   const moduleUrl = fromModuleUrl;
   const override = process.env.BFCL_DATA_DIR;
@@ -366,263 +366,6 @@ function resolveDataDir(fromModuleUrl) {
   return path.join(pkgRoot, "data");
 }
-// src/benchmarks/json-generation.ts
-function extractFirstJsonBlock(text) {
-  try {
-    return JSON.parse(text);
-  } catch {
-  }
-  const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
-  if (fenceMatch) {
-    const inner = fenceMatch[1].trim();
-    try {
-      return JSON.parse(inner);
-    } catch {
-    }
-  }
-  const startIdxObj = text.indexOf("{");
-  const startIdxArr = text.indexOf("[");
-  const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
-  if (start === void 0) return void 0;
-  const open = text[start] === "{" ? "{" : "[";
-  const close = open === "{" ? "}" : "]";
-  let depth = 0;
-  for (let i = start; i < text.length; i++) {
-    const ch = text[i];
-    if (ch === open) depth++;
-    else if (ch === close) depth--;
-    if (depth === 0) {
-      const candidate = text.slice(start, i + 1);
-      try {
-        return JSON.parse(candidate);
-      } catch {
-      }
-      break;
-    }
-  }
-  return void 0;
-}
-function subsetMatch(expected, actual) {
-  if (expected === null || typeof expected !== "object") {
-    return expected === actual;
-  }
-  if (Array.isArray(expected)) {
-    if (!Array.isArray(actual)) return false;
-    for (let i = 0; i < expected.length; i++) {
-      if (!subsetMatch(expected[i], actual[i])) return false;
-    }
-    return true;
-  }
-  if (actual === null || typeof actual !== "object") return false;
-  const eObj = expected;
-  const aObj = actual;
-  for (const key of Object.keys(eObj)) {
-    if (!subsetMatch(eObj[key], aObj[key])) return false;
-  }
-  return true;
-}
-var jsonGenerationBenchmark = {
-  name: "json-generation",
-  version: "2.1.0",
-  description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
-  async run(model) {
-    const logs = [];
-    const ajv = new Ajv({ allErrors: true, strict: false });
-    let schemaValidCount = 0;
-    let valueMatchCount = 0;
-    let correctCount = 0;
-    let tests = [];
-    const expectedMap = /* @__PURE__ */ new Map();
-    try {
-      const dataDir = resolveDataDir();
-      const testsJsonl = await fs2.readFile(
-        path2.join(dataDir, "json_generation_tests.jsonl"),
-        "utf-8"
-      );
-      const expectedJsonl = await fs2.readFile(
-        path2.join(dataDir, "json_generation_expected.jsonl"),
-        "utf-8"
-      );
-      tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-      const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-      for (const r of expecteds) expectedMap.set(r.id, r);
-    } catch (e) {
-      const msg = e instanceof Error ? e.message : String(e);
-      return {
-        score: 0,
-        success: false,
-        metrics: {},
-        logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
-        error: e
-      };
-    }
-    for (const tc of tests) {
-      try {
-        const schemaStr = JSON.stringify(tc.schema, null, 2);
-        const messages = [
-          {
-            role: "system",
-            content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
-          },
-          {
-            role: "user",
-            content: [
-              "Generate a JSON object that reflects the following facts.",
-              "JSON Schema:",
-              schemaStr,
-              "Facts:",
-              tc.promptFacts,
-              "Output must be a single JSON only, with no additional text."
-            ].join("\n\n")
-          }
-        ];
-        const { text } = await generateText({ model, messages });
-        let parsed;
-        try {
-          parsed = extractFirstJsonBlock(text);
-        } catch {
-        }
-        if (parsed === void 0) {
-          logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
-          continue;
-        }
-        const validate = ajv.compile(tc.schema);
-        const valid = validate(parsed);
-        if (valid) schemaValidCount++;
-        else
-          logs.push(
-            `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
-          );
-        const expectedRec = expectedMap.get(tc.id);
-        if (!expectedRec) {
-          logs.push(
-            `[WARN] ${tc.id}: No expected record found. Skipping value match.`
-          );
-        }
-        const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
-        if (valuesOk) valueMatchCount++;
-        if (valid && valuesOk) {
-          correctCount++;
-          logs.push(`[PASS] ${tc.id}`);
-        } else {
-          logs.push(
-            `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
-              parsed
-            )}`
-          );
-        }
-      } catch (e) {
-        const msg = e instanceof Error ? e.message : String(e);
-        logs.push(`[ERROR] ${tc.id}: ${msg}`);
-      }
-    }
-    const total = tests.length;
-    const score = correctCount / total;
-    return {
-      score,
-      success: score >= 0.8,
-      metrics: {
-        total_cases: total,
-        correct_count: correctCount,
-        schema_valid_count: schemaValidCount,
-        value_match_count: valueMatchCount,
-        accuracy: score
-      },
-      logs
-    };
-  }
-};
-var jsonGenerationSchemaOnlyBenchmark = {
-  name: "json-generation-schema-only",
-  version: "1.0.1",
-  description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
-  async run(model) {
-    const logs = [];
-    const ajv = new Ajv({ allErrors: true, strict: false });
-    let tests = [];
-    try {
-      const dataDir = resolveDataDir();
-      const testsJsonl = await fs2.readFile(
-        path2.join(dataDir, "json_generation_tests.jsonl"),
-        "utf-8"
-      );
-      tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
-    } catch (e) {
-      const msg = e instanceof Error ? e.message : String(e);
-      return {
-        score: 0,
-        success: false,
-        metrics: {},
-        logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
-        error: e
-      };
-    }
-    let schemaValidCount = 0;
-    for (const tc of tests) {
-      try {
-        const schemaStr = JSON.stringify(tc.schema, null, 2);
-        const messages = [
-          {
-            role: "system",
-            content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
-          },
-          {
-            role: "user",
-            content: [
-              "Generate a JSON object that reflects the following facts.",
-              "JSON Schema:",
-              schemaStr,
-              "Facts:",
-              tc.promptFacts,
-              "Output must be a single JSON only, with no additional text."
-            ].join("\n\n")
-          }
-        ];
-        const { text } = await generateText({ model, messages });
-        let parsed;
-        try {
-          parsed = extractFirstJsonBlock(text);
-        } catch {
-        }
-        if (parsed === void 0) {
-          logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
-          continue;
-        }
-        const validate = ajv.compile(tc.schema);
-        const valid = validate(parsed);
-        if (valid) {
-          schemaValidCount++;
-          logs.push(`[PASS] ${tc.id}`);
-        } else {
-          logs.push(
-            `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
-          );
-        }
-      } catch (e) {
-        const msg = e instanceof Error ? e.message : String(e);
-        logs.push(`[ERROR] ${tc.id}: ${msg}`);
-      }
-    }
-    const total = tests.length;
-    const score = total > 0 ? schemaValidCount / total : 0;
-    return {
-      score,
-      success: score >= 0.8,
-      metrics: {
-        total_cases: total,
-        schema_valid_count: schemaValidCount,
-        accuracy: score
-      },
-      logs
-    };
-  }
-};
-// src/benchmarks/bfcl.ts
-import { generateText as generateText2, jsonSchema, tool } from "ai";
-import { promises as fs3 } from "fs";
-import path3 from "path";
 // src/benchmarks/bfcl/ast-checker.ts
 function standardizeString(input) {
   if (typeof input !== "string") return input;
@@ -632,7 +375,7 @@ function standardizeString(input) {
 function checkStringValue(param, modelValue, possibleAnswers) {
   const standardizedModelValue = standardizeString(modelValue);
   const standardizedPossibleAnswers = possibleAnswers.map(
-    (ans) => standardizeString(ans)
+    (ans) => standardizeString(String(ans))
   );
   if (!standardizedPossibleAnswers.includes(standardizedModelValue)) {
     return {
@@ -659,8 +402,9 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
     };
   }
   const possibleAnswerParams = possibleAnswer[Object.keys(possibleAnswer)[0]];
+  const argsObj = modelArgs && typeof modelArgs === "object" ? modelArgs : {};
   for (const param of requiredParams) {
-    if (!(param in modelArgs)) {
+    if (!(param in argsObj)) {
       return {
         valid: false,
         error: `Missing required parameter: '${param}'.`,
@@ -668,87 +412,98 @@ function simpleFunctionChecker(funcDescription, modelToolCall, possibleAnswer) {
       };
     }
   }
-  for (const paramName in modelArgs) {
-    const modelValue = modelArgs[paramName];
-    if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
-      return {
-        valid: false,
-        error: `Unexpected parameter: '${paramName}'.`,
-        error_type: "simple_function_checker:unexpected_param"
-      };
-    }
-    const possibleValues = possibleAnswerParams[paramName];
-    if (typeof modelValue === "string") {
-      const result = checkStringValue(paramName, modelValue, possibleValues);
-      if (!result.valid) return result;
-    } else if (Array.isArray(modelValue)) {
-      const modelValueStr = JSON.stringify(
-        modelValue.map((v) => standardizeString(v.toString())).sort()
-      );
-      const hasMatch = possibleValues.some(
-        (p) => JSON.stringify(
-          p.map((v) => standardizeString(v.toString())).sort()
-        ) === modelValueStr
-      );
-      if (!hasMatch) {
+  if (modelArgs && typeof modelArgs === "object") {
+    for (const paramName of Object.keys(argsObj)) {
+      const modelValue = argsObj[paramName];
+      if (!(paramName in expectedParams) || !(paramName in possibleAnswerParams)) {
         return {
           valid: false,
-          error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
-            modelValue
-          )}. Expected one of ${JSON.stringify(possibleValues)}.`,
-          error_type: "value_error:list"
+          error: `Unexpected parameter: '${paramName}'.`,
+          error_type: "simple_function_checker:unexpected_param"
         };
       }
-    } else {
-      const hasMatch = possibleValues.some((possibleValue) => {
-        if (modelValue === possibleValue) return true;
-        if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
-          try {
-            const normalizeObject = (obj) => {
-              if (Array.isArray(obj)) {
-                return obj.map(normalizeObject);
-              }
-              if (obj && typeof obj === "object") {
-                const normalized = {};
-                for (const [key, value] of Object.entries(obj)) {
-                  if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
-                    normalized[key] = value[0];
-                  } else {
-                    normalized[key] = normalizeObject(value);
+      const possibleValues = possibleAnswerParams[paramName];
+      if (typeof modelValue === "string") {
+        const result = checkStringValue(
+          paramName,
+          modelValue,
+          possibleValues ?? []
+        );
+        if (!result.valid) return result;
+      } else if (Array.isArray(modelValue)) {
+        const modelValueStr = JSON.stringify(
+          modelValue.map((v) => standardizeString(String(v))).sort()
+        );
+        const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((p) => {
+          if (!Array.isArray(p)) return false;
+          return JSON.stringify(
+            p.map((v) => standardizeString(String(v))).sort()
+          ) === modelValueStr;
+        }) : false;
+        if (!hasMatch) {
+          return {
+            valid: false,
+            error: `Invalid value for list parameter '${paramName}'. Got ${JSON.stringify(
+              modelValue
+            )}. Expected one of ${JSON.stringify(possibleValues)}.`,
+            error_type: "value_error:list"
+          };
+        }
+      } else {
+        const hasMatch = Array.isArray(possibleValues) ? possibleValues.some((possibleValue) => {
+          if (modelValue === possibleValue) return true;
+          if (typeof modelValue === "object" && modelValue !== null && typeof possibleValue === "object" && possibleValue !== null) {
+            try {
+              const normalizeObject = (obj) => {
+                if (Array.isArray(obj)) {
+                  return obj.map(normalizeObject);
+                }
+                if (obj && typeof obj === "object") {
+                  const normalized = {};
+                  for (const [key, value] of Object.entries(
+                    obj
+                  )) {
+                    if (Array.isArray(value) && value.length === 1 && (typeof value[0] !== "object" || value[0] === null)) {
+                      normalized[key] = value[0];
+                    } else {
+                      normalized[key] = normalizeObject(value);
+                    }
                   }
+                  return normalized;
                 }
-                return normalized;
-              }
-              return obj;
-            };
-            const normalizedModel = normalizeObject(modelValue);
-            const normalizedPossible = normalizeObject(possibleValue);
-            return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
-          } catch {
-            return false;
+                return obj;
+              };
+              const normalizedModel = normalizeObject(modelValue);
+              const normalizedPossible = normalizeObject(possibleValue);
+              return JSON.stringify(normalizedModel) === JSON.stringify(normalizedPossible);
+            } catch {
+              return false;
+            }
           }
+          if (typeof modelValue === "number" && typeof possibleValue === "string") {
+            return modelValue.toString() === possibleValue;
+          }
+          if (typeof modelValue === "string" && typeof possibleValue === "number") {
+            return modelValue === possibleValue.toString();
+          }
+          return false;
+        }) : false;
+        if (!hasMatch) {
+          return {
+            valid: false,
+            error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
+              modelValue
+            )}. Expected one of ${JSON.stringify(possibleValues)}.`,
+            error_type: "value_error:other"
+          };
         }
-        if (typeof modelValue === "number" && typeof possibleValue === "string") {
-          return modelValue.toString() === possibleValue;
-        }
-        if (typeof modelValue === "string" && typeof possibleValue === "number") {
-          return modelValue === possibleValue.toString();
-        }
-        return false;
-      });
-      if (!hasMatch) {
-        return {
-          valid: false,
-          error: `Invalid value for parameter '${paramName}'. Got ${JSON.stringify(
-            modelValue
-          )}. Expected one of ${JSON.stringify(possibleValues)}.`,
-          error_type: "value_error:other"
-        };
       }
     }
   }
   for (const paramName in possibleAnswerParams) {
-    if (!(paramName in modelArgs) && !possibleAnswerParams[paramName].includes("")) {
+    const val = possibleAnswerParams[paramName];
+    const isOptional = Array.isArray(val) && val.includes("");
+    if (!(paramName in argsObj) && !isOptional) {
       return {
         valid: false,
         error: `Missing optional parameter '${paramName}' which was not marked as optional.`,
@@ -834,10 +589,10 @@ function check(testCase, modelOutput, possibleAnswer) {
   const category = testCase.id.split("_")[0];
   try {
     if (category === "simple") {
-      if (!modelOutput || modelOutput.length !== 1) {
+      if (!Array.isArray(modelOutput) || modelOutput.length !== 1) {
         return {
           valid: false,
-          error: `Expected 1 function call, but got ${modelOutput?.length ?? 0}.`,
+          error: `Expected 1 function call, but got ${Array.isArray(modelOutput) ? modelOutput.length : 0}.`,
           error_type: "simple:wrong_count"
         };
       }
@@ -879,19 +634,19 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
     name,
     version: "1.0.0",
     description,
-    async run(model) {
+    async run(model, config) {
       const logs = [];
       let correctCount = 0;
       let testCases = [];
       try {
         const dataPath = resolveDataDir();
         logs.push(`[INFO] Using data dir: ${dataPath}`);
-        const testCasesJson = await fs3.readFile(
-          path3.join(dataPath, testDataFile),
+        const testCasesJson = await fs2.readFile(
+          path2.join(dataPath, testDataFile),
           "utf-8"
         );
-        const possibleAnswersJson = await fs3.readFile(
-          path3.join(dataPath, answerDataFile),
+        const possibleAnswersJson = await fs2.readFile(
+          path2.join(dataPath, answerDataFile),
           "utf-8"
         );
         testCases = testCasesJson.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
@@ -908,19 +663,25 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
           );
         }
         const fixSchema = (schema) => {
-          if (!schema || typeof schema !== "object") return schema;
+          if (!schema || typeof schema !== "object")
+            return { type: "object", properties: {} };
           const copy = Array.isArray(schema) ? schema.map((v) => fixSchema(v)) : { ...schema };
-          if (copy.type) {
-            if (copy.type === "dict") copy.type = "object";
-            if (copy.type === "integer" || copy.type === "float")
-              copy.type = "number";
-          }
-          if (copy.properties && typeof copy.properties === "object") {
-            for (const k of Object.keys(copy.properties)) {
-              copy.properties[k] = fixSchema(copy.properties[k]);
+          if (!Array.isArray(copy)) {
+            if (copy.type) {
+              if (copy.type === "dict") copy.type = "object";
+              if (copy.type === "integer" || copy.type === "float")
+                copy.type = "number";
+            }
+            if (copy.properties && typeof copy.properties === "object") {
+              for (const k of Object.keys(copy.properties)) {
+                copy.properties[k] = fixSchema(
+                  copy.properties[k]
+                );
+              }
             }
+            if (copy.items) copy.items = fixSchema(copy.items);
+            return copy;
           }
-          if (copy.items) copy.items = fixSchema(copy.items);
           return copy;
         };
         const concurrencyEnv = process.env.BFCL_CONCURRENCY;
@@ -931,6 +692,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
         const runSingleCase = async (testCase) => {
           const caseLogs = [];
           const { function: tools, question: messages } = testCase;
+          const temp = config?.temperature;
+          const temperature = typeof temp === "number" ? temp : void 0;
           try {
             const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
             const nameMap = /* @__PURE__ */ new Map();
@@ -940,7 +703,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
             };
             const transformedTools = tools.map((t) => {
               const fixed = fixSchema(t.parameters);
-              const inputSchema = fixed && typeof fixed === "object" && fixed.type === "object" ? fixed : { type: "object", properties: {} };
+              const isObjectSchema = fixed && typeof fixed === "object" && fixed.type === "object";
+              const inputSchema = isObjectSchema ? fixed : { type: "object", properties: {} };
               const sanitized = sanitizeName(t.name);
               nameMap.set(sanitized, t.name);
               return {
@@ -970,16 +734,20 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                 `[DEBUG] ${testCase.id}: failed to introspect tools: ${e.message}`
               );
             }
-            const { toolCalls, text, finishReason } = await generateText2({
+            const { toolCalls, text, finishReason } = await generateText({
               model,
               messages: flatMessages,
               tools: toolsMap,
               toolChoice: "auto",
+              ...temperature !== void 0 ? { temperature } : {},
               // Pass original schema information to middleware
               providerOptions: {
                 toolCallMiddleware: {
                   originalToolSchemas: Object.fromEntries(
-                    transformedTools.map((t) => [t.name, t.inputSchema])
+                    transformedTools.map((t) => [
+                      t.name,
+                      t.inputSchema
+                    ])
                   )
                 }
               }
@@ -1032,10 +800,13 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                 const summarizeArgs = (args) => {
                   if (args == null) return args;
                   if (typeof args !== "object") return args;
-                  return Object.keys(args).sort().reduce((acc, k) => {
-                    acc[k] = args[k];
-                    return acc;
-                  }, {});
+                  return Object.keys(args).sort().reduce(
+                    (acc, k) => {
+                      acc[k] = args[k];
+                      return acc;
+                    },
+                    {}
+                  );
                 };
                 const expected = {};
                 const actual = {};
@@ -1056,19 +827,23 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                     diff.push(`- ${expectedFuncName}`);
                     diff.push(`+ ${receivedName}`);
                   }
-                  if (expectedParams && receivedArgs) {
+                  if (expectedParams && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
                     const required = funcDesc?.parameters?.required ?? [];
                     for (const req of required) {
                       if (!(req in receivedArgs)) {
                         diff.push(`- missing required param: ${req}`);
                       }
                     }
-                    for (const k of Object.keys(receivedArgs)) {
+                    for (const k of Object.keys(
+                      receivedArgs
+                    )) {
                       if (!Object.prototype.hasOwnProperty.call(expectedParams, k)) {
                         diff.push(`+ unexpected param: ${k}`);
                       }
                     }
-                    for (const k of Object.keys(receivedArgs)) {
+                    for (const k of Object.keys(
+                      receivedArgs
+                    )) {
                       if (Object.prototype.hasOwnProperty.call(expectedParams, k)) {
                         const allowed = expectedParams[k];
                         const got = receivedArgs[k];
@@ -1141,13 +916,15 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                     );
                     const requiredParams = funcDesc?.parameters?.required ?? [];
                     diff.push(`@@ function ${fname}`);
-                    if (expectedParamsAllowed && receivedArgs) {
+                    if (expectedParamsAllowed && receivedArgs && typeof receivedArgs === "object" && receivedArgs !== null) {
                       for (const req of requiredParams) {
                         if (!(req in receivedArgs)) {
                           diff.push(`- missing required param: ${req}`);
                         }
                       }
-                      for (const k of Object.keys(receivedArgs)) {
+                      for (const k of Object.keys(
+                        receivedArgs
+                      )) {
                         if (!Object.prototype.hasOwnProperty.call(
                           expectedParamsAllowed,
                           k
@@ -1155,7 +932,9 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
                           diff.push(`+ unexpected param: ${k}`);
                         }
                       }
-                      for (const k of Object.keys(receivedArgs)) {
+                      for (const k of Object.keys(
+                        receivedArgs
+                      )) {
                         if (Object.prototype.hasOwnProperty.call(
                           expectedParamsAllowed,
                           k
@@ -1293,6 +1072,274 @@ var bfclParallelMultipleBenchmark = createBfclBenchmark(
   "BFCL_v3_parallel_multiple.json",
   "BFCL_v3_parallel_multiple_possible_answer.json"
 );
+// src/benchmarks/json-generation.ts
+import { generateText as generateText2 } from "ai";
+import Ajv from "ajv";
+import { promises as fs3 } from "fs";
+import path3 from "path";
+function extractFirstJsonBlock(text) {
+  try {
+    return JSON.parse(text);
+  } catch {
+  }
+  const fenceMatch = text.match(/```json\s*([\s\S]*?)```/i) || text.match(/```\s*([\s\S]*?)```/i);
+  if (fenceMatch) {
+    const inner = fenceMatch[1].trim();
+    try {
+      return JSON.parse(inner);
+    } catch {
+    }
+  }
+  const startIdxObj = text.indexOf("{");
+  const startIdxArr = text.indexOf("[");
+  const start = [startIdxObj, startIdxArr].filter((i) => i >= 0).sort((a, b) => a - b)[0];
+  if (start === void 0) return void 0;
+  const open = text[start] === "{" ? "{" : "[";
+  const close = open === "{" ? "}" : "]";
+  let depth = 0;
+  for (let i = start; i < text.length; i++) {
+    const ch = text[i];
+    if (ch === open) depth++;
+    else if (ch === close) depth--;
+    if (depth === 0) {
+      const candidate = text.slice(start, i + 1);
+      try {
+        return JSON.parse(candidate);
+      } catch {
+      }
+      break;
+    }
+  }
+  return void 0;
+}
+function subsetMatch(expected, actual) {
+  if (expected === null || typeof expected !== "object") {
+    return expected === actual;
+  }
+  if (Array.isArray(expected)) {
+    if (!Array.isArray(actual)) return false;
+    for (let i = 0; i < expected.length; i++) {
+      if (!subsetMatch(expected[i], actual[i])) return false;
+    }
+    return true;
+  }
+  if (actual === null || typeof actual !== "object") return false;
+  const eObj = expected;
+  const aObj = actual;
+  for (const key of Object.keys(eObj)) {
+    if (!subsetMatch(eObj[key], aObj[key])) return false;
+  }
+  return true;
+}
+var jsonGenerationBenchmark = {
+  name: "json-generation",
+  version: "2.1.0",
+  description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
+  async run(model, config) {
+    const logs = [];
+    const ajv = new Ajv({ allErrors: true, strict: false });
+    let schemaValidCount = 0;
+    let valueMatchCount = 0;
+    let correctCount = 0;
+    let tests = [];
+    const expectedMap = /* @__PURE__ */ new Map();
+    try {
+      const dataDir = resolveDataDir();
+      const testsJsonl = await fs3.readFile(
+        path3.join(dataDir, "json_generation_tests.jsonl"),
+        "utf-8"
+      );
+      const expectedJsonl = await fs3.readFile(
+        path3.join(dataDir, "json_generation_expected.jsonl"),
+        "utf-8"
+      );
+      tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+      const expecteds = expectedJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+      for (const r of expecteds) expectedMap.set(r.id, r);
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      return {
+        score: 0,
+        success: false,
+        metrics: {},
+        logs: [`[FATAL] Failed to load json-generation datasets: ${msg}`],
+        error: e
+      };
+    }
+    for (const tc of tests) {
+      try {
+        const schemaStr = JSON.stringify(tc.schema, null, 2);
+        const messages = [
+          {
+            role: "system",
+            content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
+          },
+          {
+            role: "user",
+            content: [
+              "Generate a JSON object that reflects the following facts.",
+              "JSON Schema:",
+              schemaStr,
+              "Facts:",
+              tc.promptFacts,
+              "Output must be a single JSON only, with no additional text."
+            ].join("\n\n")
+          }
+        ];
+        const temp = config?.temperature;
+        const temperature = typeof temp === "number" ? temp : void 0;
+        const { text } = await generateText2({
+          model,
+          messages,
+          ...temperature !== void 0 ? { temperature } : {}
+        });
+        let parsed;
+        try {
+          parsed = extractFirstJsonBlock(text);
+        } catch {
+        }
+        if (parsed === void 0) {
+          logs.push(`[FAIL] ${tc.id}: Unable to parse JSON from model output.`);
+          continue;
+        }
+        const validate = ajv.compile(tc.schema);
+        const valid = validate(parsed);
+        if (valid) schemaValidCount++;
+        else
+          logs.push(
+            `[INFO] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
+          );
+        const expectedRec = expectedMap.get(tc.id);
+        if (!expectedRec) {
+          logs.push(
+            `[WARN] ${tc.id}: No expected record found. Skipping value match.`
+          );
+        }
+        const valuesOk = expectedRec ? subsetMatch(expectedRec.expected, parsed) : false;
+        if (valuesOk) valueMatchCount++;
+        if (valid && valuesOk) {
+          correctCount++;
+          logs.push(`[PASS] ${tc.id}`);
+        } else {
+          logs.push(
+            `[FAIL] ${tc.id}: schemaValid=${valid}, valuesOk=${valuesOk}. Output=${JSON.stringify(
+              parsed
+            )}`
+          );
+        }
+      } catch (e) {
+        const msg = e instanceof Error ? e.message : String(e);
+        logs.push(`[ERROR] ${tc.id}: ${msg}`);
+      }
+    }
+    const total = tests.length;
+    const score = correctCount / total;
+    return {
+      score,
+      success: score >= 0.8,
+      metrics: {
+        total_cases: total,
+        correct_count: correctCount,
+        schema_valid_count: schemaValidCount,
+        value_match_count: valueMatchCount,
+        accuracy: score
+      },
+      logs
+    };
+  }
+};
+var jsonGenerationSchemaOnlyBenchmark = {
+  name: "json-generation-schema-only",
+  version: "1.0.1",
+  description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
+  async run(model, config) {
+    const logs = [];
+    const ajv = new Ajv({ allErrors: true, strict: false });
+    let tests = [];
+    try {
+      const dataDir = resolveDataDir();
+      const testsJsonl = await fs3.readFile(
+        path3.join(dataDir, "json_generation_tests.jsonl"),
+        "utf-8"
+      );
+      tests = testsJsonl.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      return {
+        score: 0,
+        success: false,
+        metrics: {},
+        logs: [`[FATAL] Failed to load schema-only tests: ${msg}`],
+        error: e
+      };
+    }
+    let schemaValidCount = 0;
+    for (const tc of tests) {
+      try {
+        const schemaStr = JSON.stringify(tc.schema, null, 2);
+        const messages = [
+          {
+            role: "system",
+            content: "You must output only a single JSON document that strictly conforms to the given JSON Schema. Do not include any extra text or code fences."
+          },
+          {
+            role: "user",
+            content: [
+              "Generate a JSON object that reflects the following facts.",
+              "JSON Schema:",
+              schemaStr,
+              "Facts:",
+              tc.promptFacts,
+              "Output must be a single JSON only, with no additional text."
+            ].join("\n\n")
+          }
+        ];
+        const temp = config?.temperature;
+        const temperature = typeof temp === "number" ? temp : void 0;
+        const { text } = await generateText2({
+          model,
+          messages,
+          ...temperature !== void 0 ? { temperature } : {}
+        });
+        let parsed;
+        try {
+          parsed = extractFirstJsonBlock(text);
+        } catch {
+        }
+        if (parsed === void 0) {
+          logs.push(`[FAIL] ${tc.id}: Could not parse JSON from model output.`);
+          continue;
+        }
+        const validate = ajv.compile(tc.schema);
+        const valid = validate(parsed);
+        if (valid) {
+          schemaValidCount++;
+          logs.push(`[PASS] ${tc.id}`);
+        } else {
+          logs.push(
+            `[FAIL] ${tc.id}: Schema validation errors: ${(validate.errors || []).map((e) => `${e.instancePath} ${e.message}`).join(", ") || "unknown"}`
+          );
+        }
+      } catch (e) {
+        const msg = e instanceof Error ? e.message : String(e);
+        logs.push(`[ERROR] ${tc.id}: ${msg}`);
+      }
+    }
+    const total = tests.length;
+    const score = total > 0 ? schemaValidCount / total : 0;
+    return {
+      score,
+      success: score >= 0.8,
+      metrics: {
+        total_cases: total,
+        schema_valid_count: schemaValidCount,
+        accuracy: score
+      },
+      logs
+    };
+  }
+};
 export {
   bfclMultipleBenchmark,
   bfclParallelBenchmark,