@ai-sdk-tool/eval 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -84,6 +84,10 @@ interface EvaluateOptions {
84
84
  * Defaults to 'console'.
85
85
  */
86
86
  reporter?: ReporterType;
87
+ /**
88
+ * Optional temperature setting to pass to model generation during evaluation.
89
+ */
90
+ temperature?: number;
87
91
  }
88
92
 
89
93
  declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
package/dist/index.d.ts CHANGED
@@ -84,6 +84,10 @@ interface EvaluateOptions {
84
84
  * Defaults to 'console'.
85
85
  */
86
86
  reporter?: ReporterType;
87
+ /**
88
+ * Optional temperature setting to pass to model generation during evaluation.
89
+ */
90
+ temperature?: number;
87
91
  }
88
92
 
89
93
  declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
package/dist/index.js CHANGED
@@ -239,13 +239,13 @@ var reporters = {
239
239
  };
240
240
 
241
241
  // src/evaluate.ts
242
- async function runSingleBenchmark(model, benchmark, modelKey) {
242
+ async function runSingleBenchmark(model, benchmark, modelKey, config) {
243
243
  const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
244
244
  try {
245
245
  console.log(
246
246
  `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
247
247
  );
248
- const result = await benchmark.run(model);
248
+ const result = await benchmark.run(model, config);
249
249
  console.log(
250
250
  `[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
251
251
  );
@@ -274,7 +274,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
274
274
  }
275
275
  }
276
276
  async function evaluate(options) {
277
- const { models, benchmarks, reporter = "console" } = options;
277
+ const { models, benchmarks, reporter = "console", temperature } = options;
278
278
  const modelEntries = [];
279
279
  if (Array.isArray(models)) {
280
280
  for (const m of models) modelEntries.push([void 0, m]);
@@ -293,7 +293,8 @@ async function evaluate(options) {
293
293
  const evaluationResult = await runSingleBenchmark(
294
294
  model,
295
295
  benchmark,
296
- modelKey
296
+ modelKey,
297
+ temperature !== void 0 ? { temperature } : void 0
297
298
  );
298
299
  allResults.push(evaluationResult);
299
300
  }
@@ -633,7 +634,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
633
634
  name,
634
635
  version: "1.0.0",
635
636
  description,
636
- async run(model) {
637
+ async run(model, config) {
637
638
  const logs = [];
638
639
  let correctCount = 0;
639
640
  let testCases = [];
@@ -691,6 +692,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
691
692
  const runSingleCase = async (testCase) => {
692
693
  const caseLogs = [];
693
694
  const { function: tools, question: messages } = testCase;
695
+ const temp = config?.temperature;
696
+ const temperature = typeof temp === "number" ? temp : void 0;
694
697
  try {
695
698
  const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
696
699
  const nameMap = /* @__PURE__ */ new Map();
@@ -736,6 +739,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
736
739
  messages: flatMessages,
737
740
  tools: toolsMap,
738
741
  toolChoice: "auto",
742
+ ...temperature !== void 0 ? { temperature } : {},
739
743
  // Pass original schema information to middleware
740
744
  providerOptions: {
741
745
  toolCallMiddleware: {
@@ -1132,7 +1136,7 @@ var jsonGenerationBenchmark = {
1132
1136
  name: "json-generation",
1133
1137
  version: "2.1.0",
1134
1138
  description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
1135
- async run(model) {
1139
+ async run(model, config) {
1136
1140
  const logs = [];
1137
1141
  const ajv = new Ajv({ allErrors: true, strict: false });
1138
1142
  let schemaValidCount = 0;
@@ -1183,7 +1187,13 @@ var jsonGenerationBenchmark = {
1183
1187
  ].join("\n\n")
1184
1188
  }
1185
1189
  ];
1186
- const { text } = await generateText2({ model, messages });
1190
+ const temp = config?.temperature;
1191
+ const temperature = typeof temp === "number" ? temp : void 0;
1192
+ const { text } = await generateText2({
1193
+ model,
1194
+ messages,
1195
+ ...temperature !== void 0 ? { temperature } : {}
1196
+ });
1187
1197
  let parsed;
1188
1198
  try {
1189
1199
  parsed = extractFirstJsonBlock(text);
@@ -1243,7 +1253,7 @@ var jsonGenerationSchemaOnlyBenchmark = {
1243
1253
  name: "json-generation-schema-only",
1244
1254
  version: "1.0.1",
1245
1255
  description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
1246
- async run(model) {
1256
+ async run(model, config) {
1247
1257
  const logs = [];
1248
1258
  const ajv = new Ajv({ allErrors: true, strict: false });
1249
1259
  let tests = [];
@@ -1285,7 +1295,13 @@ var jsonGenerationSchemaOnlyBenchmark = {
1285
1295
  ].join("\n\n")
1286
1296
  }
1287
1297
  ];
1288
- const { text } = await generateText2({ model, messages });
1298
+ const temp = config?.temperature;
1299
+ const temperature = typeof temp === "number" ? temp : void 0;
1300
+ const { text } = await generateText2({
1301
+ model,
1302
+ messages,
1303
+ ...temperature !== void 0 ? { temperature } : {}
1304
+ });
1289
1305
  let parsed;
1290
1306
  try {
1291
1307
  parsed = extractFirstJsonBlock(text);