@ai-sdk-tool/eval 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +25 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +25 -9
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -84,6 +84,10 @@ interface EvaluateOptions {
|
|
|
84
84
|
* Defaults to 'console'.
|
|
85
85
|
*/
|
|
86
86
|
reporter?: ReporterType;
|
|
87
|
+
/**
|
|
88
|
+
* Optional temperature setting to pass to model generation during evaluation.
|
|
89
|
+
*/
|
|
90
|
+
temperature?: number;
|
|
87
91
|
}
|
|
88
92
|
|
|
89
93
|
declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
|
package/dist/index.d.ts
CHANGED
|
@@ -84,6 +84,10 @@ interface EvaluateOptions {
|
|
|
84
84
|
* Defaults to 'console'.
|
|
85
85
|
*/
|
|
86
86
|
reporter?: ReporterType;
|
|
87
|
+
/**
|
|
88
|
+
* Optional temperature setting to pass to model generation during evaluation.
|
|
89
|
+
*/
|
|
90
|
+
temperature?: number;
|
|
87
91
|
}
|
|
88
92
|
|
|
89
93
|
declare function evaluate(options: EvaluateOptions): Promise<EvaluationResult[]>;
|
package/dist/index.js
CHANGED
|
@@ -239,13 +239,13 @@ var reporters = {
|
|
|
239
239
|
};
|
|
240
240
|
|
|
241
241
|
// src/evaluate.ts
|
|
242
|
-
async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
242
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
243
243
|
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
244
244
|
try {
|
|
245
245
|
console.log(
|
|
246
246
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
247
247
|
);
|
|
248
|
-
const result = await benchmark.run(model);
|
|
248
|
+
const result = await benchmark.run(model, config);
|
|
249
249
|
console.log(
|
|
250
250
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
251
251
|
);
|
|
@@ -274,7 +274,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
|
274
274
|
}
|
|
275
275
|
}
|
|
276
276
|
async function evaluate(options) {
|
|
277
|
-
const { models, benchmarks, reporter = "console" } = options;
|
|
277
|
+
const { models, benchmarks, reporter = "console", temperature } = options;
|
|
278
278
|
const modelEntries = [];
|
|
279
279
|
if (Array.isArray(models)) {
|
|
280
280
|
for (const m of models) modelEntries.push([void 0, m]);
|
|
@@ -293,7 +293,8 @@ async function evaluate(options) {
|
|
|
293
293
|
const evaluationResult = await runSingleBenchmark(
|
|
294
294
|
model,
|
|
295
295
|
benchmark,
|
|
296
|
-
modelKey
|
|
296
|
+
modelKey,
|
|
297
|
+
temperature !== void 0 ? { temperature } : void 0
|
|
297
298
|
);
|
|
298
299
|
allResults.push(evaluationResult);
|
|
299
300
|
}
|
|
@@ -633,7 +634,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
633
634
|
name,
|
|
634
635
|
version: "1.0.0",
|
|
635
636
|
description,
|
|
636
|
-
async run(model) {
|
|
637
|
+
async run(model, config) {
|
|
637
638
|
const logs = [];
|
|
638
639
|
let correctCount = 0;
|
|
639
640
|
let testCases = [];
|
|
@@ -691,6 +692,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
691
692
|
const runSingleCase = async (testCase) => {
|
|
692
693
|
const caseLogs = [];
|
|
693
694
|
const { function: tools, question: messages } = testCase;
|
|
695
|
+
const temp = config?.temperature;
|
|
696
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
694
697
|
try {
|
|
695
698
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
696
699
|
const nameMap = /* @__PURE__ */ new Map();
|
|
@@ -736,6 +739,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
736
739
|
messages: flatMessages,
|
|
737
740
|
tools: toolsMap,
|
|
738
741
|
toolChoice: "auto",
|
|
742
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
739
743
|
// Pass original schema information to middleware
|
|
740
744
|
providerOptions: {
|
|
741
745
|
toolCallMiddleware: {
|
|
@@ -1132,7 +1136,7 @@ var jsonGenerationBenchmark = {
|
|
|
1132
1136
|
name: "json-generation",
|
|
1133
1137
|
version: "2.1.0",
|
|
1134
1138
|
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1135
|
-
async run(model) {
|
|
1139
|
+
async run(model, config) {
|
|
1136
1140
|
const logs = [];
|
|
1137
1141
|
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1138
1142
|
let schemaValidCount = 0;
|
|
@@ -1183,7 +1187,13 @@ var jsonGenerationBenchmark = {
|
|
|
1183
1187
|
].join("\n\n")
|
|
1184
1188
|
}
|
|
1185
1189
|
];
|
|
1186
|
-
const
|
|
1190
|
+
const temp = config?.temperature;
|
|
1191
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1192
|
+
const { text } = await generateText2({
|
|
1193
|
+
model,
|
|
1194
|
+
messages,
|
|
1195
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1196
|
+
});
|
|
1187
1197
|
let parsed;
|
|
1188
1198
|
try {
|
|
1189
1199
|
parsed = extractFirstJsonBlock(text);
|
|
@@ -1243,7 +1253,7 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1243
1253
|
name: "json-generation-schema-only",
|
|
1244
1254
|
version: "1.0.1",
|
|
1245
1255
|
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
1246
|
-
async run(model) {
|
|
1256
|
+
async run(model, config) {
|
|
1247
1257
|
const logs = [];
|
|
1248
1258
|
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
1249
1259
|
let tests = [];
|
|
@@ -1285,7 +1295,13 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1285
1295
|
].join("\n\n")
|
|
1286
1296
|
}
|
|
1287
1297
|
];
|
|
1288
|
-
const
|
|
1298
|
+
const temp = config?.temperature;
|
|
1299
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1300
|
+
const { text } = await generateText2({
|
|
1301
|
+
model,
|
|
1302
|
+
messages,
|
|
1303
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1304
|
+
});
|
|
1289
1305
|
let parsed;
|
|
1290
1306
|
try {
|
|
1291
1307
|
parsed = extractFirstJsonBlock(text);
|