@ai-sdk-tool/eval 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +13 -0
- package/README.md +4 -0
- package/dist/index.cjs +25 -9
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +25 -9
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/LICENSE
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Copyright 2025 Woonggi Min (https://github.com/minpeter)
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
package/README.md
CHANGED
package/dist/index.cjs
CHANGED
|
@@ -281,13 +281,13 @@ var reporters = {
|
|
|
281
281
|
};
|
|
282
282
|
|
|
283
283
|
// src/evaluate.ts
|
|
284
|
-
async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
284
|
+
async function runSingleBenchmark(model, benchmark, modelKey, config) {
|
|
285
285
|
const modelId = typeof model === "object" && model !== null && "modelId" in model && typeof model.modelId === "string" ? model.modelId : "unknown-model";
|
|
286
286
|
try {
|
|
287
287
|
console.log(
|
|
288
288
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Running benchmark: ${benchmark.name}...`
|
|
289
289
|
);
|
|
290
|
-
const result = await benchmark.run(model);
|
|
290
|
+
const result = await benchmark.run(model, config);
|
|
291
291
|
console.log(
|
|
292
292
|
`[${modelId}]${modelKey ? ` (${modelKey})` : ""} Finished benchmark: ${benchmark.name}. Score: ${result.score}`
|
|
293
293
|
);
|
|
@@ -316,7 +316,7 @@ async function runSingleBenchmark(model, benchmark, modelKey) {
|
|
|
316
316
|
}
|
|
317
317
|
}
|
|
318
318
|
async function evaluate(options) {
|
|
319
|
-
const { models, benchmarks, reporter = "console" } = options;
|
|
319
|
+
const { models, benchmarks, reporter = "console", temperature } = options;
|
|
320
320
|
const modelEntries = [];
|
|
321
321
|
if (Array.isArray(models)) {
|
|
322
322
|
for (const m of models) modelEntries.push([void 0, m]);
|
|
@@ -335,7 +335,8 @@ async function evaluate(options) {
|
|
|
335
335
|
const evaluationResult = await runSingleBenchmark(
|
|
336
336
|
model,
|
|
337
337
|
benchmark,
|
|
338
|
-
modelKey
|
|
338
|
+
modelKey,
|
|
339
|
+
temperature !== void 0 ? { temperature } : void 0
|
|
339
340
|
);
|
|
340
341
|
allResults.push(evaluationResult);
|
|
341
342
|
}
|
|
@@ -675,7 +676,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
675
676
|
name,
|
|
676
677
|
version: "1.0.0",
|
|
677
678
|
description,
|
|
678
|
-
async run(model) {
|
|
679
|
+
async run(model, config) {
|
|
679
680
|
const logs = [];
|
|
680
681
|
let correctCount = 0;
|
|
681
682
|
let testCases = [];
|
|
@@ -733,6 +734,8 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
733
734
|
const runSingleCase = async (testCase) => {
|
|
734
735
|
const caseLogs = [];
|
|
735
736
|
const { function: tools, question: messages } = testCase;
|
|
737
|
+
const temp = config?.temperature;
|
|
738
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
736
739
|
try {
|
|
737
740
|
const flatMessages = Array.isArray(messages) && messages.some((m) => Array.isArray(m)) ? messages.flat(1) : messages;
|
|
738
741
|
const nameMap = /* @__PURE__ */ new Map();
|
|
@@ -778,6 +781,7 @@ function createBfclBenchmark(name, description, testDataFile, answerDataFile) {
|
|
|
778
781
|
messages: flatMessages,
|
|
779
782
|
tools: toolsMap,
|
|
780
783
|
toolChoice: "auto",
|
|
784
|
+
...temperature !== void 0 ? { temperature } : {},
|
|
781
785
|
// Pass original schema information to middleware
|
|
782
786
|
providerOptions: {
|
|
783
787
|
toolCallMiddleware: {
|
|
@@ -1174,7 +1178,7 @@ var jsonGenerationBenchmark = {
|
|
|
1174
1178
|
name: "json-generation",
|
|
1175
1179
|
version: "2.1.0",
|
|
1176
1180
|
description: "Evaluates schema-compliant JSON generation from natural language using JSON Schema prompts.",
|
|
1177
|
-
async run(model) {
|
|
1181
|
+
async run(model, config) {
|
|
1178
1182
|
const logs = [];
|
|
1179
1183
|
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1180
1184
|
let schemaValidCount = 0;
|
|
@@ -1225,7 +1229,13 @@ var jsonGenerationBenchmark = {
|
|
|
1225
1229
|
].join("\n\n")
|
|
1226
1230
|
}
|
|
1227
1231
|
];
|
|
1228
|
-
const
|
|
1232
|
+
const temp = config?.temperature;
|
|
1233
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1234
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
1235
|
+
model,
|
|
1236
|
+
messages,
|
|
1237
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1238
|
+
});
|
|
1229
1239
|
let parsed;
|
|
1230
1240
|
try {
|
|
1231
1241
|
parsed = extractFirstJsonBlock(text);
|
|
@@ -1285,7 +1295,7 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1285
1295
|
name: "json-generation-schema-only",
|
|
1286
1296
|
version: "1.0.1",
|
|
1287
1297
|
description: "Evaluates whether model outputs strictly conform to the provided JSON Schema (structure only).",
|
|
1288
|
-
async run(model) {
|
|
1298
|
+
async run(model, config) {
|
|
1289
1299
|
const logs = [];
|
|
1290
1300
|
const ajv = new import_ajv.default({ allErrors: true, strict: false });
|
|
1291
1301
|
let tests = [];
|
|
@@ -1327,7 +1337,13 @@ var jsonGenerationSchemaOnlyBenchmark = {
|
|
|
1327
1337
|
].join("\n\n")
|
|
1328
1338
|
}
|
|
1329
1339
|
];
|
|
1330
|
-
const
|
|
1340
|
+
const temp = config?.temperature;
|
|
1341
|
+
const temperature = typeof temp === "number" ? temp : void 0;
|
|
1342
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
1343
|
+
model,
|
|
1344
|
+
messages,
|
|
1345
|
+
...temperature !== void 0 ? { temperature } : {}
|
|
1346
|
+
});
|
|
1331
1347
|
let parsed;
|
|
1332
1348
|
try {
|
|
1333
1349
|
parsed = extractFirstJsonBlock(text);
|