@m4trix/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +51 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +51 -13
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +39 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +39 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +39 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +39 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -986,6 +986,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
986
986
|
function formatLogMessage(msg) {
|
|
987
987
|
if (typeof msg === "string")
|
|
988
988
|
return msg;
|
|
989
|
+
if (msg instanceof Error)
|
|
990
|
+
return msg.stack ?? msg.message;
|
|
989
991
|
try {
|
|
990
992
|
if (msg !== null && typeof msg === "object") {
|
|
991
993
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1288,6 +1290,7 @@ function toNumericScore(value) {
|
|
|
1288
1290
|
}
|
|
1289
1291
|
|
|
1290
1292
|
// src/runner/execution.ts
|
|
1293
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1291
1294
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1292
1295
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1293
1296
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1344,20 +1347,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1344
1347
|
if (!evaluateFn) {
|
|
1345
1348
|
continue;
|
|
1346
1349
|
}
|
|
1350
|
+
const logs = [];
|
|
1351
|
+
const logDiff = (expected, actual, options) => {
|
|
1352
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1353
|
+
};
|
|
1354
|
+
const log = (message, options) => {
|
|
1355
|
+
logs.push(createLogEntry(message, options));
|
|
1356
|
+
};
|
|
1357
|
+
const createError = (message, options) => {
|
|
1358
|
+
const entry = createLogEntry(message, options);
|
|
1359
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1360
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1361
|
+
return error;
|
|
1362
|
+
};
|
|
1347
1363
|
try {
|
|
1348
|
-
const logs = [];
|
|
1349
|
-
const logDiff = (expected, actual, options) => {
|
|
1350
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1351
|
-
};
|
|
1352
|
-
const log = (message, options) => {
|
|
1353
|
-
logs.push(createLogEntry(message, options));
|
|
1354
|
-
};
|
|
1355
1364
|
const ctx = yield* Effect.promise(
|
|
1356
1365
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1357
1366
|
);
|
|
1358
1367
|
const result = yield* Effect.promise(
|
|
1359
|
-
() => Promise.resolve(
|
|
1360
|
-
evaluateFn({
|
|
1368
|
+
() => Promise.resolve().then(
|
|
1369
|
+
() => evaluateFn({
|
|
1361
1370
|
input: testCaseItem.testCase.getInput(),
|
|
1362
1371
|
ctx,
|
|
1363
1372
|
output,
|
|
@@ -1367,10 +1376,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1367
1376
|
datasetId: task.datasetId
|
|
1368
1377
|
},
|
|
1369
1378
|
logDiff,
|
|
1370
|
-
log
|
|
1379
|
+
log,
|
|
1380
|
+
createError
|
|
1371
1381
|
})
|
|
1372
1382
|
)
|
|
1373
1383
|
);
|
|
1384
|
+
if (result instanceof Error) {
|
|
1385
|
+
const evaluatorError = result;
|
|
1386
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1387
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1388
|
+
testCaseError = result.message;
|
|
1389
|
+
evaluatorScores.push({
|
|
1390
|
+
evaluatorId,
|
|
1391
|
+
scores: [],
|
|
1392
|
+
passed: false,
|
|
1393
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1394
|
+
});
|
|
1395
|
+
continue;
|
|
1396
|
+
}
|
|
1374
1397
|
const { scores, metrics } = normalizeResult(result);
|
|
1375
1398
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1376
1399
|
evaluatorScores.push({
|
|
@@ -1381,11 +1404,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1381
1404
|
logs: logs.length > 0 ? logs : void 0
|
|
1382
1405
|
});
|
|
1383
1406
|
} catch (error) {
|
|
1407
|
+
if (error instanceof Error) {
|
|
1408
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1409
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1410
|
+
}
|
|
1384
1411
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1385
1412
|
evaluatorScores.push({
|
|
1386
1413
|
evaluatorId,
|
|
1387
1414
|
scores: [],
|
|
1388
|
-
passed: false
|
|
1415
|
+
passed: false,
|
|
1416
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1389
1417
|
});
|
|
1390
1418
|
}
|
|
1391
1419
|
}
|