@m4trix/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +51 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +51 -13
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +39 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +39 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +39 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +39 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1012,6 +1012,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1012
1012
|
function formatLogMessage(msg) {
|
|
1013
1013
|
if (typeof msg === "string")
|
|
1014
1014
|
return msg;
|
|
1015
|
+
if (msg instanceof Error)
|
|
1016
|
+
return msg.stack ?? msg.message;
|
|
1015
1017
|
try {
|
|
1016
1018
|
if (msg !== null && typeof msg === "object") {
|
|
1017
1019
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1314,6 +1316,7 @@ function toNumericScore(value) {
|
|
|
1314
1316
|
}
|
|
1315
1317
|
|
|
1316
1318
|
// src/runner/execution.ts
|
|
1319
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1317
1320
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1318
1321
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1319
1322
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1370,20 +1373,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1370
1373
|
if (!evaluateFn) {
|
|
1371
1374
|
continue;
|
|
1372
1375
|
}
|
|
1376
|
+
const logs = [];
|
|
1377
|
+
const logDiff = (expected, actual, options) => {
|
|
1378
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1379
|
+
};
|
|
1380
|
+
const log = (message, options) => {
|
|
1381
|
+
logs.push(createLogEntry(message, options));
|
|
1382
|
+
};
|
|
1383
|
+
const createError = (message, options) => {
|
|
1384
|
+
const entry = createLogEntry(message, options);
|
|
1385
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1386
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1387
|
+
return error;
|
|
1388
|
+
};
|
|
1373
1389
|
try {
|
|
1374
|
-
const logs = [];
|
|
1375
|
-
const logDiff = (expected, actual, options) => {
|
|
1376
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1377
|
-
};
|
|
1378
|
-
const log = (message, options) => {
|
|
1379
|
-
logs.push(createLogEntry(message, options));
|
|
1380
|
-
};
|
|
1381
1390
|
const ctx = yield* effect.Effect.promise(
|
|
1382
1391
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1383
1392
|
);
|
|
1384
1393
|
const result = yield* effect.Effect.promise(
|
|
1385
|
-
() => Promise.resolve(
|
|
1386
|
-
evaluateFn({
|
|
1394
|
+
() => Promise.resolve().then(
|
|
1395
|
+
() => evaluateFn({
|
|
1387
1396
|
input: testCaseItem.testCase.getInput(),
|
|
1388
1397
|
ctx,
|
|
1389
1398
|
output,
|
|
@@ -1393,10 +1402,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1393
1402
|
datasetId: task.datasetId
|
|
1394
1403
|
},
|
|
1395
1404
|
logDiff,
|
|
1396
|
-
log
|
|
1405
|
+
log,
|
|
1406
|
+
createError
|
|
1397
1407
|
})
|
|
1398
1408
|
)
|
|
1399
1409
|
);
|
|
1410
|
+
if (result instanceof Error) {
|
|
1411
|
+
const evaluatorError = result;
|
|
1412
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1413
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1414
|
+
testCaseError = result.message;
|
|
1415
|
+
evaluatorScores.push({
|
|
1416
|
+
evaluatorId,
|
|
1417
|
+
scores: [],
|
|
1418
|
+
passed: false,
|
|
1419
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1420
|
+
});
|
|
1421
|
+
continue;
|
|
1422
|
+
}
|
|
1400
1423
|
const { scores, metrics } = normalizeResult(result);
|
|
1401
1424
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1402
1425
|
evaluatorScores.push({
|
|
@@ -1407,11 +1430,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1407
1430
|
logs: logs.length > 0 ? logs : void 0
|
|
1408
1431
|
});
|
|
1409
1432
|
} catch (error) {
|
|
1433
|
+
if (error instanceof Error) {
|
|
1434
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1435
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1436
|
+
}
|
|
1410
1437
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1411
1438
|
evaluatorScores.push({
|
|
1412
1439
|
evaluatorId,
|
|
1413
1440
|
scores: [],
|
|
1414
|
-
passed: false
|
|
1441
|
+
passed: false,
|
|
1442
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1415
1443
|
});
|
|
1416
1444
|
}
|
|
1417
1445
|
}
|