@m4trix/evals 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +135 -26
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +135 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +56 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -1
- package/dist/index.js +56 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -986,6 +986,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
986
986
|
function formatLogMessage(msg) {
|
|
987
987
|
if (typeof msg === "string")
|
|
988
988
|
return msg;
|
|
989
|
+
if (msg instanceof Error)
|
|
990
|
+
return msg.stack ?? msg.message;
|
|
989
991
|
try {
|
|
990
992
|
if (msg !== null && typeof msg === "object") {
|
|
991
993
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1288,6 +1290,7 @@ function toNumericScore(value) {
|
|
|
1288
1290
|
}
|
|
1289
1291
|
|
|
1290
1292
|
// src/runner/execution.ts
|
|
1293
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1291
1294
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1292
1295
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1293
1296
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1329,13 +1332,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1329
1332
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1330
1333
|
);
|
|
1331
1334
|
}
|
|
1332
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1335
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1333
1336
|
return Effect.gen(function* () {
|
|
1334
1337
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1335
1338
|
const rerunPassed = [];
|
|
1336
1339
|
for (let r = 0; r < reruns; r++) {
|
|
1337
1340
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1338
1341
|
const started = Date.now();
|
|
1342
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1343
|
+
n + 1,
|
|
1344
|
+
n + 1
|
|
1345
|
+
]);
|
|
1346
|
+
yield* publishEvent({
|
|
1347
|
+
type: "TestCaseStarted",
|
|
1348
|
+
runId: task.runId,
|
|
1349
|
+
testCaseId: testCaseItem.id,
|
|
1350
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1351
|
+
startedTestCases: startedEvaluations,
|
|
1352
|
+
totalTestCases: totalEvaluations,
|
|
1353
|
+
rerunIndex: r + 1,
|
|
1354
|
+
rerunTotal: reruns
|
|
1355
|
+
});
|
|
1339
1356
|
const evaluatorScores = [];
|
|
1340
1357
|
let testCaseError;
|
|
1341
1358
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1344,20 +1361,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1344
1361
|
if (!evaluateFn) {
|
|
1345
1362
|
continue;
|
|
1346
1363
|
}
|
|
1364
|
+
const logs = [];
|
|
1365
|
+
const logDiff = (expected, actual, options) => {
|
|
1366
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1367
|
+
};
|
|
1368
|
+
const log = (message, options) => {
|
|
1369
|
+
logs.push(createLogEntry(message, options));
|
|
1370
|
+
};
|
|
1371
|
+
const createError = (message, options) => {
|
|
1372
|
+
const entry = createLogEntry(message, options);
|
|
1373
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1374
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1375
|
+
return error;
|
|
1376
|
+
};
|
|
1347
1377
|
try {
|
|
1348
|
-
const logs = [];
|
|
1349
|
-
const logDiff = (expected, actual, options) => {
|
|
1350
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1351
|
-
};
|
|
1352
|
-
const log = (message, options) => {
|
|
1353
|
-
logs.push(createLogEntry(message, options));
|
|
1354
|
-
};
|
|
1355
1378
|
const ctx = yield* Effect.promise(
|
|
1356
1379
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1357
1380
|
);
|
|
1358
1381
|
const result = yield* Effect.promise(
|
|
1359
|
-
() => Promise.resolve(
|
|
1360
|
-
evaluateFn({
|
|
1382
|
+
() => Promise.resolve().then(
|
|
1383
|
+
() => evaluateFn({
|
|
1361
1384
|
input: testCaseItem.testCase.getInput(),
|
|
1362
1385
|
ctx,
|
|
1363
1386
|
output,
|
|
@@ -1367,10 +1390,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1367
1390
|
datasetId: task.datasetId
|
|
1368
1391
|
},
|
|
1369
1392
|
logDiff,
|
|
1370
|
-
log
|
|
1393
|
+
log,
|
|
1394
|
+
createError
|
|
1371
1395
|
})
|
|
1372
1396
|
)
|
|
1373
1397
|
);
|
|
1398
|
+
if (result instanceof Error) {
|
|
1399
|
+
const evaluatorError = result;
|
|
1400
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1401
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1402
|
+
testCaseError = result.message;
|
|
1403
|
+
evaluatorScores.push({
|
|
1404
|
+
evaluatorId,
|
|
1405
|
+
scores: [],
|
|
1406
|
+
passed: false,
|
|
1407
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1408
|
+
});
|
|
1409
|
+
continue;
|
|
1410
|
+
}
|
|
1374
1411
|
const { scores, metrics } = normalizeResult(result);
|
|
1375
1412
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1376
1413
|
evaluatorScores.push({
|
|
@@ -1381,11 +1418,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1381
1418
|
logs: logs.length > 0 ? logs : void 0
|
|
1382
1419
|
});
|
|
1383
1420
|
} catch (error) {
|
|
1421
|
+
if (error instanceof Error) {
|
|
1422
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1423
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1424
|
+
}
|
|
1384
1425
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1385
1426
|
evaluatorScores.push({
|
|
1386
1427
|
evaluatorId,
|
|
1387
1428
|
scores: [],
|
|
1388
|
-
passed: false
|
|
1429
|
+
passed: false,
|
|
1430
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1389
1431
|
});
|
|
1390
1432
|
}
|
|
1391
1433
|
}
|
|
@@ -1456,6 +1498,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1456
1498
|
);
|
|
1457
1499
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1458
1500
|
const completedRef = yield* Ref.make(0);
|
|
1501
|
+
const startedRef = yield* Ref.make(0);
|
|
1459
1502
|
const passedRef = yield* Ref.make(0);
|
|
1460
1503
|
const failedRef = yield* Ref.make(0);
|
|
1461
1504
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1465,6 +1508,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1465
1508
|
publishEvent,
|
|
1466
1509
|
persistenceQueue,
|
|
1467
1510
|
updateSnapshot,
|
|
1511
|
+
startedRef,
|
|
1468
1512
|
completedRef,
|
|
1469
1513
|
passedRef,
|
|
1470
1514
|
failedRef
|