@m4trix/evals 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +135 -26
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +135 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +56 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -1
- package/dist/index.js +56 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1012,6 +1012,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1012
1012
|
function formatLogMessage(msg) {
|
|
1013
1013
|
if (typeof msg === "string")
|
|
1014
1014
|
return msg;
|
|
1015
|
+
if (msg instanceof Error)
|
|
1016
|
+
return msg.stack ?? msg.message;
|
|
1015
1017
|
try {
|
|
1016
1018
|
if (msg !== null && typeof msg === "object") {
|
|
1017
1019
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1314,6 +1316,7 @@ function toNumericScore(value) {
|
|
|
1314
1316
|
}
|
|
1315
1317
|
|
|
1316
1318
|
// src/runner/execution.ts
|
|
1319
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1317
1320
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1318
1321
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1319
1322
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1355,13 +1358,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1355
1358
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1356
1359
|
);
|
|
1357
1360
|
}
|
|
1358
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1361
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1359
1362
|
return effect.Effect.gen(function* () {
|
|
1360
1363
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1361
1364
|
const rerunPassed = [];
|
|
1362
1365
|
for (let r = 0; r < reruns; r++) {
|
|
1363
1366
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1364
1367
|
const started = Date.now();
|
|
1368
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1369
|
+
n + 1,
|
|
1370
|
+
n + 1
|
|
1371
|
+
]);
|
|
1372
|
+
yield* publishEvent({
|
|
1373
|
+
type: "TestCaseStarted",
|
|
1374
|
+
runId: task.runId,
|
|
1375
|
+
testCaseId: testCaseItem.id,
|
|
1376
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1377
|
+
startedTestCases: startedEvaluations,
|
|
1378
|
+
totalTestCases: totalEvaluations,
|
|
1379
|
+
rerunIndex: r + 1,
|
|
1380
|
+
rerunTotal: reruns
|
|
1381
|
+
});
|
|
1365
1382
|
const evaluatorScores = [];
|
|
1366
1383
|
let testCaseError;
|
|
1367
1384
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1370,20 +1387,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1370
1387
|
if (!evaluateFn) {
|
|
1371
1388
|
continue;
|
|
1372
1389
|
}
|
|
1390
|
+
const logs = [];
|
|
1391
|
+
const logDiff = (expected, actual, options) => {
|
|
1392
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1393
|
+
};
|
|
1394
|
+
const log = (message, options) => {
|
|
1395
|
+
logs.push(createLogEntry(message, options));
|
|
1396
|
+
};
|
|
1397
|
+
const createError = (message, options) => {
|
|
1398
|
+
const entry = createLogEntry(message, options);
|
|
1399
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1400
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1401
|
+
return error;
|
|
1402
|
+
};
|
|
1373
1403
|
try {
|
|
1374
|
-
const logs = [];
|
|
1375
|
-
const logDiff = (expected, actual, options) => {
|
|
1376
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1377
|
-
};
|
|
1378
|
-
const log = (message, options) => {
|
|
1379
|
-
logs.push(createLogEntry(message, options));
|
|
1380
|
-
};
|
|
1381
1404
|
const ctx = yield* effect.Effect.promise(
|
|
1382
1405
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1383
1406
|
);
|
|
1384
1407
|
const result = yield* effect.Effect.promise(
|
|
1385
|
-
() => Promise.resolve(
|
|
1386
|
-
evaluateFn({
|
|
1408
|
+
() => Promise.resolve().then(
|
|
1409
|
+
() => evaluateFn({
|
|
1387
1410
|
input: testCaseItem.testCase.getInput(),
|
|
1388
1411
|
ctx,
|
|
1389
1412
|
output,
|
|
@@ -1393,10 +1416,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1393
1416
|
datasetId: task.datasetId
|
|
1394
1417
|
},
|
|
1395
1418
|
logDiff,
|
|
1396
|
-
log
|
|
1419
|
+
log,
|
|
1420
|
+
createError
|
|
1397
1421
|
})
|
|
1398
1422
|
)
|
|
1399
1423
|
);
|
|
1424
|
+
if (result instanceof Error) {
|
|
1425
|
+
const evaluatorError = result;
|
|
1426
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1427
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1428
|
+
testCaseError = result.message;
|
|
1429
|
+
evaluatorScores.push({
|
|
1430
|
+
evaluatorId,
|
|
1431
|
+
scores: [],
|
|
1432
|
+
passed: false,
|
|
1433
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1434
|
+
});
|
|
1435
|
+
continue;
|
|
1436
|
+
}
|
|
1400
1437
|
const { scores, metrics } = normalizeResult(result);
|
|
1401
1438
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1402
1439
|
evaluatorScores.push({
|
|
@@ -1407,11 +1444,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1407
1444
|
logs: logs.length > 0 ? logs : void 0
|
|
1408
1445
|
});
|
|
1409
1446
|
} catch (error) {
|
|
1447
|
+
if (error instanceof Error) {
|
|
1448
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1449
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1450
|
+
}
|
|
1410
1451
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1411
1452
|
evaluatorScores.push({
|
|
1412
1453
|
evaluatorId,
|
|
1413
1454
|
scores: [],
|
|
1414
|
-
passed: false
|
|
1455
|
+
passed: false,
|
|
1456
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1415
1457
|
});
|
|
1416
1458
|
}
|
|
1417
1459
|
}
|
|
@@ -1482,6 +1524,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1482
1524
|
);
|
|
1483
1525
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1484
1526
|
const completedRef = yield* effect.Ref.make(0);
|
|
1527
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
1485
1528
|
const passedRef = yield* effect.Ref.make(0);
|
|
1486
1529
|
const failedRef = yield* effect.Ref.make(0);
|
|
1487
1530
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1491,6 +1534,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1491
1534
|
publishEvent,
|
|
1492
1535
|
persistenceQueue,
|
|
1493
1536
|
updateSnapshot,
|
|
1537
|
+
startedRef,
|
|
1494
1538
|
completedRef,
|
|
1495
1539
|
passedRef,
|
|
1496
1540
|
failedRef
|