@m4trix/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1012,6 +1012,8 @@ function createDiffString(expected, actual, diffOptions) {
1012
1012
  function formatLogMessage(msg) {
1013
1013
  if (typeof msg === "string")
1014
1014
  return msg;
1015
+ if (msg instanceof Error)
1016
+ return msg.stack ?? msg.message;
1015
1017
  try {
1016
1018
  if (msg !== null && typeof msg === "object") {
1017
1019
  return JSON.stringify(msg, null, 2);
@@ -1314,6 +1316,7 @@ function toNumericScore(value) {
1314
1316
  }
1315
1317
 
1316
1318
  // src/runner/execution.ts
1319
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1317
1320
  function computeEvaluatorPassed(evaluator, result, scores) {
1318
1321
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1319
1322
  if (scoresWithPassed.length > 0) {
@@ -1370,20 +1373,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1370
1373
  if (!evaluateFn) {
1371
1374
  continue;
1372
1375
  }
1376
+ const logs = [];
1377
+ const logDiff = (expected, actual, options) => {
1378
+ logs.push(createDiffLogEntry(expected, actual, options));
1379
+ };
1380
+ const log = (message, options) => {
1381
+ logs.push(createLogEntry(message, options));
1382
+ };
1383
+ const createError = (message, options) => {
1384
+ const entry = createLogEntry(message, options);
1385
+ const error = message instanceof Error ? message : new Error(entry.message);
1386
+ error[evaluatorErrorLogEntryKey] = entry;
1387
+ return error;
1388
+ };
1373
1389
  try {
1374
- const logs = [];
1375
- const logDiff = (expected, actual, options) => {
1376
- logs.push(createDiffLogEntry(expected, actual, options));
1377
- };
1378
- const log = (message, options) => {
1379
- logs.push(createLogEntry(message, options));
1380
- };
1381
1390
  const ctx = yield* effect.Effect.promise(
1382
1391
  () => Promise.resolve(evaluator.resolveContext())
1383
1392
  );
1384
1393
  const result = yield* effect.Effect.promise(
1385
- () => Promise.resolve(
1386
- evaluateFn({
1394
+ () => Promise.resolve().then(
1395
+ () => evaluateFn({
1387
1396
  input: testCaseItem.testCase.getInput(),
1388
1397
  ctx,
1389
1398
  output,
@@ -1393,10 +1402,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1393
1402
  datasetId: task.datasetId
1394
1403
  },
1395
1404
  logDiff,
1396
- log
1405
+ log,
1406
+ createError
1397
1407
  })
1398
1408
  )
1399
1409
  );
1410
+ if (result instanceof Error) {
1411
+ const evaluatorError = result;
1412
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1413
+ logs.push(taggedEntry ?? createLogEntry(result));
1414
+ testCaseError = result.message;
1415
+ evaluatorScores.push({
1416
+ evaluatorId,
1417
+ scores: [],
1418
+ passed: false,
1419
+ logs: logs.length > 0 ? logs : void 0
1420
+ });
1421
+ continue;
1422
+ }
1400
1423
  const { scores, metrics } = normalizeResult(result);
1401
1424
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1402
1425
  evaluatorScores.push({
@@ -1407,11 +1430,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1407
1430
  logs: logs.length > 0 ? logs : void 0
1408
1431
  });
1409
1432
  } catch (error) {
1433
+ if (error instanceof Error) {
1434
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1435
+ logs.push(taggedEntry ?? createLogEntry(error));
1436
+ }
1410
1437
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1411
1438
  evaluatorScores.push({
1412
1439
  evaluatorId,
1413
1440
  scores: [],
1414
- passed: false
1441
+ passed: false,
1442
+ logs: logs.length > 0 ? logs : void 0
1415
1443
  });
1416
1444
  }
1417
1445
  }