@m4trix/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -986,6 +986,8 @@ function createDiffString(expected, actual, diffOptions) {
986
986
  function formatLogMessage(msg) {
987
987
  if (typeof msg === "string")
988
988
  return msg;
989
+ if (msg instanceof Error)
990
+ return msg.stack ?? msg.message;
989
991
  try {
990
992
  if (msg !== null && typeof msg === "object") {
991
993
  return JSON.stringify(msg, null, 2);
@@ -1288,6 +1290,7 @@ function toNumericScore(value) {
1288
1290
  }
1289
1291
 
1290
1292
  // src/runner/execution.ts
1293
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1291
1294
  function computeEvaluatorPassed(evaluator, result, scores) {
1292
1295
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1293
1296
  if (scoresWithPassed.length > 0) {
@@ -1344,20 +1347,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1344
1347
  if (!evaluateFn) {
1345
1348
  continue;
1346
1349
  }
1350
+ const logs = [];
1351
+ const logDiff = (expected, actual, options) => {
1352
+ logs.push(createDiffLogEntry(expected, actual, options));
1353
+ };
1354
+ const log = (message, options) => {
1355
+ logs.push(createLogEntry(message, options));
1356
+ };
1357
+ const createError = (message, options) => {
1358
+ const entry = createLogEntry(message, options);
1359
+ const error = message instanceof Error ? message : new Error(entry.message);
1360
+ error[evaluatorErrorLogEntryKey] = entry;
1361
+ return error;
1362
+ };
1347
1363
  try {
1348
- const logs = [];
1349
- const logDiff = (expected, actual, options) => {
1350
- logs.push(createDiffLogEntry(expected, actual, options));
1351
- };
1352
- const log = (message, options) => {
1353
- logs.push(createLogEntry(message, options));
1354
- };
1355
1364
  const ctx = yield* Effect.promise(
1356
1365
  () => Promise.resolve(evaluator.resolveContext())
1357
1366
  );
1358
1367
  const result = yield* Effect.promise(
1359
- () => Promise.resolve(
1360
- evaluateFn({
1368
+ () => Promise.resolve().then(
1369
+ () => evaluateFn({
1361
1370
  input: testCaseItem.testCase.getInput(),
1362
1371
  ctx,
1363
1372
  output,
@@ -1367,10 +1376,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1367
1376
  datasetId: task.datasetId
1368
1377
  },
1369
1378
  logDiff,
1370
- log
1379
+ log,
1380
+ createError
1371
1381
  })
1372
1382
  )
1373
1383
  );
1384
+ if (result instanceof Error) {
1385
+ const evaluatorError = result;
1386
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1387
+ logs.push(taggedEntry ?? createLogEntry(result));
1388
+ testCaseError = result.message;
1389
+ evaluatorScores.push({
1390
+ evaluatorId,
1391
+ scores: [],
1392
+ passed: false,
1393
+ logs: logs.length > 0 ? logs : void 0
1394
+ });
1395
+ continue;
1396
+ }
1374
1397
  const { scores, metrics } = normalizeResult(result);
1375
1398
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1376
1399
  evaluatorScores.push({
@@ -1381,11 +1404,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1381
1404
  logs: logs.length > 0 ? logs : void 0
1382
1405
  });
1383
1406
  } catch (error) {
1407
+ if (error instanceof Error) {
1408
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1409
+ logs.push(taggedEntry ?? createLogEntry(error));
1410
+ }
1384
1411
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1385
1412
  evaluatorScores.push({
1386
1413
  evaluatorId,
1387
1414
  scores: [],
1388
- passed: false
1415
+ passed: false,
1416
+ logs: logs.length > 0 ? logs : void 0
1389
1417
  });
1390
1418
  }
1391
1419
  }