@m4trix/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -268,6 +268,8 @@ function createDiffString(expected, actual, diffOptions) {
268
268
  function formatLogMessage(msg) {
269
269
  if (typeof msg === "string")
270
270
  return msg;
271
+ if (msg instanceof Error)
272
+ return msg.stack ?? msg.message;
271
273
  try {
272
274
  if (msg !== null && typeof msg === "object") {
273
275
  return JSON.stringify(msg, null, 2);
@@ -607,6 +609,7 @@ function toNumericScore(value) {
607
609
  }
608
610
 
609
611
  // src/runner/execution.ts
612
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
610
613
  function computeEvaluatorPassed(evaluator, result, scores) {
611
614
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
612
615
  if (scoresWithPassed.length > 0) {
@@ -663,20 +666,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
663
666
  if (!evaluateFn) {
664
667
  continue;
665
668
  }
669
+ const logs = [];
670
+ const logDiff = (expected, actual, options) => {
671
+ logs.push(createDiffLogEntry(expected, actual, options));
672
+ };
673
+ const log = (message, options) => {
674
+ logs.push(createLogEntry(message, options));
675
+ };
676
+ const createError = (message, options) => {
677
+ const entry = createLogEntry(message, options);
678
+ const error = message instanceof Error ? message : new Error(entry.message);
679
+ error[evaluatorErrorLogEntryKey] = entry;
680
+ return error;
681
+ };
666
682
  try {
667
- const logs = [];
668
- const logDiff = (expected, actual, options) => {
669
- logs.push(createDiffLogEntry(expected, actual, options));
670
- };
671
- const log = (message, options) => {
672
- logs.push(createLogEntry(message, options));
673
- };
674
683
  const ctx = yield* Effect.promise(
675
684
  () => Promise.resolve(evaluator.resolveContext())
676
685
  );
677
686
  const result = yield* Effect.promise(
678
- () => Promise.resolve(
679
- evaluateFn({
687
+ () => Promise.resolve().then(
688
+ () => evaluateFn({
680
689
  input: testCaseItem.testCase.getInput(),
681
690
  ctx,
682
691
  output,
@@ -686,10 +695,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
686
695
  datasetId: task.datasetId
687
696
  },
688
697
  logDiff,
689
- log
698
+ log,
699
+ createError
690
700
  })
691
701
  )
692
702
  );
703
+ if (result instanceof Error) {
704
+ const evaluatorError = result;
705
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
706
+ logs.push(taggedEntry ?? createLogEntry(result));
707
+ testCaseError = result.message;
708
+ evaluatorScores.push({
709
+ evaluatorId,
710
+ scores: [],
711
+ passed: false,
712
+ logs: logs.length > 0 ? logs : void 0
713
+ });
714
+ continue;
715
+ }
693
716
  const { scores, metrics } = normalizeResult(result);
694
717
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
695
718
  evaluatorScores.push({
@@ -700,11 +723,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
700
723
  logs: logs.length > 0 ? logs : void 0
701
724
  });
702
725
  } catch (error) {
726
+ if (error instanceof Error) {
727
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
728
+ logs.push(taggedEntry ?? createLogEntry(error));
729
+ }
703
730
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
704
731
  evaluatorScores.push({
705
732
  evaluatorId,
706
733
  scores: [],
707
- passed: false
734
+ passed: false,
735
+ logs: logs.length > 0 ? logs : void 0
708
736
  });
709
737
  }
710
738
  }
@@ -1688,6 +1716,7 @@ function RunView({
1688
1716
  rerunTotal: event.rerunTotal,
1689
1717
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1690
1718
  passed: events.every((e) => e.passed),
1719
+ errorMessage: event.errorMessage,
1691
1720
  events,
1692
1721
  aggregatedEvaluatorScores,
1693
1722
  isAggregated
@@ -1798,8 +1827,13 @@ function RunView({
1798
1827
  " (",
1799
1828
  tc.durationMs,
1800
1829
  "ms)"
1801
- ] })
1830
+ ] }),
1831
+ tc.errorMessage ? /* @__PURE__ */ jsxs(Text, { color: "red", bold: true, children: [
1832
+ " ",
1833
+ "ERROR"
1834
+ ] }) : null
1802
1835
  ] }),
1836
+ tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
1803
1837
  tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
1804
1838
  Box,
1805
1839
  {
@@ -2337,9 +2371,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2337
2371
  0
2338
2372
  );
2339
2373
  const lines = [];
2374
+ const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2340
2375
  lines.push(
2341
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2376
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2342
2377
  );
2378
+ if (event.errorMessage) {
2379
+ lines.push(colorize(event.errorMessage, ansi2.red));
2380
+ }
2343
2381
  for (const item of aggregatedScores) {
2344
2382
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2345
2383
  lines.push(