@m4trix/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,8 +77,15 @@ export const myEvaluator = Evaluator.define({
77
77
  inputSchema,
78
78
  outputSchema: S.Unknown,
79
79
  scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
80
- }).evaluate(async ({ input, ctx: _ctx, output }) => {
80
+ }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
81
81
  const start = Date.now();
82
+ const value = 85;
83
+ if (value < 50) {
84
+ return createError(
85
+ { reason: 'score below minimum', value, prompt: input.prompt, output },
86
+ { label: 'quality-check' },
87
+ );
88
+ }
82
89
  const latencyMs = Date.now() - start;
83
90
  const minScore =
84
91
  typeof output === 'object' &&
@@ -90,7 +97,7 @@ export const myEvaluator = Evaluator.define({
90
97
  return {
91
98
  scores: [
92
99
  percentScore.make(
93
- { value: 85 },
100
+ { value },
94
101
  { definePassed: (d) => d.value >= (minScore ?? 50) },
95
102
  ),
96
103
  ],
@@ -294,6 +294,8 @@ function createDiffString(expected, actual, diffOptions) {
294
294
  function formatLogMessage(msg) {
295
295
  if (typeof msg === "string")
296
296
  return msg;
297
+ if (msg instanceof Error)
298
+ return msg.stack ?? msg.message;
297
299
  try {
298
300
  if (msg !== null && typeof msg === "object") {
299
301
  return JSON.stringify(msg, null, 2);
@@ -633,6 +635,7 @@ function toNumericScore(value) {
633
635
  }
634
636
 
635
637
  // src/runner/execution.ts
638
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
636
639
  function computeEvaluatorPassed(evaluator, result, scores) {
637
640
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
638
641
  if (scoresWithPassed.length > 0) {
@@ -689,20 +692,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
689
692
  if (!evaluateFn) {
690
693
  continue;
691
694
  }
695
+ const logs = [];
696
+ const logDiff = (expected, actual, options) => {
697
+ logs.push(createDiffLogEntry(expected, actual, options));
698
+ };
699
+ const log = (message, options) => {
700
+ logs.push(createLogEntry(message, options));
701
+ };
702
+ const createError = (message, options) => {
703
+ const entry = createLogEntry(message, options);
704
+ const error = message instanceof Error ? message : new Error(entry.message);
705
+ error[evaluatorErrorLogEntryKey] = entry;
706
+ return error;
707
+ };
692
708
  try {
693
- const logs = [];
694
- const logDiff = (expected, actual, options) => {
695
- logs.push(createDiffLogEntry(expected, actual, options));
696
- };
697
- const log = (message, options) => {
698
- logs.push(createLogEntry(message, options));
699
- };
700
709
  const ctx = yield* effect.Effect.promise(
701
710
  () => Promise.resolve(evaluator.resolveContext())
702
711
  );
703
712
  const result = yield* effect.Effect.promise(
704
- () => Promise.resolve(
705
- evaluateFn({
713
+ () => Promise.resolve().then(
714
+ () => evaluateFn({
706
715
  input: testCaseItem.testCase.getInput(),
707
716
  ctx,
708
717
  output,
@@ -712,10 +721,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
712
721
  datasetId: task.datasetId
713
722
  },
714
723
  logDiff,
715
- log
724
+ log,
725
+ createError
716
726
  })
717
727
  )
718
728
  );
729
+ if (result instanceof Error) {
730
+ const evaluatorError = result;
731
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
732
+ logs.push(taggedEntry ?? createLogEntry(result));
733
+ testCaseError = result.message;
734
+ evaluatorScores.push({
735
+ evaluatorId,
736
+ scores: [],
737
+ passed: false,
738
+ logs: logs.length > 0 ? logs : void 0
739
+ });
740
+ continue;
741
+ }
719
742
  const { scores, metrics } = normalizeResult(result);
720
743
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
721
744
  evaluatorScores.push({
@@ -726,11 +749,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
726
749
  logs: logs.length > 0 ? logs : void 0
727
750
  });
728
751
  } catch (error) {
752
+ if (error instanceof Error) {
753
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
754
+ logs.push(taggedEntry ?? createLogEntry(error));
755
+ }
729
756
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
730
757
  evaluatorScores.push({
731
758
  evaluatorId,
732
759
  scores: [],
733
- passed: false
760
+ passed: false,
761
+ logs: logs.length > 0 ? logs : void 0
734
762
  });
735
763
  }
736
764
  }
@@ -1714,6 +1742,7 @@ function RunView({
1714
1742
  rerunTotal: event.rerunTotal,
1715
1743
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1716
1744
  passed: events.every((e) => e.passed),
1745
+ errorMessage: event.errorMessage,
1717
1746
  events,
1718
1747
  aggregatedEvaluatorScores,
1719
1748
  isAggregated
@@ -1824,8 +1853,13 @@ function RunView({
1824
1853
  " (",
1825
1854
  tc.durationMs,
1826
1855
  "ms)"
1827
- ] })
1856
+ ] }),
1857
+ tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", bold: true, children: [
1858
+ " ",
1859
+ "ERROR"
1860
+ ] }) : null
1828
1861
  ] }),
1862
+ tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
1829
1863
  tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1830
1864
  ink.Box,
1831
1865
  {
@@ -2363,9 +2397,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2363
2397
  0
2364
2398
  );
2365
2399
  const lines = [];
2400
+ const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2366
2401
  lines.push(
2367
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2402
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2368
2403
  );
2404
+ if (event.errorMessage) {
2405
+ lines.push(colorize(event.errorMessage, ansi2.red));
2406
+ }
2369
2407
  for (const item of aggregatedScores) {
2370
2408
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2371
2409
  lines.push(