@m4trix/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -257,8 +257,15 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
257
257
  log: (message: unknown, options?: {
258
258
  label?: string;
259
259
  }) => void;
260
+ /**
261
+ * Creates an Error from string/object payloads for `return createError(...)` (or `throw createError(...)`).
262
+ * The payload is also logged and shown by the CLI when the evaluator fails.
263
+ */
264
+ createError: (message: unknown, options?: {
265
+ label?: string;
266
+ }) => Error;
260
267
  }
261
- type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Promise<TScore>;
268
+ type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
262
269
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
263
270
  name: string;
264
271
  inputSchema: TI;
package/dist/index.js CHANGED
@@ -716,6 +716,8 @@ function createDiffString(expected, actual, diffOptions) {
716
716
  function formatLogMessage(msg) {
717
717
  if (typeof msg === "string")
718
718
  return msg;
719
+ if (msg instanceof Error)
720
+ return msg.stack ?? msg.message;
719
721
  try {
720
722
  if (msg !== null && typeof msg === "object") {
721
723
  return JSON.stringify(msg, null, 2);
@@ -1062,6 +1064,7 @@ function toNumericScore(value) {
1062
1064
  }
1063
1065
 
1064
1066
  // src/runner/execution.ts
1067
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1065
1068
  function computeEvaluatorPassed(evaluator, result, scores) {
1066
1069
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1067
1070
  if (scoresWithPassed.length > 0) {
@@ -1118,20 +1121,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1118
1121
  if (!evaluateFn) {
1119
1122
  continue;
1120
1123
  }
1124
+ const logs = [];
1125
+ const logDiff = (expected, actual, options) => {
1126
+ logs.push(createDiffLogEntry(expected, actual, options));
1127
+ };
1128
+ const log = (message, options) => {
1129
+ logs.push(createLogEntry(message, options));
1130
+ };
1131
+ const createError = (message, options) => {
1132
+ const entry = createLogEntry(message, options);
1133
+ const error = message instanceof Error ? message : new Error(entry.message);
1134
+ error[evaluatorErrorLogEntryKey] = entry;
1135
+ return error;
1136
+ };
1121
1137
  try {
1122
- const logs = [];
1123
- const logDiff = (expected, actual, options) => {
1124
- logs.push(createDiffLogEntry(expected, actual, options));
1125
- };
1126
- const log = (message, options) => {
1127
- logs.push(createLogEntry(message, options));
1128
- };
1129
1138
  const ctx = yield* Effect.promise(
1130
1139
  () => Promise.resolve(evaluator.resolveContext())
1131
1140
  );
1132
1141
  const result = yield* Effect.promise(
1133
- () => Promise.resolve(
1134
- evaluateFn({
1142
+ () => Promise.resolve().then(
1143
+ () => evaluateFn({
1135
1144
  input: testCaseItem.testCase.getInput(),
1136
1145
  ctx,
1137
1146
  output,
@@ -1141,10 +1150,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1141
1150
  datasetId: task.datasetId
1142
1151
  },
1143
1152
  logDiff,
1144
- log
1153
+ log,
1154
+ createError
1145
1155
  })
1146
1156
  )
1147
1157
  );
1158
+ if (result instanceof Error) {
1159
+ const evaluatorError = result;
1160
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1161
+ logs.push(taggedEntry ?? createLogEntry(result));
1162
+ testCaseError = result.message;
1163
+ evaluatorScores.push({
1164
+ evaluatorId,
1165
+ scores: [],
1166
+ passed: false,
1167
+ logs: logs.length > 0 ? logs : void 0
1168
+ });
1169
+ continue;
1170
+ }
1148
1171
  const { scores, metrics } = normalizeResult(result);
1149
1172
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1150
1173
  evaluatorScores.push({
@@ -1155,11 +1178,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1155
1178
  logs: logs.length > 0 ? logs : void 0
1156
1179
  });
1157
1180
  } catch (error) {
1181
+ if (error instanceof Error) {
1182
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1183
+ logs.push(taggedEntry ?? createLogEntry(error));
1184
+ }
1158
1185
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1159
1186
  evaluatorScores.push({
1160
1187
  evaluatorId,
1161
1188
  scores: [],
1162
- passed: false
1189
+ passed: false,
1190
+ logs: logs.length > 0 ? logs : void 0
1163
1191
  });
1164
1192
  }
1165
1193
  }