@m4trix/evals 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -738,6 +738,8 @@ function createDiffString(expected, actual, diffOptions) {
738
738
  function formatLogMessage(msg) {
739
739
  if (typeof msg === "string")
740
740
  return msg;
741
+ if (msg instanceof Error)
742
+ return msg.stack ?? msg.message;
741
743
  try {
742
744
  if (msg !== null && typeof msg === "object") {
743
745
  return JSON.stringify(msg, null, 2);
@@ -1084,6 +1086,7 @@ function toNumericScore(value) {
1084
1086
  }
1085
1087
 
1086
1088
  // src/runner/execution.ts
1089
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1087
1090
  function computeEvaluatorPassed(evaluator, result, scores) {
1088
1091
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1089
1092
  if (scoresWithPassed.length > 0) {
@@ -1140,20 +1143,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1140
1143
  if (!evaluateFn) {
1141
1144
  continue;
1142
1145
  }
1146
+ const logs = [];
1147
+ const logDiff = (expected, actual, options) => {
1148
+ logs.push(createDiffLogEntry(expected, actual, options));
1149
+ };
1150
+ const log = (message, options) => {
1151
+ logs.push(createLogEntry(message, options));
1152
+ };
1153
+ const createError = (message, options) => {
1154
+ const entry = createLogEntry(message, options);
1155
+ const error = message instanceof Error ? message : new Error(entry.message);
1156
+ error[evaluatorErrorLogEntryKey] = entry;
1157
+ return error;
1158
+ };
1143
1159
  try {
1144
- const logs = [];
1145
- const logDiff = (expected, actual, options) => {
1146
- logs.push(createDiffLogEntry(expected, actual, options));
1147
- };
1148
- const log = (message, options) => {
1149
- logs.push(createLogEntry(message, options));
1150
- };
1151
1160
  const ctx = yield* effect.Effect.promise(
1152
1161
  () => Promise.resolve(evaluator.resolveContext())
1153
1162
  );
1154
1163
  const result = yield* effect.Effect.promise(
1155
- () => Promise.resolve(
1156
- evaluateFn({
1164
+ () => Promise.resolve().then(
1165
+ () => evaluateFn({
1157
1166
  input: testCaseItem.testCase.getInput(),
1158
1167
  ctx,
1159
1168
  output,
@@ -1163,10 +1172,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1163
1172
  datasetId: task.datasetId
1164
1173
  },
1165
1174
  logDiff,
1166
- log
1175
+ log,
1176
+ createError
1167
1177
  })
1168
1178
  )
1169
1179
  );
1180
+ if (result instanceof Error) {
1181
+ const evaluatorError = result;
1182
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1183
+ logs.push(taggedEntry ?? createLogEntry(result));
1184
+ testCaseError = result.message;
1185
+ evaluatorScores.push({
1186
+ evaluatorId,
1187
+ scores: [],
1188
+ passed: false,
1189
+ logs: logs.length > 0 ? logs : void 0
1190
+ });
1191
+ continue;
1192
+ }
1170
1193
  const { scores, metrics } = normalizeResult(result);
1171
1194
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1172
1195
  evaluatorScores.push({
@@ -1177,11 +1200,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1177
1200
  logs: logs.length > 0 ? logs : void 0
1178
1201
  });
1179
1202
  } catch (error) {
1203
+ if (error instanceof Error) {
1204
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1205
+ logs.push(taggedEntry ?? createLogEntry(error));
1206
+ }
1180
1207
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1181
1208
  evaluatorScores.push({
1182
1209
  evaluatorId,
1183
1210
  scores: [],
1184
- passed: false
1211
+ passed: false,
1212
+ logs: logs.length > 0 ? logs : void 0
1185
1213
  });
1186
1214
  }
1187
1215
  }