@m4trix/evals 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -738,6 +738,8 @@ function createDiffString(expected, actual, diffOptions) {
738
738
  function formatLogMessage(msg) {
739
739
  if (typeof msg === "string")
740
740
  return msg;
741
+ if (msg instanceof Error)
742
+ return msg.stack ?? msg.message;
741
743
  try {
742
744
  if (msg !== null && typeof msg === "object") {
743
745
  return JSON.stringify(msg, null, 2);
@@ -1084,6 +1086,7 @@ function toNumericScore(value) {
1084
1086
  }
1085
1087
 
1086
1088
  // src/runner/execution.ts
1089
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1087
1090
  function computeEvaluatorPassed(evaluator, result, scores) {
1088
1091
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1089
1092
  if (scoresWithPassed.length > 0) {
@@ -1125,13 +1128,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1125
1128
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1126
1129
  );
1127
1130
  }
1128
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1131
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1129
1132
  return effect.Effect.gen(function* () {
1130
1133
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1131
1134
  const rerunPassed = [];
1132
1135
  for (let r = 0; r < reruns; r++) {
1133
1136
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1134
1137
  const started = Date.now();
1138
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1139
+ n + 1,
1140
+ n + 1
1141
+ ]);
1142
+ yield* publishEvent({
1143
+ type: "TestCaseStarted",
1144
+ runId: task.runId,
1145
+ testCaseId: testCaseItem.id,
1146
+ testCaseName: testCaseItem.testCase.getName(),
1147
+ startedTestCases: startedEvaluations,
1148
+ totalTestCases: totalEvaluations,
1149
+ rerunIndex: r + 1,
1150
+ rerunTotal: reruns
1151
+ });
1135
1152
  const evaluatorScores = [];
1136
1153
  let testCaseError;
1137
1154
  const output = readOutput(testCaseItem.testCase);
@@ -1140,20 +1157,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1140
1157
  if (!evaluateFn) {
1141
1158
  continue;
1142
1159
  }
1160
+ const logs = [];
1161
+ const logDiff = (expected, actual, options) => {
1162
+ logs.push(createDiffLogEntry(expected, actual, options));
1163
+ };
1164
+ const log = (message, options) => {
1165
+ logs.push(createLogEntry(message, options));
1166
+ };
1167
+ const createError = (message, options) => {
1168
+ const entry = createLogEntry(message, options);
1169
+ const error = message instanceof Error ? message : new Error(entry.message);
1170
+ error[evaluatorErrorLogEntryKey] = entry;
1171
+ return error;
1172
+ };
1143
1173
  try {
1144
- const logs = [];
1145
- const logDiff = (expected, actual, options) => {
1146
- logs.push(createDiffLogEntry(expected, actual, options));
1147
- };
1148
- const log = (message, options) => {
1149
- logs.push(createLogEntry(message, options));
1150
- };
1151
1174
  const ctx = yield* effect.Effect.promise(
1152
1175
  () => Promise.resolve(evaluator.resolveContext())
1153
1176
  );
1154
1177
  const result = yield* effect.Effect.promise(
1155
- () => Promise.resolve(
1156
- evaluateFn({
1178
+ () => Promise.resolve().then(
1179
+ () => evaluateFn({
1157
1180
  input: testCaseItem.testCase.getInput(),
1158
1181
  ctx,
1159
1182
  output,
@@ -1163,10 +1186,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1163
1186
  datasetId: task.datasetId
1164
1187
  },
1165
1188
  logDiff,
1166
- log
1189
+ log,
1190
+ createError
1167
1191
  })
1168
1192
  )
1169
1193
  );
1194
+ if (result instanceof Error) {
1195
+ const evaluatorError = result;
1196
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1197
+ logs.push(taggedEntry ?? createLogEntry(result));
1198
+ testCaseError = result.message;
1199
+ evaluatorScores.push({
1200
+ evaluatorId,
1201
+ scores: [],
1202
+ passed: false,
1203
+ logs: logs.length > 0 ? logs : void 0
1204
+ });
1205
+ continue;
1206
+ }
1170
1207
  const { scores, metrics } = normalizeResult(result);
1171
1208
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1172
1209
  evaluatorScores.push({
@@ -1177,11 +1214,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1177
1214
  logs: logs.length > 0 ? logs : void 0
1178
1215
  });
1179
1216
  } catch (error) {
1217
+ if (error instanceof Error) {
1218
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1219
+ logs.push(taggedEntry ?? createLogEntry(error));
1220
+ }
1180
1221
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1181
1222
  evaluatorScores.push({
1182
1223
  evaluatorId,
1183
1224
  scores: [],
1184
- passed: false
1225
+ passed: false,
1226
+ logs: logs.length > 0 ? logs : void 0
1185
1227
  });
1186
1228
  }
1187
1229
  }
@@ -1252,6 +1294,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1252
1294
  );
1253
1295
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1254
1296
  const completedRef = yield* effect.Ref.make(0);
1297
+ const startedRef = yield* effect.Ref.make(0);
1255
1298
  const passedRef = yield* effect.Ref.make(0);
1256
1299
  const failedRef = yield* effect.Ref.make(0);
1257
1300
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1261,6 +1304,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1261
1304
  publishEvent,
1262
1305
  persistenceQueue,
1263
1306
  updateSnapshot,
1307
+ startedRef,
1264
1308
  completedRef,
1265
1309
  passedRef,
1266
1310
  failedRef