@m4trix/evals 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -257,8 +257,15 @@ interface EvaluateArgs<TInput, TOutput = unknown, TCtx = Record<string, never>>
257
257
  log: (message: unknown, options?: {
258
258
  label?: string;
259
259
  }) => void;
260
+ /**
261
+ * Creates an Error from string/object payloads for `return createError(...)` (or `throw createError(...)`).
262
+ * The payload is also logged and shown by the CLI when the evaluator fails.
263
+ */
264
+ createError: (message: unknown, options?: {
265
+ label?: string;
266
+ }) => Error;
260
267
  }
261
- type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Promise<TScore>;
268
+ type EvaluateFn<TInput, TOutput, TScore, TCtx> = (args: EvaluateArgs<TInput, TOutput, TCtx>) => TScore | Error | Promise<TScore | Error>;
262
269
  interface EvaluatorDefineConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any, TS extends Schema.Schema.Any> {
263
270
  name: string;
264
271
  inputSchema: TI;
@@ -430,6 +437,15 @@ type RunnerEvent = {
430
437
  type: 'RunStarted';
431
438
  runId: string;
432
439
  startedAt: number;
440
+ } | {
441
+ type: 'TestCaseStarted';
442
+ runId: string;
443
+ testCaseId: string;
444
+ testCaseName: string;
445
+ startedTestCases: number;
446
+ totalTestCases: number;
447
+ rerunIndex: number;
448
+ rerunTotal: number;
433
449
  } | {
434
450
  type: 'TestCaseProgress';
435
451
  runId: string;
package/dist/index.js CHANGED
@@ -716,6 +716,8 @@ function createDiffString(expected, actual, diffOptions) {
716
716
  function formatLogMessage(msg) {
717
717
  if (typeof msg === "string")
718
718
  return msg;
719
+ if (msg instanceof Error)
720
+ return msg.stack ?? msg.message;
719
721
  try {
720
722
  if (msg !== null && typeof msg === "object") {
721
723
  return JSON.stringify(msg, null, 2);
@@ -1062,6 +1064,7 @@ function toNumericScore(value) {
1062
1064
  }
1063
1065
 
1064
1066
  // src/runner/execution.ts
1067
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1065
1068
  function computeEvaluatorPassed(evaluator, result, scores) {
1066
1069
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1067
1070
  if (scoresWithPassed.length > 0) {
@@ -1103,13 +1106,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1103
1106
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1104
1107
  );
1105
1108
  }
1106
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1109
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1107
1110
  return Effect.gen(function* () {
1108
1111
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1109
1112
  const rerunPassed = [];
1110
1113
  for (let r = 0; r < reruns; r++) {
1111
1114
  const evaluatorRunId = `run-${randomUUID()}`;
1112
1115
  const started = Date.now();
1116
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1117
+ n + 1,
1118
+ n + 1
1119
+ ]);
1120
+ yield* publishEvent({
1121
+ type: "TestCaseStarted",
1122
+ runId: task.runId,
1123
+ testCaseId: testCaseItem.id,
1124
+ testCaseName: testCaseItem.testCase.getName(),
1125
+ startedTestCases: startedEvaluations,
1126
+ totalTestCases: totalEvaluations,
1127
+ rerunIndex: r + 1,
1128
+ rerunTotal: reruns
1129
+ });
1113
1130
  const evaluatorScores = [];
1114
1131
  let testCaseError;
1115
1132
  const output = readOutput(testCaseItem.testCase);
@@ -1118,20 +1135,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1118
1135
  if (!evaluateFn) {
1119
1136
  continue;
1120
1137
  }
1138
+ const logs = [];
1139
+ const logDiff = (expected, actual, options) => {
1140
+ logs.push(createDiffLogEntry(expected, actual, options));
1141
+ };
1142
+ const log = (message, options) => {
1143
+ logs.push(createLogEntry(message, options));
1144
+ };
1145
+ const createError = (message, options) => {
1146
+ const entry = createLogEntry(message, options);
1147
+ const error = message instanceof Error ? message : new Error(entry.message);
1148
+ error[evaluatorErrorLogEntryKey] = entry;
1149
+ return error;
1150
+ };
1121
1151
  try {
1122
- const logs = [];
1123
- const logDiff = (expected, actual, options) => {
1124
- logs.push(createDiffLogEntry(expected, actual, options));
1125
- };
1126
- const log = (message, options) => {
1127
- logs.push(createLogEntry(message, options));
1128
- };
1129
1152
  const ctx = yield* Effect.promise(
1130
1153
  () => Promise.resolve(evaluator.resolveContext())
1131
1154
  );
1132
1155
  const result = yield* Effect.promise(
1133
- () => Promise.resolve(
1134
- evaluateFn({
1156
+ () => Promise.resolve().then(
1157
+ () => evaluateFn({
1135
1158
  input: testCaseItem.testCase.getInput(),
1136
1159
  ctx,
1137
1160
  output,
@@ -1141,10 +1164,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1141
1164
  datasetId: task.datasetId
1142
1165
  },
1143
1166
  logDiff,
1144
- log
1167
+ log,
1168
+ createError
1145
1169
  })
1146
1170
  )
1147
1171
  );
1172
+ if (result instanceof Error) {
1173
+ const evaluatorError = result;
1174
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1175
+ logs.push(taggedEntry ?? createLogEntry(result));
1176
+ testCaseError = result.message;
1177
+ evaluatorScores.push({
1178
+ evaluatorId,
1179
+ scores: [],
1180
+ passed: false,
1181
+ logs: logs.length > 0 ? logs : void 0
1182
+ });
1183
+ continue;
1184
+ }
1148
1185
  const { scores, metrics } = normalizeResult(result);
1149
1186
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1150
1187
  evaluatorScores.push({
@@ -1155,11 +1192,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1155
1192
  logs: logs.length > 0 ? logs : void 0
1156
1193
  });
1157
1194
  } catch (error) {
1195
+ if (error instanceof Error) {
1196
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1197
+ logs.push(taggedEntry ?? createLogEntry(error));
1198
+ }
1158
1199
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1159
1200
  evaluatorScores.push({
1160
1201
  evaluatorId,
1161
1202
  scores: [],
1162
- passed: false
1203
+ passed: false,
1204
+ logs: logs.length > 0 ? logs : void 0
1163
1205
  });
1164
1206
  }
1165
1207
  }
@@ -1230,6 +1272,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1230
1272
  );
1231
1273
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1232
1274
  const completedRef = yield* Ref.make(0);
1275
+ const startedRef = yield* Ref.make(0);
1233
1276
  const passedRef = yield* Ref.make(0);
1234
1277
  const failedRef = yield* Ref.make(0);
1235
1278
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1239,6 +1282,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1239
1282
  publishEvent,
1240
1283
  persistenceQueue,
1241
1284
  updateSnapshot,
1285
+ startedRef,
1242
1286
  completedRef,
1243
1287
  passedRef,
1244
1288
  failedRef