@m4trix/evals 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -986,6 +986,8 @@ function createDiffString(expected, actual, diffOptions) {
986
986
  function formatLogMessage(msg) {
987
987
  if (typeof msg === "string")
988
988
  return msg;
989
+ if (msg instanceof Error)
990
+ return msg.stack ?? msg.message;
989
991
  try {
990
992
  if (msg !== null && typeof msg === "object") {
991
993
  return JSON.stringify(msg, null, 2);
@@ -1288,6 +1290,7 @@ function toNumericScore(value) {
1288
1290
  }
1289
1291
 
1290
1292
  // src/runner/execution.ts
1293
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1291
1294
  function computeEvaluatorPassed(evaluator, result, scores) {
1292
1295
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1293
1296
  if (scoresWithPassed.length > 0) {
@@ -1329,13 +1332,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1329
1332
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1330
1333
  );
1331
1334
  }
1332
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1335
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1333
1336
  return Effect.gen(function* () {
1334
1337
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1335
1338
  const rerunPassed = [];
1336
1339
  for (let r = 0; r < reruns; r++) {
1337
1340
  const evaluatorRunId = `run-${randomUUID()}`;
1338
1341
  const started = Date.now();
1342
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1343
+ n + 1,
1344
+ n + 1
1345
+ ]);
1346
+ yield* publishEvent({
1347
+ type: "TestCaseStarted",
1348
+ runId: task.runId,
1349
+ testCaseId: testCaseItem.id,
1350
+ testCaseName: testCaseItem.testCase.getName(),
1351
+ startedTestCases: startedEvaluations,
1352
+ totalTestCases: totalEvaluations,
1353
+ rerunIndex: r + 1,
1354
+ rerunTotal: reruns
1355
+ });
1339
1356
  const evaluatorScores = [];
1340
1357
  let testCaseError;
1341
1358
  const output = readOutput(testCaseItem.testCase);
@@ -1344,20 +1361,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1344
1361
  if (!evaluateFn) {
1345
1362
  continue;
1346
1363
  }
1364
+ const logs = [];
1365
+ const logDiff = (expected, actual, options) => {
1366
+ logs.push(createDiffLogEntry(expected, actual, options));
1367
+ };
1368
+ const log = (message, options) => {
1369
+ logs.push(createLogEntry(message, options));
1370
+ };
1371
+ const createError = (message, options) => {
1372
+ const entry = createLogEntry(message, options);
1373
+ const error = message instanceof Error ? message : new Error(entry.message);
1374
+ error[evaluatorErrorLogEntryKey] = entry;
1375
+ return error;
1376
+ };
1347
1377
  try {
1348
- const logs = [];
1349
- const logDiff = (expected, actual, options) => {
1350
- logs.push(createDiffLogEntry(expected, actual, options));
1351
- };
1352
- const log = (message, options) => {
1353
- logs.push(createLogEntry(message, options));
1354
- };
1355
1378
  const ctx = yield* Effect.promise(
1356
1379
  () => Promise.resolve(evaluator.resolveContext())
1357
1380
  );
1358
1381
  const result = yield* Effect.promise(
1359
- () => Promise.resolve(
1360
- evaluateFn({
1382
+ () => Promise.resolve().then(
1383
+ () => evaluateFn({
1361
1384
  input: testCaseItem.testCase.getInput(),
1362
1385
  ctx,
1363
1386
  output,
@@ -1367,10 +1390,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1367
1390
  datasetId: task.datasetId
1368
1391
  },
1369
1392
  logDiff,
1370
- log
1393
+ log,
1394
+ createError
1371
1395
  })
1372
1396
  )
1373
1397
  );
1398
+ if (result instanceof Error) {
1399
+ const evaluatorError = result;
1400
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1401
+ logs.push(taggedEntry ?? createLogEntry(result));
1402
+ testCaseError = result.message;
1403
+ evaluatorScores.push({
1404
+ evaluatorId,
1405
+ scores: [],
1406
+ passed: false,
1407
+ logs: logs.length > 0 ? logs : void 0
1408
+ });
1409
+ continue;
1410
+ }
1374
1411
  const { scores, metrics } = normalizeResult(result);
1375
1412
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1376
1413
  evaluatorScores.push({
@@ -1381,11 +1418,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1381
1418
  logs: logs.length > 0 ? logs : void 0
1382
1419
  });
1383
1420
  } catch (error) {
1421
+ if (error instanceof Error) {
1422
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1423
+ logs.push(taggedEntry ?? createLogEntry(error));
1424
+ }
1384
1425
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1385
1426
  evaluatorScores.push({
1386
1427
  evaluatorId,
1387
1428
  scores: [],
1388
- passed: false
1429
+ passed: false,
1430
+ logs: logs.length > 0 ? logs : void 0
1389
1431
  });
1390
1432
  }
1391
1433
  }
@@ -1456,6 +1498,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1456
1498
  );
1457
1499
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1458
1500
  const completedRef = yield* Ref.make(0);
1501
+ const startedRef = yield* Ref.make(0);
1459
1502
  const passedRef = yield* Ref.make(0);
1460
1503
  const failedRef = yield* Ref.make(0);
1461
1504
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1465,6 +1508,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1465
1508
  publishEvent,
1466
1509
  persistenceQueue,
1467
1510
  updateSnapshot,
1511
+ startedRef,
1468
1512
  completedRef,
1469
1513
  passedRef,
1470
1514
  failedRef