@m4trix/evals 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -1012,6 +1012,8 @@ function createDiffString(expected, actual, diffOptions) {
1012
1012
  function formatLogMessage(msg) {
1013
1013
  if (typeof msg === "string")
1014
1014
  return msg;
1015
+ if (msg instanceof Error)
1016
+ return msg.stack ?? msg.message;
1015
1017
  try {
1016
1018
  if (msg !== null && typeof msg === "object") {
1017
1019
  return JSON.stringify(msg, null, 2);
@@ -1314,6 +1316,7 @@ function toNumericScore(value) {
1314
1316
  }
1315
1317
 
1316
1318
  // src/runner/execution.ts
1319
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
1317
1320
  function computeEvaluatorPassed(evaluator, result, scores) {
1318
1321
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1319
1322
  if (scoresWithPassed.length > 0) {
@@ -1355,13 +1358,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1355
1358
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1356
1359
  );
1357
1360
  }
1358
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1361
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1359
1362
  return effect.Effect.gen(function* () {
1360
1363
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1361
1364
  const rerunPassed = [];
1362
1365
  for (let r = 0; r < reruns; r++) {
1363
1366
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1364
1367
  const started = Date.now();
1368
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1369
+ n + 1,
1370
+ n + 1
1371
+ ]);
1372
+ yield* publishEvent({
1373
+ type: "TestCaseStarted",
1374
+ runId: task.runId,
1375
+ testCaseId: testCaseItem.id,
1376
+ testCaseName: testCaseItem.testCase.getName(),
1377
+ startedTestCases: startedEvaluations,
1378
+ totalTestCases: totalEvaluations,
1379
+ rerunIndex: r + 1,
1380
+ rerunTotal: reruns
1381
+ });
1365
1382
  const evaluatorScores = [];
1366
1383
  let testCaseError;
1367
1384
  const output = readOutput(testCaseItem.testCase);
@@ -1370,20 +1387,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1370
1387
  if (!evaluateFn) {
1371
1388
  continue;
1372
1389
  }
1390
+ const logs = [];
1391
+ const logDiff = (expected, actual, options) => {
1392
+ logs.push(createDiffLogEntry(expected, actual, options));
1393
+ };
1394
+ const log = (message, options) => {
1395
+ logs.push(createLogEntry(message, options));
1396
+ };
1397
+ const createError = (message, options) => {
1398
+ const entry = createLogEntry(message, options);
1399
+ const error = message instanceof Error ? message : new Error(entry.message);
1400
+ error[evaluatorErrorLogEntryKey] = entry;
1401
+ return error;
1402
+ };
1373
1403
  try {
1374
- const logs = [];
1375
- const logDiff = (expected, actual, options) => {
1376
- logs.push(createDiffLogEntry(expected, actual, options));
1377
- };
1378
- const log = (message, options) => {
1379
- logs.push(createLogEntry(message, options));
1380
- };
1381
1404
  const ctx = yield* effect.Effect.promise(
1382
1405
  () => Promise.resolve(evaluator.resolveContext())
1383
1406
  );
1384
1407
  const result = yield* effect.Effect.promise(
1385
- () => Promise.resolve(
1386
- evaluateFn({
1408
+ () => Promise.resolve().then(
1409
+ () => evaluateFn({
1387
1410
  input: testCaseItem.testCase.getInput(),
1388
1411
  ctx,
1389
1412
  output,
@@ -1393,10 +1416,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1393
1416
  datasetId: task.datasetId
1394
1417
  },
1395
1418
  logDiff,
1396
- log
1419
+ log,
1420
+ createError
1397
1421
  })
1398
1422
  )
1399
1423
  );
1424
+ if (result instanceof Error) {
1425
+ const evaluatorError = result;
1426
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
1427
+ logs.push(taggedEntry ?? createLogEntry(result));
1428
+ testCaseError = result.message;
1429
+ evaluatorScores.push({
1430
+ evaluatorId,
1431
+ scores: [],
1432
+ passed: false,
1433
+ logs: logs.length > 0 ? logs : void 0
1434
+ });
1435
+ continue;
1436
+ }
1400
1437
  const { scores, metrics } = normalizeResult(result);
1401
1438
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1402
1439
  evaluatorScores.push({
@@ -1407,11 +1444,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
1407
1444
  logs: logs.length > 0 ? logs : void 0
1408
1445
  });
1409
1446
  } catch (error) {
1447
+ if (error instanceof Error) {
1448
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
1449
+ logs.push(taggedEntry ?? createLogEntry(error));
1450
+ }
1410
1451
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1411
1452
  evaluatorScores.push({
1412
1453
  evaluatorId,
1413
1454
  scores: [],
1414
- passed: false
1455
+ passed: false,
1456
+ logs: logs.length > 0 ? logs : void 0
1415
1457
  });
1416
1458
  }
1417
1459
  }
@@ -1482,6 +1524,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1482
1524
  );
1483
1525
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1484
1526
  const completedRef = yield* effect.Ref.make(0);
1527
+ const startedRef = yield* effect.Ref.make(0);
1485
1528
  const passedRef = yield* effect.Ref.make(0);
1486
1529
  const failedRef = yield* effect.Ref.make(0);
1487
1530
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1491,6 +1534,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1491
1534
  publishEvent,
1492
1535
  persistenceQueue,
1493
1536
  updateSnapshot,
1537
+ startedRef,
1494
1538
  completedRef,
1495
1539
  passedRef,
1496
1540
  failedRef