@m4trix/evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -860,6 +860,7 @@ var Metric = {
860
860
  const def = {
861
861
  id: config.id,
862
862
  name: config.name,
863
+ aggregate: config.aggregate,
863
864
  format: config.format,
864
865
  make: (data) => ({ id: config.id, data })
865
866
  };
@@ -879,6 +880,7 @@ var Score = {
879
880
  id: config.id,
880
881
  name: config.name,
881
882
  displayStrategy: config.displayStrategy,
883
+ aggregate: config.aggregate,
882
884
  format: config.format,
883
885
  make: (data, options) => {
884
886
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -897,23 +899,62 @@ function getScoreById(id) {
897
899
  return registry2.get(id);
898
900
  }
899
901
 
902
+ // src/evals/aggregators.ts
903
+ function aggregateAverage(values) {
904
+ if (values.length === 0) {
905
+ return { value: 0 };
906
+ }
907
+ const sum = values.reduce((s, v) => s + v.value, 0);
908
+ return { value: sum / values.length };
909
+ }
910
+ function aggregateAll(values) {
911
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
912
+ }
913
+ function aggregateTokenCountSum(values) {
914
+ const initial = {
915
+ input: 0,
916
+ output: 0,
917
+ inputCached: 0,
918
+ outputCached: 0
919
+ };
920
+ return values.reduce(
921
+ (acc, v) => ({
922
+ input: acc.input + (v.input ?? 0),
923
+ output: acc.output + (v.output ?? 0),
924
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
925
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
926
+ }),
927
+ initial
928
+ );
929
+ }
930
+ function aggregateLatencyAverage(values) {
931
+ if (values.length === 0) {
932
+ return { ms: 0 };
933
+ }
934
+ const sum = values.reduce((s, v) => s + v.ms, 0);
935
+ return { ms: sum / values.length };
936
+ }
937
+
900
938
  // src/evals/metrics/standard.ts
901
939
  Metric.of({
902
940
  id: "token-count",
903
941
  name: "Tokens",
904
- format: (data) => {
942
+ aggregate: aggregateTokenCountSum,
943
+ format: (data, options) => {
905
944
  const input = data.input ?? 0;
906
945
  const output = data.output ?? 0;
907
946
  const inputCached = data.inputCached ?? 0;
908
947
  const outputCached = data.outputCached ?? 0;
909
948
  const cached = inputCached + outputCached;
910
- return `in:${input} out:${output} cached:${cached}`;
949
+ const base = `in:${input} out:${output} cached:${cached}`;
950
+ return options?.isAggregated ? `Total: ${base}` : base;
911
951
  }
912
952
  });
913
953
  Metric.of({
914
954
  id: "latency",
915
955
  name: "Latency",
916
- format: (data) => `${data.ms}ms`
956
+ aggregate: aggregateLatencyAverage,
957
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
917
958
  });
918
959
 
919
960
  // src/evals/scores/standard.ts
@@ -921,13 +962,15 @@ Score.of({
921
962
  id: "percent",
922
963
  name: "Score",
923
964
  displayStrategy: "bar",
924
- format: (data) => data.value.toFixed(2)
965
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
966
+ aggregate: aggregateAverage
925
967
  });
926
968
  Score.of({
927
969
  id: "binary",
928
970
  name: "Result",
929
971
  displayStrategy: "passFail",
930
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
972
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
973
+ aggregate: aggregateAll
931
974
  });
932
975
  function createDiffLogEntry(expected, actual, options) {
933
976
  const diff = jsonDiff.diffString(expected, actual, { color: false });
@@ -1009,7 +1052,8 @@ var defaultRunnerConfig = {
1009
1052
  ],
1010
1053
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
1011
1054
  },
1012
- artifactDirectory: ".eval-results"
1055
+ artifactDirectory: ".eval-results",
1056
+ maxConcurrency: 1
1013
1057
  };
1014
1058
  function toRunnerConfigOverrides(config) {
1015
1059
  if (!config) {
@@ -1042,6 +1086,9 @@ function toRunnerConfigOverrides(config) {
1042
1086
  if (config.artifactDirectory !== void 0) {
1043
1087
  overrides.artifactDirectory = config.artifactDirectory;
1044
1088
  }
1089
+ if (config.maxConcurrency !== void 0) {
1090
+ overrides.maxConcurrency = config.maxConcurrency;
1091
+ }
1045
1092
  if (Object.keys(discovery).length > 0) {
1046
1093
  overrides.discovery = discovery;
1047
1094
  }
@@ -1276,6 +1323,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1276
1323
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1277
1324
  );
1278
1325
  }
1326
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1327
+ return effect.Effect.gen(function* () {
1328
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1329
+ const rerunPassed = [];
1330
+ for (let r = 0; r < reruns; r++) {
1331
+ const started = Date.now();
1332
+ const evaluatorScores = [];
1333
+ let testCaseError;
1334
+ const output = readOutput(testCaseItem.testCase);
1335
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1336
+ const evaluateFn = evaluator.getEvaluateFn();
1337
+ if (!evaluateFn) {
1338
+ continue;
1339
+ }
1340
+ try {
1341
+ const logs = [];
1342
+ const logDiff = (expected, actual, options) => {
1343
+ logs.push(createDiffLogEntry(expected, actual, options));
1344
+ };
1345
+ const ctx = yield* effect.Effect.promise(
1346
+ () => Promise.resolve(evaluator.resolveContext())
1347
+ );
1348
+ const result = yield* effect.Effect.promise(
1349
+ () => Promise.resolve(
1350
+ evaluateFn({
1351
+ input: testCaseItem.testCase.getInput(),
1352
+ ctx,
1353
+ output,
1354
+ logDiff
1355
+ })
1356
+ )
1357
+ );
1358
+ const { scores, metrics } = normalizeResult(result);
1359
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1360
+ evaluatorScores.push({
1361
+ evaluatorId,
1362
+ scores,
1363
+ passed: passed2,
1364
+ metrics,
1365
+ logs: logs.length > 0 ? logs : void 0
1366
+ });
1367
+ } catch (error) {
1368
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1369
+ evaluatorScores.push({
1370
+ evaluatorId,
1371
+ scores: [],
1372
+ passed: false
1373
+ });
1374
+ }
1375
+ }
1376
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1377
+ rerunPassed.push(rerunPassedThis);
1378
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1379
+ n + 1,
1380
+ n + 1
1381
+ ]);
1382
+ const progressEvent = {
1383
+ type: "TestCaseProgress",
1384
+ runId: task.runId,
1385
+ testCaseId: testCaseItem.id,
1386
+ testCaseName: testCaseItem.testCase.getName(),
1387
+ completedTestCases: completedEvaluations,
1388
+ totalTestCases: totalEvaluations,
1389
+ rerunIndex: r + 1,
1390
+ rerunTotal: reruns,
1391
+ passed: rerunPassedThis,
1392
+ durationMs: Date.now() - started,
1393
+ evaluatorScores,
1394
+ output,
1395
+ errorMessage: testCaseError
1396
+ };
1397
+ updateSnapshot(task.runId, (snapshot) => ({
1398
+ ...snapshot,
1399
+ completedTestCases: completedEvaluations
1400
+ }));
1401
+ yield* publishEvent(progressEvent);
1402
+ yield* effect.Queue.offer(persistenceQueue, {
1403
+ runId: task.runId,
1404
+ artifactPath: task.snapshot.artifactPath,
1405
+ payload: progressEvent
1406
+ });
1407
+ }
1408
+ const testCasePassed = rerunPassed.every(Boolean);
1409
+ if (testCasePassed) {
1410
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1411
+ } else {
1412
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1413
+ }
1414
+ const [passed, failed] = yield* effect.Effect.all([
1415
+ effect.Ref.get(passedRef),
1416
+ effect.Ref.get(failedRef)
1417
+ ]);
1418
+ updateSnapshot(task.runId, (snapshot) => ({
1419
+ ...snapshot,
1420
+ passedTestCases: passed,
1421
+ failedTestCases: failed
1422
+ }));
1423
+ });
1424
+ }
1279
1425
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1280
1426
  const startedAt = Date.now();
1281
1427
  updateSnapshot(task.runId, (snapshot) => ({
@@ -1288,104 +1434,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1288
1434
  runId: task.runId,
1289
1435
  startedAt
1290
1436
  });
1291
- let completedTestCases = 0;
1292
- let passedTestCases = 0;
1293
- let failedTestCases = 0;
1294
- for (const testCaseItem of task.testCases) {
1295
- const started = Date.now();
1296
- const evaluatorScores = [];
1297
- let testCaseError;
1298
- const output = readOutput(testCaseItem.testCase);
1299
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1300
- const evaluateFn = evaluator.getEvaluateFn();
1301
- if (!evaluateFn) {
1302
- continue;
1303
- }
1304
- try {
1305
- const logs = [];
1306
- const logDiff = (expected, actual, options) => {
1307
- logs.push(createDiffLogEntry(expected, actual, options));
1308
- };
1309
- const ctx = yield* effect.Effect.promise(
1310
- () => Promise.resolve(evaluator.resolveContext())
1311
- );
1312
- const result = yield* effect.Effect.promise(
1313
- () => Promise.resolve(
1314
- evaluateFn({
1315
- input: testCaseItem.testCase.getInput(),
1316
- ctx,
1317
- output,
1318
- logDiff
1319
- })
1320
- )
1321
- );
1322
- const { scores, metrics } = normalizeResult(result);
1323
- const passed = computeEvaluatorPassed(evaluator, result, scores);
1324
- evaluatorScores.push({
1325
- evaluatorId,
1326
- scores,
1327
- passed,
1328
- metrics,
1329
- logs: logs.length > 0 ? logs : void 0
1330
- });
1331
- } catch (error) {
1332
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1333
- evaluatorScores.push({
1334
- evaluatorId,
1335
- scores: [],
1336
- passed: false
1337
- });
1338
- }
1339
- }
1340
- const testCasePassed = evaluatorScores.every((s) => s.passed);
1341
- completedTestCases += 1;
1342
- if (testCasePassed) {
1343
- passedTestCases += 1;
1344
- } else {
1345
- failedTestCases += 1;
1346
- }
1347
- const progressEvent = {
1348
- type: "TestCaseProgress",
1349
- runId: task.runId,
1350
- testCaseId: testCaseItem.id,
1351
- testCaseName: testCaseItem.testCase.getName(),
1352
- completedTestCases,
1353
- totalTestCases: task.testCases.length,
1354
- passed: testCasePassed,
1355
- durationMs: Date.now() - started,
1356
- evaluatorScores,
1357
- output,
1358
- errorMessage: testCaseError
1359
- };
1360
- updateSnapshot(task.runId, (snapshot) => ({
1361
- ...snapshot,
1362
- completedTestCases,
1363
- passedTestCases,
1364
- failedTestCases
1365
- }));
1366
- yield* publishEvent(progressEvent);
1367
- yield* effect.Queue.offer(persistenceQueue, {
1368
- runId: task.runId,
1369
- artifactPath: task.snapshot.artifactPath,
1370
- payload: progressEvent
1371
- });
1372
- }
1437
+ const totalEvaluations = task.testCases.reduce(
1438
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1439
+ 0
1440
+ );
1441
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1442
+ const completedRef = yield* effect.Ref.make(0);
1443
+ const passedRef = yield* effect.Ref.make(0);
1444
+ const failedRef = yield* effect.Ref.make(0);
1445
+ const processTestCase = (testCaseItem) => processOneTestCase(
1446
+ task,
1447
+ testCaseItem,
1448
+ totalEvaluations,
1449
+ publishEvent,
1450
+ persistenceQueue,
1451
+ updateSnapshot,
1452
+ completedRef,
1453
+ passedRef,
1454
+ failedRef
1455
+ );
1456
+ yield* effect.Effect.forEach(
1457
+ task.testCases,
1458
+ processTestCase,
1459
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1460
+ );
1461
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1462
+ effect.Ref.get(completedRef),
1463
+ effect.Ref.get(passedRef),
1464
+ effect.Ref.get(failedRef)
1465
+ ]);
1373
1466
  const finishedAt = Date.now();
1374
1467
  const completedEvent = {
1375
1468
  type: "RunCompleted",
1376
1469
  runId: task.runId,
1377
1470
  finishedAt,
1378
- passedTestCases,
1379
- failedTestCases,
1471
+ passedTestCases: passedUniqueTestCases,
1472
+ failedTestCases: failedUniqueTestCases,
1380
1473
  totalTestCases: task.testCases.length,
1381
1474
  artifactPath: task.snapshot.artifactPath
1382
1475
  };
1383
1476
  updateSnapshot(task.runId, (snapshot) => ({
1384
1477
  ...snapshot,
1385
1478
  status: "completed",
1386
- completedTestCases,
1387
- passedTestCases,
1388
- failedTestCases,
1479
+ completedTestCases: completedEvaluations,
1480
+ passedTestCases: passedUniqueTestCases,
1481
+ failedTestCases: failedUniqueTestCases,
1389
1482
  finishedAt
1390
1483
  }));
1391
1484
  yield* publishEvent(completedEvent);
@@ -1473,7 +1566,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1473
1566
  const artifactPath = filePath;
1474
1567
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1475
1568
  const progress = aggregateTestCaseProgress(lines);
1476
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1569
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1477
1570
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1478
1571
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1479
1572
  return {
@@ -1495,23 +1588,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1495
1588
  }
1496
1589
  function aggregateTestCaseProgress(lines) {
1497
1590
  let completedTestCases = 0;
1498
- let passedTestCases = 0;
1499
- let failedTestCases = 0;
1591
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1500
1592
  for (const line of lines) {
1501
1593
  try {
1502
1594
  const event = JSON.parse(line);
1503
1595
  if (event.type === "TestCaseProgress") {
1504
1596
  const ev = event;
1505
1597
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1506
- if (ev.passed) {
1507
- passedTestCases += 1;
1508
- } else {
1509
- failedTestCases += 1;
1510
- }
1598
+ const id = ev.testCaseId;
1599
+ const current = testCasePassedBy.get(id);
1600
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1511
1601
  }
1512
1602
  } catch {
1513
1603
  }
1514
1604
  }
1605
+ let passedTestCases = 0;
1606
+ let failedTestCases = 0;
1607
+ for (const passed of testCasePassedBy.values()) {
1608
+ if (passed) {
1609
+ passedTestCases += 1;
1610
+ } else {
1611
+ failedTestCases += 1;
1612
+ }
1613
+ }
1515
1614
  return { completedTestCases, passedTestCases, failedTestCases };
1516
1615
  }
1517
1616
  async function parseArtifactFile(artifactPath) {
@@ -1529,6 +1628,8 @@ async function parseArtifactFile(artifactPath) {
1529
1628
  testCaseName: ev.testCaseName,
1530
1629
  completedTestCases: ev.completedTestCases,
1531
1630
  totalTestCases: ev.totalTestCases,
1631
+ rerunIndex: ev.rerunIndex,
1632
+ rerunTotal: ev.rerunTotal,
1532
1633
  passed: ev.passed,
1533
1634
  durationMs: ev.durationMs,
1534
1635
  evaluatorScores: ev.evaluatorScores ?? []
@@ -1734,6 +1835,10 @@ var EffectRunner = class {
1734
1835
  throw new Error("No evaluators selected for run");
1735
1836
  }
1736
1837
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1838
+ const totalEvaluations = selectedTestCases.reduce(
1839
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1840
+ 0
1841
+ );
1737
1842
  const runId = `run-${crypto.randomUUID()}`;
1738
1843
  const artifactPath = createArtifactPath(
1739
1844
  this.config.artifactDirectory,
@@ -1746,7 +1851,7 @@ var EffectRunner = class {
1746
1851
  datasetName: dataset.dataset.getName(),
1747
1852
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1748
1853
  queuedAt: Date.now(),
1749
- totalTestCases: selectedTestCases.length,
1854
+ totalTestCases: totalEvaluations,
1750
1855
  completedTestCases: 0,
1751
1856
  passedTestCases: 0,
1752
1857
  failedTestCases: 0,
@@ -1760,7 +1865,7 @@ var EffectRunner = class {
1760
1865
  datasetId: request.datasetId,
1761
1866
  datasetName: dataset.dataset.getName(),
1762
1867
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1763
- totalTestCases: selectedTestCases.length,
1868
+ totalTestCases: totalEvaluations,
1764
1869
  artifactPath
1765
1870
  };
1766
1871
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1771,6 +1876,7 @@ var EffectRunner = class {
1771
1876
  payload: queuedEvent
1772
1877
  })
1773
1878
  );
1879
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1774
1880
  await effect.Effect.runPromise(
1775
1881
  effect.Queue.offer(this.runQueue, {
1776
1882
  runId,
@@ -1778,7 +1884,8 @@ var EffectRunner = class {
1778
1884
  dataset: dataset.dataset,
1779
1885
  evaluators: selectedEvaluators,
1780
1886
  testCases: selectedTestCases,
1781
- snapshot
1887
+ snapshot,
1888
+ maxConcurrency
1782
1889
  })
1783
1890
  );
1784
1891
  return snapshot;
@@ -1975,6 +2082,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1975
2082
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
1976
2083
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
1977
2084
  for (const tc of testCases) {
2085
+ const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
1978
2086
  rows.push(
1979
2087
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1980
2088
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -1986,12 +2094,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1986
2094
  ] }),
1987
2095
  " ",
1988
2096
  tc.testCaseName,
2097
+ rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
1989
2098
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1990
2099
  " (",
1991
2100
  tc.durationMs,
1992
2101
  "ms)"
1993
2102
  ] })
1994
- ] }, `tc-${tc.testCaseId}`)
2103
+ ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
1995
2104
  );
1996
2105
  for (const item of tc.evaluatorScores) {
1997
2106
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;