@m4trix/evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -6,7 +6,7 @@ import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
6
  import { resolve, relative, join, dirname } from 'path';
7
7
  import { diffString } from 'json-diff';
8
8
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Fiber } from 'effect';
9
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
@@ -834,6 +834,7 @@ var Metric = {
834
834
  const def = {
835
835
  id: config.id,
836
836
  name: config.name,
837
+ aggregate: config.aggregate,
837
838
  format: config.format,
838
839
  make: (data) => ({ id: config.id, data })
839
840
  };
@@ -853,6 +854,7 @@ var Score = {
853
854
  id: config.id,
854
855
  name: config.name,
855
856
  displayStrategy: config.displayStrategy,
857
+ aggregate: config.aggregate,
856
858
  format: config.format,
857
859
  make: (data, options) => {
858
860
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -871,23 +873,62 @@ function getScoreById(id) {
871
873
  return registry2.get(id);
872
874
  }
873
875
 
876
+ // src/evals/aggregators.ts
877
+ function aggregateAverage(values) {
878
+ if (values.length === 0) {
879
+ return { value: 0 };
880
+ }
881
+ const sum = values.reduce((s, v) => s + v.value, 0);
882
+ return { value: sum / values.length };
883
+ }
884
+ function aggregateAll(values) {
885
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
886
+ }
887
+ function aggregateTokenCountSum(values) {
888
+ const initial = {
889
+ input: 0,
890
+ output: 0,
891
+ inputCached: 0,
892
+ outputCached: 0
893
+ };
894
+ return values.reduce(
895
+ (acc, v) => ({
896
+ input: acc.input + (v.input ?? 0),
897
+ output: acc.output + (v.output ?? 0),
898
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
899
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
900
+ }),
901
+ initial
902
+ );
903
+ }
904
+ function aggregateLatencyAverage(values) {
905
+ if (values.length === 0) {
906
+ return { ms: 0 };
907
+ }
908
+ const sum = values.reduce((s, v) => s + v.ms, 0);
909
+ return { ms: sum / values.length };
910
+ }
911
+
874
912
  // src/evals/metrics/standard.ts
875
913
  Metric.of({
876
914
  id: "token-count",
877
915
  name: "Tokens",
878
- format: (data) => {
916
+ aggregate: aggregateTokenCountSum,
917
+ format: (data, options) => {
879
918
  const input = data.input ?? 0;
880
919
  const output = data.output ?? 0;
881
920
  const inputCached = data.inputCached ?? 0;
882
921
  const outputCached = data.outputCached ?? 0;
883
922
  const cached = inputCached + outputCached;
884
- return `in:${input} out:${output} cached:${cached}`;
923
+ const base = `in:${input} out:${output} cached:${cached}`;
924
+ return options?.isAggregated ? `Total: ${base}` : base;
885
925
  }
886
926
  });
887
927
  Metric.of({
888
928
  id: "latency",
889
929
  name: "Latency",
890
- format: (data) => `${data.ms}ms`
930
+ aggregate: aggregateLatencyAverage,
931
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
891
932
  });
892
933
 
893
934
  // src/evals/scores/standard.ts
@@ -895,13 +936,15 @@ Score.of({
895
936
  id: "percent",
896
937
  name: "Score",
897
938
  displayStrategy: "bar",
898
- format: (data) => data.value.toFixed(2)
939
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
940
+ aggregate: aggregateAverage
899
941
  });
900
942
  Score.of({
901
943
  id: "binary",
902
944
  name: "Result",
903
945
  displayStrategy: "passFail",
904
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
946
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
947
+ aggregate: aggregateAll
905
948
  });
906
949
  function createDiffLogEntry(expected, actual, options) {
907
950
  const diff = diffString(expected, actual, { color: false });
@@ -983,7 +1026,8 @@ var defaultRunnerConfig = {
983
1026
  ],
984
1027
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
985
1028
  },
986
- artifactDirectory: ".eval-results"
1029
+ artifactDirectory: ".eval-results",
1030
+ maxConcurrency: 1
987
1031
  };
988
1032
  function toRunnerConfigOverrides(config) {
989
1033
  if (!config) {
@@ -1016,6 +1060,9 @@ function toRunnerConfigOverrides(config) {
1016
1060
  if (config.artifactDirectory !== void 0) {
1017
1061
  overrides.artifactDirectory = config.artifactDirectory;
1018
1062
  }
1063
+ if (config.maxConcurrency !== void 0) {
1064
+ overrides.maxConcurrency = config.maxConcurrency;
1065
+ }
1019
1066
  if (Object.keys(discovery).length > 0) {
1020
1067
  overrides.discovery = discovery;
1021
1068
  }
@@ -1250,6 +1297,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1250
1297
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1251
1298
  );
1252
1299
  }
1300
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1301
+ return Effect.gen(function* () {
1302
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1303
+ const rerunPassed = [];
1304
+ for (let r = 0; r < reruns; r++) {
1305
+ const started = Date.now();
1306
+ const evaluatorScores = [];
1307
+ let testCaseError;
1308
+ const output = readOutput(testCaseItem.testCase);
1309
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1310
+ const evaluateFn = evaluator.getEvaluateFn();
1311
+ if (!evaluateFn) {
1312
+ continue;
1313
+ }
1314
+ try {
1315
+ const logs = [];
1316
+ const logDiff = (expected, actual, options) => {
1317
+ logs.push(createDiffLogEntry(expected, actual, options));
1318
+ };
1319
+ const ctx = yield* Effect.promise(
1320
+ () => Promise.resolve(evaluator.resolveContext())
1321
+ );
1322
+ const result = yield* Effect.promise(
1323
+ () => Promise.resolve(
1324
+ evaluateFn({
1325
+ input: testCaseItem.testCase.getInput(),
1326
+ ctx,
1327
+ output,
1328
+ logDiff
1329
+ })
1330
+ )
1331
+ );
1332
+ const { scores, metrics } = normalizeResult(result);
1333
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1334
+ evaluatorScores.push({
1335
+ evaluatorId,
1336
+ scores,
1337
+ passed: passed2,
1338
+ metrics,
1339
+ logs: logs.length > 0 ? logs : void 0
1340
+ });
1341
+ } catch (error) {
1342
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1343
+ evaluatorScores.push({
1344
+ evaluatorId,
1345
+ scores: [],
1346
+ passed: false
1347
+ });
1348
+ }
1349
+ }
1350
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1351
+ rerunPassed.push(rerunPassedThis);
1352
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1353
+ n + 1,
1354
+ n + 1
1355
+ ]);
1356
+ const progressEvent = {
1357
+ type: "TestCaseProgress",
1358
+ runId: task.runId,
1359
+ testCaseId: testCaseItem.id,
1360
+ testCaseName: testCaseItem.testCase.getName(),
1361
+ completedTestCases: completedEvaluations,
1362
+ totalTestCases: totalEvaluations,
1363
+ rerunIndex: r + 1,
1364
+ rerunTotal: reruns,
1365
+ passed: rerunPassedThis,
1366
+ durationMs: Date.now() - started,
1367
+ evaluatorScores,
1368
+ output,
1369
+ errorMessage: testCaseError
1370
+ };
1371
+ updateSnapshot(task.runId, (snapshot) => ({
1372
+ ...snapshot,
1373
+ completedTestCases: completedEvaluations
1374
+ }));
1375
+ yield* publishEvent(progressEvent);
1376
+ yield* Queue.offer(persistenceQueue, {
1377
+ runId: task.runId,
1378
+ artifactPath: task.snapshot.artifactPath,
1379
+ payload: progressEvent
1380
+ });
1381
+ }
1382
+ const testCasePassed = rerunPassed.every(Boolean);
1383
+ if (testCasePassed) {
1384
+ yield* Ref.update(passedRef, (n) => n + 1);
1385
+ } else {
1386
+ yield* Ref.update(failedRef, (n) => n + 1);
1387
+ }
1388
+ const [passed, failed] = yield* Effect.all([
1389
+ Ref.get(passedRef),
1390
+ Ref.get(failedRef)
1391
+ ]);
1392
+ updateSnapshot(task.runId, (snapshot) => ({
1393
+ ...snapshot,
1394
+ passedTestCases: passed,
1395
+ failedTestCases: failed
1396
+ }));
1397
+ });
1398
+ }
1253
1399
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1254
1400
  const startedAt = Date.now();
1255
1401
  updateSnapshot(task.runId, (snapshot) => ({
@@ -1262,104 +1408,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1262
1408
  runId: task.runId,
1263
1409
  startedAt
1264
1410
  });
1265
- let completedTestCases = 0;
1266
- let passedTestCases = 0;
1267
- let failedTestCases = 0;
1268
- for (const testCaseItem of task.testCases) {
1269
- const started = Date.now();
1270
- const evaluatorScores = [];
1271
- let testCaseError;
1272
- const output = readOutput(testCaseItem.testCase);
1273
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1274
- const evaluateFn = evaluator.getEvaluateFn();
1275
- if (!evaluateFn) {
1276
- continue;
1277
- }
1278
- try {
1279
- const logs = [];
1280
- const logDiff = (expected, actual, options) => {
1281
- logs.push(createDiffLogEntry(expected, actual, options));
1282
- };
1283
- const ctx = yield* Effect.promise(
1284
- () => Promise.resolve(evaluator.resolveContext())
1285
- );
1286
- const result = yield* Effect.promise(
1287
- () => Promise.resolve(
1288
- evaluateFn({
1289
- input: testCaseItem.testCase.getInput(),
1290
- ctx,
1291
- output,
1292
- logDiff
1293
- })
1294
- )
1295
- );
1296
- const { scores, metrics } = normalizeResult(result);
1297
- const passed = computeEvaluatorPassed(evaluator, result, scores);
1298
- evaluatorScores.push({
1299
- evaluatorId,
1300
- scores,
1301
- passed,
1302
- metrics,
1303
- logs: logs.length > 0 ? logs : void 0
1304
- });
1305
- } catch (error) {
1306
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1307
- evaluatorScores.push({
1308
- evaluatorId,
1309
- scores: [],
1310
- passed: false
1311
- });
1312
- }
1313
- }
1314
- const testCasePassed = evaluatorScores.every((s) => s.passed);
1315
- completedTestCases += 1;
1316
- if (testCasePassed) {
1317
- passedTestCases += 1;
1318
- } else {
1319
- failedTestCases += 1;
1320
- }
1321
- const progressEvent = {
1322
- type: "TestCaseProgress",
1323
- runId: task.runId,
1324
- testCaseId: testCaseItem.id,
1325
- testCaseName: testCaseItem.testCase.getName(),
1326
- completedTestCases,
1327
- totalTestCases: task.testCases.length,
1328
- passed: testCasePassed,
1329
- durationMs: Date.now() - started,
1330
- evaluatorScores,
1331
- output,
1332
- errorMessage: testCaseError
1333
- };
1334
- updateSnapshot(task.runId, (snapshot) => ({
1335
- ...snapshot,
1336
- completedTestCases,
1337
- passedTestCases,
1338
- failedTestCases
1339
- }));
1340
- yield* publishEvent(progressEvent);
1341
- yield* Queue.offer(persistenceQueue, {
1342
- runId: task.runId,
1343
- artifactPath: task.snapshot.artifactPath,
1344
- payload: progressEvent
1345
- });
1346
- }
1411
+ const totalEvaluations = task.testCases.reduce(
1412
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1413
+ 0
1414
+ );
1415
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1416
+ const completedRef = yield* Ref.make(0);
1417
+ const passedRef = yield* Ref.make(0);
1418
+ const failedRef = yield* Ref.make(0);
1419
+ const processTestCase = (testCaseItem) => processOneTestCase(
1420
+ task,
1421
+ testCaseItem,
1422
+ totalEvaluations,
1423
+ publishEvent,
1424
+ persistenceQueue,
1425
+ updateSnapshot,
1426
+ completedRef,
1427
+ passedRef,
1428
+ failedRef
1429
+ );
1430
+ yield* Effect.forEach(
1431
+ task.testCases,
1432
+ processTestCase,
1433
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1434
+ );
1435
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1436
+ Ref.get(completedRef),
1437
+ Ref.get(passedRef),
1438
+ Ref.get(failedRef)
1439
+ ]);
1347
1440
  const finishedAt = Date.now();
1348
1441
  const completedEvent = {
1349
1442
  type: "RunCompleted",
1350
1443
  runId: task.runId,
1351
1444
  finishedAt,
1352
- passedTestCases,
1353
- failedTestCases,
1445
+ passedTestCases: passedUniqueTestCases,
1446
+ failedTestCases: failedUniqueTestCases,
1354
1447
  totalTestCases: task.testCases.length,
1355
1448
  artifactPath: task.snapshot.artifactPath
1356
1449
  };
1357
1450
  updateSnapshot(task.runId, (snapshot) => ({
1358
1451
  ...snapshot,
1359
1452
  status: "completed",
1360
- completedTestCases,
1361
- passedTestCases,
1362
- failedTestCases,
1453
+ completedTestCases: completedEvaluations,
1454
+ passedTestCases: passedUniqueTestCases,
1455
+ failedTestCases: failedUniqueTestCases,
1363
1456
  finishedAt
1364
1457
  }));
1365
1458
  yield* publishEvent(completedEvent);
@@ -1447,7 +1540,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1447
1540
  const artifactPath = filePath;
1448
1541
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1449
1542
  const progress = aggregateTestCaseProgress(lines);
1450
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1543
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1451
1544
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1452
1545
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1453
1546
  return {
@@ -1469,23 +1562,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1469
1562
  }
1470
1563
  function aggregateTestCaseProgress(lines) {
1471
1564
  let completedTestCases = 0;
1472
- let passedTestCases = 0;
1473
- let failedTestCases = 0;
1565
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1474
1566
  for (const line of lines) {
1475
1567
  try {
1476
1568
  const event = JSON.parse(line);
1477
1569
  if (event.type === "TestCaseProgress") {
1478
1570
  const ev = event;
1479
1571
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1480
- if (ev.passed) {
1481
- passedTestCases += 1;
1482
- } else {
1483
- failedTestCases += 1;
1484
- }
1572
+ const id = ev.testCaseId;
1573
+ const current = testCasePassedBy.get(id);
1574
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1485
1575
  }
1486
1576
  } catch {
1487
1577
  }
1488
1578
  }
1579
+ let passedTestCases = 0;
1580
+ let failedTestCases = 0;
1581
+ for (const passed of testCasePassedBy.values()) {
1582
+ if (passed) {
1583
+ passedTestCases += 1;
1584
+ } else {
1585
+ failedTestCases += 1;
1586
+ }
1587
+ }
1489
1588
  return { completedTestCases, passedTestCases, failedTestCases };
1490
1589
  }
1491
1590
  async function parseArtifactFile(artifactPath) {
@@ -1503,6 +1602,8 @@ async function parseArtifactFile(artifactPath) {
1503
1602
  testCaseName: ev.testCaseName,
1504
1603
  completedTestCases: ev.completedTestCases,
1505
1604
  totalTestCases: ev.totalTestCases,
1605
+ rerunIndex: ev.rerunIndex,
1606
+ rerunTotal: ev.rerunTotal,
1506
1607
  passed: ev.passed,
1507
1608
  durationMs: ev.durationMs,
1508
1609
  evaluatorScores: ev.evaluatorScores ?? []
@@ -1708,6 +1809,10 @@ var EffectRunner = class {
1708
1809
  throw new Error("No evaluators selected for run");
1709
1810
  }
1710
1811
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1812
+ const totalEvaluations = selectedTestCases.reduce(
1813
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1814
+ 0
1815
+ );
1711
1816
  const runId = `run-${randomUUID()}`;
1712
1817
  const artifactPath = createArtifactPath(
1713
1818
  this.config.artifactDirectory,
@@ -1720,7 +1825,7 @@ var EffectRunner = class {
1720
1825
  datasetName: dataset.dataset.getName(),
1721
1826
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1722
1827
  queuedAt: Date.now(),
1723
- totalTestCases: selectedTestCases.length,
1828
+ totalTestCases: totalEvaluations,
1724
1829
  completedTestCases: 0,
1725
1830
  passedTestCases: 0,
1726
1831
  failedTestCases: 0,
@@ -1734,7 +1839,7 @@ var EffectRunner = class {
1734
1839
  datasetId: request.datasetId,
1735
1840
  datasetName: dataset.dataset.getName(),
1736
1841
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1737
- totalTestCases: selectedTestCases.length,
1842
+ totalTestCases: totalEvaluations,
1738
1843
  artifactPath
1739
1844
  };
1740
1845
  await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1745,6 +1850,7 @@ var EffectRunner = class {
1745
1850
  payload: queuedEvent
1746
1851
  })
1747
1852
  );
1853
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1748
1854
  await Effect.runPromise(
1749
1855
  Queue.offer(this.runQueue, {
1750
1856
  runId,
@@ -1752,7 +1858,8 @@ var EffectRunner = class {
1752
1858
  dataset: dataset.dataset,
1753
1859
  evaluators: selectedEvaluators,
1754
1860
  testCases: selectedTestCases,
1755
- snapshot
1861
+ snapshot,
1862
+ maxConcurrency
1756
1863
  })
1757
1864
  );
1758
1865
  return snapshot;
@@ -1949,6 +2056,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1949
2056
  rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
1950
2057
  rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
1951
2058
  for (const tc of testCases) {
2059
+ const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
1952
2060
  rows.push(
1953
2061
  /* @__PURE__ */ jsxs(Text, { children: [
1954
2062
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -1960,12 +2068,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1960
2068
  ] }),
1961
2069
  " ",
1962
2070
  tc.testCaseName,
2071
+ rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
1963
2072
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1964
2073
  " (",
1965
2074
  tc.durationMs,
1966
2075
  "ms)"
1967
2076
  ] })
1968
- ] }, `tc-${tc.testCaseId}`)
2077
+ ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
1969
2078
  );
1970
2079
  for (const item of tc.evaluatorScores) {
1971
2080
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;