@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1,7 +1,7 @@
1
1
  'use strict';
2
2
 
3
3
  var effect = require('effect');
4
- var jsonDiff = require('json-diff');
4
+ var diff = require('diff');
5
5
  var crypto = require('crypto');
6
6
  var fs = require('fs');
7
7
  var path = require('path');
@@ -331,15 +331,23 @@ var TestCase = class _TestCase {
331
331
  this._config = config;
332
332
  }
333
333
  static describe(config) {
334
+ const reruns = config.reruns ?? 1;
335
+ if (reruns < 1 || !Number.isInteger(reruns)) {
336
+ throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
337
+ }
334
338
  return new _TestCase({
335
339
  name: config.name,
336
340
  tags: config.tags,
341
+ reruns,
337
342
  inputSchema: config.inputSchema,
338
343
  input: config.input,
339
344
  outputSchema: config.outputSchema,
340
345
  output: config.output
341
346
  });
342
347
  }
348
+ getReruns() {
349
+ return this._config.reruns;
350
+ }
343
351
  getName() {
344
352
  return this._config.name;
345
353
  }
@@ -513,6 +521,7 @@ var Metric = {
513
521
  const def = {
514
522
  id: config.id,
515
523
  name: config.name,
524
+ aggregate: config.aggregate,
516
525
  format: config.format,
517
526
  make: (data) => ({ id: config.id, data })
518
527
  };
@@ -532,6 +541,7 @@ var Score = {
532
541
  id: config.id,
533
542
  name: config.name,
534
543
  displayStrategy: config.displayStrategy,
544
+ aggregate: config.aggregate,
535
545
  format: config.format,
536
546
  make: (data, options) => {
537
547
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -550,23 +560,75 @@ function getScoreById(id) {
550
560
  return registry2.get(id);
551
561
  }
552
562
 
563
+ // src/evals/aggregators.ts
564
+ function aggregateAverageWithVariance(values) {
565
+ if (values.length === 0) {
566
+ return { value: 0, count: 0 };
567
+ }
568
+ const sum = values.reduce((s, v) => s + v.value, 0);
569
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
570
+ const mean = sum / values.length;
571
+ let stdDev;
572
+ if (values.length >= 2) {
573
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
574
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
575
+ }
576
+ return { value: mean, stdDev, count: values.length };
577
+ }
578
+ function aggregateAll(values) {
579
+ const total = values.length;
580
+ const passedCount = values.filter((v) => v.passed).length;
581
+ return {
582
+ passed: total > 0 && values.every((v) => v.passed),
583
+ passedCount,
584
+ totalCount: total
585
+ };
586
+ }
587
+ function aggregateTokenCountSum(values) {
588
+ const initial = {
589
+ input: 0,
590
+ output: 0,
591
+ inputCached: 0,
592
+ outputCached: 0
593
+ };
594
+ return values.reduce(
595
+ (acc, v) => ({
596
+ input: acc.input + (v.input ?? 0),
597
+ output: acc.output + (v.output ?? 0),
598
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
599
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
600
+ }),
601
+ initial
602
+ );
603
+ }
604
+ function aggregateLatencyAverage(values) {
605
+ if (values.length === 0) {
606
+ return { ms: 0 };
607
+ }
608
+ const sum = values.reduce((s, v) => s + v.ms, 0);
609
+ return { ms: sum / values.length };
610
+ }
611
+
553
612
  // src/evals/metrics/standard.ts
554
613
  var tokenCountMetric = Metric.of({
555
614
  id: "token-count",
556
615
  name: "Tokens",
557
- format: (data) => {
616
+ aggregate: aggregateTokenCountSum,
617
+ format: (data, options) => {
558
618
  const input = data.input ?? 0;
559
619
  const output = data.output ?? 0;
560
620
  const inputCached = data.inputCached ?? 0;
561
621
  const outputCached = data.outputCached ?? 0;
562
622
  const cached = inputCached + outputCached;
563
- return `in:${input} out:${output} cached:${cached}`;
623
+ const base = `in:${input} out:${output} cached:${cached}`;
624
+ return options?.isAggregated ? `Total: ${base}` : base;
564
625
  }
565
626
  });
566
627
  var latencyMetric = Metric.of({
567
628
  id: "latency",
568
629
  name: "Latency",
569
- format: (data) => `${data.ms}ms`
630
+ aggregate: aggregateLatencyAverage,
631
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
570
632
  });
571
633
 
572
634
  // src/evals/scores/standard.ts
@@ -574,16 +636,59 @@ var percentScore = Score.of({
574
636
  id: "percent",
575
637
  name: "Score",
576
638
  displayStrategy: "bar",
577
- format: (data) => data.value.toFixed(2)
639
+ format: (data, options) => {
640
+ if (options?.isAggregated) {
641
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
642
+ }
643
+ return data.value.toFixed(2);
644
+ },
645
+ aggregate: aggregateAverageWithVariance
578
646
  });
579
647
  var binaryScore = Score.of({
580
648
  id: "binary",
581
649
  name: "Result",
582
650
  displayStrategy: "passFail",
583
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
651
+ format: (data, options) => {
652
+ if (options?.isAggregated) {
653
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
654
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
655
+ return `${base} (${data.passedCount}/${data.totalCount})`;
656
+ }
657
+ return base;
658
+ }
659
+ return data.passed ? "PASSED" : "NOT PASSED";
660
+ },
661
+ aggregate: aggregateAll
584
662
  });
663
+ function toJsonLines(value) {
664
+ try {
665
+ return JSON.stringify(value, null, 2);
666
+ } catch {
667
+ return String(value);
668
+ }
669
+ }
670
+ function formatDiffString(changes) {
671
+ const lines = [];
672
+ for (const part of changes) {
673
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
674
+ const partLines = part.value.split("\n");
675
+ if (partLines[partLines.length - 1] === "") {
676
+ partLines.pop();
677
+ }
678
+ for (const line of partLines) {
679
+ lines.push(`${prefix} ${line}`);
680
+ }
681
+ }
682
+ return lines.join("\n");
683
+ }
684
+ function createDiffString(expected, actual) {
685
+ const expectedStr = toJsonLines(expected);
686
+ const actualStr = toJsonLines(actual);
687
+ const changes = diff.diffLines(expectedStr, actualStr);
688
+ return formatDiffString(changes);
689
+ }
585
690
  function createDiffLogEntry(expected, actual, options) {
586
- const diff = jsonDiff.diffString(expected, actual, { color: false });
691
+ const diff = createDiffString(expected, actual);
587
692
  return {
588
693
  type: "diff",
589
694
  label: options?.label,
@@ -593,8 +698,22 @@ function createDiffLogEntry(expected, actual, options) {
593
698
  };
594
699
  }
595
700
  function printJsonDiff(expected, actual, options = {}) {
596
- const { color = true } = options;
597
- const diff = jsonDiff.diffString(expected, actual, { color });
701
+ const diff = createDiffString(expected, actual);
702
+ if (options.color) {
703
+ const lines = diff.split("\n").map((line) => {
704
+ const trimmed = line.trimStart();
705
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
706
+ return `\x1B[31m${line}\x1B[0m`;
707
+ }
708
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
709
+ return `\x1B[32m${line}\x1B[0m`;
710
+ }
711
+ return line;
712
+ });
713
+ const colored = lines.join("\n");
714
+ console.log(colored || "(no differences)");
715
+ return colored;
716
+ }
598
717
  console.log(diff || "(no differences)");
599
718
  return diff;
600
719
  }
@@ -621,7 +740,8 @@ var defaultRunnerConfig = {
621
740
  ],
622
741
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
623
742
  },
624
- artifactDirectory: ".eval-results"
743
+ artifactDirectory: ".eval-results",
744
+ maxConcurrency: 1
625
745
  };
626
746
  function toRunnerConfigOverrides(config) {
627
747
  if (!config) {
@@ -654,6 +774,9 @@ function toRunnerConfigOverrides(config) {
654
774
  if (config.artifactDirectory !== void 0) {
655
775
  overrides.artifactDirectory = config.artifactDirectory;
656
776
  }
777
+ if (config.maxConcurrency !== void 0) {
778
+ overrides.maxConcurrency = config.maxConcurrency;
779
+ }
657
780
  if (Object.keys(discovery).length > 0) {
658
781
  overrides.discovery = discovery;
659
782
  }
@@ -927,6 +1050,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
927
1050
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
928
1051
  );
929
1052
  }
1053
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1054
+ return effect.Effect.gen(function* () {
1055
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1056
+ const rerunPassed = [];
1057
+ for (let r = 0; r < reruns; r++) {
1058
+ const started = Date.now();
1059
+ const evaluatorScores = [];
1060
+ let testCaseError;
1061
+ const output = readOutput(testCaseItem.testCase);
1062
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1063
+ const evaluateFn = evaluator.getEvaluateFn();
1064
+ if (!evaluateFn) {
1065
+ continue;
1066
+ }
1067
+ try {
1068
+ const logs = [];
1069
+ const logDiff = (expected, actual, options) => {
1070
+ logs.push(createDiffLogEntry(expected, actual, options));
1071
+ };
1072
+ const ctx = yield* effect.Effect.promise(
1073
+ () => Promise.resolve(evaluator.resolveContext())
1074
+ );
1075
+ const result = yield* effect.Effect.promise(
1076
+ () => Promise.resolve(
1077
+ evaluateFn({
1078
+ input: testCaseItem.testCase.getInput(),
1079
+ ctx,
1080
+ output,
1081
+ logDiff
1082
+ })
1083
+ )
1084
+ );
1085
+ const { scores, metrics } = normalizeResult(result);
1086
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1087
+ evaluatorScores.push({
1088
+ evaluatorId,
1089
+ scores,
1090
+ passed: passed2,
1091
+ metrics,
1092
+ logs: logs.length > 0 ? logs : void 0
1093
+ });
1094
+ } catch (error) {
1095
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1096
+ evaluatorScores.push({
1097
+ evaluatorId,
1098
+ scores: [],
1099
+ passed: false
1100
+ });
1101
+ }
1102
+ }
1103
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1104
+ rerunPassed.push(rerunPassedThis);
1105
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1106
+ n + 1,
1107
+ n + 1
1108
+ ]);
1109
+ const progressEvent = {
1110
+ type: "TestCaseProgress",
1111
+ runId: task.runId,
1112
+ testCaseId: testCaseItem.id,
1113
+ testCaseName: testCaseItem.testCase.getName(),
1114
+ completedTestCases: completedEvaluations,
1115
+ totalTestCases: totalEvaluations,
1116
+ rerunIndex: r + 1,
1117
+ rerunTotal: reruns,
1118
+ passed: rerunPassedThis,
1119
+ durationMs: Date.now() - started,
1120
+ evaluatorScores,
1121
+ output,
1122
+ errorMessage: testCaseError
1123
+ };
1124
+ updateSnapshot(task.runId, (snapshot) => ({
1125
+ ...snapshot,
1126
+ completedTestCases: completedEvaluations
1127
+ }));
1128
+ yield* publishEvent(progressEvent);
1129
+ yield* effect.Queue.offer(persistenceQueue, {
1130
+ runId: task.runId,
1131
+ artifactPath: task.snapshot.artifactPath,
1132
+ payload: progressEvent
1133
+ });
1134
+ }
1135
+ const testCasePassed = rerunPassed.every(Boolean);
1136
+ if (testCasePassed) {
1137
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1138
+ } else {
1139
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1140
+ }
1141
+ const [passed, failed] = yield* effect.Effect.all([
1142
+ effect.Ref.get(passedRef),
1143
+ effect.Ref.get(failedRef)
1144
+ ]);
1145
+ updateSnapshot(task.runId, (snapshot) => ({
1146
+ ...snapshot,
1147
+ passedTestCases: passed,
1148
+ failedTestCases: failed
1149
+ }));
1150
+ });
1151
+ }
930
1152
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
931
1153
  const startedAt = Date.now();
932
1154
  updateSnapshot(task.runId, (snapshot) => ({
@@ -939,104 +1161,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
939
1161
  runId: task.runId,
940
1162
  startedAt
941
1163
  });
942
- let completedTestCases = 0;
943
- let passedTestCases = 0;
944
- let failedTestCases = 0;
945
- for (const testCaseItem of task.testCases) {
946
- const started = Date.now();
947
- const evaluatorScores = [];
948
- let testCaseError;
949
- const output = readOutput(testCaseItem.testCase);
950
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
951
- const evaluateFn = evaluator.getEvaluateFn();
952
- if (!evaluateFn) {
953
- continue;
954
- }
955
- try {
956
- const logs = [];
957
- const logDiff = (expected, actual, options) => {
958
- logs.push(createDiffLogEntry(expected, actual, options));
959
- };
960
- const ctx = yield* effect.Effect.promise(
961
- () => Promise.resolve(evaluator.resolveContext())
962
- );
963
- const result = yield* effect.Effect.promise(
964
- () => Promise.resolve(
965
- evaluateFn({
966
- input: testCaseItem.testCase.getInput(),
967
- ctx,
968
- output,
969
- logDiff
970
- })
971
- )
972
- );
973
- const { scores, metrics } = normalizeResult(result);
974
- const passed = computeEvaluatorPassed(evaluator, result, scores);
975
- evaluatorScores.push({
976
- evaluatorId,
977
- scores,
978
- passed,
979
- metrics,
980
- logs: logs.length > 0 ? logs : void 0
981
- });
982
- } catch (error) {
983
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
984
- evaluatorScores.push({
985
- evaluatorId,
986
- scores: [],
987
- passed: false
988
- });
989
- }
990
- }
991
- const testCasePassed = evaluatorScores.every((s) => s.passed);
992
- completedTestCases += 1;
993
- if (testCasePassed) {
994
- passedTestCases += 1;
995
- } else {
996
- failedTestCases += 1;
997
- }
998
- const progressEvent = {
999
- type: "TestCaseProgress",
1000
- runId: task.runId,
1001
- testCaseId: testCaseItem.id,
1002
- testCaseName: testCaseItem.testCase.getName(),
1003
- completedTestCases,
1004
- totalTestCases: task.testCases.length,
1005
- passed: testCasePassed,
1006
- durationMs: Date.now() - started,
1007
- evaluatorScores,
1008
- output,
1009
- errorMessage: testCaseError
1010
- };
1011
- updateSnapshot(task.runId, (snapshot) => ({
1012
- ...snapshot,
1013
- completedTestCases,
1014
- passedTestCases,
1015
- failedTestCases
1016
- }));
1017
- yield* publishEvent(progressEvent);
1018
- yield* effect.Queue.offer(persistenceQueue, {
1019
- runId: task.runId,
1020
- artifactPath: task.snapshot.artifactPath,
1021
- payload: progressEvent
1022
- });
1023
- }
1164
+ const totalEvaluations = task.testCases.reduce(
1165
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1166
+ 0
1167
+ );
1168
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1169
+ const completedRef = yield* effect.Ref.make(0);
1170
+ const passedRef = yield* effect.Ref.make(0);
1171
+ const failedRef = yield* effect.Ref.make(0);
1172
+ const processTestCase = (testCaseItem) => processOneTestCase(
1173
+ task,
1174
+ testCaseItem,
1175
+ totalEvaluations,
1176
+ publishEvent,
1177
+ persistenceQueue,
1178
+ updateSnapshot,
1179
+ completedRef,
1180
+ passedRef,
1181
+ failedRef
1182
+ );
1183
+ yield* effect.Effect.forEach(
1184
+ task.testCases,
1185
+ processTestCase,
1186
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1187
+ );
1188
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1189
+ effect.Ref.get(completedRef),
1190
+ effect.Ref.get(passedRef),
1191
+ effect.Ref.get(failedRef)
1192
+ ]);
1024
1193
  const finishedAt = Date.now();
1025
1194
  const completedEvent = {
1026
1195
  type: "RunCompleted",
1027
1196
  runId: task.runId,
1028
1197
  finishedAt,
1029
- passedTestCases,
1030
- failedTestCases,
1198
+ passedTestCases: passedUniqueTestCases,
1199
+ failedTestCases: failedUniqueTestCases,
1031
1200
  totalTestCases: task.testCases.length,
1032
1201
  artifactPath: task.snapshot.artifactPath
1033
1202
  };
1034
1203
  updateSnapshot(task.runId, (snapshot) => ({
1035
1204
  ...snapshot,
1036
1205
  status: "completed",
1037
- completedTestCases,
1038
- passedTestCases,
1039
- failedTestCases,
1206
+ completedTestCases: completedEvaluations,
1207
+ passedTestCases: passedUniqueTestCases,
1208
+ failedTestCases: failedUniqueTestCases,
1040
1209
  finishedAt
1041
1210
  }));
1042
1211
  yield* publishEvent(completedEvent);
@@ -1124,7 +1293,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1124
1293
  const artifactPath = filePath;
1125
1294
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1126
1295
  const progress = aggregateTestCaseProgress(lines);
1127
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1296
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1128
1297
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1129
1298
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1130
1299
  return {
@@ -1146,23 +1315,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1146
1315
  }
1147
1316
  function aggregateTestCaseProgress(lines) {
1148
1317
  let completedTestCases = 0;
1149
- let passedTestCases = 0;
1150
- let failedTestCases = 0;
1318
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1151
1319
  for (const line of lines) {
1152
1320
  try {
1153
1321
  const event = JSON.parse(line);
1154
1322
  if (event.type === "TestCaseProgress") {
1155
1323
  const ev = event;
1156
1324
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1157
- if (ev.passed) {
1158
- passedTestCases += 1;
1159
- } else {
1160
- failedTestCases += 1;
1161
- }
1325
+ const id = ev.testCaseId;
1326
+ const current = testCasePassedBy.get(id);
1327
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1162
1328
  }
1163
1329
  } catch {
1164
1330
  }
1165
1331
  }
1332
+ let passedTestCases = 0;
1333
+ let failedTestCases = 0;
1334
+ for (const passed of testCasePassedBy.values()) {
1335
+ if (passed) {
1336
+ passedTestCases += 1;
1337
+ } else {
1338
+ failedTestCases += 1;
1339
+ }
1340
+ }
1166
1341
  return { completedTestCases, passedTestCases, failedTestCases };
1167
1342
  }
1168
1343
  async function appendJsonLine(artifactPath, payload) {
@@ -1357,6 +1532,10 @@ var EffectRunner = class {
1357
1532
  throw new Error("No evaluators selected for run");
1358
1533
  }
1359
1534
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1535
+ const totalEvaluations = selectedTestCases.reduce(
1536
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1537
+ 0
1538
+ );
1360
1539
  const runId = `run-${crypto.randomUUID()}`;
1361
1540
  const artifactPath = createArtifactPath(
1362
1541
  this.config.artifactDirectory,
@@ -1369,7 +1548,7 @@ var EffectRunner = class {
1369
1548
  datasetName: dataset.dataset.getName(),
1370
1549
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1371
1550
  queuedAt: Date.now(),
1372
- totalTestCases: selectedTestCases.length,
1551
+ totalTestCases: totalEvaluations,
1373
1552
  completedTestCases: 0,
1374
1553
  passedTestCases: 0,
1375
1554
  failedTestCases: 0,
@@ -1383,7 +1562,7 @@ var EffectRunner = class {
1383
1562
  datasetId: request.datasetId,
1384
1563
  datasetName: dataset.dataset.getName(),
1385
1564
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1386
- totalTestCases: selectedTestCases.length,
1565
+ totalTestCases: totalEvaluations,
1387
1566
  artifactPath
1388
1567
  };
1389
1568
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1394,6 +1573,7 @@ var EffectRunner = class {
1394
1573
  payload: queuedEvent
1395
1574
  })
1396
1575
  );
1576
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1397
1577
  await effect.Effect.runPromise(
1398
1578
  effect.Queue.offer(this.runQueue, {
1399
1579
  runId,
@@ -1401,7 +1581,8 @@ var EffectRunner = class {
1401
1581
  dataset: dataset.dataset,
1402
1582
  evaluators: selectedEvaluators,
1403
1583
  testCases: selectedTestCases,
1404
- snapshot
1584
+ snapshot,
1585
+ maxConcurrency
1405
1586
  })
1406
1587
  );
1407
1588
  return snapshot;