@m4trix/evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -331,15 +331,23 @@ var TestCase = class _TestCase {
331
331
  this._config = config;
332
332
  }
333
333
  static describe(config) {
334
+ const reruns = config.reruns ?? 1;
335
+ if (reruns < 1 || !Number.isInteger(reruns)) {
336
+ throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
337
+ }
334
338
  return new _TestCase({
335
339
  name: config.name,
336
340
  tags: config.tags,
341
+ reruns,
337
342
  inputSchema: config.inputSchema,
338
343
  input: config.input,
339
344
  outputSchema: config.outputSchema,
340
345
  output: config.output
341
346
  });
342
347
  }
348
+ getReruns() {
349
+ return this._config.reruns;
350
+ }
343
351
  getName() {
344
352
  return this._config.name;
345
353
  }
@@ -513,6 +521,7 @@ var Metric = {
513
521
  const def = {
514
522
  id: config.id,
515
523
  name: config.name,
524
+ aggregate: config.aggregate,
516
525
  format: config.format,
517
526
  make: (data) => ({ id: config.id, data })
518
527
  };
@@ -532,6 +541,7 @@ var Score = {
532
541
  id: config.id,
533
542
  name: config.name,
534
543
  displayStrategy: config.displayStrategy,
544
+ aggregate: config.aggregate,
535
545
  format: config.format,
536
546
  make: (data, options) => {
537
547
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -550,23 +560,62 @@ function getScoreById(id) {
550
560
  return registry2.get(id);
551
561
  }
552
562
 
563
+ // src/evals/aggregators.ts
564
+ function aggregateAverage(values) {
565
+ if (values.length === 0) {
566
+ return { value: 0 };
567
+ }
568
+ const sum = values.reduce((s, v) => s + v.value, 0);
569
+ return { value: sum / values.length };
570
+ }
571
+ function aggregateAll(values) {
572
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
573
+ }
574
+ function aggregateTokenCountSum(values) {
575
+ const initial = {
576
+ input: 0,
577
+ output: 0,
578
+ inputCached: 0,
579
+ outputCached: 0
580
+ };
581
+ return values.reduce(
582
+ (acc, v) => ({
583
+ input: acc.input + (v.input ?? 0),
584
+ output: acc.output + (v.output ?? 0),
585
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
586
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
587
+ }),
588
+ initial
589
+ );
590
+ }
591
+ function aggregateLatencyAverage(values) {
592
+ if (values.length === 0) {
593
+ return { ms: 0 };
594
+ }
595
+ const sum = values.reduce((s, v) => s + v.ms, 0);
596
+ return { ms: sum / values.length };
597
+ }
598
+
553
599
  // src/evals/metrics/standard.ts
554
600
  var tokenCountMetric = Metric.of({
555
601
  id: "token-count",
556
602
  name: "Tokens",
557
- format: (data) => {
603
+ aggregate: aggregateTokenCountSum,
604
+ format: (data, options) => {
558
605
  const input = data.input ?? 0;
559
606
  const output = data.output ?? 0;
560
607
  const inputCached = data.inputCached ?? 0;
561
608
  const outputCached = data.outputCached ?? 0;
562
609
  const cached = inputCached + outputCached;
563
- return `in:${input} out:${output} cached:${cached}`;
610
+ const base = `in:${input} out:${output} cached:${cached}`;
611
+ return options?.isAggregated ? `Total: ${base}` : base;
564
612
  }
565
613
  });
566
614
  var latencyMetric = Metric.of({
567
615
  id: "latency",
568
616
  name: "Latency",
569
- format: (data) => `${data.ms}ms`
617
+ aggregate: aggregateLatencyAverage,
618
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
570
619
  });
571
620
 
572
621
  // src/evals/scores/standard.ts
@@ -574,13 +623,15 @@ var percentScore = Score.of({
574
623
  id: "percent",
575
624
  name: "Score",
576
625
  displayStrategy: "bar",
577
- format: (data) => data.value.toFixed(2)
626
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
627
+ aggregate: aggregateAverage
578
628
  });
579
629
  var binaryScore = Score.of({
580
630
  id: "binary",
581
631
  name: "Result",
582
632
  displayStrategy: "passFail",
583
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
633
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
634
+ aggregate: aggregateAll
584
635
  });
585
636
  function createDiffLogEntry(expected, actual, options) {
586
637
  const diff = jsonDiff.diffString(expected, actual, { color: false });
@@ -621,7 +672,8 @@ var defaultRunnerConfig = {
621
672
  ],
622
673
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
623
674
  },
624
- artifactDirectory: ".eval-results"
675
+ artifactDirectory: ".eval-results",
676
+ maxConcurrency: 1
625
677
  };
626
678
  function toRunnerConfigOverrides(config) {
627
679
  if (!config) {
@@ -654,6 +706,9 @@ function toRunnerConfigOverrides(config) {
654
706
  if (config.artifactDirectory !== void 0) {
655
707
  overrides.artifactDirectory = config.artifactDirectory;
656
708
  }
709
+ if (config.maxConcurrency !== void 0) {
710
+ overrides.maxConcurrency = config.maxConcurrency;
711
+ }
657
712
  if (Object.keys(discovery).length > 0) {
658
713
  overrides.discovery = discovery;
659
714
  }
@@ -927,6 +982,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
927
982
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
928
983
  );
929
984
  }
985
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
986
+ return effect.Effect.gen(function* () {
987
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
988
+ const rerunPassed = [];
989
+ for (let r = 0; r < reruns; r++) {
990
+ const started = Date.now();
991
+ const evaluatorScores = [];
992
+ let testCaseError;
993
+ const output = readOutput(testCaseItem.testCase);
994
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
995
+ const evaluateFn = evaluator.getEvaluateFn();
996
+ if (!evaluateFn) {
997
+ continue;
998
+ }
999
+ try {
1000
+ const logs = [];
1001
+ const logDiff = (expected, actual, options) => {
1002
+ logs.push(createDiffLogEntry(expected, actual, options));
1003
+ };
1004
+ const ctx = yield* effect.Effect.promise(
1005
+ () => Promise.resolve(evaluator.resolveContext())
1006
+ );
1007
+ const result = yield* effect.Effect.promise(
1008
+ () => Promise.resolve(
1009
+ evaluateFn({
1010
+ input: testCaseItem.testCase.getInput(),
1011
+ ctx,
1012
+ output,
1013
+ logDiff
1014
+ })
1015
+ )
1016
+ );
1017
+ const { scores, metrics } = normalizeResult(result);
1018
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1019
+ evaluatorScores.push({
1020
+ evaluatorId,
1021
+ scores,
1022
+ passed: passed2,
1023
+ metrics,
1024
+ logs: logs.length > 0 ? logs : void 0
1025
+ });
1026
+ } catch (error) {
1027
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1028
+ evaluatorScores.push({
1029
+ evaluatorId,
1030
+ scores: [],
1031
+ passed: false
1032
+ });
1033
+ }
1034
+ }
1035
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1036
+ rerunPassed.push(rerunPassedThis);
1037
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1038
+ n + 1,
1039
+ n + 1
1040
+ ]);
1041
+ const progressEvent = {
1042
+ type: "TestCaseProgress",
1043
+ runId: task.runId,
1044
+ testCaseId: testCaseItem.id,
1045
+ testCaseName: testCaseItem.testCase.getName(),
1046
+ completedTestCases: completedEvaluations,
1047
+ totalTestCases: totalEvaluations,
1048
+ rerunIndex: r + 1,
1049
+ rerunTotal: reruns,
1050
+ passed: rerunPassedThis,
1051
+ durationMs: Date.now() - started,
1052
+ evaluatorScores,
1053
+ output,
1054
+ errorMessage: testCaseError
1055
+ };
1056
+ updateSnapshot(task.runId, (snapshot) => ({
1057
+ ...snapshot,
1058
+ completedTestCases: completedEvaluations
1059
+ }));
1060
+ yield* publishEvent(progressEvent);
1061
+ yield* effect.Queue.offer(persistenceQueue, {
1062
+ runId: task.runId,
1063
+ artifactPath: task.snapshot.artifactPath,
1064
+ payload: progressEvent
1065
+ });
1066
+ }
1067
+ const testCasePassed = rerunPassed.every(Boolean);
1068
+ if (testCasePassed) {
1069
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1070
+ } else {
1071
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1072
+ }
1073
+ const [passed, failed] = yield* effect.Effect.all([
1074
+ effect.Ref.get(passedRef),
1075
+ effect.Ref.get(failedRef)
1076
+ ]);
1077
+ updateSnapshot(task.runId, (snapshot) => ({
1078
+ ...snapshot,
1079
+ passedTestCases: passed,
1080
+ failedTestCases: failed
1081
+ }));
1082
+ });
1083
+ }
930
1084
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
931
1085
  const startedAt = Date.now();
932
1086
  updateSnapshot(task.runId, (snapshot) => ({
@@ -939,104 +1093,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
939
1093
  runId: task.runId,
940
1094
  startedAt
941
1095
  });
942
- let completedTestCases = 0;
943
- let passedTestCases = 0;
944
- let failedTestCases = 0;
945
- for (const testCaseItem of task.testCases) {
946
- const started = Date.now();
947
- const evaluatorScores = [];
948
- let testCaseError;
949
- const output = readOutput(testCaseItem.testCase);
950
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
951
- const evaluateFn = evaluator.getEvaluateFn();
952
- if (!evaluateFn) {
953
- continue;
954
- }
955
- try {
956
- const logs = [];
957
- const logDiff = (expected, actual, options) => {
958
- logs.push(createDiffLogEntry(expected, actual, options));
959
- };
960
- const ctx = yield* effect.Effect.promise(
961
- () => Promise.resolve(evaluator.resolveContext())
962
- );
963
- const result = yield* effect.Effect.promise(
964
- () => Promise.resolve(
965
- evaluateFn({
966
- input: testCaseItem.testCase.getInput(),
967
- ctx,
968
- output,
969
- logDiff
970
- })
971
- )
972
- );
973
- const { scores, metrics } = normalizeResult(result);
974
- const passed = computeEvaluatorPassed(evaluator, result, scores);
975
- evaluatorScores.push({
976
- evaluatorId,
977
- scores,
978
- passed,
979
- metrics,
980
- logs: logs.length > 0 ? logs : void 0
981
- });
982
- } catch (error) {
983
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
984
- evaluatorScores.push({
985
- evaluatorId,
986
- scores: [],
987
- passed: false
988
- });
989
- }
990
- }
991
- const testCasePassed = evaluatorScores.every((s) => s.passed);
992
- completedTestCases += 1;
993
- if (testCasePassed) {
994
- passedTestCases += 1;
995
- } else {
996
- failedTestCases += 1;
997
- }
998
- const progressEvent = {
999
- type: "TestCaseProgress",
1000
- runId: task.runId,
1001
- testCaseId: testCaseItem.id,
1002
- testCaseName: testCaseItem.testCase.getName(),
1003
- completedTestCases,
1004
- totalTestCases: task.testCases.length,
1005
- passed: testCasePassed,
1006
- durationMs: Date.now() - started,
1007
- evaluatorScores,
1008
- output,
1009
- errorMessage: testCaseError
1010
- };
1011
- updateSnapshot(task.runId, (snapshot) => ({
1012
- ...snapshot,
1013
- completedTestCases,
1014
- passedTestCases,
1015
- failedTestCases
1016
- }));
1017
- yield* publishEvent(progressEvent);
1018
- yield* effect.Queue.offer(persistenceQueue, {
1019
- runId: task.runId,
1020
- artifactPath: task.snapshot.artifactPath,
1021
- payload: progressEvent
1022
- });
1023
- }
1096
+ const totalEvaluations = task.testCases.reduce(
1097
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1098
+ 0
1099
+ );
1100
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1101
+ const completedRef = yield* effect.Ref.make(0);
1102
+ const passedRef = yield* effect.Ref.make(0);
1103
+ const failedRef = yield* effect.Ref.make(0);
1104
+ const processTestCase = (testCaseItem) => processOneTestCase(
1105
+ task,
1106
+ testCaseItem,
1107
+ totalEvaluations,
1108
+ publishEvent,
1109
+ persistenceQueue,
1110
+ updateSnapshot,
1111
+ completedRef,
1112
+ passedRef,
1113
+ failedRef
1114
+ );
1115
+ yield* effect.Effect.forEach(
1116
+ task.testCases,
1117
+ processTestCase,
1118
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1119
+ );
1120
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1121
+ effect.Ref.get(completedRef),
1122
+ effect.Ref.get(passedRef),
1123
+ effect.Ref.get(failedRef)
1124
+ ]);
1024
1125
  const finishedAt = Date.now();
1025
1126
  const completedEvent = {
1026
1127
  type: "RunCompleted",
1027
1128
  runId: task.runId,
1028
1129
  finishedAt,
1029
- passedTestCases,
1030
- failedTestCases,
1130
+ passedTestCases: passedUniqueTestCases,
1131
+ failedTestCases: failedUniqueTestCases,
1031
1132
  totalTestCases: task.testCases.length,
1032
1133
  artifactPath: task.snapshot.artifactPath
1033
1134
  };
1034
1135
  updateSnapshot(task.runId, (snapshot) => ({
1035
1136
  ...snapshot,
1036
1137
  status: "completed",
1037
- completedTestCases,
1038
- passedTestCases,
1039
- failedTestCases,
1138
+ completedTestCases: completedEvaluations,
1139
+ passedTestCases: passedUniqueTestCases,
1140
+ failedTestCases: failedUniqueTestCases,
1040
1141
  finishedAt
1041
1142
  }));
1042
1143
  yield* publishEvent(completedEvent);
@@ -1124,7 +1225,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1124
1225
  const artifactPath = filePath;
1125
1226
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1126
1227
  const progress = aggregateTestCaseProgress(lines);
1127
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1228
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1128
1229
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1129
1230
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1130
1231
  return {
@@ -1146,23 +1247,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1146
1247
  }
1147
1248
  function aggregateTestCaseProgress(lines) {
1148
1249
  let completedTestCases = 0;
1149
- let passedTestCases = 0;
1150
- let failedTestCases = 0;
1250
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1151
1251
  for (const line of lines) {
1152
1252
  try {
1153
1253
  const event = JSON.parse(line);
1154
1254
  if (event.type === "TestCaseProgress") {
1155
1255
  const ev = event;
1156
1256
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1157
- if (ev.passed) {
1158
- passedTestCases += 1;
1159
- } else {
1160
- failedTestCases += 1;
1161
- }
1257
+ const id = ev.testCaseId;
1258
+ const current = testCasePassedBy.get(id);
1259
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1162
1260
  }
1163
1261
  } catch {
1164
1262
  }
1165
1263
  }
1264
+ let passedTestCases = 0;
1265
+ let failedTestCases = 0;
1266
+ for (const passed of testCasePassedBy.values()) {
1267
+ if (passed) {
1268
+ passedTestCases += 1;
1269
+ } else {
1270
+ failedTestCases += 1;
1271
+ }
1272
+ }
1166
1273
  return { completedTestCases, passedTestCases, failedTestCases };
1167
1274
  }
1168
1275
  async function appendJsonLine(artifactPath, payload) {
@@ -1357,6 +1464,10 @@ var EffectRunner = class {
1357
1464
  throw new Error("No evaluators selected for run");
1358
1465
  }
1359
1466
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1467
+ const totalEvaluations = selectedTestCases.reduce(
1468
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1469
+ 0
1470
+ );
1360
1471
  const runId = `run-${crypto.randomUUID()}`;
1361
1472
  const artifactPath = createArtifactPath(
1362
1473
  this.config.artifactDirectory,
@@ -1369,7 +1480,7 @@ var EffectRunner = class {
1369
1480
  datasetName: dataset.dataset.getName(),
1370
1481
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1371
1482
  queuedAt: Date.now(),
1372
- totalTestCases: selectedTestCases.length,
1483
+ totalTestCases: totalEvaluations,
1373
1484
  completedTestCases: 0,
1374
1485
  passedTestCases: 0,
1375
1486
  failedTestCases: 0,
@@ -1383,7 +1494,7 @@ var EffectRunner = class {
1383
1494
  datasetId: request.datasetId,
1384
1495
  datasetName: dataset.dataset.getName(),
1385
1496
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1386
- totalTestCases: selectedTestCases.length,
1497
+ totalTestCases: totalEvaluations,
1387
1498
  artifactPath
1388
1499
  };
1389
1500
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1394,6 +1505,7 @@ var EffectRunner = class {
1394
1505
  payload: queuedEvent
1395
1506
  })
1396
1507
  );
1508
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1397
1509
  await effect.Effect.runPromise(
1398
1510
  effect.Queue.offer(this.runQueue, {
1399
1511
  runId,
@@ -1401,7 +1513,8 @@ var EffectRunner = class {
1401
1513
  dataset: dataset.dataset,
1402
1514
  evaluators: selectedEvaluators,
1403
1515
  testCases: selectedTestCases,
1404
- snapshot
1516
+ snapshot,
1517
+ maxConcurrency
1405
1518
  })
1406
1519
  );
1407
1520
  return snapshot;