@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -74,6 +74,7 @@ interface CliState {
74
74
  datasetMenuIndex: number;
75
75
  runMenuIndex: number;
76
76
  detailsScrollOffset: number;
77
+ overviewScrollOffset: number;
77
78
  selectedEvaluatorIds: string[];
78
79
  evaluatorMenuIndex: number;
79
80
  searchQuery: string;
@@ -91,6 +92,8 @@ interface RunnerDiscoveryConfig {
91
92
  interface RunnerConfig {
92
93
  discovery: RunnerDiscoveryConfig;
93
94
  artifactDirectory: string;
95
+ /** Max concurrent test cases per run. Default: 1 (sequential). */
96
+ maxConcurrency: number;
94
97
  }
95
98
  type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
96
99
  discovery?: Partial<RunnerDiscoveryConfig>;
@@ -108,6 +111,8 @@ interface M4trixEvalConfigDiscovery {
108
111
  interface M4trixEvalConfig {
109
112
  discovery?: M4trixEvalConfigDiscovery;
110
113
  artifactDirectory?: string;
114
+ /** Max concurrent test cases per run. Default: 1 (sequential). */
115
+ maxConcurrency?: number;
111
116
  }
112
117
  type ConfigType = M4trixEvalConfig;
113
118
  type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
@@ -124,6 +129,7 @@ type InputOrBuilder<T> = T | (() => T);
124
129
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
125
130
  name: string;
126
131
  tags: string[];
132
+ reruns?: number;
127
133
  inputSchema: TI;
128
134
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
135
  outputSchema?: TO;
@@ -133,6 +139,7 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
133
139
  private readonly _config;
134
140
  private constructor();
135
141
  static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
142
+ getReruns(): number;
136
143
  getName(): string;
137
144
  getTags(): string[];
138
145
  getInputSchema(): Schema.Schema.Any;
@@ -225,17 +232,22 @@ interface MetricItem<TData = unknown> {
225
232
  readonly id: string;
226
233
  readonly data: TData;
227
234
  }
235
+ interface FormatMetricOptions {
236
+ isAggregated?: boolean;
237
+ }
228
238
  interface MetricDef<TData = unknown> {
229
239
  readonly id: string;
230
240
  readonly name?: string;
231
- format(data: TData): string;
241
+ readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
242
+ format(data: TData, options?: FormatMetricOptions): string;
232
243
  make(data: TData): MetricItem<TData>;
233
244
  }
234
245
  declare const Metric: {
235
246
  of<TData>(config: {
236
247
  id: string;
237
248
  name?: string | undefined;
238
- format: (data: TData) => string;
249
+ format: (data: TData, options?: FormatMetricOptions) => string;
250
+ aggregate?: ((values: readonly TData[]) => TData) | undefined;
239
251
  }): MetricDef<TData>;
240
252
  };
241
253
  declare function getMetricById(id: string): MetricDef<unknown> | undefined;
@@ -246,11 +258,15 @@ interface ScoreItem<TData = unknown> {
246
258
  readonly data: TData;
247
259
  readonly passed?: boolean;
248
260
  }
261
+ interface FormatScoreOptions {
262
+ isAggregated?: boolean;
263
+ }
249
264
  interface ScoreDef<TData = unknown> {
250
265
  readonly id: string;
251
266
  readonly name?: string;
252
267
  readonly displayStrategy: ScoreDisplayStrategy;
253
- format(data: TData): string;
268
+ readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
269
+ format(data: TData, options?: FormatScoreOptions): string;
254
270
  make(data: TData, options?: {
255
271
  definePassed?: (data: TData) => boolean;
256
272
  }): ScoreItem<TData>;
@@ -260,7 +276,8 @@ declare const Score: {
260
276
  id: string;
261
277
  name?: string | undefined;
262
278
  displayStrategy: ScoreDisplayStrategy;
263
- format: (data: TData) => string;
279
+ format: (data: TData, options?: FormatScoreOptions) => string;
280
+ aggregate?: ((values: readonly TData[]) => TData) | undefined;
264
281
  }): ScoreDef<TData>;
265
282
  };
266
283
  declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
@@ -326,6 +343,8 @@ type RunnerEvent = {
326
343
  testCaseName: string;
327
344
  completedTestCases: number;
328
345
  totalTestCases: number;
346
+ rerunIndex: number;
347
+ rerunTotal: number;
329
348
  passed: boolean;
330
349
  durationMs: number;
331
350
  evaluatorScores: ReadonlyArray<{
@@ -394,11 +413,15 @@ declare const latencyMetric: MetricDef<LatencyData>;
394
413
 
395
414
  interface PercentScoreData {
396
415
  value: number;
416
+ stdDev?: number;
417
+ count?: number;
397
418
  }
398
419
  declare const percentScore: ScoreDef<PercentScoreData>;
399
420
  interface BinaryScoreData {
400
421
  passed: boolean;
422
+ passedCount?: number;
423
+ totalCount?: number;
401
424
  }
402
425
  declare const binaryScore: ScoreDef<BinaryScoreData>;
403
426
 
404
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
427
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
- import { Effect, PubSub, Queue, Fiber } from 'effect';
1
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
- import { diffString } from 'json-diff';
3
+ import { diffLines } from 'diff';
4
4
  import { randomUUID } from 'crypto';
5
5
  import { existsSync } from 'fs';
6
6
  import { resolve as resolve$1, relative, join, dirname } from 'path';
@@ -309,15 +309,23 @@ var TestCase = class _TestCase {
309
309
  this._config = config;
310
310
  }
311
311
  static describe(config) {
312
+ const reruns = config.reruns ?? 1;
313
+ if (reruns < 1 || !Number.isInteger(reruns)) {
314
+ throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
315
+ }
312
316
  return new _TestCase({
313
317
  name: config.name,
314
318
  tags: config.tags,
319
+ reruns,
315
320
  inputSchema: config.inputSchema,
316
321
  input: config.input,
317
322
  outputSchema: config.outputSchema,
318
323
  output: config.output
319
324
  });
320
325
  }
326
+ getReruns() {
327
+ return this._config.reruns;
328
+ }
321
329
  getName() {
322
330
  return this._config.name;
323
331
  }
@@ -491,6 +499,7 @@ var Metric = {
491
499
  const def = {
492
500
  id: config.id,
493
501
  name: config.name,
502
+ aggregate: config.aggregate,
494
503
  format: config.format,
495
504
  make: (data) => ({ id: config.id, data })
496
505
  };
@@ -510,6 +519,7 @@ var Score = {
510
519
  id: config.id,
511
520
  name: config.name,
512
521
  displayStrategy: config.displayStrategy,
522
+ aggregate: config.aggregate,
513
523
  format: config.format,
514
524
  make: (data, options) => {
515
525
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -528,23 +538,75 @@ function getScoreById(id) {
528
538
  return registry2.get(id);
529
539
  }
530
540
 
541
+ // src/evals/aggregators.ts
542
+ function aggregateAverageWithVariance(values) {
543
+ if (values.length === 0) {
544
+ return { value: 0, count: 0 };
545
+ }
546
+ const sum = values.reduce((s, v) => s + v.value, 0);
547
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
548
+ const mean = sum / values.length;
549
+ let stdDev;
550
+ if (values.length >= 2) {
551
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
552
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
553
+ }
554
+ return { value: mean, stdDev, count: values.length };
555
+ }
556
+ function aggregateAll(values) {
557
+ const total = values.length;
558
+ const passedCount = values.filter((v) => v.passed).length;
559
+ return {
560
+ passed: total > 0 && values.every((v) => v.passed),
561
+ passedCount,
562
+ totalCount: total
563
+ };
564
+ }
565
+ function aggregateTokenCountSum(values) {
566
+ const initial = {
567
+ input: 0,
568
+ output: 0,
569
+ inputCached: 0,
570
+ outputCached: 0
571
+ };
572
+ return values.reduce(
573
+ (acc, v) => ({
574
+ input: acc.input + (v.input ?? 0),
575
+ output: acc.output + (v.output ?? 0),
576
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
577
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
578
+ }),
579
+ initial
580
+ );
581
+ }
582
+ function aggregateLatencyAverage(values) {
583
+ if (values.length === 0) {
584
+ return { ms: 0 };
585
+ }
586
+ const sum = values.reduce((s, v) => s + v.ms, 0);
587
+ return { ms: sum / values.length };
588
+ }
589
+
531
590
  // src/evals/metrics/standard.ts
532
591
  var tokenCountMetric = Metric.of({
533
592
  id: "token-count",
534
593
  name: "Tokens",
535
- format: (data) => {
594
+ aggregate: aggregateTokenCountSum,
595
+ format: (data, options) => {
536
596
  const input = data.input ?? 0;
537
597
  const output = data.output ?? 0;
538
598
  const inputCached = data.inputCached ?? 0;
539
599
  const outputCached = data.outputCached ?? 0;
540
600
  const cached = inputCached + outputCached;
541
- return `in:${input} out:${output} cached:${cached}`;
601
+ const base = `in:${input} out:${output} cached:${cached}`;
602
+ return options?.isAggregated ? `Total: ${base}` : base;
542
603
  }
543
604
  });
544
605
  var latencyMetric = Metric.of({
545
606
  id: "latency",
546
607
  name: "Latency",
547
- format: (data) => `${data.ms}ms`
608
+ aggregate: aggregateLatencyAverage,
609
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
548
610
  });
549
611
 
550
612
  // src/evals/scores/standard.ts
@@ -552,16 +614,59 @@ var percentScore = Score.of({
552
614
  id: "percent",
553
615
  name: "Score",
554
616
  displayStrategy: "bar",
555
- format: (data) => data.value.toFixed(2)
617
+ format: (data, options) => {
618
+ if (options?.isAggregated) {
619
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
620
+ }
621
+ return data.value.toFixed(2);
622
+ },
623
+ aggregate: aggregateAverageWithVariance
556
624
  });
557
625
  var binaryScore = Score.of({
558
626
  id: "binary",
559
627
  name: "Result",
560
628
  displayStrategy: "passFail",
561
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
629
+ format: (data, options) => {
630
+ if (options?.isAggregated) {
631
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
632
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
633
+ return `${base} (${data.passedCount}/${data.totalCount})`;
634
+ }
635
+ return base;
636
+ }
637
+ return data.passed ? "PASSED" : "NOT PASSED";
638
+ },
639
+ aggregate: aggregateAll
562
640
  });
641
+ function toJsonLines(value) {
642
+ try {
643
+ return JSON.stringify(value, null, 2);
644
+ } catch {
645
+ return String(value);
646
+ }
647
+ }
648
+ function formatDiffString(changes) {
649
+ const lines = [];
650
+ for (const part of changes) {
651
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
652
+ const partLines = part.value.split("\n");
653
+ if (partLines[partLines.length - 1] === "") {
654
+ partLines.pop();
655
+ }
656
+ for (const line of partLines) {
657
+ lines.push(`${prefix} ${line}`);
658
+ }
659
+ }
660
+ return lines.join("\n");
661
+ }
662
+ function createDiffString(expected, actual) {
663
+ const expectedStr = toJsonLines(expected);
664
+ const actualStr = toJsonLines(actual);
665
+ const changes = diffLines(expectedStr, actualStr);
666
+ return formatDiffString(changes);
667
+ }
563
668
  function createDiffLogEntry(expected, actual, options) {
564
- const diff = diffString(expected, actual, { color: false });
669
+ const diff = createDiffString(expected, actual);
565
670
  return {
566
671
  type: "diff",
567
672
  label: options?.label,
@@ -571,8 +676,22 @@ function createDiffLogEntry(expected, actual, options) {
571
676
  };
572
677
  }
573
678
  function printJsonDiff(expected, actual, options = {}) {
574
- const { color = true } = options;
575
- const diff = diffString(expected, actual, { color });
679
+ const diff = createDiffString(expected, actual);
680
+ if (options.color) {
681
+ const lines = diff.split("\n").map((line) => {
682
+ const trimmed = line.trimStart();
683
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
684
+ return `\x1B[31m${line}\x1B[0m`;
685
+ }
686
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
687
+ return `\x1B[32m${line}\x1B[0m`;
688
+ }
689
+ return line;
690
+ });
691
+ const colored = lines.join("\n");
692
+ console.log(colored || "(no differences)");
693
+ return colored;
694
+ }
576
695
  console.log(diff || "(no differences)");
577
696
  return diff;
578
697
  }
@@ -599,7 +718,8 @@ var defaultRunnerConfig = {
599
718
  ],
600
719
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
601
720
  },
602
- artifactDirectory: ".eval-results"
721
+ artifactDirectory: ".eval-results",
722
+ maxConcurrency: 1
603
723
  };
604
724
  function toRunnerConfigOverrides(config) {
605
725
  if (!config) {
@@ -632,6 +752,9 @@ function toRunnerConfigOverrides(config) {
632
752
  if (config.artifactDirectory !== void 0) {
633
753
  overrides.artifactDirectory = config.artifactDirectory;
634
754
  }
755
+ if (config.maxConcurrency !== void 0) {
756
+ overrides.maxConcurrency = config.maxConcurrency;
757
+ }
635
758
  if (Object.keys(discovery).length > 0) {
636
759
  overrides.discovery = discovery;
637
760
  }
@@ -905,6 +1028,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
905
1028
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
906
1029
  );
907
1030
  }
1031
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1032
+ return Effect.gen(function* () {
1033
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1034
+ const rerunPassed = [];
1035
+ for (let r = 0; r < reruns; r++) {
1036
+ const started = Date.now();
1037
+ const evaluatorScores = [];
1038
+ let testCaseError;
1039
+ const output = readOutput(testCaseItem.testCase);
1040
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1041
+ const evaluateFn = evaluator.getEvaluateFn();
1042
+ if (!evaluateFn) {
1043
+ continue;
1044
+ }
1045
+ try {
1046
+ const logs = [];
1047
+ const logDiff = (expected, actual, options) => {
1048
+ logs.push(createDiffLogEntry(expected, actual, options));
1049
+ };
1050
+ const ctx = yield* Effect.promise(
1051
+ () => Promise.resolve(evaluator.resolveContext())
1052
+ );
1053
+ const result = yield* Effect.promise(
1054
+ () => Promise.resolve(
1055
+ evaluateFn({
1056
+ input: testCaseItem.testCase.getInput(),
1057
+ ctx,
1058
+ output,
1059
+ logDiff
1060
+ })
1061
+ )
1062
+ );
1063
+ const { scores, metrics } = normalizeResult(result);
1064
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1065
+ evaluatorScores.push({
1066
+ evaluatorId,
1067
+ scores,
1068
+ passed: passed2,
1069
+ metrics,
1070
+ logs: logs.length > 0 ? logs : void 0
1071
+ });
1072
+ } catch (error) {
1073
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1074
+ evaluatorScores.push({
1075
+ evaluatorId,
1076
+ scores: [],
1077
+ passed: false
1078
+ });
1079
+ }
1080
+ }
1081
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1082
+ rerunPassed.push(rerunPassedThis);
1083
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1084
+ n + 1,
1085
+ n + 1
1086
+ ]);
1087
+ const progressEvent = {
1088
+ type: "TestCaseProgress",
1089
+ runId: task.runId,
1090
+ testCaseId: testCaseItem.id,
1091
+ testCaseName: testCaseItem.testCase.getName(),
1092
+ completedTestCases: completedEvaluations,
1093
+ totalTestCases: totalEvaluations,
1094
+ rerunIndex: r + 1,
1095
+ rerunTotal: reruns,
1096
+ passed: rerunPassedThis,
1097
+ durationMs: Date.now() - started,
1098
+ evaluatorScores,
1099
+ output,
1100
+ errorMessage: testCaseError
1101
+ };
1102
+ updateSnapshot(task.runId, (snapshot) => ({
1103
+ ...snapshot,
1104
+ completedTestCases: completedEvaluations
1105
+ }));
1106
+ yield* publishEvent(progressEvent);
1107
+ yield* Queue.offer(persistenceQueue, {
1108
+ runId: task.runId,
1109
+ artifactPath: task.snapshot.artifactPath,
1110
+ payload: progressEvent
1111
+ });
1112
+ }
1113
+ const testCasePassed = rerunPassed.every(Boolean);
1114
+ if (testCasePassed) {
1115
+ yield* Ref.update(passedRef, (n) => n + 1);
1116
+ } else {
1117
+ yield* Ref.update(failedRef, (n) => n + 1);
1118
+ }
1119
+ const [passed, failed] = yield* Effect.all([
1120
+ Ref.get(passedRef),
1121
+ Ref.get(failedRef)
1122
+ ]);
1123
+ updateSnapshot(task.runId, (snapshot) => ({
1124
+ ...snapshot,
1125
+ passedTestCases: passed,
1126
+ failedTestCases: failed
1127
+ }));
1128
+ });
1129
+ }
908
1130
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
909
1131
  const startedAt = Date.now();
910
1132
  updateSnapshot(task.runId, (snapshot) => ({
@@ -917,104 +1139,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
917
1139
  runId: task.runId,
918
1140
  startedAt
919
1141
  });
920
- let completedTestCases = 0;
921
- let passedTestCases = 0;
922
- let failedTestCases = 0;
923
- for (const testCaseItem of task.testCases) {
924
- const started = Date.now();
925
- const evaluatorScores = [];
926
- let testCaseError;
927
- const output = readOutput(testCaseItem.testCase);
928
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
929
- const evaluateFn = evaluator.getEvaluateFn();
930
- if (!evaluateFn) {
931
- continue;
932
- }
933
- try {
934
- const logs = [];
935
- const logDiff = (expected, actual, options) => {
936
- logs.push(createDiffLogEntry(expected, actual, options));
937
- };
938
- const ctx = yield* Effect.promise(
939
- () => Promise.resolve(evaluator.resolveContext())
940
- );
941
- const result = yield* Effect.promise(
942
- () => Promise.resolve(
943
- evaluateFn({
944
- input: testCaseItem.testCase.getInput(),
945
- ctx,
946
- output,
947
- logDiff
948
- })
949
- )
950
- );
951
- const { scores, metrics } = normalizeResult(result);
952
- const passed = computeEvaluatorPassed(evaluator, result, scores);
953
- evaluatorScores.push({
954
- evaluatorId,
955
- scores,
956
- passed,
957
- metrics,
958
- logs: logs.length > 0 ? logs : void 0
959
- });
960
- } catch (error) {
961
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
962
- evaluatorScores.push({
963
- evaluatorId,
964
- scores: [],
965
- passed: false
966
- });
967
- }
968
- }
969
- const testCasePassed = evaluatorScores.every((s) => s.passed);
970
- completedTestCases += 1;
971
- if (testCasePassed) {
972
- passedTestCases += 1;
973
- } else {
974
- failedTestCases += 1;
975
- }
976
- const progressEvent = {
977
- type: "TestCaseProgress",
978
- runId: task.runId,
979
- testCaseId: testCaseItem.id,
980
- testCaseName: testCaseItem.testCase.getName(),
981
- completedTestCases,
982
- totalTestCases: task.testCases.length,
983
- passed: testCasePassed,
984
- durationMs: Date.now() - started,
985
- evaluatorScores,
986
- output,
987
- errorMessage: testCaseError
988
- };
989
- updateSnapshot(task.runId, (snapshot) => ({
990
- ...snapshot,
991
- completedTestCases,
992
- passedTestCases,
993
- failedTestCases
994
- }));
995
- yield* publishEvent(progressEvent);
996
- yield* Queue.offer(persistenceQueue, {
997
- runId: task.runId,
998
- artifactPath: task.snapshot.artifactPath,
999
- payload: progressEvent
1000
- });
1001
- }
1142
+ const totalEvaluations = task.testCases.reduce(
1143
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1144
+ 0
1145
+ );
1146
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1147
+ const completedRef = yield* Ref.make(0);
1148
+ const passedRef = yield* Ref.make(0);
1149
+ const failedRef = yield* Ref.make(0);
1150
+ const processTestCase = (testCaseItem) => processOneTestCase(
1151
+ task,
1152
+ testCaseItem,
1153
+ totalEvaluations,
1154
+ publishEvent,
1155
+ persistenceQueue,
1156
+ updateSnapshot,
1157
+ completedRef,
1158
+ passedRef,
1159
+ failedRef
1160
+ );
1161
+ yield* Effect.forEach(
1162
+ task.testCases,
1163
+ processTestCase,
1164
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1165
+ );
1166
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1167
+ Ref.get(completedRef),
1168
+ Ref.get(passedRef),
1169
+ Ref.get(failedRef)
1170
+ ]);
1002
1171
  const finishedAt = Date.now();
1003
1172
  const completedEvent = {
1004
1173
  type: "RunCompleted",
1005
1174
  runId: task.runId,
1006
1175
  finishedAt,
1007
- passedTestCases,
1008
- failedTestCases,
1176
+ passedTestCases: passedUniqueTestCases,
1177
+ failedTestCases: failedUniqueTestCases,
1009
1178
  totalTestCases: task.testCases.length,
1010
1179
  artifactPath: task.snapshot.artifactPath
1011
1180
  };
1012
1181
  updateSnapshot(task.runId, (snapshot) => ({
1013
1182
  ...snapshot,
1014
1183
  status: "completed",
1015
- completedTestCases,
1016
- passedTestCases,
1017
- failedTestCases,
1184
+ completedTestCases: completedEvaluations,
1185
+ passedTestCases: passedUniqueTestCases,
1186
+ failedTestCases: failedUniqueTestCases,
1018
1187
  finishedAt
1019
1188
  }));
1020
1189
  yield* publishEvent(completedEvent);
@@ -1102,7 +1271,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1102
1271
  const artifactPath = filePath;
1103
1272
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1104
1273
  const progress = aggregateTestCaseProgress(lines);
1105
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1274
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1106
1275
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1107
1276
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1108
1277
  return {
@@ -1124,23 +1293,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1124
1293
  }
1125
1294
  function aggregateTestCaseProgress(lines) {
1126
1295
  let completedTestCases = 0;
1127
- let passedTestCases = 0;
1128
- let failedTestCases = 0;
1296
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1129
1297
  for (const line of lines) {
1130
1298
  try {
1131
1299
  const event = JSON.parse(line);
1132
1300
  if (event.type === "TestCaseProgress") {
1133
1301
  const ev = event;
1134
1302
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1135
- if (ev.passed) {
1136
- passedTestCases += 1;
1137
- } else {
1138
- failedTestCases += 1;
1139
- }
1303
+ const id = ev.testCaseId;
1304
+ const current = testCasePassedBy.get(id);
1305
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1140
1306
  }
1141
1307
  } catch {
1142
1308
  }
1143
1309
  }
1310
+ let passedTestCases = 0;
1311
+ let failedTestCases = 0;
1312
+ for (const passed of testCasePassedBy.values()) {
1313
+ if (passed) {
1314
+ passedTestCases += 1;
1315
+ } else {
1316
+ failedTestCases += 1;
1317
+ }
1318
+ }
1144
1319
  return { completedTestCases, passedTestCases, failedTestCases };
1145
1320
  }
1146
1321
  async function appendJsonLine(artifactPath, payload) {
@@ -1335,6 +1510,10 @@ var EffectRunner = class {
1335
1510
  throw new Error("No evaluators selected for run");
1336
1511
  }
1337
1512
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1513
+ const totalEvaluations = selectedTestCases.reduce(
1514
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1515
+ 0
1516
+ );
1338
1517
  const runId = `run-${randomUUID()}`;
1339
1518
  const artifactPath = createArtifactPath(
1340
1519
  this.config.artifactDirectory,
@@ -1347,7 +1526,7 @@ var EffectRunner = class {
1347
1526
  datasetName: dataset.dataset.getName(),
1348
1527
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1349
1528
  queuedAt: Date.now(),
1350
- totalTestCases: selectedTestCases.length,
1529
+ totalTestCases: totalEvaluations,
1351
1530
  completedTestCases: 0,
1352
1531
  passedTestCases: 0,
1353
1532
  failedTestCases: 0,
@@ -1361,7 +1540,7 @@ var EffectRunner = class {
1361
1540
  datasetId: request.datasetId,
1362
1541
  datasetName: dataset.dataset.getName(),
1363
1542
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1364
- totalTestCases: selectedTestCases.length,
1543
+ totalTestCases: totalEvaluations,
1365
1544
  artifactPath
1366
1545
  };
1367
1546
  await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1372,6 +1551,7 @@ var EffectRunner = class {
1372
1551
  payload: queuedEvent
1373
1552
  })
1374
1553
  );
1554
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1375
1555
  await Effect.runPromise(
1376
1556
  Queue.offer(this.runQueue, {
1377
1557
  runId,
@@ -1379,7 +1559,8 @@ var EffectRunner = class {
1379
1559
  dataset: dataset.dataset,
1380
1560
  evaluators: selectedEvaluators,
1381
1561
  testCases: selectedTestCases,
1382
- snapshot
1562
+ snapshot,
1563
+ maxConcurrency
1383
1564
  })
1384
1565
  );
1385
1566
  return snapshot;