@m4trix/evals 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -91,6 +91,8 @@ interface RunnerDiscoveryConfig {
91
91
  interface RunnerConfig {
92
92
  discovery: RunnerDiscoveryConfig;
93
93
  artifactDirectory: string;
94
+ /** Max concurrent test cases per run. Default: 1 (sequential). */
95
+ maxConcurrency: number;
94
96
  }
95
97
  type RunnerConfigOverrides = Omit<Partial<RunnerConfig>, 'discovery'> & {
96
98
  discovery?: Partial<RunnerDiscoveryConfig>;
@@ -108,6 +110,8 @@ interface M4trixEvalConfigDiscovery {
108
110
  interface M4trixEvalConfig {
109
111
  discovery?: M4trixEvalConfigDiscovery;
110
112
  artifactDirectory?: string;
113
+ /** Max concurrent test cases per run. Default: 1 (sequential). */
114
+ maxConcurrency?: number;
111
115
  }
112
116
  type ConfigType = M4trixEvalConfig;
113
117
  type M4trixEvalConfigFactory<TConfig extends ConfigType = ConfigType> = () => TConfig;
@@ -124,6 +128,7 @@ type InputOrBuilder<T> = T | (() => T);
124
128
  interface TestCaseDescribeConfig<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>> {
125
129
  name: string;
126
130
  tags: string[];
131
+ reruns?: number;
127
132
  inputSchema: TI;
128
133
  input: InputOrBuilder<Schema.Schema.Type<TI>>;
129
134
  outputSchema?: TO;
@@ -133,6 +138,7 @@ declare class TestCase<TInput = unknown, TOutput = unknown> {
133
138
  private readonly _config;
134
139
  private constructor();
135
140
  static describe<TI extends Schema.Schema.Any, TO extends Schema.Schema.Any = Schema.Schema<unknown>>(config: TestCaseDescribeConfig<TI, TO>): TestCase<Schema.Schema.Type<TI>, Schema.Schema.Type<TO>>;
141
+ getReruns(): number;
136
142
  getName(): string;
137
143
  getTags(): string[];
138
144
  getInputSchema(): Schema.Schema.Any;
@@ -225,17 +231,22 @@ interface MetricItem<TData = unknown> {
225
231
  readonly id: string;
226
232
  readonly data: TData;
227
233
  }
234
+ interface FormatMetricOptions {
235
+ isAggregated?: boolean;
236
+ }
228
237
  interface MetricDef<TData = unknown> {
229
238
  readonly id: string;
230
239
  readonly name?: string;
231
- format(data: TData): string;
240
+ readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
241
+ format(data: TData, options?: FormatMetricOptions): string;
232
242
  make(data: TData): MetricItem<TData>;
233
243
  }
234
244
  declare const Metric: {
235
245
  of<TData>(config: {
236
246
  id: string;
237
247
  name?: string | undefined;
238
- format: (data: TData) => string;
248
+ format: (data: TData, options?: FormatMetricOptions) => string;
249
+ aggregate?: ((values: readonly TData[]) => TData) | undefined;
239
250
  }): MetricDef<TData>;
240
251
  };
241
252
  declare function getMetricById(id: string): MetricDef<unknown> | undefined;
@@ -246,11 +257,15 @@ interface ScoreItem<TData = unknown> {
246
257
  readonly data: TData;
247
258
  readonly passed?: boolean;
248
259
  }
260
+ interface FormatScoreOptions {
261
+ isAggregated?: boolean;
262
+ }
249
263
  interface ScoreDef<TData = unknown> {
250
264
  readonly id: string;
251
265
  readonly name?: string;
252
266
  readonly displayStrategy: ScoreDisplayStrategy;
253
- format(data: TData): string;
267
+ readonly aggregate?: (values: ReadonlyArray<TData>) => TData;
268
+ format(data: TData, options?: FormatScoreOptions): string;
254
269
  make(data: TData, options?: {
255
270
  definePassed?: (data: TData) => boolean;
256
271
  }): ScoreItem<TData>;
@@ -260,7 +275,8 @@ declare const Score: {
260
275
  id: string;
261
276
  name?: string | undefined;
262
277
  displayStrategy: ScoreDisplayStrategy;
263
- format: (data: TData) => string;
278
+ format: (data: TData, options?: FormatScoreOptions) => string;
279
+ aggregate?: ((values: readonly TData[]) => TData) | undefined;
264
280
  }): ScoreDef<TData>;
265
281
  };
266
282
  declare function getScoreById(id: string): ScoreDef<unknown> | undefined;
@@ -326,6 +342,8 @@ type RunnerEvent = {
326
342
  testCaseName: string;
327
343
  completedTestCases: number;
328
344
  totalTestCases: number;
345
+ rerunIndex: number;
346
+ rerunTotal: number;
329
347
  passed: boolean;
330
348
  durationMs: number;
331
349
  evaluatorScores: ReadonlyArray<{
@@ -401,4 +419,4 @@ interface BinaryScoreData {
401
419
  }
402
420
  declare const binaryScore: ScoreDef<BinaryScoreData>;
403
421
 
404
- export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
422
+ export { BinaryScoreData, CliState, CollectedDataset, CollectedEvaluator, CollectedTestCase, ConfigType, Dataset, EvalDataset, EvalMiddleware, EvalRun, EvalsData, EvaluateArgs, Evaluator, EvaluatorOption, FormatMetricOptions, FormatScoreOptions, LatencyData, M4trixEvalConfig, M4trixEvalConfigDiscovery, Metric, MetricDef, MetricItem, PathMatcher, PercentScoreData, PrintJsonDiffOptions, RunDatasetRequest, RunSnapshot, RunnerApi, RunnerConfig, RunnerConfigOverrides, RunnerDiscoveryConfig, RunnerEvent, Score, ScoreDef, ScoreDisplayStrategy, ScoreItem, SearchTestCasesQuery, StartupArgs, TagMatcher, TestCase, TokenCountData, ViewLevel, binaryScore, createRunner, defaultRunnerConfig, defineConfig, getMetricById, getScoreById, latencyMetric, loadMockData, loadRunnerData, parseStartupArgs, percentScore, printJsonDiff, tokenCountMetric, withRunnerConfig };
package/dist/index.js CHANGED
@@ -1,4 +1,4 @@
1
- import { Effect, PubSub, Queue, Fiber } from 'effect';
1
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
2
2
  export { Schema as S } from 'effect';
3
3
  import { diffString } from 'json-diff';
4
4
  import { randomUUID } from 'crypto';
@@ -309,15 +309,23 @@ var TestCase = class _TestCase {
309
309
  this._config = config;
310
310
  }
311
311
  static describe(config) {
312
+ const reruns = config.reruns ?? 1;
313
+ if (reruns < 1 || !Number.isInteger(reruns)) {
314
+ throw new Error(`TestCase reruns must be a positive integer, got ${reruns}`);
315
+ }
312
316
  return new _TestCase({
313
317
  name: config.name,
314
318
  tags: config.tags,
319
+ reruns,
315
320
  inputSchema: config.inputSchema,
316
321
  input: config.input,
317
322
  outputSchema: config.outputSchema,
318
323
  output: config.output
319
324
  });
320
325
  }
326
+ getReruns() {
327
+ return this._config.reruns;
328
+ }
321
329
  getName() {
322
330
  return this._config.name;
323
331
  }
@@ -491,6 +499,7 @@ var Metric = {
491
499
  const def = {
492
500
  id: config.id,
493
501
  name: config.name,
502
+ aggregate: config.aggregate,
494
503
  format: config.format,
495
504
  make: (data) => ({ id: config.id, data })
496
505
  };
@@ -510,6 +519,7 @@ var Score = {
510
519
  id: config.id,
511
520
  name: config.name,
512
521
  displayStrategy: config.displayStrategy,
522
+ aggregate: config.aggregate,
513
523
  format: config.format,
514
524
  make: (data, options) => {
515
525
  const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
@@ -528,23 +538,62 @@ function getScoreById(id) {
528
538
  return registry2.get(id);
529
539
  }
530
540
 
541
+ // src/evals/aggregators.ts
542
+ function aggregateAverage(values) {
543
+ if (values.length === 0) {
544
+ return { value: 0 };
545
+ }
546
+ const sum = values.reduce((s, v) => s + v.value, 0);
547
+ return { value: sum / values.length };
548
+ }
549
+ function aggregateAll(values) {
550
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
551
+ }
552
+ function aggregateTokenCountSum(values) {
553
+ const initial = {
554
+ input: 0,
555
+ output: 0,
556
+ inputCached: 0,
557
+ outputCached: 0
558
+ };
559
+ return values.reduce(
560
+ (acc, v) => ({
561
+ input: acc.input + (v.input ?? 0),
562
+ output: acc.output + (v.output ?? 0),
563
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
564
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
565
+ }),
566
+ initial
567
+ );
568
+ }
569
+ function aggregateLatencyAverage(values) {
570
+ if (values.length === 0) {
571
+ return { ms: 0 };
572
+ }
573
+ const sum = values.reduce((s, v) => s + v.ms, 0);
574
+ return { ms: sum / values.length };
575
+ }
576
+
531
577
  // src/evals/metrics/standard.ts
532
578
  var tokenCountMetric = Metric.of({
533
579
  id: "token-count",
534
580
  name: "Tokens",
535
- format: (data) => {
581
+ aggregate: aggregateTokenCountSum,
582
+ format: (data, options) => {
536
583
  const input = data.input ?? 0;
537
584
  const output = data.output ?? 0;
538
585
  const inputCached = data.inputCached ?? 0;
539
586
  const outputCached = data.outputCached ?? 0;
540
587
  const cached = inputCached + outputCached;
541
- return `in:${input} out:${output} cached:${cached}`;
588
+ const base = `in:${input} out:${output} cached:${cached}`;
589
+ return options?.isAggregated ? `Total: ${base}` : base;
542
590
  }
543
591
  });
544
592
  var latencyMetric = Metric.of({
545
593
  id: "latency",
546
594
  name: "Latency",
547
- format: (data) => `${data.ms}ms`
595
+ aggregate: aggregateLatencyAverage,
596
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
548
597
  });
549
598
 
550
599
  // src/evals/scores/standard.ts
@@ -552,13 +601,15 @@ var percentScore = Score.of({
552
601
  id: "percent",
553
602
  name: "Score",
554
603
  displayStrategy: "bar",
555
- format: (data) => data.value.toFixed(2)
604
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
605
+ aggregate: aggregateAverage
556
606
  });
557
607
  var binaryScore = Score.of({
558
608
  id: "binary",
559
609
  name: "Result",
560
610
  displayStrategy: "passFail",
561
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
611
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
612
+ aggregate: aggregateAll
562
613
  });
563
614
  function createDiffLogEntry(expected, actual, options) {
564
615
  const diff = diffString(expected, actual, { color: false });
@@ -599,7 +650,8 @@ var defaultRunnerConfig = {
599
650
  ],
600
651
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
601
652
  },
602
- artifactDirectory: ".eval-results"
653
+ artifactDirectory: ".eval-results",
654
+ maxConcurrency: 1
603
655
  };
604
656
  function toRunnerConfigOverrides(config) {
605
657
  if (!config) {
@@ -632,6 +684,9 @@ function toRunnerConfigOverrides(config) {
632
684
  if (config.artifactDirectory !== void 0) {
633
685
  overrides.artifactDirectory = config.artifactDirectory;
634
686
  }
687
+ if (config.maxConcurrency !== void 0) {
688
+ overrides.maxConcurrency = config.maxConcurrency;
689
+ }
635
690
  if (Object.keys(discovery).length > 0) {
636
691
  overrides.discovery = discovery;
637
692
  }
@@ -905,6 +960,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
905
960
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
906
961
  );
907
962
  }
963
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
964
+ return Effect.gen(function* () {
965
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
966
+ const rerunPassed = [];
967
+ for (let r = 0; r < reruns; r++) {
968
+ const started = Date.now();
969
+ const evaluatorScores = [];
970
+ let testCaseError;
971
+ const output = readOutput(testCaseItem.testCase);
972
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
973
+ const evaluateFn = evaluator.getEvaluateFn();
974
+ if (!evaluateFn) {
975
+ continue;
976
+ }
977
+ try {
978
+ const logs = [];
979
+ const logDiff = (expected, actual, options) => {
980
+ logs.push(createDiffLogEntry(expected, actual, options));
981
+ };
982
+ const ctx = yield* Effect.promise(
983
+ () => Promise.resolve(evaluator.resolveContext())
984
+ );
985
+ const result = yield* Effect.promise(
986
+ () => Promise.resolve(
987
+ evaluateFn({
988
+ input: testCaseItem.testCase.getInput(),
989
+ ctx,
990
+ output,
991
+ logDiff
992
+ })
993
+ )
994
+ );
995
+ const { scores, metrics } = normalizeResult(result);
996
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
997
+ evaluatorScores.push({
998
+ evaluatorId,
999
+ scores,
1000
+ passed: passed2,
1001
+ metrics,
1002
+ logs: logs.length > 0 ? logs : void 0
1003
+ });
1004
+ } catch (error) {
1005
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1006
+ evaluatorScores.push({
1007
+ evaluatorId,
1008
+ scores: [],
1009
+ passed: false
1010
+ });
1011
+ }
1012
+ }
1013
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1014
+ rerunPassed.push(rerunPassedThis);
1015
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1016
+ n + 1,
1017
+ n + 1
1018
+ ]);
1019
+ const progressEvent = {
1020
+ type: "TestCaseProgress",
1021
+ runId: task.runId,
1022
+ testCaseId: testCaseItem.id,
1023
+ testCaseName: testCaseItem.testCase.getName(),
1024
+ completedTestCases: completedEvaluations,
1025
+ totalTestCases: totalEvaluations,
1026
+ rerunIndex: r + 1,
1027
+ rerunTotal: reruns,
1028
+ passed: rerunPassedThis,
1029
+ durationMs: Date.now() - started,
1030
+ evaluatorScores,
1031
+ output,
1032
+ errorMessage: testCaseError
1033
+ };
1034
+ updateSnapshot(task.runId, (snapshot) => ({
1035
+ ...snapshot,
1036
+ completedTestCases: completedEvaluations
1037
+ }));
1038
+ yield* publishEvent(progressEvent);
1039
+ yield* Queue.offer(persistenceQueue, {
1040
+ runId: task.runId,
1041
+ artifactPath: task.snapshot.artifactPath,
1042
+ payload: progressEvent
1043
+ });
1044
+ }
1045
+ const testCasePassed = rerunPassed.every(Boolean);
1046
+ if (testCasePassed) {
1047
+ yield* Ref.update(passedRef, (n) => n + 1);
1048
+ } else {
1049
+ yield* Ref.update(failedRef, (n) => n + 1);
1050
+ }
1051
+ const [passed, failed] = yield* Effect.all([
1052
+ Ref.get(passedRef),
1053
+ Ref.get(failedRef)
1054
+ ]);
1055
+ updateSnapshot(task.runId, (snapshot) => ({
1056
+ ...snapshot,
1057
+ passedTestCases: passed,
1058
+ failedTestCases: failed
1059
+ }));
1060
+ });
1061
+ }
908
1062
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
909
1063
  const startedAt = Date.now();
910
1064
  updateSnapshot(task.runId, (snapshot) => ({
@@ -917,104 +1071,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
917
1071
  runId: task.runId,
918
1072
  startedAt
919
1073
  });
920
- let completedTestCases = 0;
921
- let passedTestCases = 0;
922
- let failedTestCases = 0;
923
- for (const testCaseItem of task.testCases) {
924
- const started = Date.now();
925
- const evaluatorScores = [];
926
- let testCaseError;
927
- const output = readOutput(testCaseItem.testCase);
928
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
929
- const evaluateFn = evaluator.getEvaluateFn();
930
- if (!evaluateFn) {
931
- continue;
932
- }
933
- try {
934
- const logs = [];
935
- const logDiff = (expected, actual, options) => {
936
- logs.push(createDiffLogEntry(expected, actual, options));
937
- };
938
- const ctx = yield* Effect.promise(
939
- () => Promise.resolve(evaluator.resolveContext())
940
- );
941
- const result = yield* Effect.promise(
942
- () => Promise.resolve(
943
- evaluateFn({
944
- input: testCaseItem.testCase.getInput(),
945
- ctx,
946
- output,
947
- logDiff
948
- })
949
- )
950
- );
951
- const { scores, metrics } = normalizeResult(result);
952
- const passed = computeEvaluatorPassed(evaluator, result, scores);
953
- evaluatorScores.push({
954
- evaluatorId,
955
- scores,
956
- passed,
957
- metrics,
958
- logs: logs.length > 0 ? logs : void 0
959
- });
960
- } catch (error) {
961
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
962
- evaluatorScores.push({
963
- evaluatorId,
964
- scores: [],
965
- passed: false
966
- });
967
- }
968
- }
969
- const testCasePassed = evaluatorScores.every((s) => s.passed);
970
- completedTestCases += 1;
971
- if (testCasePassed) {
972
- passedTestCases += 1;
973
- } else {
974
- failedTestCases += 1;
975
- }
976
- const progressEvent = {
977
- type: "TestCaseProgress",
978
- runId: task.runId,
979
- testCaseId: testCaseItem.id,
980
- testCaseName: testCaseItem.testCase.getName(),
981
- completedTestCases,
982
- totalTestCases: task.testCases.length,
983
- passed: testCasePassed,
984
- durationMs: Date.now() - started,
985
- evaluatorScores,
986
- output,
987
- errorMessage: testCaseError
988
- };
989
- updateSnapshot(task.runId, (snapshot) => ({
990
- ...snapshot,
991
- completedTestCases,
992
- passedTestCases,
993
- failedTestCases
994
- }));
995
- yield* publishEvent(progressEvent);
996
- yield* Queue.offer(persistenceQueue, {
997
- runId: task.runId,
998
- artifactPath: task.snapshot.artifactPath,
999
- payload: progressEvent
1000
- });
1001
- }
1074
+ const totalEvaluations = task.testCases.reduce(
1075
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1076
+ 0
1077
+ );
1078
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1079
+ const completedRef = yield* Ref.make(0);
1080
+ const passedRef = yield* Ref.make(0);
1081
+ const failedRef = yield* Ref.make(0);
1082
+ const processTestCase = (testCaseItem) => processOneTestCase(
1083
+ task,
1084
+ testCaseItem,
1085
+ totalEvaluations,
1086
+ publishEvent,
1087
+ persistenceQueue,
1088
+ updateSnapshot,
1089
+ completedRef,
1090
+ passedRef,
1091
+ failedRef
1092
+ );
1093
+ yield* Effect.forEach(
1094
+ task.testCases,
1095
+ processTestCase,
1096
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1097
+ );
1098
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1099
+ Ref.get(completedRef),
1100
+ Ref.get(passedRef),
1101
+ Ref.get(failedRef)
1102
+ ]);
1002
1103
  const finishedAt = Date.now();
1003
1104
  const completedEvent = {
1004
1105
  type: "RunCompleted",
1005
1106
  runId: task.runId,
1006
1107
  finishedAt,
1007
- passedTestCases,
1008
- failedTestCases,
1108
+ passedTestCases: passedUniqueTestCases,
1109
+ failedTestCases: failedUniqueTestCases,
1009
1110
  totalTestCases: task.testCases.length,
1010
1111
  artifactPath: task.snapshot.artifactPath
1011
1112
  };
1012
1113
  updateSnapshot(task.runId, (snapshot) => ({
1013
1114
  ...snapshot,
1014
1115
  status: "completed",
1015
- completedTestCases,
1016
- passedTestCases,
1017
- failedTestCases,
1116
+ completedTestCases: completedEvaluations,
1117
+ passedTestCases: passedUniqueTestCases,
1118
+ failedTestCases: failedUniqueTestCases,
1018
1119
  finishedAt
1019
1120
  }));
1020
1121
  yield* publishEvent(completedEvent);
@@ -1102,7 +1203,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1102
1203
  const artifactPath = filePath;
1103
1204
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1104
1205
  const progress = aggregateTestCaseProgress(lines);
1105
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1206
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1106
1207
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1107
1208
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1108
1209
  return {
@@ -1124,23 +1225,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1124
1225
  }
1125
1226
  function aggregateTestCaseProgress(lines) {
1126
1227
  let completedTestCases = 0;
1127
- let passedTestCases = 0;
1128
- let failedTestCases = 0;
1228
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1129
1229
  for (const line of lines) {
1130
1230
  try {
1131
1231
  const event = JSON.parse(line);
1132
1232
  if (event.type === "TestCaseProgress") {
1133
1233
  const ev = event;
1134
1234
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1135
- if (ev.passed) {
1136
- passedTestCases += 1;
1137
- } else {
1138
- failedTestCases += 1;
1139
- }
1235
+ const id = ev.testCaseId;
1236
+ const current = testCasePassedBy.get(id);
1237
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1140
1238
  }
1141
1239
  } catch {
1142
1240
  }
1143
1241
  }
1242
+ let passedTestCases = 0;
1243
+ let failedTestCases = 0;
1244
+ for (const passed of testCasePassedBy.values()) {
1245
+ if (passed) {
1246
+ passedTestCases += 1;
1247
+ } else {
1248
+ failedTestCases += 1;
1249
+ }
1250
+ }
1144
1251
  return { completedTestCases, passedTestCases, failedTestCases };
1145
1252
  }
1146
1253
  async function appendJsonLine(artifactPath, payload) {
@@ -1335,6 +1442,10 @@ var EffectRunner = class {
1335
1442
  throw new Error("No evaluators selected for run");
1336
1443
  }
1337
1444
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1445
+ const totalEvaluations = selectedTestCases.reduce(
1446
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1447
+ 0
1448
+ );
1338
1449
  const runId = `run-${randomUUID()}`;
1339
1450
  const artifactPath = createArtifactPath(
1340
1451
  this.config.artifactDirectory,
@@ -1347,7 +1458,7 @@ var EffectRunner = class {
1347
1458
  datasetName: dataset.dataset.getName(),
1348
1459
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1349
1460
  queuedAt: Date.now(),
1350
- totalTestCases: selectedTestCases.length,
1461
+ totalTestCases: totalEvaluations,
1351
1462
  completedTestCases: 0,
1352
1463
  passedTestCases: 0,
1353
1464
  failedTestCases: 0,
@@ -1361,7 +1472,7 @@ var EffectRunner = class {
1361
1472
  datasetId: request.datasetId,
1362
1473
  datasetName: dataset.dataset.getName(),
1363
1474
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1364
- totalTestCases: selectedTestCases.length,
1475
+ totalTestCases: totalEvaluations,
1365
1476
  artifactPath
1366
1477
  };
1367
1478
  await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1372,6 +1483,7 @@ var EffectRunner = class {
1372
1483
  payload: queuedEvent
1373
1484
  })
1374
1485
  );
1486
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1375
1487
  await Effect.runPromise(
1376
1488
  Queue.offer(this.runQueue, {
1377
1489
  runId,
@@ -1379,7 +1491,8 @@ var EffectRunner = class {
1379
1491
  dataset: dataset.dataset,
1380
1492
  evaluators: selectedEvaluators,
1381
1493
  testCases: selectedTestCases,
1382
- snapshot
1494
+ snapshot,
1495
+ maxConcurrency
1383
1496
  })
1384
1497
  );
1385
1498
  return snapshot;