@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,10 @@
3
3
 
4
4
  var crypto = require('crypto');
5
5
  var effect = require('effect');
6
- var fs = require('fs');
6
+ var promises = require('fs/promises');
7
7
  var path = require('path');
8
+ var fs = require('fs');
8
9
  var jitiModule = require('jiti');
9
- var promises = require('fs/promises');
10
10
  var url = require('url');
11
11
  var diff = require('diff');
12
12
  var stringify = require('fast-json-stable-stringify');
@@ -39,12 +39,179 @@ var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
39
39
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
40
40
  var React__namespace = /*#__PURE__*/_interopNamespace(React);
41
41
 
42
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
43
+ function makeEntityIdSchema(brand, label) {
44
+ return effect.Schema.String.pipe(
45
+ effect.Schema.trimmed(),
46
+ effect.Schema.minLength(1, {
47
+ message: () => `${label} must be non-empty.`
48
+ }),
49
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
50
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
51
+ }),
52
+ effect.Schema.brand(brand)
53
+ );
54
+ }
55
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
56
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
57
+ makeEntityIdSchema("TestCaseName", "Test case name");
58
+ makeEntityIdSchema("DatasetName", "Dataset name");
59
+ function validateWithSchema(schema, raw, context) {
60
+ const trimmed = raw.trim();
61
+ const decode = effect.Schema.decodeUnknownEither(
62
+ schema
63
+ );
64
+ const result = decode(trimmed);
65
+ if (effect.Either.isLeft(result)) {
66
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
67
+ }
68
+ return result.right;
69
+ }
70
+ function validateRunConfigName(raw, context) {
71
+ return validateWithSchema(RunConfigNameSchema, raw, context);
72
+ }
73
+
74
+ // src/evals/evaluator.ts
75
+ function getEvaluatorDisplayLabel(evaluator) {
76
+ if (typeof evaluator.getDisplayLabel === "function") {
77
+ const label = evaluator.getDisplayLabel();
78
+ if (label !== void 0) {
79
+ return label;
80
+ }
81
+ }
82
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
83
+ }
84
+ function getEvaluatorTagList(evaluator) {
85
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
86
+ }
87
+ async function loadRunSnapshotsFromArtifacts(config) {
88
+ const baseDir = path.resolve(config.artifactDirectory);
89
+ let entries;
90
+ try {
91
+ entries = await promises.readdir(baseDir);
92
+ } catch {
93
+ return [];
94
+ }
95
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
96
+ const snapshots = [];
97
+ for (const fileName of jsonlFiles) {
98
+ const filePath = path.join(baseDir, fileName);
99
+ try {
100
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
101
+ if (snapshot) {
102
+ snapshots.push(snapshot);
103
+ }
104
+ } catch {
105
+ }
106
+ }
107
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
108
+ }
109
+ async function parseArtifactToSnapshot(filePath, _config) {
110
+ const content = await promises.readFile(filePath, "utf8");
111
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
112
+ if (lines.length === 0) {
113
+ return null;
114
+ }
115
+ let runQueued = null;
116
+ let runCompleted = null;
117
+ let runFailed = null;
118
+ let runStarted = null;
119
+ for (const line of lines) {
120
+ try {
121
+ const event = JSON.parse(line);
122
+ const type = event.type;
123
+ if (type === "RunQueued") {
124
+ runQueued = {
125
+ runId: event.runId,
126
+ datasetId: event.datasetId,
127
+ datasetName: event.datasetName,
128
+ evaluatorIds: event.evaluatorIds,
129
+ totalTestCases: event.totalTestCases ?? 0,
130
+ artifactPath: event.artifactPath ?? filePath,
131
+ ts: event.ts
132
+ };
133
+ }
134
+ if (type === "RunStarted") {
135
+ runStarted = { startedAt: event.startedAt };
136
+ }
137
+ if (type === "RunCompleted") {
138
+ runCompleted = {
139
+ passedTestCases: event.passedTestCases,
140
+ failedTestCases: event.failedTestCases,
141
+ totalTestCases: event.totalTestCases,
142
+ finishedAt: event.finishedAt
143
+ };
144
+ }
145
+ if (type === "RunFailed") {
146
+ runFailed = {
147
+ finishedAt: event.finishedAt,
148
+ errorMessage: event.errorMessage
149
+ };
150
+ }
151
+ } catch {
152
+ }
153
+ }
154
+ if (!runQueued) {
155
+ return null;
156
+ }
157
+ const artifactPath = filePath;
158
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
159
+ const progress = aggregateTestCaseProgress(lines);
160
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
161
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
162
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
163
+ return {
164
+ runId: runQueued.runId,
165
+ datasetId: runQueued.datasetId,
166
+ datasetName: runQueued.datasetName,
167
+ evaluatorIds: runQueued.evaluatorIds,
168
+ queuedAt: runQueued.ts ?? 0,
169
+ startedAt: runStarted?.startedAt,
170
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
171
+ totalTestCases: runQueued.totalTestCases,
172
+ completedTestCases,
173
+ passedTestCases,
174
+ failedTestCases,
175
+ status,
176
+ artifactPath,
177
+ errorMessage: runFailed?.errorMessage
178
+ };
179
+ }
180
+ function aggregateTestCaseProgress(lines) {
181
+ let completedTestCases = 0;
182
+ const testCasePassedBy = /* @__PURE__ */ new Map();
183
+ for (const line of lines) {
184
+ try {
185
+ const event = JSON.parse(line);
186
+ if (event.type === "TestCaseProgress") {
187
+ const ev = event;
188
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
189
+ const id = ev.testCaseId;
190
+ const current = testCasePassedBy.get(id);
191
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
192
+ }
193
+ } catch {
194
+ }
195
+ }
196
+ let passedTestCases = 0;
197
+ let failedTestCases = 0;
198
+ for (const passed of testCasePassedBy.values()) {
199
+ if (passed) {
200
+ passedTestCases += 1;
201
+ } else {
202
+ failedTestCases += 1;
203
+ }
204
+ }
205
+ return { completedTestCases, passedTestCases, failedTestCases };
206
+ }
207
+
42
208
  // src/runner/config.ts
43
209
  var defaultRunnerConfig = {
44
210
  discovery: {
45
211
  rootDir: process.cwd(),
46
212
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
47
213
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
214
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
48
215
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
49
216
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
50
217
  },
@@ -70,6 +237,11 @@ function toRunnerConfigOverrides(config) {
70
237
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
71
238
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
72
239
  }
240
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
241
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
242
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
243
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
244
+ }
73
245
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
74
246
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
75
247
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -168,6 +340,9 @@ function isDatasetLike(value) {
168
340
  function isEvaluatorLike(value) {
169
341
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
170
342
  }
343
+ function isRunConfigLike(value) {
344
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
345
+ }
171
346
  function isTestCaseLike(value) {
172
347
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
173
348
  }
@@ -256,6 +431,23 @@ async function collectEvaluatorsFromFiles(config) {
256
431
  );
257
432
  return found.flat();
258
433
  }
434
+ async function collectRunConfigsFromFiles(config) {
435
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
436
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
437
+ const found = await Promise.all(
438
+ matched.map(async (absolutePath) => {
439
+ const exports = await loadModuleExports(absolutePath);
440
+ const runConfigs = exports.filter(isRunConfigLike);
441
+ const relPath = path.relative(config.rootDir, absolutePath);
442
+ return runConfigs.map((runConfig) => ({
443
+ id: runConfig.getName(),
444
+ filePath: relPath,
445
+ runConfig
446
+ }));
447
+ })
448
+ );
449
+ return found.flat();
450
+ }
259
451
  async function collectTestCasesFromFiles(config) {
260
452
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
261
453
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -409,6 +601,25 @@ function getDiffLines(entry) {
409
601
  });
410
602
  }
411
603
 
604
+ // src/evals/test-case.ts
605
+ function getTestCaseDisplayLabel(testCase) {
606
+ if (typeof testCase.getDisplayLabel === "function") {
607
+ return testCase.getDisplayLabel();
608
+ }
609
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
610
+ }
611
+ function getTestCaseTagList(testCase) {
612
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
613
+ }
614
+
615
+ // src/evals/dataset.ts
616
+ function getDatasetDisplayLabel(dataset) {
617
+ if (typeof dataset.getDisplayLabel === "function") {
618
+ return dataset.getDisplayLabel();
619
+ }
620
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
621
+ }
622
+
412
623
  // src/evals/metric.ts
413
624
  var registry = /* @__PURE__ */ new Map();
414
625
  var Metric = {
@@ -432,6 +643,54 @@ function getMetricById(id) {
432
643
  return registry.get(id);
433
644
  }
434
645
 
646
+ // src/evals/aggregators.ts
647
+ function aggregateTokenCountSum(values) {
648
+ const initial = {
649
+ input: 0,
650
+ output: 0,
651
+ inputCached: 0,
652
+ outputCached: 0
653
+ };
654
+ return values.reduce(
655
+ (acc, v) => ({
656
+ input: acc.input + (v.input ?? 0),
657
+ output: acc.output + (v.output ?? 0),
658
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
659
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
660
+ }),
661
+ initial
662
+ );
663
+ }
664
+ function aggregateLatencyAverage(values) {
665
+ if (values.length === 0) {
666
+ return { ms: 0 };
667
+ }
668
+ const sum = values.reduce((s, v) => s + v.ms, 0);
669
+ return { ms: sum / values.length };
670
+ }
671
+
672
+ // src/evals/metrics/standard.ts
673
+ Metric.of({
674
+ id: "token-count",
675
+ name: "Tokens",
676
+ aggregate: aggregateTokenCountSum,
677
+ format: (data, options) => {
678
+ const input = data.input ?? 0;
679
+ const output = data.output ?? 0;
680
+ const inputCached = data.inputCached ?? 0;
681
+ const outputCached = data.outputCached ?? 0;
682
+ const cached = inputCached + outputCached;
683
+ const base = `in:${input} out:${output} cached:${cached}`;
684
+ return options?.isAggregated ? `Total: ${base}` : base;
685
+ }
686
+ });
687
+ Metric.of({
688
+ id: "latency",
689
+ name: "Latency",
690
+ aggregate: aggregateLatencyAverage,
691
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
692
+ });
693
+
435
694
  // src/evals/score.ts
436
695
  var registry2 = /* @__PURE__ */ new Map();
437
696
  function formatScoreData(def, data, options) {
@@ -540,54 +799,6 @@ function getScoreById(id) {
540
799
  return registry2.get(id);
541
800
  }
542
801
 
543
- // src/evals/aggregators.ts
544
- function aggregateTokenCountSum(values) {
545
- const initial = {
546
- input: 0,
547
- output: 0,
548
- inputCached: 0,
549
- outputCached: 0
550
- };
551
- return values.reduce(
552
- (acc, v) => ({
553
- input: acc.input + (v.input ?? 0),
554
- output: acc.output + (v.output ?? 0),
555
- inputCached: acc.inputCached + (v.inputCached ?? 0),
556
- outputCached: acc.outputCached + (v.outputCached ?? 0)
557
- }),
558
- initial
559
- );
560
- }
561
- function aggregateLatencyAverage(values) {
562
- if (values.length === 0) {
563
- return { ms: 0 };
564
- }
565
- const sum = values.reduce((s, v) => s + v.ms, 0);
566
- return { ms: sum / values.length };
567
- }
568
-
569
- // src/evals/metrics/standard.ts
570
- Metric.of({
571
- id: "token-count",
572
- name: "Tokens",
573
- aggregate: aggregateTokenCountSum,
574
- format: (data, options) => {
575
- const input = data.input ?? 0;
576
- const output = data.output ?? 0;
577
- const inputCached = data.inputCached ?? 0;
578
- const outputCached = data.outputCached ?? 0;
579
- const cached = inputCached + outputCached;
580
- const base = `in:${input} out:${output} cached:${cached}`;
581
- return options?.isAggregated ? `Total: ${base}` : base;
582
- }
583
- });
584
- Metric.of({
585
- id: "latency",
586
- name: "Latency",
587
- aggregate: aggregateLatencyAverage,
588
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
589
- });
590
-
591
802
  // src/evals/scores/standard.ts
592
803
  Score.of({
593
804
  id: "percent",
@@ -731,15 +942,17 @@ function readOutput(testCase) {
731
942
  }
732
943
  return candidate.getOutput();
733
944
  }
734
- function buildEvaluationUnits(testCases) {
945
+ function buildEvaluationUnits(testCases, repetitionCount) {
946
+ const count = Math.max(1, repetitionCount);
735
947
  const units = [];
736
948
  for (const testCaseItem of testCases) {
737
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
738
- for (let r = 0; r < rerunTotal; r++) {
949
+ const repetitionId = `rep-${crypto.randomUUID()}`;
950
+ for (let r = 0; r < count; r++) {
739
951
  units.push({
740
952
  testCaseItem,
741
- rerunIndex: r + 1,
742
- rerunTotal
953
+ repetitionId,
954
+ repetitionIndex: r + 1,
955
+ repetitionCount: count
743
956
  });
744
957
  }
745
958
  }
@@ -752,7 +965,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
752
965
  return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
753
966
  }
754
967
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
755
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
968
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
756
969
  return effect.Effect.gen(function* () {
757
970
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
758
971
  const started = Date.now();
@@ -761,11 +974,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
761
974
  type: "TestCaseStarted",
762
975
  runId: task.runId,
763
976
  testCaseId: testCaseItem.id,
764
- testCaseName: testCaseItem.testCase.getName(),
977
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
765
978
  startedTestCases: startedEvaluations,
766
979
  totalTestCases: totalEvaluations,
767
- rerunIndex,
768
- rerunTotal
980
+ repetitionId,
981
+ repetitionIndex,
982
+ repetitionCount
769
983
  });
770
984
  const evaluatorScores = [];
771
985
  let testCaseError;
@@ -799,8 +1013,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
799
1013
  meta: {
800
1014
  triggerId: task.triggerId,
801
1015
  runId: evaluatorRunId,
802
- datasetId: task.datasetId
1016
+ datasetName: task.dataset.getDisplayLabel(),
1017
+ repetitionId,
1018
+ repetitionIndex,
1019
+ repetitionCount,
1020
+ runConfigName: task.runConfigName
803
1021
  },
1022
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1023
+ runConfigTags: task.runConfigTags,
1024
+ evaluatorTags: getEvaluatorTagList(evaluator),
804
1025
  logDiff,
805
1026
  log,
806
1027
  createError
@@ -843,18 +1064,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
843
1064
  });
844
1065
  }
845
1066
  }
846
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1067
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
847
1068
  const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
848
1069
  const progressEvent = {
849
1070
  type: "TestCaseProgress",
850
1071
  runId: task.runId,
851
1072
  testCaseId: testCaseItem.id,
852
- testCaseName: testCaseItem.testCase.getName(),
1073
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
853
1074
  completedTestCases: completedEvaluations,
854
1075
  totalTestCases: totalEvaluations,
855
- rerunIndex,
856
- rerunTotal,
857
- passed: rerunPassedThis,
1076
+ repetitionId,
1077
+ repetitionIndex,
1078
+ repetitionCount,
1079
+ passed: repetitionPassedThis,
858
1080
  durationMs: Date.now() - started,
859
1081
  evaluatorScores,
860
1082
  output,
@@ -875,9 +1097,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
875
1097
  (map) => {
876
1098
  const key = testCaseItem.id;
877
1099
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
878
- const newResults = [...existing.results, rerunPassedThis];
1100
+ const newResults = [...existing.results, repetitionPassedThis];
879
1101
  const newCompletedCount = existing.completedCount + 1;
880
- const isLast = newCompletedCount === rerunTotal;
1102
+ const isLast = newCompletedCount === repetitionCount;
881
1103
  const newMap = new Map(map);
882
1104
  newMap.set(key, {
883
1105
  completedCount: newCompletedCount,
@@ -914,10 +1136,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
914
1136
  runId: task.runId,
915
1137
  startedAt
916
1138
  });
917
- const totalEvaluations = task.testCases.reduce(
918
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
919
- 0
920
- );
1139
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
921
1140
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
922
1141
  const completedRef = yield* effect.Ref.make(0);
923
1142
  const startedRef = yield* effect.Ref.make(0);
@@ -926,7 +1145,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
926
1145
  const testCaseResultsRef = yield* effect.Ref.make(
927
1146
  /* @__PURE__ */ new Map()
928
1147
  );
929
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1148
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
930
1149
  const processEvaluation = (unit) => processOneEvaluation(
931
1150
  task,
932
1151
  unit,
@@ -940,11 +1159,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
940
1159
  failedRef,
941
1160
  testCaseResultsRef
942
1161
  );
943
- yield* effect.Effect.forEach(
944
- evaluationUnits,
945
- processEvaluation,
946
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
947
- );
1162
+ const globalSem = task.globalEvaluationSemaphore;
1163
+ if (globalSem !== void 0) {
1164
+ yield* effect.Effect.forEach(
1165
+ evaluationUnits,
1166
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1167
+ { concurrency: "unbounded", discard: true }
1168
+ );
1169
+ } else {
1170
+ yield* effect.Effect.forEach(
1171
+ evaluationUnits,
1172
+ processEvaluation,
1173
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1174
+ );
1175
+ }
948
1176
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
949
1177
  effect.Ref.get(completedRef),
950
1178
  effect.Ref.get(passedRef),
@@ -961,144 +1189,53 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
961
1189
  artifactPath: task.snapshot.artifactPath
962
1190
  };
963
1191
  yield* updateSnapshot(task.runId, (snapshot) => ({
964
- ...snapshot,
965
- status: "completed",
966
- completedTestCases: completedEvaluations,
967
- passedTestCases: passedUniqueTestCases,
968
- failedTestCases: failedUniqueTestCases,
969
- finishedAt
970
- }));
971
- yield* publishEvent(completedEvent);
972
- yield* effect.Queue.offer(persistenceQueue, {
973
- runId: task.runId,
974
- artifactPath: task.snapshot.artifactPath,
975
- payload: completedEvent
976
- });
977
- yield* publishEvent({
978
- type: "ArtifactFlushed",
979
- runId: task.runId,
980
- artifactPath: task.snapshot.artifactPath
981
- });
982
- });
983
- async function loadRunSnapshotsFromArtifacts(config) {
984
- const baseDir = path.resolve(config.artifactDirectory);
985
- let entries;
986
- try {
987
- entries = await promises.readdir(baseDir);
988
- } catch {
989
- return [];
990
- }
991
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
992
- const snapshots = [];
993
- for (const fileName of jsonlFiles) {
994
- const filePath = path.join(baseDir, fileName);
995
- try {
996
- const snapshot = await parseArtifactToSnapshot(filePath, config);
997
- if (snapshot) {
998
- snapshots.push(snapshot);
999
- }
1000
- } catch {
1001
- }
1002
- }
1003
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1004
- }
1005
- async function parseArtifactToSnapshot(filePath, _config) {
1006
- const content = await promises.readFile(filePath, "utf8");
1007
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1008
- if (lines.length === 0) {
1009
- return null;
1010
- }
1011
- let runQueued = null;
1012
- let runCompleted = null;
1013
- let runFailed = null;
1014
- let runStarted = null;
1015
- for (const line of lines) {
1016
- try {
1017
- const event = JSON.parse(line);
1018
- const type = event.type;
1019
- if (type === "RunQueued") {
1020
- runQueued = {
1021
- runId: event.runId,
1022
- datasetId: event.datasetId,
1023
- datasetName: event.datasetName,
1024
- evaluatorIds: event.evaluatorIds,
1025
- totalTestCases: event.totalTestCases ?? 0,
1026
- artifactPath: event.artifactPath ?? filePath,
1027
- ts: event.ts
1028
- };
1029
- }
1030
- if (type === "RunStarted") {
1031
- runStarted = { startedAt: event.startedAt };
1032
- }
1033
- if (type === "RunCompleted") {
1034
- runCompleted = {
1035
- passedTestCases: event.passedTestCases,
1036
- failedTestCases: event.failedTestCases,
1037
- totalTestCases: event.totalTestCases,
1038
- finishedAt: event.finishedAt
1039
- };
1040
- }
1041
- if (type === "RunFailed") {
1042
- runFailed = {
1043
- finishedAt: event.finishedAt,
1044
- errorMessage: event.errorMessage
1045
- };
1046
- }
1047
- } catch {
1048
- }
1192
+ ...snapshot,
1193
+ status: "completed",
1194
+ completedTestCases: completedEvaluations,
1195
+ passedTestCases: passedUniqueTestCases,
1196
+ failedTestCases: failedUniqueTestCases,
1197
+ finishedAt
1198
+ }));
1199
+ yield* publishEvent(completedEvent);
1200
+ yield* effect.Queue.offer(persistenceQueue, {
1201
+ runId: task.runId,
1202
+ artifactPath: task.snapshot.artifactPath,
1203
+ payload: completedEvent
1204
+ });
1205
+ yield* publishEvent({
1206
+ type: "ArtifactFlushed",
1207
+ runId: task.runId,
1208
+ artifactPath: task.snapshot.artifactPath
1209
+ });
1210
+ });
1211
+
1212
+ // src/runner/name-pattern.ts
1213
+ function parseRegexLiteral(pattern) {
1214
+ if (!pattern.startsWith("/")) {
1215
+ return void 0;
1049
1216
  }
1050
- if (!runQueued) {
1051
- return null;
1217
+ const lastSlash = pattern.lastIndexOf("/");
1218
+ if (lastSlash <= 0) {
1219
+ return void 0;
1052
1220
  }
1053
- const artifactPath = filePath;
1054
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1055
- const progress = aggregateTestCaseProgress(lines);
1056
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1057
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1058
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1059
1221
  return {
1060
- runId: runQueued.runId,
1061
- datasetId: runQueued.datasetId,
1062
- datasetName: runQueued.datasetName,
1063
- evaluatorIds: runQueued.evaluatorIds,
1064
- queuedAt: runQueued.ts ?? 0,
1065
- startedAt: runStarted?.startedAt,
1066
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1067
- totalTestCases: runQueued.totalTestCases,
1068
- completedTestCases,
1069
- passedTestCases,
1070
- failedTestCases,
1071
- status,
1072
- artifactPath,
1073
- errorMessage: runFailed?.errorMessage
1222
+ source: pattern.slice(1, lastSlash),
1223
+ flags: pattern.slice(lastSlash + 1)
1074
1224
  };
1075
1225
  }
1076
- function aggregateTestCaseProgress(lines) {
1077
- let completedTestCases = 0;
1078
- const testCasePassedBy = /* @__PURE__ */ new Map();
1079
- for (const line of lines) {
1080
- try {
1081
- const event = JSON.parse(line);
1082
- if (event.type === "TestCaseProgress") {
1083
- const ev = event;
1084
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1085
- const id = ev.testCaseId;
1086
- const current = testCasePassedBy.get(id);
1087
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1088
- }
1089
- } catch {
1090
- }
1226
+ function createNameMatcher(pattern) {
1227
+ const normalizedPattern = pattern.trim();
1228
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1229
+ if (regexLiteral) {
1230
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1231
+ return (value) => regex.test(value);
1091
1232
  }
1092
- let passedTestCases = 0;
1093
- let failedTestCases = 0;
1094
- for (const passed of testCasePassedBy.values()) {
1095
- if (passed) {
1096
- passedTestCases += 1;
1097
- } else {
1098
- failedTestCases += 1;
1099
- }
1233
+ if (normalizedPattern.includes("*")) {
1234
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1235
+ const regex = new RegExp(`^${escaped}$`, "i");
1236
+ return (value) => regex.test(value);
1100
1237
  }
1101
- return { completedTestCases, passedTestCases, failedTestCases };
1238
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1102
1239
  }
1103
1240
  async function appendJsonLine(artifactPath, payload) {
1104
1241
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1157,32 +1294,12 @@ function searchCollectedTestCases(all, query) {
1157
1294
  }
1158
1295
 
1159
1296
  // src/runner/api.ts
1160
- function parseRegexLiteral(pattern) {
1161
- if (!pattern.startsWith("/")) {
1162
- return void 0;
1163
- }
1164
- const lastSlash = pattern.lastIndexOf("/");
1165
- if (lastSlash <= 0) {
1166
- return void 0;
1167
- }
1168
- return {
1169
- source: pattern.slice(1, lastSlash),
1170
- flags: pattern.slice(lastSlash + 1)
1171
- };
1172
- }
1173
- function createNameMatcher(pattern) {
1174
- const normalizedPattern = pattern.trim();
1175
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1176
- if (regexLiteral) {
1177
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1178
- return (value) => regex.test(value);
1297
+ function normalizeRunRepetitions(value) {
1298
+ const n = value ?? 1;
1299
+ if (!Number.isInteger(n) || n < 1) {
1300
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1179
1301
  }
1180
- if (normalizedPattern.includes("*")) {
1181
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1182
- const regex = new RegExp(`^${escaped}$`, "i");
1183
- return (value) => regex.test(value);
1184
- }
1185
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1302
+ return n;
1186
1303
  }
1187
1304
  function mergeRunnerOverrides(base, next) {
1188
1305
  if (!base) {
@@ -1217,6 +1334,7 @@ var EffectRunner = class {
1217
1334
  this.listeners = /* @__PURE__ */ new Set();
1218
1335
  this.datasetsById = /* @__PURE__ */ new Map();
1219
1336
  this.evaluatorsById = /* @__PURE__ */ new Map();
1337
+ this.runConfigsById = /* @__PURE__ */ new Map();
1220
1338
  this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1221
1339
  this.persistenceFiber = effect.Effect.runFork(
1222
1340
  createPersistenceWorker(this.persistenceQueue)
@@ -1257,6 +1375,137 @@ var EffectRunner = class {
1257
1375
  (item) => matcher(item.evaluator.getName() ?? "")
1258
1376
  );
1259
1377
  }
1378
+ async collectRunConfigs() {
1379
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1380
+ this.runConfigsById.clear();
1381
+ const byNameLower = /* @__PURE__ */ new Map();
1382
+ for (const item of runConfigs) {
1383
+ const id = item.runConfig.getName();
1384
+ const lower = id.toLowerCase();
1385
+ const prev = byNameLower.get(lower);
1386
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1387
+ throw new Error(
1388
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1389
+ );
1390
+ }
1391
+ byNameLower.set(lower, item);
1392
+ this.runConfigsById.set(id, item);
1393
+ }
1394
+ return runConfigs;
1395
+ }
1396
+ async resolveRunConfigByName(name) {
1397
+ if (this.runConfigsById.size === 0) {
1398
+ await this.collectRunConfigs();
1399
+ }
1400
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1401
+ const keyLower = key.toLowerCase();
1402
+ const matches = Array.from(this.runConfigsById.values()).filter(
1403
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1404
+ );
1405
+ if (matches.length === 0) {
1406
+ return void 0;
1407
+ }
1408
+ if (matches.length > 1) {
1409
+ throw new Error(
1410
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1411
+ );
1412
+ }
1413
+ return matches[0];
1414
+ }
1415
+ async expandRunConfigToJobs(collected) {
1416
+ if (this.datasetsById.size === 0) {
1417
+ await this.collectDatasets();
1418
+ }
1419
+ if (this.evaluatorsById.size === 0) {
1420
+ await this.collectEvaluators();
1421
+ }
1422
+ const rcName = collected.runConfig.getName();
1423
+ const jobs = [];
1424
+ const runs = collected.runConfig.getRuns();
1425
+ for (const [i, row] of runs.entries()) {
1426
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1427
+ (d) => d.dataset === row.dataset
1428
+ );
1429
+ if (!dsCollected) {
1430
+ throw new Error(
1431
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1432
+ );
1433
+ }
1434
+ let evaluatorIds;
1435
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1436
+ const matcher = createNameMatcher(row.evaluatorPattern);
1437
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1438
+ (item) => matcher(item.evaluator.getName() ?? "")
1439
+ );
1440
+ if (matched.length === 0) {
1441
+ throw new Error(
1442
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
1443
+ );
1444
+ }
1445
+ evaluatorIds = matched.map((item) => item.id);
1446
+ } else {
1447
+ const evaluators = row.evaluators;
1448
+ evaluatorIds = [];
1449
+ for (const ev of evaluators) {
1450
+ const found = Array.from(this.evaluatorsById.values()).find(
1451
+ (item) => item.evaluator === ev
1452
+ );
1453
+ if (!found) {
1454
+ throw new Error(
1455
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
1456
+ );
1457
+ }
1458
+ evaluatorIds.push(found.id);
1459
+ }
1460
+ }
1461
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
1462
+ jobs.push({
1463
+ datasetId: dsCollected.id,
1464
+ evaluatorIds,
1465
+ runConfigName: rcName,
1466
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
1467
+ runConfigTags: collected.runConfig.getTags(),
1468
+ repetitions
1469
+ });
1470
+ }
1471
+ return jobs;
1472
+ }
1473
+ async expandRunConfigNamesToJobs(names) {
1474
+ const jobs = [];
1475
+ for (const name of names) {
1476
+ const collected = await this.resolveRunConfigByName(name);
1477
+ if (!collected) {
1478
+ const known = await this.collectRunConfigs();
1479
+ const available = known.map((r) => r.runConfig.getName()).sort();
1480
+ throw new Error(
1481
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
1482
+ );
1483
+ }
1484
+ jobs.push(...await this.expandRunConfigToJobs(collected));
1485
+ }
1486
+ return jobs;
1487
+ }
1488
+ async runDatasetJobsWithSharedConcurrency(request) {
1489
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
1490
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
1491
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1492
+ const snapshots = [];
1493
+ for (const job of request.jobs) {
1494
+ snapshots.push(
1495
+ await this.startDatasetRun({
1496
+ datasetId: job.datasetId,
1497
+ evaluatorIds: job.evaluatorIds,
1498
+ triggerId,
1499
+ maxConcurrency: this.config.maxConcurrency ?? 1,
1500
+ globalEvaluationSemaphore: sem,
1501
+ runConfigName: job.runConfigName,
1502
+ runConfigTags: job.runConfigTags,
1503
+ repetitions: job.repetitions
1504
+ })
1505
+ );
1506
+ }
1507
+ return snapshots;
1508
+ }
1260
1509
  async searchTestCases(query) {
1261
1510
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1262
1511
  return searchCollectedTestCases(testCases, query);
@@ -1275,36 +1524,46 @@ var EffectRunner = class {
1275
1524
  );
1276
1525
  }
1277
1526
  async runDatasetWith(request) {
1527
+ const runConfigName = validateRunConfigName(
1528
+ request.runConfigName,
1529
+ "runDatasetWith.runConfigName"
1530
+ );
1531
+ return this.startDatasetRun({
1532
+ datasetId: request.datasetId,
1533
+ evaluatorIds: request.evaluatorIds,
1534
+ triggerId: request.triggerId,
1535
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1536
+ repetitions: request.repetitions,
1537
+ runConfigName,
1538
+ runConfigTags: request.runConfigTags
1539
+ });
1540
+ }
1541
+ async startDatasetRun(params) {
1278
1542
  if (this.datasetsById.size === 0) {
1279
1543
  await this.collectDatasets();
1280
1544
  }
1281
1545
  if (this.evaluatorsById.size === 0) {
1282
1546
  await this.collectEvaluators();
1283
1547
  }
1284
- const dataset = this.datasetsById.get(request.datasetId);
1548
+ const dataset = this.datasetsById.get(params.datasetId);
1285
1549
  if (!dataset) {
1286
- throw new Error(`Unknown dataset: ${request.datasetId}`);
1550
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1287
1551
  }
1288
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1552
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1289
1553
  if (selectedEvaluators.length === 0) {
1290
1554
  throw new Error("No evaluators selected for run");
1291
1555
  }
1292
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1293
- const totalEvaluations = selectedTestCases.reduce(
1294
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1295
- 0
1296
- );
1297
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1556
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
1557
+ const repetitions = normalizeRunRepetitions(params.repetitions);
1558
+ const totalEvaluations = selectedTestCases.length * repetitions;
1559
+ const runConfigTags = [...params.runConfigTags ?? []];
1560
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1298
1561
  const runId = `run-${crypto.randomUUID()}`;
1299
- const artifactPath = createArtifactPath(
1300
- this.config.artifactDirectory,
1301
- request.datasetId,
1302
- runId
1303
- );
1562
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1304
1563
  const snapshot = {
1305
1564
  runId,
1306
- datasetId: request.datasetId,
1307
- datasetName: dataset.dataset.getName(),
1565
+ datasetId: params.datasetId,
1566
+ datasetName: dataset.dataset.getDisplayLabel(),
1308
1567
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1309
1568
  queuedAt: Date.now(),
1310
1569
  totalTestCases: totalEvaluations,
@@ -1324,8 +1583,8 @@ var EffectRunner = class {
1324
1583
  const queuedEvent = {
1325
1584
  type: "RunQueued",
1326
1585
  runId,
1327
- datasetId: request.datasetId,
1328
- datasetName: dataset.dataset.getName(),
1586
+ datasetId: params.datasetId,
1587
+ datasetName: dataset.dataset.getDisplayLabel(),
1329
1588
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1330
1589
  totalTestCases: totalEvaluations,
1331
1590
  artifactPath
@@ -1338,17 +1597,20 @@ var EffectRunner = class {
1338
1597
  payload: queuedEvent
1339
1598
  })
1340
1599
  );
1341
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1342
1600
  await effect.Effect.runPromise(
1343
1601
  effect.Queue.offer(this.runQueue, {
1344
1602
  runId,
1345
1603
  triggerId,
1346
- datasetId: request.datasetId,
1604
+ datasetId: params.datasetId,
1347
1605
  dataset: dataset.dataset,
1348
1606
  evaluators: selectedEvaluators,
1349
1607
  testCases: selectedTestCases,
1350
1608
  snapshot,
1351
- maxConcurrency
1609
+ maxConcurrency: params.maxConcurrency,
1610
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1611
+ runConfigName: params.runConfigName,
1612
+ runConfigTags,
1613
+ repetitions
1352
1614
  })
1353
1615
  );
1354
1616
  return snapshot;
@@ -1427,6 +1689,8 @@ function getDefaultConcurrency() {
1427
1689
  function parseSimpleCliArgs(argv) {
1428
1690
  const args = {
1429
1691
  help: false,
1692
+ ci: false,
1693
+ runConfigNames: [],
1430
1694
  unknownArgs: []
1431
1695
  };
1432
1696
  let index = 0;
@@ -1440,18 +1704,26 @@ function parseSimpleCliArgs(argv) {
1440
1704
  args.help = true;
1441
1705
  continue;
1442
1706
  }
1707
+ if (token === "--ci") {
1708
+ args.ci = true;
1709
+ continue;
1710
+ }
1443
1711
  if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
1444
1712
  args.datasetName = argv[index + 1];
1445
1713
  index += 1;
1446
1714
  continue;
1447
1715
  }
1448
- if ((token === "--evaluator" || token === "--name") && argv[index + 1]) {
1449
- args.evaluatorPattern = argv[index + 1];
1716
+ if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
1717
+ const next = argv[index + 1];
1718
+ if (typeof next === "string") {
1719
+ args.runConfigNames.push(next);
1720
+ }
1450
1721
  index += 1;
1451
1722
  continue;
1452
1723
  }
1453
1724
  if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1454
- const n = parseInt(argv[index + 1], 10);
1725
+ const nextConc = argv[index + 1];
1726
+ const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
1455
1727
  if (!Number.isNaN(n) && n >= 1) {
1456
1728
  args.concurrency = n;
1457
1729
  }
@@ -1465,16 +1737,12 @@ function parseSimpleCliArgs(argv) {
1465
1737
  function getSimpleCliUsage() {
1466
1738
  return [
1467
1739
  "Usage:",
1468
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1469
- " eval-agents-simple generate --dataset <datasetName>",
1740
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1741
+ " eval-agents-simple generate --dataset <datasetId>",
1470
1742
  "",
1471
1743
  "Options:",
1472
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1473
- "",
1474
- "Pattern examples for --evaluator:",
1475
- " score-evaluator exact name (case-insensitive)",
1476
- ' "*score*" wildcard pattern',
1477
- ' "/score/i" regex literal'
1744
+ " --ci With run: exit with code 1 if any test case fails.",
1745
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1478
1746
  ].join("\n");
1479
1747
  }
1480
1748
 
@@ -1525,7 +1793,7 @@ function GenerateView({
1525
1793
  const payload = testCases.map((item) => {
1526
1794
  const tc = item.testCase;
1527
1795
  return {
1528
- name: item.testCase.getName(),
1796
+ name: getTestCaseDisplayLabel(item.testCase),
1529
1797
  input: item.testCase.getInput(),
1530
1798
  output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
1531
1799
  };
@@ -1538,7 +1806,7 @@ function GenerateView({
1538
1806
  if (!cancelled) {
1539
1807
  setResult({
1540
1808
  count: payload.length,
1541
- datasetName: dataset.dataset.getName(),
1809
+ datasetName: getDatasetDisplayLabel(dataset.dataset),
1542
1810
  outputPath
1543
1811
  });
1544
1812
  setTimeout(() => onComplete(), 200);
@@ -1591,7 +1859,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1591
1859
  }
1592
1860
  const testCases = await runner.collectDatasetTestCases(dataset.id);
1593
1861
  const payload = testCases.map((item) => ({
1594
- name: item.testCase.getName(),
1862
+ name: getTestCaseDisplayLabel(item.testCase),
1595
1863
  input: item.testCase.getInput(),
1596
1864
  output: readOutput2(item.testCase)
1597
1865
  }));
@@ -1599,7 +1867,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1599
1867
  const outputPath = createOutputPath(absoluteDatasetPath);
1600
1868
  await promises.writeFile(outputPath, `${JSON.stringify(payload, null, 2)}
1601
1869
  `, "utf8");
1602
- console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
1870
+ console.log(`Generated ${payload.length} test cases for dataset "${getDatasetDisplayLabel(dataset.dataset)}".`);
1603
1871
  console.log(`Wrote ${outputPath}`);
1604
1872
  }
1605
1873
  async function generateDatasetJsonCommandInk(runner, datasetName) {
@@ -1749,8 +2017,7 @@ function formatScorePart(item, _scoreToColor, options) {
1749
2017
  }
1750
2018
  function RunView({
1751
2019
  runner,
1752
- datasetName,
1753
- evaluatorPattern,
2020
+ runConfigNames,
1754
2021
  concurrency,
1755
2022
  onComplete
1756
2023
  }) {
@@ -1763,30 +2030,30 @@ function RunView({
1763
2030
  const [summary, setSummary] = React.useState(null);
1764
2031
  const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
1765
2032
  const runEval = React.useCallback(async () => {
1766
- const dataset = await runner.resolveDatasetByName(datasetName);
1767
- if (!dataset) {
1768
- const known = await runner.collectDatasets();
1769
- const available = known.map((item) => item.dataset.getName()).sort();
1770
- onComplete(
1771
- new Error(
1772
- available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1773
- )
1774
- );
2033
+ const rcList = runConfigNames.filter((n) => n.trim().length > 0);
2034
+ if (rcList.length === 0) {
2035
+ onComplete(new Error("At least one RunConfig name is required."));
1775
2036
  return;
1776
2037
  }
1777
- const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1778
- if (evaluators.length === 0) {
1779
- const known = await runner.collectEvaluators();
1780
- const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1781
- onComplete(
1782
- new Error(
1783
- available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1784
- )
1785
- );
2038
+ setStartedEvaluations(0);
2039
+ setCompletedEvaluations(0);
2040
+ setTestCases([]);
2041
+ setRunningEvaluations([]);
2042
+ setSummary(null);
2043
+ let jobs;
2044
+ try {
2045
+ jobs = await runner.expandRunConfigNamesToJobs(rcList);
2046
+ } catch (err) {
2047
+ onComplete(err instanceof Error ? err : new Error(String(err)));
2048
+ return;
2049
+ }
2050
+ if (jobs.length === 0) {
2051
+ onComplete(new Error("No jobs expanded from RunConfigs."));
1786
2052
  return;
1787
2053
  }
2054
+ const allEvaluators = await runner.collectEvaluators();
1788
2055
  const nameById = new Map(
1789
- evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2056
+ allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
1790
2057
  );
1791
2058
  setEvaluatorNameById(nameById);
1792
2059
  const aggregates = /* @__PURE__ */ new Map();
@@ -1794,21 +2061,30 @@ function RunView({
1794
2061
  let overallScoreTotal = 0;
1795
2062
  let overallScoreSumSq = 0;
1796
2063
  let overallScoreCount = 0;
1797
- const done = new Promise((resolve5) => {
2064
+ const batchPendingRunIds = /* @__PURE__ */ new Set();
2065
+ const runIdToLabel = /* @__PURE__ */ new Map();
2066
+ let batchReady = false;
2067
+ const completedRuns = /* @__PURE__ */ new Map();
2068
+ const done = new Promise((resolve5, reject) => {
1798
2069
  const unsubscribe = runner.subscribeRunEvents((event) => {
2070
+ if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
2071
+ return;
2072
+ }
1799
2073
  if (event.type === "TestCaseStarted") {
1800
- setStartedEvaluations(event.startedTestCases);
2074
+ setStartedEvaluations((c) => c + 1);
1801
2075
  setRunningEvaluations((prev) => {
1802
2076
  const withoutDuplicate = prev.filter(
1803
- (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
2077
+ (item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
1804
2078
  );
1805
2079
  return [
1806
2080
  ...withoutDuplicate,
1807
2081
  {
2082
+ runId: event.runId,
1808
2083
  testCaseId: event.testCaseId,
1809
2084
  name: event.testCaseName,
1810
- rerunIndex: event.rerunIndex,
1811
- rerunTotal: event.rerunTotal,
2085
+ repetitionId: event.repetitionId,
2086
+ repetitionIndex: event.repetitionIndex,
2087
+ repetitionCount: event.repetitionCount,
1812
2088
  startedTestCases: event.startedTestCases,
1813
2089
  totalTestCases: event.totalTestCases
1814
2090
  }
@@ -1844,9 +2120,12 @@ function RunView({
1844
2120
  scoreItemsByEvaluatorScore.set(key, list);
1845
2121
  }
1846
2122
  }
2123
+ const label = runIdToLabel.get(event.runId);
2124
+ const compositeId = `${event.runId}:${event.testCaseId}`;
2125
+ const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
1847
2126
  setTestCases((prev) => {
1848
2127
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1849
- const existing = byId.get(event.testCaseId);
2128
+ const existing = byId.get(compositeId);
1850
2129
  const newEvent = {
1851
2130
  evaluatorScores: event.evaluatorScores.map((item) => ({
1852
2131
  evaluatorId: item.evaluatorId,
@@ -1863,12 +2142,12 @@ function RunView({
1863
2142
  const isAggregated = events.length > 1;
1864
2143
  const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
1865
2144
  const merged = {
1866
- name: event.testCaseName,
1867
- testCaseId: event.testCaseId,
2145
+ name: displayName,
2146
+ testCaseId: compositeId,
1868
2147
  completedTestCases: event.completedTestCases,
1869
2148
  totalTestCases: event.totalTestCases,
1870
- rerunIndex: event.rerunIndex,
1871
- rerunTotal: event.rerunTotal,
2149
+ repetitionIndex: event.repetitionIndex,
2150
+ repetitionCount: event.repetitionCount,
1872
2151
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1873
2152
  passed: events.every((e) => e.passed),
1874
2153
  errorMessage: event.errorMessage,
@@ -1876,84 +2155,118 @@ function RunView({
1876
2155
  aggregatedEvaluatorScores,
1877
2156
  isAggregated
1878
2157
  };
1879
- byId.set(event.testCaseId, merged);
1880
- setCompletedEvaluations(event.completedTestCases);
1881
- setRunningEvaluations(
1882
- (running) => running.filter(
1883
- (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1884
- )
1885
- );
2158
+ byId.set(compositeId, merged);
1886
2159
  return Array.from(byId.values());
1887
2160
  });
2161
+ setCompletedEvaluations((c) => c + 1);
2162
+ setRunningEvaluations(
2163
+ (running) => running.filter(
2164
+ (item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
2165
+ )
2166
+ );
1888
2167
  }
1889
- if (event.type === "RunCompleted" || event.type === "RunFailed") {
2168
+ if (event.type === "RunFailed") {
2169
+ if (batchReady && !batchPendingRunIds.has(event.runId)) {
2170
+ return;
2171
+ }
1890
2172
  unsubscribe();
1891
- resolve5(event);
2173
+ reject(new Error(`Run failed: ${event.errorMessage}`));
2174
+ return;
2175
+ }
2176
+ if (event.type === "RunCompleted") {
2177
+ if (!batchPendingRunIds.has(event.runId)) {
2178
+ return;
2179
+ }
2180
+ completedRuns.set(event.runId, event);
2181
+ batchPendingRunIds.delete(event.runId);
2182
+ if (batchPendingRunIds.size === 0) {
2183
+ unsubscribe();
2184
+ resolve5();
2185
+ }
1892
2186
  }
1893
2187
  });
1894
2188
  });
1895
- const snapshot = await runner.runDatasetWith({
1896
- datasetId: dataset.id,
1897
- evaluatorIds: evaluators.map((item) => item.id),
1898
- concurrency
2189
+ const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2190
+ jobs,
2191
+ globalConcurrency: concurrency
1899
2192
  });
2193
+ for (let i = 0; i < snapshots.length; i += 1) {
2194
+ const snap = snapshots[i];
2195
+ const job = jobs[i];
2196
+ if (snap && job) {
2197
+ runIdToLabel.set(
2198
+ snap.runId,
2199
+ `${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
2200
+ );
2201
+ batchPendingRunIds.add(snap.runId);
2202
+ }
2203
+ }
2204
+ const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
2205
+ batchReady = true;
2206
+ const runConfigLabels = await Promise.all(
2207
+ rcList.map(async (n) => {
2208
+ const collected = await runner.resolveRunConfigByName(n);
2209
+ return collected?.runConfig.getDisplayLabel() ?? n;
2210
+ })
2211
+ );
1900
2212
  setRunInfo({
1901
- runId: snapshot.runId,
1902
- datasetName: snapshot.datasetName,
1903
- evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1904
- totalTestCases: snapshot.totalTestCases
2213
+ names: runConfigLabels,
2214
+ jobs: jobs.length,
2215
+ totalTestCases: totalUnits
1905
2216
  });
1906
2217
  setPhase("running");
1907
- const finalEvent = await done;
1908
- if (finalEvent.type === "RunFailed") {
1909
- onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
2218
+ try {
2219
+ await done;
2220
+ } catch (err) {
2221
+ onComplete(err instanceof Error ? err : new Error(String(err)));
1910
2222
  return;
1911
2223
  }
1912
- const completed = finalEvent;
2224
+ let passedTestCases = 0;
2225
+ let failedTestCases = 0;
2226
+ let totalTestCases = 0;
2227
+ const artifacts = [];
2228
+ for (const ev of completedRuns.values()) {
2229
+ passedTestCases += ev.passedTestCases;
2230
+ failedTestCases += ev.failedTestCases;
2231
+ totalTestCases += ev.totalTestCases;
2232
+ artifacts.push(ev.artifactPath);
2233
+ }
1913
2234
  setSummary({
1914
- passedTestCases: completed.passedTestCases,
1915
- failedTestCases: completed.failedTestCases,
1916
- totalTestCases: completed.totalTestCases,
2235
+ passedTestCases,
2236
+ failedTestCases,
2237
+ totalTestCases,
1917
2238
  overallScoreTotal,
1918
2239
  overallScoreSumSq,
1919
2240
  overallScoreCount,
1920
2241
  aggregates: new Map(aggregates),
1921
2242
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1922
- artifactPath: completed.artifactPath
2243
+ artifactPath: artifacts.join("\n")
1923
2244
  });
1924
2245
  setPhase("completed");
1925
- setTimeout(() => onComplete(), 200);
1926
- }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
2246
+ const exitCode = failedTestCases > 0 ? 1 : 0;
2247
+ setTimeout(() => onComplete(void 0, exitCode), 200);
2248
+ }, [runner, runConfigNames, concurrency, onComplete]);
1927
2249
  React.useEffect(() => {
1928
2250
  void runEval();
1929
2251
  }, [runEval]);
1930
2252
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1931
2253
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1932
2254
  runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1933
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1934
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1935
- "Run",
1936
- " "
1937
- ] }),
1938
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
2255
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
2256
+ "RunConfigs",
2257
+ " "
1939
2258
  ] }),
2259
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.names.join(", ") }),
1940
2260
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1941
2261
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1942
- "Dataset",
2262
+ "Jobs",
1943
2263
  " "
1944
2264
  ] }),
1945
- runInfo.datasetName
2265
+ runInfo.jobs
1946
2266
  ] }),
1947
2267
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1948
2268
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1949
- "Evaluators",
1950
- " "
1951
- ] }),
1952
- runInfo.evaluatorNames.join(", ")
1953
- ] }),
1954
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1955
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1956
- "Test cases",
2269
+ "Evaluation units",
1957
2270
  " "
1958
2271
  ] }),
1959
2272
  runInfo.totalTestCases
@@ -1966,22 +2279,29 @@ function RunView({
1966
2279
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1967
2280
  }
1968
2281
  ),
1969
- runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
1970
- "[running ",
1971
- item.startedTestCases,
1972
- "/",
1973
- item.totalTestCases,
1974
- "] ",
1975
- item.name,
1976
- " ",
1977
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1978
- "(",
1979
- item.rerunIndex,
1980
- "/",
1981
- item.rerunTotal,
1982
- ")"
1983
- ] })
1984
- ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
2282
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
2283
+ ink.Text,
2284
+ {
2285
+ color: "yellow",
2286
+ children: [
2287
+ "[running ",
2288
+ item.startedTestCases,
2289
+ "/",
2290
+ item.totalTestCases,
2291
+ "] ",
2292
+ item.name,
2293
+ " ",
2294
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2295
+ "(",
2296
+ item.repetitionIndex,
2297
+ "/",
2298
+ item.repetitionCount,
2299
+ ")"
2300
+ ] })
2301
+ ]
2302
+ },
2303
+ `${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
2304
+ )) })
1985
2305
  ] }),
1986
2306
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1987
2307
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
@@ -1997,9 +2317,9 @@ function RunView({
1997
2317
  " ",
1998
2318
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1999
2319
  "(",
2000
- tc.rerunIndex,
2320
+ tc.repetitionIndex,
2001
2321
  "/",
2002
- tc.rerunTotal,
2322
+ tc.repetitionCount,
2003
2323
  ")"
2004
2324
  ] }),
2005
2325
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
@@ -2039,7 +2359,7 @@ function RunView({
2039
2359
  })
2040
2360
  ] }) : null
2041
2361
  ] }),
2042
- item.scores.length > 0 ? item.scores.map((s, idx) => {
2362
+ item.scores.length > 0 ? item.scores.map((s) => {
2043
2363
  const def = s.def ?? getScoreById(s.id);
2044
2364
  const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2045
2365
  return /* @__PURE__ */ jsxRuntime.jsxs(
@@ -2056,18 +2376,25 @@ function RunView({
2056
2376
  })
2057
2377
  ]
2058
2378
  },
2059
- `${item.evaluatorId}-${s.id}-${idx}`
2379
+ `${item.evaluatorId}-${s.id}-${scoreLabel}`
2060
2380
  );
2061
2381
  }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2062
2382
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2063
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
2064
- ink.Text,
2383
+ (log) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(
2384
+ ink.Box,
2065
2385
  {
2066
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2067
- children: line
2386
+ flexDirection: "column",
2387
+ children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsxRuntime.jsx(
2388
+ ink.Text,
2389
+ {
2390
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2391
+ children: line
2392
+ },
2393
+ `${type}:${line}`
2394
+ ))
2068
2395
  },
2069
- lineIdx
2070
- )) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2396
+ `diff:${getDiffLines(log).map((x) => x.line).join("|")}`
2397
+ ) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
2071
2398
  ) })
2072
2399
  ] }, item.evaluatorId))
2073
2400
  ] }, tc.testCaseId)) }),
@@ -2191,10 +2518,10 @@ function RunView({
2191
2518
  ] }, tc.testCaseId);
2192
2519
  })
2193
2520
  ] }),
2194
- /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2195
- "artifact: ",
2196
- summary.artifactPath
2197
- ] }) })
2521
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
2522
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "artifact(s):" }),
2523
+ summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line))
2524
+ ] })
2198
2525
  ] })
2199
2526
  ] });
2200
2527
  }
@@ -2406,25 +2733,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2406
2733
  }
2407
2734
  return lines;
2408
2735
  }
2409
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2410
- const dataset = await runner.resolveDatasetByName(datasetName);
2411
- if (!dataset) {
2412
- const known = await runner.collectDatasets();
2413
- const available = known.map((item) => item.dataset.getName()).sort();
2414
- throw new Error(
2415
- available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
2416
- );
2417
- }
2418
- const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
2419
- if (evaluators.length === 0) {
2420
- const known = await runner.collectEvaluators();
2421
- const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
2422
- throw new Error(
2423
- available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
2424
- );
2736
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2737
+ const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2738
+ if (jobs.length === 0) {
2739
+ throw new Error("No jobs expanded from RunConfigs.");
2425
2740
  }
2741
+ const evaluators = await runner.collectEvaluators();
2426
2742
  const evaluatorNameById = new Map(
2427
- evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2743
+ evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
2428
2744
  );
2429
2745
  const aggregates = /* @__PURE__ */ new Map();
2430
2746
  const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
@@ -2432,11 +2748,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2432
2748
  let overallScoreTotal = 0;
2433
2749
  let overallScoreSumSq = 0;
2434
2750
  let overallScoreCount = 0;
2435
- let startedCount = 0;
2436
- let completedCount = 0;
2751
+ let globalStartedUnits = 0;
2752
+ let globalCompletedUnits = 0;
2437
2753
  let totalCount = 0;
2438
2754
  let runFinished = false;
2439
- const inFlightReruns = /* @__PURE__ */ new Set();
2755
+ const inFlightRepetitions = /* @__PURE__ */ new Set();
2440
2756
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2441
2757
  let spinnerIndex = 0;
2442
2758
  function clearLine() {
@@ -2458,33 +2774,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2458
2774
  spinnerIndex += 1;
2459
2775
  process.stdout.write(
2460
2776
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2461
- `${completedCount}/${totalCount}`,
2777
+ `${globalCompletedUnits}/${totalCount}`,
2462
2778
  ansi2.bold
2463
- )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2779
+ )} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
2464
2780
  );
2465
2781
  }
2466
2782
  let lastPrintedTestCaseId = null;
2467
2783
  let lastPrintedLineCount = 0;
2468
2784
  let spinnerTimer;
2469
- const done = new Promise((resolve5) => {
2785
+ const batchPendingRunIds = /* @__PURE__ */ new Set();
2786
+ const runIdToLabel = /* @__PURE__ */ new Map();
2787
+ let batchReady = false;
2788
+ const completedRuns = /* @__PURE__ */ new Map();
2789
+ const done = new Promise((resolve5, reject) => {
2470
2790
  const unsubscribe = runner.subscribeRunEvents((event) => {
2791
+ if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
2792
+ return;
2793
+ }
2794
+ const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
2795
+ const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
2471
2796
  if (event.type === "TestCaseStarted") {
2472
- startedCount = event.startedTestCases;
2473
- inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2797
+ globalStartedUnits += 1;
2798
+ inFlightRepetitions.add(
2799
+ `${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
2800
+ );
2474
2801
  clearLine();
2475
2802
  process.stdout.write(
2476
- `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2803
+ `${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2477
2804
  `
2478
2805
  );
2479
2806
  drawSpinner();
2480
2807
  }
2481
2808
  if (event.type === "TestCaseProgress") {
2482
- completedCount = event.completedTestCases;
2483
- inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2809
+ globalCompletedUnits += 1;
2810
+ inFlightRepetitions.delete(
2811
+ `${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
2812
+ );
2484
2813
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2485
2814
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2486
- const testCaseId = event.testCaseId;
2487
- const existing = testCaseByTestId.get(testCaseId) ?? {
2815
+ const compositeId = `${event.runId}:${event.testCaseId}`;
2816
+ const existing = testCaseByTestId.get(compositeId) ?? {
2488
2817
  name: event.testCaseName,
2489
2818
  events: []
2490
2819
  };
@@ -2494,7 +2823,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2494
2823
  durationMs: event.durationMs,
2495
2824
  evaluatorScores: event.evaluatorScores
2496
2825
  });
2497
- testCaseByTestId.set(testCaseId, existing);
2826
+ testCaseByTestId.set(compositeId, existing);
2498
2827
  for (const item of event.evaluatorScores) {
2499
2828
  const numeric = toNumericScoreFromScores(item.scores);
2500
2829
  if (numeric !== void 0) {
@@ -2523,10 +2852,10 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2523
2852
  scoreItemsByEvaluatorScore.set(key, list);
2524
2853
  }
2525
2854
  }
2526
- const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2527
- const isLastRerun = event.rerunIndex >= event.rerunTotal;
2855
+ const isSameTestCase = lastPrintedTestCaseId === compositeId;
2856
+ const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
2528
2857
  const isNonTty = !process.stdout.isTTY;
2529
- const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2858
+ const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
2530
2859
  if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2531
2860
  cursorUp(lastPrintedLineCount);
2532
2861
  }
@@ -2537,7 +2866,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2537
2866
  const lines = [];
2538
2867
  const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2539
2868
  lines.push(
2540
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2869
+ `${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2541
2870
  );
2542
2871
  if (event.errorMessage) {
2543
2872
  lines.push(colorize(event.errorMessage, ansi2.red));
@@ -2568,64 +2897,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2568
2897
  }
2569
2898
  }
2570
2899
  if (!skipPrintNonTty) {
2571
- for (let i = 0; i < lines.length; i++) {
2900
+ for (let i = 0; i < lines.length; i += 1) {
2572
2901
  process.stdout.write(`\r\x1B[2K${lines[i]}
2573
2902
  `);
2574
2903
  }
2575
- lastPrintedTestCaseId = testCaseId;
2904
+ lastPrintedTestCaseId = compositeId;
2576
2905
  lastPrintedLineCount = lines.length;
2577
2906
  }
2578
2907
  drawSpinner();
2579
2908
  }
2580
- if (event.type === "RunCompleted" || event.type === "RunFailed") {
2909
+ if (event.type === "RunFailed") {
2910
+ if (batchReady && !batchPendingRunIds.has(event.runId)) {
2911
+ return;
2912
+ }
2581
2913
  runFinished = true;
2582
2914
  clearLine();
2583
2915
  unsubscribe();
2584
- resolve5(event);
2916
+ reject(new Error(`Run failed: ${event.errorMessage}`));
2917
+ return;
2918
+ }
2919
+ if (event.type === "RunCompleted") {
2920
+ if (!batchPendingRunIds.has(event.runId)) {
2921
+ return;
2922
+ }
2923
+ completedRuns.set(event.runId, event);
2924
+ batchPendingRunIds.delete(event.runId);
2925
+ if (batchPendingRunIds.size === 0) {
2926
+ runFinished = true;
2927
+ clearLine();
2928
+ unsubscribe();
2929
+ resolve5();
2930
+ }
2585
2931
  }
2586
2932
  });
2587
2933
  });
2588
- const snapshot = await runner.runDatasetWith({
2589
- datasetId: dataset.id,
2590
- evaluatorIds: evaluators.map((item) => item.id),
2591
- concurrency
2934
+ console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
2935
+ for (const name of runConfigNames) {
2936
+ const collected = await runner.resolveRunConfigByName(name);
2937
+ const label = collected?.runConfig.getDisplayLabel() ?? name;
2938
+ console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
2939
+ }
2940
+ console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
2941
+ console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
2942
+ console.log("");
2943
+ const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2944
+ jobs,
2945
+ globalConcurrency: concurrency
2592
2946
  });
2593
- totalCount = snapshot.totalTestCases;
2594
- console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
2595
- console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
2596
- console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
2597
- console.log(
2598
- `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
2599
- );
2600
- console.log(`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`);
2947
+ for (let i = 0; i < snapshots.length; i += 1) {
2948
+ const snap = snapshots[i];
2949
+ const job = jobs[i];
2950
+ if (snap && job) {
2951
+ runIdToLabel.set(
2952
+ snap.runId,
2953
+ `${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
2954
+ );
2955
+ batchPendingRunIds.add(snap.runId);
2956
+ }
2957
+ }
2958
+ totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
2959
+ console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
2601
2960
  console.log("");
2961
+ batchReady = true;
2602
2962
  drawSpinner();
2603
2963
  spinnerTimer = setInterval(drawSpinner, 100);
2604
- const finalEvent = await done;
2964
+ await done;
2605
2965
  if (spinnerTimer) {
2606
2966
  clearInterval(spinnerTimer);
2607
2967
  }
2608
- if (finalEvent.type === "RunFailed") {
2609
- throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2610
- }
2611
- const completed = finalEvent;
2612
2968
  console.log("");
2613
- console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2614
- console.log(
2615
- `- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
2616
- );
2617
- console.log(
2618
- `- failed: ${colorize(
2619
- `${completed.failedTestCases}/${completed.totalTestCases}`,
2620
- completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2621
- )}`
2622
- );
2969
+ console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
2970
+ for (const snap of snapshots) {
2971
+ const completed = completedRuns.get(snap.runId);
2972
+ if (!completed) {
2973
+ continue;
2974
+ }
2975
+ const label = runIdToLabel.get(snap.runId) ?? snap.runId;
2976
+ console.log("");
2977
+ console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
2978
+ console.log(
2979
+ `- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
2980
+ );
2981
+ console.log(
2982
+ `- failed: ${colorize(
2983
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2984
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2985
+ )}`
2986
+ );
2987
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2988
+ }
2623
2989
  if (overallScoreCount > 0) {
2624
2990
  const overallAverage = overallScoreTotal / overallScoreCount;
2625
2991
  const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
2626
2992
  const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2993
+ console.log("");
2627
2994
  console.log(
2628
- `- overall avg score: ${colorize(
2995
+ `- overall avg score (all jobs): ${colorize(
2629
2996
  avgStr,
2630
2997
  scoreToColor(overallAverage)
2631
2998
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
@@ -2666,22 +3033,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2666
3033
  );
2667
3034
  }
2668
3035
  }
2669
- console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
3036
+ let failedTestCasesTotal = 0;
3037
+ for (const snap of snapshots) {
3038
+ const completed = completedRuns.get(snap.runId);
3039
+ if (completed) {
3040
+ failedTestCasesTotal += completed.failedTestCases;
3041
+ }
3042
+ }
3043
+ return failedTestCasesTotal > 0 ? 1 : 0;
2670
3044
  }
2671
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
3045
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
2672
3046
  return new Promise((resolve5, reject) => {
2673
3047
  const app = ink.render(
2674
3048
  React__namespace.createElement(RunView, {
2675
3049
  runner,
2676
- datasetName,
2677
- evaluatorPattern,
3050
+ runConfigNames,
2678
3051
  concurrency,
2679
- onComplete: (err) => {
3052
+ onComplete: (err, exitCode) => {
2680
3053
  app.unmount();
2681
3054
  if (err) {
2682
3055
  reject(err);
2683
3056
  } else {
2684
- resolve5();
3057
+ resolve5(exitCode ?? 0);
2685
3058
  }
2686
3059
  }
2687
3060
  })
@@ -2707,12 +3080,22 @@ async function main() {
2707
3080
  if (!args.command) {
2708
3081
  printUsageAndExit(1);
2709
3082
  }
2710
- if (!args.datasetName) {
2711
- console.error("Missing required --dataset <datasetName> argument.");
2712
- printUsageAndExit(1);
3083
+ if (args.command === "run") {
3084
+ if (args.runConfigNames.length === 0) {
3085
+ console.error(
3086
+ "Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
3087
+ );
3088
+ printUsageAndExit(1);
3089
+ }
3090
+ if (args.datasetName !== void 0) {
3091
+ console.error(
3092
+ "The run command no longer accepts --dataset; use --run-config <RunConfig name>."
3093
+ );
3094
+ printUsageAndExit(1);
3095
+ }
2713
3096
  }
2714
- if (args.command === "run" && !args.evaluatorPattern) {
2715
- console.error("Missing required --evaluator <name-or-pattern> argument.");
3097
+ if (args.command === "generate" && args.runConfigNames.length > 0) {
3098
+ console.error("generate does not accept --run-config.");
2716
3099
  printUsageAndExit(1);
2717
3100
  }
2718
3101
  const useInk = process.stdout.isTTY === true;
@@ -2723,17 +3106,24 @@ async function main() {
2723
3106
  try {
2724
3107
  if (args.command === "run") {
2725
3108
  const concurrency = args.concurrency ?? getDefaultConcurrency();
2726
- await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
3109
+ const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
2727
3110
  runner,
2728
- args.datasetName,
2729
- args.evaluatorPattern,
3111
+ args.runConfigNames,
2730
3112
  concurrency
2731
3113
  );
3114
+ if (args.ci && exitCode !== 0) {
3115
+ process.exit(1);
3116
+ }
2732
3117
  return;
2733
3118
  }
3119
+ const genDataset = args.datasetName;
3120
+ if (!genDataset) {
3121
+ console.error("Missing required --dataset <datasetId> argument.");
3122
+ printUsageAndExit(1);
3123
+ }
2734
3124
  await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
2735
3125
  runner,
2736
- args.datasetName
3126
+ genDataset
2737
3127
  );
2738
3128
  } finally {
2739
3129
  await runner.shutdown();