@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,14 +3,14 @@
3
3
 
4
4
  var crypto = require('crypto');
5
5
  var effect = require('effect');
6
- var fs = require('fs');
6
+ var promises = require('fs/promises');
7
7
  var path = require('path');
8
+ var fs = require('fs');
8
9
  var jitiModule = require('jiti');
9
- var promises = require('fs/promises');
10
10
  var url = require('url');
11
11
  var diff = require('diff');
12
12
  var stringify = require('fast-json-stable-stringify');
13
- var React2 = require('react');
13
+ var React = require('react');
14
14
  var ink = require('ink');
15
15
  var jsxRuntime = require('react/jsx-runtime');
16
16
 
@@ -37,25 +37,181 @@ function _interopNamespace(e) {
37
37
 
38
38
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
39
39
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
40
- var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
40
+ var React__namespace = /*#__PURE__*/_interopNamespace(React);
41
+
42
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
43
+ function makeEntityIdSchema(brand, label) {
44
+ return effect.Schema.String.pipe(
45
+ effect.Schema.trimmed(),
46
+ effect.Schema.minLength(1, {
47
+ message: () => `${label} must be non-empty.`
48
+ }),
49
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
50
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
51
+ }),
52
+ effect.Schema.brand(brand)
53
+ );
54
+ }
55
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
56
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
57
+ makeEntityIdSchema("TestCaseName", "Test case name");
58
+ function validateWithSchema(schema, raw, context) {
59
+ const trimmed = raw.trim();
60
+ const decode = effect.Schema.decodeUnknownEither(
61
+ schema
62
+ );
63
+ const result = decode(trimmed);
64
+ if (effect.Either.isLeft(result)) {
65
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
66
+ }
67
+ return result.right;
68
+ }
69
+ function validateRunConfigName(raw, context) {
70
+ return validateWithSchema(RunConfigNameSchema, raw, context);
71
+ }
72
+
73
+ // src/evals/evaluator.ts
74
+ function getEvaluatorDisplayLabel(evaluator) {
75
+ if (typeof evaluator.getDisplayLabel === "function") {
76
+ const label = evaluator.getDisplayLabel();
77
+ if (label !== void 0) {
78
+ return label;
79
+ }
80
+ }
81
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
82
+ }
83
+ function getEvaluatorTagList(evaluator) {
84
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
85
+ }
86
+ async function loadRunSnapshotsFromArtifacts(config) {
87
+ const baseDir = path.resolve(config.artifactDirectory);
88
+ let entries;
89
+ try {
90
+ entries = await promises.readdir(baseDir);
91
+ } catch {
92
+ return [];
93
+ }
94
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
95
+ const snapshots = [];
96
+ for (const fileName of jsonlFiles) {
97
+ const filePath = path.join(baseDir, fileName);
98
+ try {
99
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
100
+ if (snapshot) {
101
+ snapshots.push(snapshot);
102
+ }
103
+ } catch {
104
+ }
105
+ }
106
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
107
+ }
108
+ async function parseArtifactToSnapshot(filePath, _config) {
109
+ const content = await promises.readFile(filePath, "utf8");
110
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
111
+ if (lines.length === 0) {
112
+ return null;
113
+ }
114
+ let runQueued = null;
115
+ let runCompleted = null;
116
+ let runFailed = null;
117
+ let runStarted = null;
118
+ for (const line of lines) {
119
+ try {
120
+ const event = JSON.parse(line);
121
+ const type = event.type;
122
+ if (type === "RunQueued") {
123
+ runQueued = {
124
+ runId: event.runId,
125
+ datasetId: event.datasetId,
126
+ datasetName: event.datasetName,
127
+ evaluatorIds: event.evaluatorIds,
128
+ totalTestCases: event.totalTestCases ?? 0,
129
+ artifactPath: event.artifactPath ?? filePath,
130
+ ts: event.ts
131
+ };
132
+ }
133
+ if (type === "RunStarted") {
134
+ runStarted = { startedAt: event.startedAt };
135
+ }
136
+ if (type === "RunCompleted") {
137
+ runCompleted = {
138
+ passedTestCases: event.passedTestCases,
139
+ failedTestCases: event.failedTestCases,
140
+ totalTestCases: event.totalTestCases,
141
+ finishedAt: event.finishedAt
142
+ };
143
+ }
144
+ if (type === "RunFailed") {
145
+ runFailed = {
146
+ finishedAt: event.finishedAt,
147
+ errorMessage: event.errorMessage
148
+ };
149
+ }
150
+ } catch {
151
+ }
152
+ }
153
+ if (!runQueued) {
154
+ return null;
155
+ }
156
+ const artifactPath = filePath;
157
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
158
+ const progress = aggregateTestCaseProgress(lines);
159
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
160
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
161
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
162
+ return {
163
+ runId: runQueued.runId,
164
+ datasetId: runQueued.datasetId,
165
+ datasetName: runQueued.datasetName,
166
+ evaluatorIds: runQueued.evaluatorIds,
167
+ queuedAt: runQueued.ts ?? 0,
168
+ startedAt: runStarted?.startedAt,
169
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
170
+ totalTestCases: runQueued.totalTestCases,
171
+ completedTestCases,
172
+ passedTestCases,
173
+ failedTestCases,
174
+ status,
175
+ artifactPath,
176
+ errorMessage: runFailed?.errorMessage
177
+ };
178
+ }
179
+ function aggregateTestCaseProgress(lines) {
180
+ let completedTestCases = 0;
181
+ const testCasePassedBy = /* @__PURE__ */ new Map();
182
+ for (const line of lines) {
183
+ try {
184
+ const event = JSON.parse(line);
185
+ if (event.type === "TestCaseProgress") {
186
+ const ev = event;
187
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
188
+ const id = ev.testCaseId;
189
+ const current = testCasePassedBy.get(id);
190
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
191
+ }
192
+ } catch {
193
+ }
194
+ }
195
+ let passedTestCases = 0;
196
+ let failedTestCases = 0;
197
+ for (const passed of testCasePassedBy.values()) {
198
+ if (passed) {
199
+ passedTestCases += 1;
200
+ } else {
201
+ failedTestCases += 1;
202
+ }
203
+ }
204
+ return { completedTestCases, passedTestCases, failedTestCases };
205
+ }
41
206
 
42
207
  // src/runner/config.ts
43
208
  var defaultRunnerConfig = {
44
209
  discovery: {
45
210
  rootDir: process.cwd(),
46
211
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
47
- evaluatorSuffixes: [
48
- ".evaluator.ts",
49
- ".evaluator.tsx",
50
- ".evaluator.js",
51
- ".evaluator.mjs"
52
- ],
53
- testCaseSuffixes: [
54
- ".test-case.ts",
55
- ".test-case.tsx",
56
- ".test-case.js",
57
- ".test-case.mjs"
58
- ],
212
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
213
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
214
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
59
215
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
60
216
  },
61
217
  artifactDirectory: ".eval-results",
@@ -80,6 +236,11 @@ function toRunnerConfigOverrides(config) {
80
236
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
81
237
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
82
238
  }
239
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
240
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
241
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
242
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
243
+ }
83
244
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
84
245
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
85
246
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -122,14 +283,15 @@ function getJitiLoader() {
122
283
  }
123
284
  const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
124
285
  if (typeof createJiti2 !== "function") {
125
- throw new Error(
126
- "Failed to initialize jiti for m4trix eval config loading."
127
- );
286
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
128
287
  }
129
- cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
130
- interopDefault: true,
131
- moduleCache: true
132
- });
288
+ cachedLoader = createJiti2(
289
+ (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
290
+ {
291
+ interopDefault: true,
292
+ moduleCache: true
293
+ }
294
+ );
133
295
  return cachedLoader;
134
296
  }
135
297
  function resolveConfigModuleExport(loadedModule) {
@@ -177,6 +339,9 @@ function isDatasetLike(value) {
177
339
  function isEvaluatorLike(value) {
178
340
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
179
341
  }
342
+ function isRunConfigLike(value) {
343
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
344
+ }
180
345
  function isTestCaseLike(value) {
181
346
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
182
347
  }
@@ -233,9 +398,7 @@ async function loadModuleExports(filePath) {
233
398
  }
234
399
  async function collectDatasetsFromFiles(config) {
235
400
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
236
- const matched = files.filter(
237
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
238
- );
401
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
239
402
  const found = await Promise.all(
240
403
  matched.map(async (absolutePath) => {
241
404
  const exports = await loadModuleExports(absolutePath);
@@ -252,9 +415,7 @@ async function collectDatasetsFromFiles(config) {
252
415
  }
253
416
  async function collectEvaluatorsFromFiles(config) {
254
417
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
255
- const matched = files.filter(
256
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
257
- );
418
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
258
419
  const found = await Promise.all(
259
420
  matched.map(async (absolutePath) => {
260
421
  const exports = await loadModuleExports(absolutePath);
@@ -269,11 +430,26 @@ async function collectEvaluatorsFromFiles(config) {
269
430
  );
270
431
  return found.flat();
271
432
  }
272
- async function collectTestCasesFromFiles(config) {
433
+ async function collectRunConfigsFromFiles(config) {
273
434
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
274
- const matched = files.filter(
275
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
435
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
436
+ const found = await Promise.all(
437
+ matched.map(async (absolutePath) => {
438
+ const exports = await loadModuleExports(absolutePath);
439
+ const runConfigs = exports.filter(isRunConfigLike);
440
+ const relPath = path.relative(config.rootDir, absolutePath);
441
+ return runConfigs.map((runConfig) => ({
442
+ id: runConfig.getName(),
443
+ filePath: relPath,
444
+ runConfig
445
+ }));
446
+ })
276
447
  );
448
+ return found.flat();
449
+ }
450
+ async function collectTestCasesFromFiles(config) {
451
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
452
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
277
453
  const found = await Promise.all(
278
454
  matched.map(async (absolutePath) => {
279
455
  const exports = await loadModuleExports(absolutePath);
@@ -345,16 +521,8 @@ function createDiffString(expected, actual, diffOptions) {
345
521
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
346
522
  const actualProcessed = preprocessForDiff(actual, diffOptions);
347
523
  if (diffOptions?.keysOnly) {
348
- const expectedKeys = JSON.stringify(
349
- extractKeys(expectedProcessed),
350
- null,
351
- 2
352
- );
353
- const actualKeys = JSON.stringify(
354
- extractKeys(actualProcessed),
355
- null,
356
- 2
357
- );
524
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
525
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
358
526
  const parts2 = diff.diffLines(expectedKeys, actualKeys);
359
527
  return formatDiffParts(parts2);
360
528
  }
@@ -365,9 +533,7 @@ function createDiffString(expected, actual, diffOptions) {
365
533
  }
366
534
  const parts = diff.diffLines(expectedStr, actualStr);
367
535
  if (diffOptions?.outputNewOnly) {
368
- const filtered = parts.filter(
369
- (p) => p.added === true
370
- );
536
+ const filtered = parts.filter((p) => p.added === true);
371
537
  return formatDiffParts(filtered);
372
538
  }
373
539
  return formatDiffParts(parts);
@@ -434,6 +600,17 @@ function getDiffLines(entry) {
434
600
  });
435
601
  }
436
602
 
603
+ // src/evals/test-case.ts
604
+ function getTestCaseDisplayLabel(testCase) {
605
+ if (typeof testCase.getDisplayLabel === "function") {
606
+ return testCase.getDisplayLabel();
607
+ }
608
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
609
+ }
610
+ function getTestCaseTagList(testCase) {
611
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
612
+ }
613
+
437
614
  // src/evals/metric.ts
438
615
  var registry = /* @__PURE__ */ new Map();
439
616
  var Metric = {
@@ -457,6 +634,54 @@ function getMetricById(id) {
457
634
  return registry.get(id);
458
635
  }
459
636
 
637
+ // src/evals/aggregators.ts
638
+ function aggregateTokenCountSum(values) {
639
+ const initial = {
640
+ input: 0,
641
+ output: 0,
642
+ inputCached: 0,
643
+ outputCached: 0
644
+ };
645
+ return values.reduce(
646
+ (acc, v) => ({
647
+ input: acc.input + (v.input ?? 0),
648
+ output: acc.output + (v.output ?? 0),
649
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
650
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
651
+ }),
652
+ initial
653
+ );
654
+ }
655
+ function aggregateLatencyAverage(values) {
656
+ if (values.length === 0) {
657
+ return { ms: 0 };
658
+ }
659
+ const sum = values.reduce((s, v) => s + v.ms, 0);
660
+ return { ms: sum / values.length };
661
+ }
662
+
663
+ // src/evals/metrics/standard.ts
664
+ Metric.of({
665
+ id: "token-count",
666
+ name: "Tokens",
667
+ aggregate: aggregateTokenCountSum,
668
+ format: (data, options) => {
669
+ const input = data.input ?? 0;
670
+ const output = data.output ?? 0;
671
+ const inputCached = data.inputCached ?? 0;
672
+ const outputCached = data.outputCached ?? 0;
673
+ const cached = inputCached + outputCached;
674
+ const base = `in:${input} out:${output} cached:${cached}`;
675
+ return options?.isAggregated ? `Total: ${base}` : base;
676
+ }
677
+ });
678
+ Metric.of({
679
+ id: "latency",
680
+ name: "Latency",
681
+ aggregate: aggregateLatencyAverage,
682
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
683
+ });
684
+
460
685
  // src/evals/score.ts
461
686
  var registry2 = /* @__PURE__ */ new Map();
462
687
  function formatScoreData(def, data, options) {
@@ -469,10 +694,7 @@ var ScoreAggregate = {
469
694
  const count = values.length || 1;
470
695
  const result = {};
471
696
  for (const field of fields) {
472
- result[field] = values.reduce(
473
- (s, v) => s + (v[field] ?? 0),
474
- 0
475
- ) / count;
697
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
476
698
  }
477
699
  return result;
478
700
  };
@@ -506,13 +728,10 @@ var ScoreAggregate = {
506
728
  (s, v) => s + (v[valueField] ?? 0),
507
729
  0
508
730
  );
509
- const sumSq = values.reduce(
510
- (s, v) => {
511
- const value = v[valueField] ?? 0;
512
- return s + value * value;
513
- },
514
- 0
515
- );
731
+ const sumSq = values.reduce((s, v) => {
732
+ const value = v[valueField] ?? 0;
733
+ return s + value * value;
734
+ }, 0);
516
735
  const mean = sum / count;
517
736
  const variance = (sumSq - count * mean * mean) / (count - 1);
518
737
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -571,54 +790,6 @@ function getScoreById(id) {
571
790
  return registry2.get(id);
572
791
  }
573
792
 
574
- // src/evals/aggregators.ts
575
- function aggregateTokenCountSum(values) {
576
- const initial = {
577
- input: 0,
578
- output: 0,
579
- inputCached: 0,
580
- outputCached: 0
581
- };
582
- return values.reduce(
583
- (acc, v) => ({
584
- input: acc.input + (v.input ?? 0),
585
- output: acc.output + (v.output ?? 0),
586
- inputCached: acc.inputCached + (v.inputCached ?? 0),
587
- outputCached: acc.outputCached + (v.outputCached ?? 0)
588
- }),
589
- initial
590
- );
591
- }
592
- function aggregateLatencyAverage(values) {
593
- if (values.length === 0) {
594
- return { ms: 0 };
595
- }
596
- const sum = values.reduce((s, v) => s + v.ms, 0);
597
- return { ms: sum / values.length };
598
- }
599
-
600
- // src/evals/metrics/standard.ts
601
- Metric.of({
602
- id: "token-count",
603
- name: "Tokens",
604
- aggregate: aggregateTokenCountSum,
605
- format: (data, options) => {
606
- const input = data.input ?? 0;
607
- const output = data.output ?? 0;
608
- const inputCached = data.inputCached ?? 0;
609
- const outputCached = data.outputCached ?? 0;
610
- const cached = inputCached + outputCached;
611
- const base = `in:${input} out:${output} cached:${cached}`;
612
- return options?.isAggregated ? `Total: ${base}` : base;
613
- }
614
- });
615
- Metric.of({
616
- id: "latency",
617
- name: "Latency",
618
- aggregate: aggregateLatencyAverage,
619
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
620
- });
621
-
622
793
  // src/evals/scores/standard.ts
623
794
  Score.of({
624
795
  id: "percent",
@@ -762,15 +933,17 @@ function readOutput(testCase) {
762
933
  }
763
934
  return candidate.getOutput();
764
935
  }
765
- function buildEvaluationUnits(testCases) {
936
+ function buildEvaluationUnits(testCases, repetitionCount) {
937
+ const count = Math.max(1, repetitionCount);
766
938
  const units = [];
767
939
  for (const testCaseItem of testCases) {
768
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
769
- for (let r = 0; r < rerunTotal; r++) {
940
+ const repetitionId = `rep-${crypto.randomUUID()}`;
941
+ for (let r = 0; r < count; r++) {
770
942
  units.push({
771
943
  testCaseItem,
772
- rerunIndex: r + 1,
773
- rerunTotal
944
+ repetitionId,
945
+ repetitionIndex: r + 1,
946
+ repetitionCount: count
774
947
  });
775
948
  }
776
949
  }
@@ -780,29 +953,24 @@ function nowIsoForFile() {
780
953
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
781
954
  }
782
955
  function createArtifactPath(artifactDirectory, datasetId, runId) {
783
- return path.join(
784
- artifactDirectory,
785
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
786
- );
956
+ return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
787
957
  }
788
958
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
789
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
959
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
790
960
  return effect.Effect.gen(function* () {
791
961
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
792
962
  const started = Date.now();
793
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
794
- n + 1,
795
- n + 1
796
- ]);
963
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
797
964
  yield* publishEvent({
798
965
  type: "TestCaseStarted",
799
966
  runId: task.runId,
800
967
  testCaseId: testCaseItem.id,
801
- testCaseName: testCaseItem.testCase.getName(),
968
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
802
969
  startedTestCases: startedEvaluations,
803
970
  totalTestCases: totalEvaluations,
804
- rerunIndex,
805
- rerunTotal
971
+ repetitionId,
972
+ repetitionIndex,
973
+ repetitionCount
806
974
  });
807
975
  const evaluatorScores = [];
808
976
  let testCaseError;
@@ -826,9 +994,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
826
994
  return error;
827
995
  };
828
996
  try {
829
- const ctx = yield* effect.Effect.promise(
830
- () => Promise.resolve(evaluator.resolveContext())
831
- );
997
+ const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
832
998
  const result = yield* effect.Effect.promise(
833
999
  () => Promise.resolve().then(
834
1000
  () => evaluateFn({
@@ -838,8 +1004,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
838
1004
  meta: {
839
1005
  triggerId: task.triggerId,
840
1006
  runId: evaluatorRunId,
841
- datasetId: task.datasetId
1007
+ datasetId: task.datasetId,
1008
+ repetitionId,
1009
+ repetitionIndex,
1010
+ repetitionCount,
1011
+ runConfigName: task.runConfigName
842
1012
  },
1013
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1014
+ runConfigTags: task.runConfigTags,
1015
+ evaluatorTags: getEvaluatorTagList(evaluator),
843
1016
  logDiff,
844
1017
  log,
845
1018
  createError
@@ -882,21 +1055,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
882
1055
  });
883
1056
  }
884
1057
  }
885
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
886
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
887
- n + 1,
888
- n + 1
889
- ]);
1058
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1059
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
890
1060
  const progressEvent = {
891
1061
  type: "TestCaseProgress",
892
1062
  runId: task.runId,
893
1063
  testCaseId: testCaseItem.id,
894
- testCaseName: testCaseItem.testCase.getName(),
1064
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
895
1065
  completedTestCases: completedEvaluations,
896
1066
  totalTestCases: totalEvaluations,
897
- rerunIndex,
898
- rerunTotal,
899
- passed: rerunPassedThis,
1067
+ repetitionId,
1068
+ repetitionIndex,
1069
+ repetitionCount,
1070
+ passed: repetitionPassedThis,
900
1071
  durationMs: Date.now() - started,
901
1072
  evaluatorScores,
902
1073
  output,
@@ -917,9 +1088,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
917
1088
  (map) => {
918
1089
  const key = testCaseItem.id;
919
1090
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
920
- const newResults = [...existing.results, rerunPassedThis];
1091
+ const newResults = [...existing.results, repetitionPassedThis];
921
1092
  const newCompletedCount = existing.completedCount + 1;
922
- const isLast = newCompletedCount === rerunTotal;
1093
+ const isLast = newCompletedCount === repetitionCount;
923
1094
  const newMap = new Map(map);
924
1095
  newMap.set(key, {
925
1096
  completedCount: newCompletedCount,
@@ -935,10 +1106,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
935
1106
  } else {
936
1107
  yield* effect.Ref.update(failedRef, (n) => n + 1);
937
1108
  }
938
- const [passed, failed] = yield* effect.Effect.all([
939
- effect.Ref.get(passedRef),
940
- effect.Ref.get(failedRef)
941
- ]);
1109
+ const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
942
1110
  yield* updateSnapshot(task.runId, (snapshot) => ({
943
1111
  ...snapshot,
944
1112
  passedTestCases: passed,
@@ -959,10 +1127,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
959
1127
  runId: task.runId,
960
1128
  startedAt
961
1129
  });
962
- const totalEvaluations = task.testCases.reduce(
963
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
964
- 0
965
- );
1130
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
966
1131
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
967
1132
  const completedRef = yield* effect.Ref.make(0);
968
1133
  const startedRef = yield* effect.Ref.make(0);
@@ -971,7 +1136,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
971
1136
  const testCaseResultsRef = yield* effect.Ref.make(
972
1137
  /* @__PURE__ */ new Map()
973
1138
  );
974
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1139
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
975
1140
  const processEvaluation = (unit) => processOneEvaluation(
976
1141
  task,
977
1142
  unit,
@@ -985,11 +1150,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
985
1150
  failedRef,
986
1151
  testCaseResultsRef
987
1152
  );
988
- yield* effect.Effect.forEach(
989
- evaluationUnits,
990
- processEvaluation,
991
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
992
- );
1153
+ const globalSem = task.globalEvaluationSemaphore;
1154
+ if (globalSem !== void 0) {
1155
+ yield* effect.Effect.forEach(
1156
+ evaluationUnits,
1157
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1158
+ { concurrency: "unbounded", discard: true }
1159
+ );
1160
+ } else {
1161
+ yield* effect.Effect.forEach(
1162
+ evaluationUnits,
1163
+ processEvaluation,
1164
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1165
+ );
1166
+ }
993
1167
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
994
1168
  effect.Ref.get(completedRef),
995
1169
  effect.Ref.get(passedRef),
@@ -1025,125 +1199,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1025
1199
  artifactPath: task.snapshot.artifactPath
1026
1200
  });
1027
1201
  });
1028
- async function loadRunSnapshotsFromArtifacts(config) {
1029
- const baseDir = path.resolve(config.artifactDirectory);
1030
- let entries;
1031
- try {
1032
- entries = await promises.readdir(baseDir);
1033
- } catch {
1034
- return [];
1035
- }
1036
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1037
- const snapshots = [];
1038
- for (const fileName of jsonlFiles) {
1039
- const filePath = path.join(baseDir, fileName);
1040
- try {
1041
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1042
- if (snapshot) {
1043
- snapshots.push(snapshot);
1044
- }
1045
- } catch {
1046
- }
1047
- }
1048
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1049
- }
1050
- async function parseArtifactToSnapshot(filePath, _config) {
1051
- const content = await promises.readFile(filePath, "utf8");
1052
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1053
- if (lines.length === 0) {
1054
- return null;
1055
- }
1056
- let runQueued = null;
1057
- let runCompleted = null;
1058
- let runFailed = null;
1059
- let runStarted = null;
1060
- for (const line of lines) {
1061
- try {
1062
- const event = JSON.parse(line);
1063
- const type = event.type;
1064
- if (type === "RunQueued") {
1065
- runQueued = {
1066
- runId: event.runId,
1067
- datasetId: event.datasetId,
1068
- datasetName: event.datasetName,
1069
- evaluatorIds: event.evaluatorIds,
1070
- totalTestCases: event.totalTestCases ?? 0,
1071
- artifactPath: event.artifactPath ?? filePath,
1072
- ts: event.ts
1073
- };
1074
- }
1075
- if (type === "RunStarted") {
1076
- runStarted = { startedAt: event.startedAt };
1077
- }
1078
- if (type === "RunCompleted") {
1079
- runCompleted = {
1080
- passedTestCases: event.passedTestCases,
1081
- failedTestCases: event.failedTestCases,
1082
- totalTestCases: event.totalTestCases,
1083
- finishedAt: event.finishedAt
1084
- };
1085
- }
1086
- if (type === "RunFailed") {
1087
- runFailed = {
1088
- finishedAt: event.finishedAt,
1089
- errorMessage: event.errorMessage
1090
- };
1091
- }
1092
- } catch {
1093
- }
1202
+
1203
+ // src/runner/name-pattern.ts
1204
+ function parseRegexLiteral(pattern) {
1205
+ if (!pattern.startsWith("/")) {
1206
+ return void 0;
1094
1207
  }
1095
- if (!runQueued) {
1096
- return null;
1208
+ const lastSlash = pattern.lastIndexOf("/");
1209
+ if (lastSlash <= 0) {
1210
+ return void 0;
1097
1211
  }
1098
- const artifactPath = filePath;
1099
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1100
- const progress = aggregateTestCaseProgress(lines);
1101
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1102
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1103
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1104
1212
  return {
1105
- runId: runQueued.runId,
1106
- datasetId: runQueued.datasetId,
1107
- datasetName: runQueued.datasetName,
1108
- evaluatorIds: runQueued.evaluatorIds,
1109
- queuedAt: runQueued.ts ?? 0,
1110
- startedAt: runStarted?.startedAt,
1111
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1112
- totalTestCases: runQueued.totalTestCases,
1113
- completedTestCases,
1114
- passedTestCases,
1115
- failedTestCases,
1116
- status,
1117
- artifactPath,
1118
- errorMessage: runFailed?.errorMessage
1213
+ source: pattern.slice(1, lastSlash),
1214
+ flags: pattern.slice(lastSlash + 1)
1119
1215
  };
1120
1216
  }
1121
- function aggregateTestCaseProgress(lines) {
1122
- let completedTestCases = 0;
1123
- const testCasePassedBy = /* @__PURE__ */ new Map();
1124
- for (const line of lines) {
1125
- try {
1126
- const event = JSON.parse(line);
1127
- if (event.type === "TestCaseProgress") {
1128
- const ev = event;
1129
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1130
- const id = ev.testCaseId;
1131
- const current = testCasePassedBy.get(id);
1132
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1133
- }
1134
- } catch {
1135
- }
1217
+ function createNameMatcher(pattern) {
1218
+ const normalizedPattern = pattern.trim();
1219
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1220
+ if (regexLiteral) {
1221
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1222
+ return (value) => regex.test(value);
1136
1223
  }
1137
- let passedTestCases = 0;
1138
- let failedTestCases = 0;
1139
- for (const passed of testCasePassedBy.values()) {
1140
- if (passed) {
1141
- passedTestCases += 1;
1142
- } else {
1143
- failedTestCases += 1;
1144
- }
1224
+ if (normalizedPattern.includes("*")) {
1225
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1226
+ const regex = new RegExp(`^${escaped}$`, "i");
1227
+ return (value) => regex.test(value);
1145
1228
  }
1146
- return { completedTestCases, passedTestCases, failedTestCases };
1229
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1147
1230
  }
1148
1231
  async function appendJsonLine(artifactPath, payload) {
1149
1232
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1202,32 +1285,12 @@ function searchCollectedTestCases(all, query) {
1202
1285
  }
1203
1286
 
1204
1287
  // src/runner/api.ts
1205
- function parseRegexLiteral(pattern) {
1206
- if (!pattern.startsWith("/")) {
1207
- return void 0;
1208
- }
1209
- const lastSlash = pattern.lastIndexOf("/");
1210
- if (lastSlash <= 0) {
1211
- return void 0;
1212
- }
1213
- return {
1214
- source: pattern.slice(1, lastSlash),
1215
- flags: pattern.slice(lastSlash + 1)
1216
- };
1217
- }
1218
- function createNameMatcher(pattern) {
1219
- const normalizedPattern = pattern.trim();
1220
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1221
- if (regexLiteral) {
1222
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1223
- return (value) => regex.test(value);
1224
- }
1225
- if (normalizedPattern.includes("*")) {
1226
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1227
- const regex = new RegExp(`^${escaped}$`, "i");
1228
- return (value) => regex.test(value);
1288
+ function normalizeRunRepetitions(value) {
1289
+ const n = value ?? 1;
1290
+ if (!Number.isInteger(n) || n < 1) {
1291
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1229
1292
  }
1230
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1293
+ return n;
1231
1294
  }
1232
1295
  function mergeRunnerOverrides(base, next) {
1233
1296
  if (!base) {
@@ -1258,15 +1321,12 @@ var EffectRunner = class {
1258
1321
  this.persistenceQueue = effect.Effect.runSync(
1259
1322
  effect.Queue.unbounded()
1260
1323
  );
1261
- this.snapshotsRef = effect.Effect.runSync(
1262
- effect.Ref.make(/* @__PURE__ */ new Map())
1263
- );
1324
+ this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
1264
1325
  this.listeners = /* @__PURE__ */ new Set();
1265
1326
  this.datasetsById = /* @__PURE__ */ new Map();
1266
1327
  this.evaluatorsById = /* @__PURE__ */ new Map();
1267
- this.schedulerFiber = effect.Effect.runFork(
1268
- this.createSchedulerEffect()
1269
- );
1328
+ this.runConfigsById = /* @__PURE__ */ new Map();
1329
+ this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1270
1330
  this.persistenceFiber = effect.Effect.runFork(
1271
1331
  createPersistenceWorker(this.persistenceQueue)
1272
1332
  );
@@ -1306,6 +1366,137 @@ var EffectRunner = class {
1306
1366
  (item) => matcher(item.evaluator.getName() ?? "")
1307
1367
  );
1308
1368
  }
1369
+ async collectRunConfigs() {
1370
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1371
+ this.runConfigsById.clear();
1372
+ const byNameLower = /* @__PURE__ */ new Map();
1373
+ for (const item of runConfigs) {
1374
+ const id = item.runConfig.getName();
1375
+ const lower = id.toLowerCase();
1376
+ const prev = byNameLower.get(lower);
1377
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1378
+ throw new Error(
1379
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1380
+ );
1381
+ }
1382
+ byNameLower.set(lower, item);
1383
+ this.runConfigsById.set(id, item);
1384
+ }
1385
+ return runConfigs;
1386
+ }
1387
+ async resolveRunConfigByName(name) {
1388
+ if (this.runConfigsById.size === 0) {
1389
+ await this.collectRunConfigs();
1390
+ }
1391
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1392
+ const keyLower = key.toLowerCase();
1393
+ const matches = Array.from(this.runConfigsById.values()).filter(
1394
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1395
+ );
1396
+ if (matches.length === 0) {
1397
+ return void 0;
1398
+ }
1399
+ if (matches.length > 1) {
1400
+ throw new Error(
1401
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1402
+ );
1403
+ }
1404
+ return matches[0];
1405
+ }
1406
+ async expandRunConfigToJobs(collected) {
1407
+ if (this.datasetsById.size === 0) {
1408
+ await this.collectDatasets();
1409
+ }
1410
+ if (this.evaluatorsById.size === 0) {
1411
+ await this.collectEvaluators();
1412
+ }
1413
+ const rcName = collected.runConfig.getName();
1414
+ const jobs = [];
1415
+ const runs = collected.runConfig.getRuns();
1416
+ for (const [i, row] of runs.entries()) {
1417
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1418
+ (d) => d.dataset === row.dataset
1419
+ );
1420
+ if (!dsCollected) {
1421
+ throw new Error(
1422
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1423
+ );
1424
+ }
1425
+ let evaluatorIds;
1426
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1427
+ const matcher = createNameMatcher(row.evaluatorPattern);
1428
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1429
+ (item) => matcher(item.evaluator.getName() ?? "")
1430
+ );
1431
+ if (matched.length === 0) {
1432
+ throw new Error(
1433
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
1434
+ );
1435
+ }
1436
+ evaluatorIds = matched.map((item) => item.id);
1437
+ } else {
1438
+ const evaluators = row.evaluators;
1439
+ evaluatorIds = [];
1440
+ for (const ev of evaluators) {
1441
+ const found = Array.from(this.evaluatorsById.values()).find(
1442
+ (item) => item.evaluator === ev
1443
+ );
1444
+ if (!found) {
1445
+ throw new Error(
1446
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
1447
+ );
1448
+ }
1449
+ evaluatorIds.push(found.id);
1450
+ }
1451
+ }
1452
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
1453
+ jobs.push({
1454
+ datasetId: dsCollected.id,
1455
+ evaluatorIds,
1456
+ runConfigName: rcName,
1457
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
1458
+ runConfigTags: collected.runConfig.getTags(),
1459
+ repetitions
1460
+ });
1461
+ }
1462
+ return jobs;
1463
+ }
1464
+ async expandRunConfigNamesToJobs(names) {
1465
+ const jobs = [];
1466
+ for (const name of names) {
1467
+ const collected = await this.resolveRunConfigByName(name);
1468
+ if (!collected) {
1469
+ const known = await this.collectRunConfigs();
1470
+ const available = known.map((r) => r.runConfig.getName()).sort();
1471
+ throw new Error(
1472
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
1473
+ );
1474
+ }
1475
+ jobs.push(...await this.expandRunConfigToJobs(collected));
1476
+ }
1477
+ return jobs;
1478
+ }
1479
+ async runDatasetJobsWithSharedConcurrency(request) {
1480
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
1481
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
1482
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1483
+ const snapshots = [];
1484
+ for (const job of request.jobs) {
1485
+ snapshots.push(
1486
+ await this.startDatasetRun({
1487
+ datasetId: job.datasetId,
1488
+ evaluatorIds: job.evaluatorIds,
1489
+ triggerId,
1490
+ maxConcurrency: this.config.maxConcurrency ?? 1,
1491
+ globalEvaluationSemaphore: sem,
1492
+ runConfigName: job.runConfigName,
1493
+ runConfigTags: job.runConfigTags,
1494
+ repetitions: job.repetitions
1495
+ })
1496
+ );
1497
+ }
1498
+ return snapshots;
1499
+ }
1309
1500
  async searchTestCases(query) {
1310
1501
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1311
1502
  return searchCollectedTestCases(testCases, query);
@@ -1324,35 +1515,45 @@ var EffectRunner = class {
1324
1515
  );
1325
1516
  }
1326
1517
  async runDatasetWith(request) {
1518
+ const runConfigName = validateRunConfigName(
1519
+ request.runConfigName,
1520
+ "runDatasetWith.runConfigName"
1521
+ );
1522
+ return this.startDatasetRun({
1523
+ datasetId: request.datasetId,
1524
+ evaluatorIds: request.evaluatorIds,
1525
+ triggerId: request.triggerId,
1526
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1527
+ repetitions: request.repetitions,
1528
+ runConfigName,
1529
+ runConfigTags: request.runConfigTags
1530
+ });
1531
+ }
1532
+ async startDatasetRun(params) {
1327
1533
  if (this.datasetsById.size === 0) {
1328
1534
  await this.collectDatasets();
1329
1535
  }
1330
1536
  if (this.evaluatorsById.size === 0) {
1331
1537
  await this.collectEvaluators();
1332
1538
  }
1333
- const dataset = this.datasetsById.get(request.datasetId);
1539
+ const dataset = this.datasetsById.get(params.datasetId);
1334
1540
  if (!dataset) {
1335
- throw new Error(`Unknown dataset: ${request.datasetId}`);
1541
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1336
1542
  }
1337
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1543
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1338
1544
  if (selectedEvaluators.length === 0) {
1339
1545
  throw new Error("No evaluators selected for run");
1340
1546
  }
1341
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1342
- const totalEvaluations = selectedTestCases.reduce(
1343
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1344
- 0
1345
- );
1346
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
1547
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
1548
+ const repetitions = normalizeRunRepetitions(params.repetitions);
1549
+ const totalEvaluations = selectedTestCases.length * repetitions;
1550
+ const runConfigTags = [...params.runConfigTags ?? []];
1551
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
1347
1552
  const runId = `run-${crypto.randomUUID()}`;
1348
- const artifactPath = createArtifactPath(
1349
- this.config.artifactDirectory,
1350
- request.datasetId,
1351
- runId
1352
- );
1553
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1353
1554
  const snapshot = {
1354
1555
  runId,
1355
- datasetId: request.datasetId,
1556
+ datasetId: params.datasetId,
1356
1557
  datasetName: dataset.dataset.getName(),
1357
1558
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1358
1559
  queuedAt: Date.now(),
@@ -1373,7 +1574,7 @@ var EffectRunner = class {
1373
1574
  const queuedEvent = {
1374
1575
  type: "RunQueued",
1375
1576
  runId,
1376
- datasetId: request.datasetId,
1577
+ datasetId: params.datasetId,
1377
1578
  datasetName: dataset.dataset.getName(),
1378
1579
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1379
1580
  totalTestCases: totalEvaluations,
@@ -1387,17 +1588,20 @@ var EffectRunner = class {
1387
1588
  payload: queuedEvent
1388
1589
  })
1389
1590
  );
1390
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1391
1591
  await effect.Effect.runPromise(
1392
1592
  effect.Queue.offer(this.runQueue, {
1393
1593
  runId,
1394
1594
  triggerId,
1395
- datasetId: request.datasetId,
1595
+ datasetId: params.datasetId,
1396
1596
  dataset: dataset.dataset,
1397
1597
  evaluators: selectedEvaluators,
1398
1598
  testCases: selectedTestCases,
1399
1599
  snapshot,
1400
- maxConcurrency
1600
+ maxConcurrency: params.maxConcurrency,
1601
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1602
+ runConfigName: params.runConfigName,
1603
+ runConfigTags,
1604
+ repetitions
1401
1605
  })
1402
1606
  );
1403
1607
  return snapshot;
@@ -1413,9 +1617,9 @@ var EffectRunner = class {
1413
1617
  return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1414
1618
  }
1415
1619
  getAllRunSnapshots() {
1416
- return Array.from(
1417
- effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1418
- ).sort((a, b) => b.queuedAt - a.queuedAt);
1620
+ return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
1621
+ (a, b) => b.queuedAt - a.queuedAt
1622
+ );
1419
1623
  }
1420
1624
  async loadRunSnapshotsFromArtifacts() {
1421
1625
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1476,6 +1680,8 @@ function getDefaultConcurrency() {
1476
1680
  function parseSimpleCliArgs(argv) {
1477
1681
  const args = {
1478
1682
  help: false,
1683
+ ci: false,
1684
+ runConfigNames: [],
1479
1685
  unknownArgs: []
1480
1686
  };
1481
1687
  let index = 0;
@@ -1489,18 +1695,26 @@ function parseSimpleCliArgs(argv) {
1489
1695
  args.help = true;
1490
1696
  continue;
1491
1697
  }
1698
+ if (token === "--ci") {
1699
+ args.ci = true;
1700
+ continue;
1701
+ }
1492
1702
  if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
1493
1703
  args.datasetName = argv[index + 1];
1494
1704
  index += 1;
1495
1705
  continue;
1496
1706
  }
1497
- if ((token === "--evaluator" || token === "--name") && argv[index + 1]) {
1498
- args.evaluatorPattern = argv[index + 1];
1707
+ if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
1708
+ const next = argv[index + 1];
1709
+ if (typeof next === "string") {
1710
+ args.runConfigNames.push(next);
1711
+ }
1499
1712
  index += 1;
1500
1713
  continue;
1501
1714
  }
1502
1715
  if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1503
- const n = parseInt(argv[index + 1], 10);
1716
+ const nextConc = argv[index + 1];
1717
+ const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
1504
1718
  if (!Number.isNaN(n) && n >= 1) {
1505
1719
  args.concurrency = n;
1506
1720
  }
@@ -1514,16 +1728,12 @@ function parseSimpleCliArgs(argv) {
1514
1728
  function getSimpleCliUsage() {
1515
1729
  return [
1516
1730
  "Usage:",
1517
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1731
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1518
1732
  " eval-agents-simple generate --dataset <datasetName>",
1519
1733
  "",
1520
1734
  "Options:",
1521
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1522
- "",
1523
- "Pattern examples for --evaluator:",
1524
- " score-evaluator exact name (case-insensitive)",
1525
- ' "*score*" wildcard pattern',
1526
- ' "/score/i" regex literal'
1735
+ " --ci With run: exit with code 1 if any test case fails.",
1736
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1527
1737
  ].join("\n");
1528
1738
  }
1529
1739
 
@@ -1557,9 +1767,9 @@ function GenerateView({
1557
1767
  datasetName,
1558
1768
  onComplete
1559
1769
  }) {
1560
- const [result, setResult] = React2.useState(null);
1561
- const [error, setError] = React2.useState(null);
1562
- React2.useEffect(() => {
1770
+ const [result, setResult] = React.useState(null);
1771
+ const [error, setError] = React.useState(null);
1772
+ React.useEffect(() => {
1563
1773
  let cancelled = false;
1564
1774
  async function run() {
1565
1775
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -1574,7 +1784,7 @@ function GenerateView({
1574
1784
  const payload = testCases.map((item) => {
1575
1785
  const tc = item.testCase;
1576
1786
  return {
1577
- name: item.testCase.getName(),
1787
+ name: getTestCaseDisplayLabel(item.testCase),
1578
1788
  input: item.testCase.getInput(),
1579
1789
  output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
1580
1790
  };
@@ -1582,12 +1792,8 @@ function GenerateView({
1582
1792
  const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
1583
1793
  const parsed = parse2(absoluteDatasetPath);
1584
1794
  const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
1585
- await writeFile2(
1586
- outputPath,
1587
- `${JSON.stringify(payload, null, 2)}
1588
- `,
1589
- "utf8"
1590
- );
1795
+ await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
1796
+ `, "utf8");
1591
1797
  if (!cancelled) {
1592
1798
  setResult({
1593
1799
  count: payload.length,
@@ -1644,7 +1850,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1644
1850
  }
1645
1851
  const testCases = await runner.collectDatasetTestCases(dataset.id);
1646
1852
  const payload = testCases.map((item) => ({
1647
- name: item.testCase.getName(),
1853
+ name: getTestCaseDisplayLabel(item.testCase),
1648
1854
  input: item.testCase.getInput(),
1649
1855
  output: readOutput2(item.testCase)
1650
1856
  }));
@@ -1658,7 +1864,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1658
1864
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1659
1865
  return new Promise((resolve5, reject) => {
1660
1866
  const app = ink.render(
1661
- React2__namespace.default.createElement(GenerateView, {
1867
+ React__namespace.default.createElement(GenerateView, {
1662
1868
  runner,
1663
1869
  datasetName,
1664
1870
  onComplete: (err) => {
@@ -1708,8 +1914,8 @@ function TextBar({
1708
1914
  }
1709
1915
  var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1710
1916
  function Spinner({ label = "Running" }) {
1711
- const [frame, setFrame] = React2.useState(0);
1712
- React2.useEffect(() => {
1917
+ const [frame, setFrame] = React.useState(0);
1918
+ React.useEffect(() => {
1713
1919
  const timer = setInterval(() => {
1714
1920
  setFrame((f) => (f + 1) % FRAMES.length);
1715
1921
  }, 100);
@@ -1743,9 +1949,7 @@ function createBar(value, max = 100, width = 20) {
1743
1949
  function aggregateEvaluatorScores(events, nameById) {
1744
1950
  if (events.length === 0)
1745
1951
  return [];
1746
- const evaluatorIds = new Set(
1747
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1748
- );
1952
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
1749
1953
  const result = [];
1750
1954
  for (const evaluatorId of evaluatorIds) {
1751
1955
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1775,9 +1979,7 @@ function aggregateEvaluatorScores(events, nameById) {
1775
1979
  return es?.passed ?? false;
1776
1980
  });
1777
1981
  const lastEvent = events[events.length - 1];
1778
- const lastEs = lastEvent?.evaluatorScores.find(
1779
- (x) => x.evaluatorId === evaluatorId
1780
- );
1982
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1781
1983
  result.push({
1782
1984
  evaluatorId,
1783
1985
  evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
@@ -1806,46 +2008,43 @@ function formatScorePart(item, _scoreToColor, options) {
1806
2008
  }
1807
2009
  function RunView({
1808
2010
  runner,
1809
- datasetName,
1810
- evaluatorPattern,
2011
+ runConfigNames,
1811
2012
  concurrency,
1812
2013
  onComplete
1813
2014
  }) {
1814
- const [phase, setPhase] = React2.useState(
1815
- "loading"
1816
- );
1817
- const [runInfo, setRunInfo] = React2.useState(null);
1818
- const [testCases, setTestCases] = React2.useState([]);
1819
- const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
1820
- const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1821
- const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
1822
- const [summary, setSummary] = React2.useState(null);
1823
- const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1824
- const runEval = React2.useCallback(async () => {
1825
- const dataset = await runner.resolveDatasetByName(datasetName);
1826
- if (!dataset) {
1827
- const known = await runner.collectDatasets();
1828
- const available = known.map((item) => item.dataset.getName()).sort();
1829
- onComplete(
1830
- new Error(
1831
- available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1832
- )
1833
- );
2015
+ const [phase, setPhase] = React.useState("loading");
2016
+ const [runInfo, setRunInfo] = React.useState(null);
2017
+ const [testCases, setTestCases] = React.useState([]);
2018
+ const [startedEvaluations, setStartedEvaluations] = React.useState(0);
2019
+ const [completedEvaluations, setCompletedEvaluations] = React.useState(0);
2020
+ const [runningEvaluations, setRunningEvaluations] = React.useState([]);
2021
+ const [summary, setSummary] = React.useState(null);
2022
+ const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
2023
+ const runEval = React.useCallback(async () => {
2024
+ const rcList = runConfigNames.filter((n) => n.trim().length > 0);
2025
+ if (rcList.length === 0) {
2026
+ onComplete(new Error("At least one RunConfig name is required."));
1834
2027
  return;
1835
2028
  }
1836
- const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1837
- if (evaluators.length === 0) {
1838
- const known = await runner.collectEvaluators();
1839
- const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1840
- onComplete(
1841
- new Error(
1842
- available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1843
- )
1844
- );
2029
+ setStartedEvaluations(0);
2030
+ setCompletedEvaluations(0);
2031
+ setTestCases([]);
2032
+ setRunningEvaluations([]);
2033
+ setSummary(null);
2034
+ let jobs;
2035
+ try {
2036
+ jobs = await runner.expandRunConfigNamesToJobs(rcList);
2037
+ } catch (err) {
2038
+ onComplete(err instanceof Error ? err : new Error(String(err)));
2039
+ return;
2040
+ }
2041
+ if (jobs.length === 0) {
2042
+ onComplete(new Error("No jobs expanded from RunConfigs."));
1845
2043
  return;
1846
2044
  }
2045
+ const allEvaluators = await runner.collectEvaluators();
1847
2046
  const nameById = new Map(
1848
- evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2047
+ allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
1849
2048
  );
1850
2049
  setEvaluatorNameById(nameById);
1851
2050
  const aggregates = /* @__PURE__ */ new Map();
@@ -1853,21 +2052,30 @@ function RunView({
1853
2052
  let overallScoreTotal = 0;
1854
2053
  let overallScoreSumSq = 0;
1855
2054
  let overallScoreCount = 0;
1856
- const done = new Promise((resolve5) => {
2055
+ const batchPendingRunIds = /* @__PURE__ */ new Set();
2056
+ const runIdToLabel = /* @__PURE__ */ new Map();
2057
+ let batchReady = false;
2058
+ const completedRuns = /* @__PURE__ */ new Map();
2059
+ const done = new Promise((resolve5, reject) => {
1857
2060
  const unsubscribe = runner.subscribeRunEvents((event) => {
2061
+ if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
2062
+ return;
2063
+ }
1858
2064
  if (event.type === "TestCaseStarted") {
1859
- setStartedEvaluations(event.startedTestCases);
2065
+ setStartedEvaluations((c) => c + 1);
1860
2066
  setRunningEvaluations((prev) => {
1861
2067
  const withoutDuplicate = prev.filter(
1862
- (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
2068
+ (item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
1863
2069
  );
1864
2070
  return [
1865
2071
  ...withoutDuplicate,
1866
2072
  {
2073
+ runId: event.runId,
1867
2074
  testCaseId: event.testCaseId,
1868
2075
  name: event.testCaseName,
1869
- rerunIndex: event.rerunIndex,
1870
- rerunTotal: event.rerunTotal,
2076
+ repetitionId: event.repetitionId,
2077
+ repetitionIndex: event.repetitionIndex,
2078
+ repetitionCount: event.repetitionCount,
1871
2079
  startedTestCases: event.startedTestCases,
1872
2080
  totalTestCases: event.totalTestCases
1873
2081
  }
@@ -1903,9 +2111,12 @@ function RunView({
1903
2111
  scoreItemsByEvaluatorScore.set(key, list);
1904
2112
  }
1905
2113
  }
2114
+ const label = runIdToLabel.get(event.runId);
2115
+ const compositeId = `${event.runId}:${event.testCaseId}`;
2116
+ const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
1906
2117
  setTestCases((prev) => {
1907
2118
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1908
- const existing = byId.get(event.testCaseId);
2119
+ const existing = byId.get(compositeId);
1909
2120
  const newEvent = {
1910
2121
  evaluatorScores: event.evaluatorScores.map((item) => ({
1911
2122
  evaluatorId: item.evaluatorId,
@@ -1920,17 +2131,14 @@ function RunView({
1920
2131
  };
1921
2132
  const events = existing ? [...existing.events, newEvent] : [newEvent];
1922
2133
  const isAggregated = events.length > 1;
1923
- const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1924
- events,
1925
- nameById
1926
- );
2134
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
1927
2135
  const merged = {
1928
- name: event.testCaseName,
1929
- testCaseId: event.testCaseId,
2136
+ name: displayName,
2137
+ testCaseId: compositeId,
1930
2138
  completedTestCases: event.completedTestCases,
1931
2139
  totalTestCases: event.totalTestCases,
1932
- rerunIndex: event.rerunIndex,
1933
- rerunTotal: event.rerunTotal,
2140
+ repetitionIndex: event.repetitionIndex,
2141
+ repetitionCount: event.repetitionCount,
1934
2142
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1935
2143
  passed: events.every((e) => e.passed),
1936
2144
  errorMessage: event.errorMessage,
@@ -1938,84 +2146,118 @@ function RunView({
1938
2146
  aggregatedEvaluatorScores,
1939
2147
  isAggregated
1940
2148
  };
1941
- byId.set(event.testCaseId, merged);
1942
- setCompletedEvaluations(event.completedTestCases);
1943
- setRunningEvaluations(
1944
- (running) => running.filter(
1945
- (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1946
- )
1947
- );
2149
+ byId.set(compositeId, merged);
1948
2150
  return Array.from(byId.values());
1949
2151
  });
2152
+ setCompletedEvaluations((c) => c + 1);
2153
+ setRunningEvaluations(
2154
+ (running) => running.filter(
2155
+ (item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
2156
+ )
2157
+ );
1950
2158
  }
1951
- if (event.type === "RunCompleted" || event.type === "RunFailed") {
2159
+ if (event.type === "RunFailed") {
2160
+ if (batchReady && !batchPendingRunIds.has(event.runId)) {
2161
+ return;
2162
+ }
1952
2163
  unsubscribe();
1953
- resolve5(event);
2164
+ reject(new Error(`Run failed: ${event.errorMessage}`));
2165
+ return;
2166
+ }
2167
+ if (event.type === "RunCompleted") {
2168
+ if (!batchPendingRunIds.has(event.runId)) {
2169
+ return;
2170
+ }
2171
+ completedRuns.set(event.runId, event);
2172
+ batchPendingRunIds.delete(event.runId);
2173
+ if (batchPendingRunIds.size === 0) {
2174
+ unsubscribe();
2175
+ resolve5();
2176
+ }
1954
2177
  }
1955
2178
  });
1956
2179
  });
1957
- const snapshot = await runner.runDatasetWith({
1958
- datasetId: dataset.id,
1959
- evaluatorIds: evaluators.map((item) => item.id),
1960
- concurrency
2180
+ const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2181
+ jobs,
2182
+ globalConcurrency: concurrency
1961
2183
  });
2184
+ for (let i = 0; i < snapshots.length; i += 1) {
2185
+ const snap = snapshots[i];
2186
+ const job = jobs[i];
2187
+ if (snap && job) {
2188
+ runIdToLabel.set(
2189
+ snap.runId,
2190
+ `${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
2191
+ );
2192
+ batchPendingRunIds.add(snap.runId);
2193
+ }
2194
+ }
2195
+ const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
2196
+ batchReady = true;
2197
+ const runConfigLabels = await Promise.all(
2198
+ rcList.map(async (n) => {
2199
+ const collected = await runner.resolveRunConfigByName(n);
2200
+ return collected?.runConfig.getDisplayLabel() ?? n;
2201
+ })
2202
+ );
1962
2203
  setRunInfo({
1963
- runId: snapshot.runId,
1964
- datasetName: snapshot.datasetName,
1965
- evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1966
- totalTestCases: snapshot.totalTestCases
2204
+ names: runConfigLabels,
2205
+ jobs: jobs.length,
2206
+ totalTestCases: totalUnits
1967
2207
  });
1968
2208
  setPhase("running");
1969
- const finalEvent = await done;
1970
- if (finalEvent.type === "RunFailed") {
1971
- onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
2209
+ try {
2210
+ await done;
2211
+ } catch (err) {
2212
+ onComplete(err instanceof Error ? err : new Error(String(err)));
1972
2213
  return;
1973
2214
  }
1974
- const completed = finalEvent;
2215
+ let passedTestCases = 0;
2216
+ let failedTestCases = 0;
2217
+ let totalTestCases = 0;
2218
+ const artifacts = [];
2219
+ for (const ev of completedRuns.values()) {
2220
+ passedTestCases += ev.passedTestCases;
2221
+ failedTestCases += ev.failedTestCases;
2222
+ totalTestCases += ev.totalTestCases;
2223
+ artifacts.push(ev.artifactPath);
2224
+ }
1975
2225
  setSummary({
1976
- passedTestCases: completed.passedTestCases,
1977
- failedTestCases: completed.failedTestCases,
1978
- totalTestCases: completed.totalTestCases,
2226
+ passedTestCases,
2227
+ failedTestCases,
2228
+ totalTestCases,
1979
2229
  overallScoreTotal,
1980
2230
  overallScoreSumSq,
1981
2231
  overallScoreCount,
1982
2232
  aggregates: new Map(aggregates),
1983
2233
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1984
- artifactPath: completed.artifactPath
2234
+ artifactPath: artifacts.join("\n")
1985
2235
  });
1986
2236
  setPhase("completed");
1987
- setTimeout(() => onComplete(), 200);
1988
- }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1989
- React2.useEffect(() => {
2237
+ const exitCode = failedTestCases > 0 ? 1 : 0;
2238
+ setTimeout(() => onComplete(void 0, exitCode), 200);
2239
+ }, [runner, runConfigNames, concurrency, onComplete]);
2240
+ React.useEffect(() => {
1990
2241
  void runEval();
1991
2242
  }, [runEval]);
1992
2243
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1993
2244
  /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1994
2245
  runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1995
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1996
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
1997
- "Run",
1998
- " "
1999
- ] }),
2000
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
2001
- ] }),
2002
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2003
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
2004
- "Dataset",
2005
- " "
2006
- ] }),
2007
- runInfo.datasetName
2246
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
2247
+ "RunConfigs",
2248
+ " "
2008
2249
  ] }),
2250
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.names.join(", ") }),
2009
2251
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2010
2252
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
2011
- "Evaluators",
2253
+ "Jobs",
2012
2254
  " "
2013
2255
  ] }),
2014
- runInfo.evaluatorNames.join(", ")
2256
+ runInfo.jobs
2015
2257
  ] }),
2016
2258
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2017
2259
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", bold: true, children: [
2018
- "Test cases",
2260
+ "Evaluation units",
2019
2261
  " "
2020
2262
  ] }),
2021
2263
  runInfo.totalTestCases
@@ -2037,20 +2279,19 @@ function RunView({
2037
2279
  item.startedTestCases,
2038
2280
  "/",
2039
2281
  item.totalTestCases,
2040
- "]",
2041
- " ",
2282
+ "] ",
2042
2283
  item.name,
2043
2284
  " ",
2044
2285
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2045
2286
  "(",
2046
- item.rerunIndex,
2287
+ item.repetitionIndex,
2047
2288
  "/",
2048
- item.rerunTotal,
2289
+ item.repetitionCount,
2049
2290
  ")"
2050
2291
  ] })
2051
2292
  ]
2052
2293
  },
2053
- `${item.testCaseId}:${item.rerunIndex}`
2294
+ `${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
2054
2295
  )) })
2055
2296
  ] }),
2056
2297
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
@@ -2067,9 +2308,9 @@ function RunView({
2067
2308
  " ",
2068
2309
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
2069
2310
  "(",
2070
- tc.rerunIndex,
2311
+ tc.repetitionIndex,
2071
2312
  "/",
2072
- tc.rerunTotal,
2313
+ tc.repetitionCount,
2073
2314
  ")"
2074
2315
  ] }),
2075
2316
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
@@ -2083,73 +2324,70 @@ function RunView({
2083
2324
  ] }) : null
2084
2325
  ] }),
2085
2326
  tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
2086
- tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
2087
- ink.Box,
2088
- {
2089
- flexDirection: "column",
2090
- marginLeft: 2,
2091
- children: [
2092
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2093
- item.evaluatorName,
2094
- ":",
2095
- " ",
2096
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2097
- item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2327
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
2328
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2329
+ item.evaluatorName,
2330
+ ":",
2331
+ " ",
2332
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2333
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2334
+ " ",
2335
+ item.metrics.map((m) => {
2336
+ const def = getMetricById(m.id);
2337
+ if (!def)
2338
+ return null;
2339
+ const formatted = def.format(m.data, {
2340
+ isAggregated: tc.isAggregated
2341
+ });
2342
+ const label = m.name ?? def.name;
2343
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2344
+ "[",
2345
+ label ? `${label}: ` : "",
2346
+ formatted,
2347
+ "]",
2348
+ " "
2349
+ ] }, m.id);
2350
+ })
2351
+ ] }) : null
2352
+ ] }),
2353
+ item.scores.length > 0 ? item.scores.map((s) => {
2354
+ const def = s.def ?? getScoreById(s.id);
2355
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2356
+ return /* @__PURE__ */ jsxRuntime.jsxs(
2357
+ ink.Text,
2358
+ {
2359
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2360
+ children: [
2361
+ " ",
2362
+ scoreLabel,
2363
+ ":",
2098
2364
  " ",
2099
- item.metrics.map((m) => {
2100
- const def = getMetricById(m.id);
2101
- if (!def)
2102
- return null;
2103
- const formatted = def.format(m.data, {
2104
- isAggregated: tc.isAggregated
2105
- });
2106
- const label = m.name ?? def.name;
2107
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2108
- "[",
2109
- label ? `${label}: ` : "",
2110
- formatted,
2111
- "]",
2112
- " "
2113
- ] }, m.id);
2365
+ formatScorePart(s, scoreColor, {
2366
+ isAggregated: tc.isAggregated
2114
2367
  })
2115
- ] }) : null
2116
- ] }),
2117
- item.scores.length > 0 ? item.scores.map((s, idx) => {
2118
- const def = s.def ?? getScoreById(s.id);
2119
- const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2120
- return /* @__PURE__ */ jsxRuntime.jsxs(
2368
+ ]
2369
+ },
2370
+ `${item.evaluatorId}-${s.id}-${scoreLabel}`
2371
+ );
2372
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2373
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2374
+ (log) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(
2375
+ ink.Box,
2376
+ {
2377
+ flexDirection: "column",
2378
+ children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsxRuntime.jsx(
2121
2379
  ink.Text,
2122
2380
  {
2123
- color: scoreColor(toNumericScore(s.data) ?? 0),
2124
- children: [
2125
- " ",
2126
- scoreLabel,
2127
- ":",
2128
- " ",
2129
- formatScorePart(s, scoreColor, {
2130
- isAggregated: tc.isAggregated
2131
- })
2132
- ]
2381
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2382
+ children: line
2133
2383
  },
2134
- `${item.evaluatorId}-${s.id}-${idx}`
2135
- );
2136
- }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2137
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2138
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
2139
- ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
2140
- ink.Text,
2141
- {
2142
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2143
- children: line
2144
- },
2145
- lineIdx
2146
- )
2147
- ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2148
- ) })
2149
- ]
2150
- },
2151
- item.evaluatorId
2152
- ))
2384
+ `${type}:${line}`
2385
+ ))
2386
+ },
2387
+ `diff:${getDiffLines(log).map((x) => x.line).join("|")}`
2388
+ ) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
2389
+ ) })
2390
+ ] }, item.evaluatorId))
2153
2391
  ] }, tc.testCaseId)) }),
2154
2392
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
2155
2393
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -2191,9 +2429,9 @@ function RunView({
2191
2429
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
2192
2430
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2193
2431
  const agg = summary.aggregates.get(id);
2194
- const scoreKeys = [
2195
- ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2196
- ].filter((k) => k.startsWith(`${id}:`));
2432
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2433
+ (k) => k.startsWith(`${id}:`)
2434
+ );
2197
2435
  if (scoreKeys.length === 0) {
2198
2436
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2199
2437
  "- ",
@@ -2223,19 +2461,12 @@ function RunView({
2223
2461
  const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
2224
2462
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
2225
2463
  const numeric = toNumericScore(aggregated.data);
2226
- return /* @__PURE__ */ jsxRuntime.jsxs(
2227
- ink.Text,
2228
- {
2229
- color: numeric !== void 0 ? scoreColor(numeric) : "gray",
2230
- children: [
2231
- " ",
2232
- label,
2233
- ": ",
2234
- formatted
2235
- ]
2236
- },
2237
- key
2238
- );
2464
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
2465
+ " ",
2466
+ label,
2467
+ ": ",
2468
+ formatted
2469
+ ] }, key);
2239
2470
  })
2240
2471
  ] }, id);
2241
2472
  })
@@ -2278,10 +2509,10 @@ function RunView({
2278
2509
  ] }, tc.testCaseId);
2279
2510
  })
2280
2511
  ] }),
2281
- /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2282
- "artifact: ",
2283
- summary.artifactPath
2284
- ] }) })
2512
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
2513
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "artifact(s):" }),
2514
+ summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, line))
2515
+ ] })
2285
2516
  ] })
2286
2517
  ] });
2287
2518
  }
@@ -2311,9 +2542,7 @@ function buildTestCaseSummaries(byId) {
2311
2542
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
2312
2543
  const scoreIdToItems = /* @__PURE__ */ new Map();
2313
2544
  for (const ev of events) {
2314
- const es = ev.evaluatorScores.find(
2315
- (x) => x.evaluatorId === evaluatorScores.evaluatorId
2316
- );
2545
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
2317
2546
  for (const s of es?.scores ?? []) {
2318
2547
  const list = scoreIdToItems.get(s.id) ?? [];
2319
2548
  list.push(s);
@@ -2366,9 +2595,7 @@ function scoreToColor(score) {
2366
2595
  }
2367
2596
  function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2368
2597
  const lines = [];
2369
- const scoreKeys = [...scoreItemsByKey.keys()].filter(
2370
- (k) => k.startsWith(`${evaluatorId}:`)
2371
- );
2598
+ const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
2372
2599
  if (scoreKeys.length === 0) {
2373
2600
  lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2374
2601
  return lines;
@@ -2403,9 +2630,7 @@ function createBar2(value, max = 100, width = 20) {
2403
2630
  function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2404
2631
  if (events.length === 0)
2405
2632
  return [];
2406
- const evaluatorIds = new Set(
2407
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
2408
- );
2633
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
2409
2634
  const result = [];
2410
2635
  for (const evaluatorId of evaluatorIds) {
2411
2636
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -2452,9 +2677,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2452
2677
  if (def) {
2453
2678
  const formatted = def.format(m.data, options);
2454
2679
  const label = m.name ?? def.name;
2455
- metricParts.push(
2456
- label ? `[${label}: ${formatted}]` : `[${formatted}]`
2457
- );
2680
+ metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
2458
2681
  }
2459
2682
  }
2460
2683
  }
@@ -2501,25 +2724,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2501
2724
  }
2502
2725
  return lines;
2503
2726
  }
2504
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2505
- const dataset = await runner.resolveDatasetByName(datasetName);
2506
- if (!dataset) {
2507
- const known = await runner.collectDatasets();
2508
- const available = known.map((item) => item.dataset.getName()).sort();
2509
- throw new Error(
2510
- available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
2511
- );
2512
- }
2513
- const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
2514
- if (evaluators.length === 0) {
2515
- const known = await runner.collectEvaluators();
2516
- const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
2517
- throw new Error(
2518
- available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
2519
- );
2727
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2728
+ const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2729
+ if (jobs.length === 0) {
2730
+ throw new Error("No jobs expanded from RunConfigs.");
2520
2731
  }
2732
+ const evaluators = await runner.collectEvaluators();
2521
2733
  const evaluatorNameById = new Map(
2522
- evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2734
+ evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
2523
2735
  );
2524
2736
  const aggregates = /* @__PURE__ */ new Map();
2525
2737
  const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
@@ -2527,11 +2739,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2527
2739
  let overallScoreTotal = 0;
2528
2740
  let overallScoreSumSq = 0;
2529
2741
  let overallScoreCount = 0;
2530
- let startedCount = 0;
2531
- let completedCount = 0;
2742
+ let globalStartedUnits = 0;
2743
+ let globalCompletedUnits = 0;
2532
2744
  let totalCount = 0;
2533
2745
  let runFinished = false;
2534
- const inFlightReruns = /* @__PURE__ */ new Set();
2746
+ const inFlightRepetitions = /* @__PURE__ */ new Set();
2535
2747
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2536
2748
  let spinnerIndex = 0;
2537
2749
  function clearLine() {
@@ -2553,33 +2765,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2553
2765
  spinnerIndex += 1;
2554
2766
  process.stdout.write(
2555
2767
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2556
- `${completedCount}/${totalCount}`,
2768
+ `${globalCompletedUnits}/${totalCount}`,
2557
2769
  ansi2.bold
2558
- )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2770
+ )} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
2559
2771
  );
2560
2772
  }
2561
2773
  let lastPrintedTestCaseId = null;
2562
2774
  let lastPrintedLineCount = 0;
2563
2775
  let spinnerTimer;
2564
- const done = new Promise((resolve5) => {
2776
+ const batchPendingRunIds = /* @__PURE__ */ new Set();
2777
+ const runIdToLabel = /* @__PURE__ */ new Map();
2778
+ let batchReady = false;
2779
+ const completedRuns = /* @__PURE__ */ new Map();
2780
+ const done = new Promise((resolve5, reject) => {
2565
2781
  const unsubscribe = runner.subscribeRunEvents((event) => {
2782
+ if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
2783
+ return;
2784
+ }
2785
+ const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
2786
+ const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
2566
2787
  if (event.type === "TestCaseStarted") {
2567
- startedCount = event.startedTestCases;
2568
- inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2788
+ globalStartedUnits += 1;
2789
+ inFlightRepetitions.add(
2790
+ `${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
2791
+ );
2569
2792
  clearLine();
2570
2793
  process.stdout.write(
2571
- `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2794
+ `${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2572
2795
  `
2573
2796
  );
2574
2797
  drawSpinner();
2575
2798
  }
2576
2799
  if (event.type === "TestCaseProgress") {
2577
- completedCount = event.completedTestCases;
2578
- inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2800
+ globalCompletedUnits += 1;
2801
+ inFlightRepetitions.delete(
2802
+ `${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
2803
+ );
2579
2804
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2580
2805
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2581
- const testCaseId = event.testCaseId;
2582
- const existing = testCaseByTestId.get(testCaseId) ?? {
2806
+ const compositeId = `${event.runId}:${event.testCaseId}`;
2807
+ const existing = testCaseByTestId.get(compositeId) ?? {
2583
2808
  name: event.testCaseName,
2584
2809
  events: []
2585
2810
  };
@@ -2589,7 +2814,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2589
2814
  durationMs: event.durationMs,
2590
2815
  evaluatorScores: event.evaluatorScores
2591
2816
  });
2592
- testCaseByTestId.set(testCaseId, existing);
2817
+ testCaseByTestId.set(compositeId, existing);
2593
2818
  for (const item of event.evaluatorScores) {
2594
2819
  const numeric = toNumericScoreFromScores(item.scores);
2595
2820
  if (numeric !== void 0) {
@@ -2618,24 +2843,21 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2618
2843
  scoreItemsByEvaluatorScore.set(key, list);
2619
2844
  }
2620
2845
  }
2621
- const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2622
- const isLastRerun = event.rerunIndex >= event.rerunTotal;
2846
+ const isSameTestCase = lastPrintedTestCaseId === compositeId;
2847
+ const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
2623
2848
  const isNonTty = !process.stdout.isTTY;
2624
- const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2849
+ const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
2625
2850
  if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2626
2851
  cursorUp(lastPrintedLineCount);
2627
2852
  }
2628
2853
  const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2629
2854
  existing.events);
2630
2855
  const isAggregated = existing.events.length > 1;
2631
- const durationMs = existing.events.reduce(
2632
- (s, e) => s + e.durationMs,
2633
- 0
2634
- );
2856
+ const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
2635
2857
  const lines = [];
2636
2858
  const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2637
2859
  lines.push(
2638
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2860
+ `${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2639
2861
  );
2640
2862
  if (event.errorMessage) {
2641
2863
  lines.push(colorize(event.errorMessage, ansi2.red));
@@ -2643,18 +2865,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2643
2865
  for (const item of aggregatedScores) {
2644
2866
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2645
2867
  lines.push(
2646
- ...formatEvaluatorScoreLine(
2647
- name,
2648
- item.scores,
2649
- item.passed,
2650
- item.metrics,
2651
- { isAggregated }
2652
- )
2868
+ ...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
2869
+ isAggregated
2870
+ })
2653
2871
  );
2654
2872
  const lastEvent = existing.events[existing.events.length - 1];
2655
- const lastEs = lastEvent?.evaluatorScores.find(
2656
- (x) => x.evaluatorId === item.evaluatorId
2657
- );
2873
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
2658
2874
  if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2659
2875
  for (const log of lastEs.logs) {
2660
2876
  if (log.type === "diff") {
@@ -2672,73 +2888,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2672
2888
  }
2673
2889
  }
2674
2890
  if (!skipPrintNonTty) {
2675
- for (let i = 0; i < lines.length; i++) {
2891
+ for (let i = 0; i < lines.length; i += 1) {
2676
2892
  process.stdout.write(`\r\x1B[2K${lines[i]}
2677
2893
  `);
2678
2894
  }
2679
- lastPrintedTestCaseId = testCaseId;
2895
+ lastPrintedTestCaseId = compositeId;
2680
2896
  lastPrintedLineCount = lines.length;
2681
2897
  }
2682
2898
  drawSpinner();
2683
2899
  }
2684
- if (event.type === "RunCompleted" || event.type === "RunFailed") {
2900
+ if (event.type === "RunFailed") {
2901
+ if (batchReady && !batchPendingRunIds.has(event.runId)) {
2902
+ return;
2903
+ }
2685
2904
  runFinished = true;
2686
2905
  clearLine();
2687
2906
  unsubscribe();
2688
- resolve5(event);
2907
+ reject(new Error(`Run failed: ${event.errorMessage}`));
2908
+ return;
2909
+ }
2910
+ if (event.type === "RunCompleted") {
2911
+ if (!batchPendingRunIds.has(event.runId)) {
2912
+ return;
2913
+ }
2914
+ completedRuns.set(event.runId, event);
2915
+ batchPendingRunIds.delete(event.runId);
2916
+ if (batchPendingRunIds.size === 0) {
2917
+ runFinished = true;
2918
+ clearLine();
2919
+ unsubscribe();
2920
+ resolve5();
2921
+ }
2689
2922
  }
2690
2923
  });
2691
2924
  });
2692
- const snapshot = await runner.runDatasetWith({
2693
- datasetId: dataset.id,
2694
- evaluatorIds: evaluators.map((item) => item.id),
2695
- concurrency
2925
+ console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
2926
+ for (const name of runConfigNames) {
2927
+ const collected = await runner.resolveRunConfigByName(name);
2928
+ const label = collected?.runConfig.getDisplayLabel() ?? name;
2929
+ console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
2930
+ }
2931
+ console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
2932
+ console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
2933
+ console.log("");
2934
+ const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2935
+ jobs,
2936
+ globalConcurrency: concurrency
2696
2937
  });
2697
- totalCount = snapshot.totalTestCases;
2698
- console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
2699
- console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
2700
- console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
2701
- console.log(
2702
- `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
2703
- );
2704
- console.log(
2705
- `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
2706
- );
2938
+ for (let i = 0; i < snapshots.length; i += 1) {
2939
+ const snap = snapshots[i];
2940
+ const job = jobs[i];
2941
+ if (snap && job) {
2942
+ runIdToLabel.set(
2943
+ snap.runId,
2944
+ `${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
2945
+ );
2946
+ batchPendingRunIds.add(snap.runId);
2947
+ }
2948
+ }
2949
+ totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
2950
+ console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
2707
2951
  console.log("");
2952
+ batchReady = true;
2708
2953
  drawSpinner();
2709
2954
  spinnerTimer = setInterval(drawSpinner, 100);
2710
- const finalEvent = await done;
2955
+ await done;
2711
2956
  if (spinnerTimer) {
2712
2957
  clearInterval(spinnerTimer);
2713
2958
  }
2714
- if (finalEvent.type === "RunFailed") {
2715
- throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2716
- }
2717
- const completed = finalEvent;
2718
2959
  console.log("");
2719
- console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2720
- console.log(
2721
- `- passed: ${colorize(
2722
- `${completed.passedTestCases}/${completed.totalTestCases}`,
2723
- ansi2.green
2724
- )}`
2725
- );
2726
- console.log(
2727
- `- failed: ${colorize(
2728
- `${completed.failedTestCases}/${completed.totalTestCases}`,
2729
- completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2730
- )}`
2731
- );
2960
+ console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
2961
+ for (const snap of snapshots) {
2962
+ const completed = completedRuns.get(snap.runId);
2963
+ if (!completed) {
2964
+ continue;
2965
+ }
2966
+ const label = runIdToLabel.get(snap.runId) ?? snap.runId;
2967
+ console.log("");
2968
+ console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
2969
+ console.log(
2970
+ `- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
2971
+ );
2972
+ console.log(
2973
+ `- failed: ${colorize(
2974
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2975
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2976
+ )}`
2977
+ );
2978
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2979
+ }
2732
2980
  if (overallScoreCount > 0) {
2733
2981
  const overallAverage = overallScoreTotal / overallScoreCount;
2734
- const overallSd = sampleStdDev2(
2735
- overallScoreTotal,
2736
- overallScoreSumSq,
2737
- overallScoreCount
2738
- );
2982
+ const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
2739
2983
  const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2984
+ console.log("");
2740
2985
  console.log(
2741
- `- overall avg score: ${colorize(
2986
+ `- overall avg score (all jobs): ${colorize(
2742
2987
  avgStr,
2743
2988
  scoreToColor(overallAverage)
2744
2989
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
@@ -2779,22 +3024,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2779
3024
  );
2780
3025
  }
2781
3026
  }
2782
- console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
3027
+ let failedTestCasesTotal = 0;
3028
+ for (const snap of snapshots) {
3029
+ const completed = completedRuns.get(snap.runId);
3030
+ if (completed) {
3031
+ failedTestCasesTotal += completed.failedTestCases;
3032
+ }
3033
+ }
3034
+ return failedTestCasesTotal > 0 ? 1 : 0;
2783
3035
  }
2784
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
3036
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
2785
3037
  return new Promise((resolve5, reject) => {
2786
3038
  const app = ink.render(
2787
- React2__namespace.createElement(RunView, {
3039
+ React__namespace.createElement(RunView, {
2788
3040
  runner,
2789
- datasetName,
2790
- evaluatorPattern,
3041
+ runConfigNames,
2791
3042
  concurrency,
2792
- onComplete: (err) => {
3043
+ onComplete: (err, exitCode) => {
2793
3044
  app.unmount();
2794
3045
  if (err) {
2795
3046
  reject(err);
2796
3047
  } else {
2797
- resolve5();
3048
+ resolve5(exitCode ?? 0);
2798
3049
  }
2799
3050
  }
2800
3051
  })
@@ -2820,12 +3071,22 @@ async function main() {
2820
3071
  if (!args.command) {
2821
3072
  printUsageAndExit(1);
2822
3073
  }
2823
- if (!args.datasetName) {
2824
- console.error("Missing required --dataset <datasetName> argument.");
2825
- printUsageAndExit(1);
3074
+ if (args.command === "run") {
3075
+ if (args.runConfigNames.length === 0) {
3076
+ console.error(
3077
+ "Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
3078
+ );
3079
+ printUsageAndExit(1);
3080
+ }
3081
+ if (args.datasetName !== void 0) {
3082
+ console.error(
3083
+ "The run command no longer accepts --dataset; use --run-config <RunConfig name>."
3084
+ );
3085
+ printUsageAndExit(1);
3086
+ }
2826
3087
  }
2827
- if (args.command === "run" && !args.evaluatorPattern) {
2828
- console.error("Missing required --evaluator <name-or-pattern> argument.");
3088
+ if (args.command === "generate" && args.runConfigNames.length > 0) {
3089
+ console.error("generate does not accept --run-config.");
2829
3090
  printUsageAndExit(1);
2830
3091
  }
2831
3092
  const useInk = process.stdout.isTTY === true;
@@ -2836,17 +3097,24 @@ async function main() {
2836
3097
  try {
2837
3098
  if (args.command === "run") {
2838
3099
  const concurrency = args.concurrency ?? getDefaultConcurrency();
2839
- await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
3100
+ const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
2840
3101
  runner,
2841
- args.datasetName,
2842
- args.evaluatorPattern,
3102
+ args.runConfigNames,
2843
3103
  concurrency
2844
3104
  );
3105
+ if (args.ci && exitCode !== 0) {
3106
+ process.exit(1);
3107
+ }
2845
3108
  return;
2846
3109
  }
3110
+ const genDataset = args.datasetName;
3111
+ if (!genDataset) {
3112
+ console.error("Missing required --dataset <datasetName> argument.");
3113
+ printUsageAndExit(1);
3114
+ }
2847
3115
  await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
2848
3116
  runner,
2849
- args.datasetName
3117
+ genDataset
2850
3118
  );
2851
3119
  } finally {
2852
3120
  await runner.shutdown();