@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,35 +1,191 @@
1
1
  #!/usr/bin/env node
2
2
  import { randomUUID } from 'crypto';
3
- import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
3
+ import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
4
+ import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
5
+ import { resolve, join, relative, parse, dirname } from 'path';
4
6
  import { existsSync } from 'fs';
5
- import { resolve, relative, join, parse, dirname } from 'path';
6
7
  import * as jitiModule from 'jiti';
7
- import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import { diffLines } from 'diff';
10
10
  import stringify from 'fast-json-stable-stringify';
11
- import * as React2 from 'react';
12
- import React2__default, { useState, useEffect, useCallback } from 'react';
11
+ import * as React from 'react';
12
+ import React__default, { useState, useEffect, useCallback } from 'react';
13
13
  import { render, Box, Text } from 'ink';
14
14
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
15
15
 
16
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
17
+ function makeEntityIdSchema(brand, label) {
18
+ return Schema.String.pipe(
19
+ Schema.trimmed(),
20
+ Schema.minLength(1, {
21
+ message: () => `${label} must be non-empty.`
22
+ }),
23
+ Schema.pattern(ENTITY_ID_PATTERN, {
24
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
25
+ }),
26
+ Schema.brand(brand)
27
+ );
28
+ }
29
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
30
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
31
+ makeEntityIdSchema("TestCaseName", "Test case name");
32
+ function validateWithSchema(schema, raw, context) {
33
+ const trimmed = raw.trim();
34
+ const decode = Schema.decodeUnknownEither(
35
+ schema
36
+ );
37
+ const result = decode(trimmed);
38
+ if (Either.isLeft(result)) {
39
+ throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
40
+ }
41
+ return result.right;
42
+ }
43
+ function validateRunConfigName(raw, context) {
44
+ return validateWithSchema(RunConfigNameSchema, raw, context);
45
+ }
46
+
47
+ // src/evals/evaluator.ts
48
+ function getEvaluatorDisplayLabel(evaluator) {
49
+ if (typeof evaluator.getDisplayLabel === "function") {
50
+ const label = evaluator.getDisplayLabel();
51
+ if (label !== void 0) {
52
+ return label;
53
+ }
54
+ }
55
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
56
+ }
57
+ function getEvaluatorTagList(evaluator) {
58
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
59
+ }
60
+ async function loadRunSnapshotsFromArtifacts(config) {
61
+ const baseDir = resolve(config.artifactDirectory);
62
+ let entries;
63
+ try {
64
+ entries = await readdir(baseDir);
65
+ } catch {
66
+ return [];
67
+ }
68
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
69
+ const snapshots = [];
70
+ for (const fileName of jsonlFiles) {
71
+ const filePath = join(baseDir, fileName);
72
+ try {
73
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
74
+ if (snapshot) {
75
+ snapshots.push(snapshot);
76
+ }
77
+ } catch {
78
+ }
79
+ }
80
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
81
+ }
82
+ async function parseArtifactToSnapshot(filePath, _config) {
83
+ const content = await readFile(filePath, "utf8");
84
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
85
+ if (lines.length === 0) {
86
+ return null;
87
+ }
88
+ let runQueued = null;
89
+ let runCompleted = null;
90
+ let runFailed = null;
91
+ let runStarted = null;
92
+ for (const line of lines) {
93
+ try {
94
+ const event = JSON.parse(line);
95
+ const type = event.type;
96
+ if (type === "RunQueued") {
97
+ runQueued = {
98
+ runId: event.runId,
99
+ datasetId: event.datasetId,
100
+ datasetName: event.datasetName,
101
+ evaluatorIds: event.evaluatorIds,
102
+ totalTestCases: event.totalTestCases ?? 0,
103
+ artifactPath: event.artifactPath ?? filePath,
104
+ ts: event.ts
105
+ };
106
+ }
107
+ if (type === "RunStarted") {
108
+ runStarted = { startedAt: event.startedAt };
109
+ }
110
+ if (type === "RunCompleted") {
111
+ runCompleted = {
112
+ passedTestCases: event.passedTestCases,
113
+ failedTestCases: event.failedTestCases,
114
+ totalTestCases: event.totalTestCases,
115
+ finishedAt: event.finishedAt
116
+ };
117
+ }
118
+ if (type === "RunFailed") {
119
+ runFailed = {
120
+ finishedAt: event.finishedAt,
121
+ errorMessage: event.errorMessage
122
+ };
123
+ }
124
+ } catch {
125
+ }
126
+ }
127
+ if (!runQueued) {
128
+ return null;
129
+ }
130
+ const artifactPath = filePath;
131
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
132
+ const progress = aggregateTestCaseProgress(lines);
133
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
134
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
135
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
136
+ return {
137
+ runId: runQueued.runId,
138
+ datasetId: runQueued.datasetId,
139
+ datasetName: runQueued.datasetName,
140
+ evaluatorIds: runQueued.evaluatorIds,
141
+ queuedAt: runQueued.ts ?? 0,
142
+ startedAt: runStarted?.startedAt,
143
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
144
+ totalTestCases: runQueued.totalTestCases,
145
+ completedTestCases,
146
+ passedTestCases,
147
+ failedTestCases,
148
+ status,
149
+ artifactPath,
150
+ errorMessage: runFailed?.errorMessage
151
+ };
152
+ }
153
+ function aggregateTestCaseProgress(lines) {
154
+ let completedTestCases = 0;
155
+ const testCasePassedBy = /* @__PURE__ */ new Map();
156
+ for (const line of lines) {
157
+ try {
158
+ const event = JSON.parse(line);
159
+ if (event.type === "TestCaseProgress") {
160
+ const ev = event;
161
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
162
+ const id = ev.testCaseId;
163
+ const current = testCasePassedBy.get(id);
164
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
165
+ }
166
+ } catch {
167
+ }
168
+ }
169
+ let passedTestCases = 0;
170
+ let failedTestCases = 0;
171
+ for (const passed of testCasePassedBy.values()) {
172
+ if (passed) {
173
+ passedTestCases += 1;
174
+ } else {
175
+ failedTestCases += 1;
176
+ }
177
+ }
178
+ return { completedTestCases, passedTestCases, failedTestCases };
179
+ }
180
+
16
181
  // src/runner/config.ts
17
182
  var defaultRunnerConfig = {
18
183
  discovery: {
19
184
  rootDir: process.cwd(),
20
185
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
21
- evaluatorSuffixes: [
22
- ".evaluator.ts",
23
- ".evaluator.tsx",
24
- ".evaluator.js",
25
- ".evaluator.mjs"
26
- ],
27
- testCaseSuffixes: [
28
- ".test-case.ts",
29
- ".test-case.tsx",
30
- ".test-case.js",
31
- ".test-case.mjs"
32
- ],
186
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
187
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
188
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
33
189
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
34
190
  },
35
191
  artifactDirectory: ".eval-results",
@@ -54,6 +210,11 @@ function toRunnerConfigOverrides(config) {
54
210
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
55
211
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
56
212
  }
213
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
214
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
215
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
216
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
217
+ }
57
218
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
58
219
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
59
220
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -96,14 +257,15 @@ function getJitiLoader() {
96
257
  }
97
258
  const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
98
259
  if (typeof createJiti2 !== "function") {
99
- throw new Error(
100
- "Failed to initialize jiti for m4trix eval config loading."
101
- );
260
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
102
261
  }
103
- cachedLoader = createJiti2(import.meta.url, {
104
- interopDefault: true,
105
- moduleCache: true
106
- });
262
+ cachedLoader = createJiti2(
263
+ import.meta.url,
264
+ {
265
+ interopDefault: true,
266
+ moduleCache: true
267
+ }
268
+ );
107
269
  return cachedLoader;
108
270
  }
109
271
  function resolveConfigModuleExport(loadedModule) {
@@ -151,6 +313,9 @@ function isDatasetLike(value) {
151
313
  function isEvaluatorLike(value) {
152
314
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
153
315
  }
316
+ function isRunConfigLike(value) {
317
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
318
+ }
154
319
  function isTestCaseLike(value) {
155
320
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
156
321
  }
@@ -207,9 +372,7 @@ async function loadModuleExports(filePath) {
207
372
  }
208
373
  async function collectDatasetsFromFiles(config) {
209
374
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
210
- const matched = files.filter(
211
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
212
- );
375
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
213
376
  const found = await Promise.all(
214
377
  matched.map(async (absolutePath) => {
215
378
  const exports = await loadModuleExports(absolutePath);
@@ -226,9 +389,7 @@ async function collectDatasetsFromFiles(config) {
226
389
  }
227
390
  async function collectEvaluatorsFromFiles(config) {
228
391
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
229
- const matched = files.filter(
230
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
231
- );
392
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
232
393
  const found = await Promise.all(
233
394
  matched.map(async (absolutePath) => {
234
395
  const exports = await loadModuleExports(absolutePath);
@@ -243,11 +404,26 @@ async function collectEvaluatorsFromFiles(config) {
243
404
  );
244
405
  return found.flat();
245
406
  }
246
- async function collectTestCasesFromFiles(config) {
407
+ async function collectRunConfigsFromFiles(config) {
247
408
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
248
- const matched = files.filter(
249
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
409
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
410
+ const found = await Promise.all(
411
+ matched.map(async (absolutePath) => {
412
+ const exports = await loadModuleExports(absolutePath);
413
+ const runConfigs = exports.filter(isRunConfigLike);
414
+ const relPath = relative(config.rootDir, absolutePath);
415
+ return runConfigs.map((runConfig) => ({
416
+ id: runConfig.getName(),
417
+ filePath: relPath,
418
+ runConfig
419
+ }));
420
+ })
250
421
  );
422
+ return found.flat();
423
+ }
424
+ async function collectTestCasesFromFiles(config) {
425
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
426
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
251
427
  const found = await Promise.all(
252
428
  matched.map(async (absolutePath) => {
253
429
  const exports = await loadModuleExports(absolutePath);
@@ -319,16 +495,8 @@ function createDiffString(expected, actual, diffOptions) {
319
495
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
320
496
  const actualProcessed = preprocessForDiff(actual, diffOptions);
321
497
  if (diffOptions?.keysOnly) {
322
- const expectedKeys = JSON.stringify(
323
- extractKeys(expectedProcessed),
324
- null,
325
- 2
326
- );
327
- const actualKeys = JSON.stringify(
328
- extractKeys(actualProcessed),
329
- null,
330
- 2
331
- );
498
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
499
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
332
500
  const parts2 = diffLines(expectedKeys, actualKeys);
333
501
  return formatDiffParts(parts2);
334
502
  }
@@ -339,9 +507,7 @@ function createDiffString(expected, actual, diffOptions) {
339
507
  }
340
508
  const parts = diffLines(expectedStr, actualStr);
341
509
  if (diffOptions?.outputNewOnly) {
342
- const filtered = parts.filter(
343
- (p) => p.added === true
344
- );
510
+ const filtered = parts.filter((p) => p.added === true);
345
511
  return formatDiffParts(filtered);
346
512
  }
347
513
  return formatDiffParts(parts);
@@ -408,6 +574,17 @@ function getDiffLines(entry) {
408
574
  });
409
575
  }
410
576
 
577
+ // src/evals/test-case.ts
578
+ function getTestCaseDisplayLabel(testCase) {
579
+ if (typeof testCase.getDisplayLabel === "function") {
580
+ return testCase.getDisplayLabel();
581
+ }
582
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
583
+ }
584
+ function getTestCaseTagList(testCase) {
585
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
586
+ }
587
+
411
588
  // src/evals/metric.ts
412
589
  var registry = /* @__PURE__ */ new Map();
413
590
  var Metric = {
@@ -431,6 +608,54 @@ function getMetricById(id) {
431
608
  return registry.get(id);
432
609
  }
433
610
 
611
+ // src/evals/aggregators.ts
612
+ function aggregateTokenCountSum(values) {
613
+ const initial = {
614
+ input: 0,
615
+ output: 0,
616
+ inputCached: 0,
617
+ outputCached: 0
618
+ };
619
+ return values.reduce(
620
+ (acc, v) => ({
621
+ input: acc.input + (v.input ?? 0),
622
+ output: acc.output + (v.output ?? 0),
623
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
624
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
625
+ }),
626
+ initial
627
+ );
628
+ }
629
+ function aggregateLatencyAverage(values) {
630
+ if (values.length === 0) {
631
+ return { ms: 0 };
632
+ }
633
+ const sum = values.reduce((s, v) => s + v.ms, 0);
634
+ return { ms: sum / values.length };
635
+ }
636
+
637
+ // src/evals/metrics/standard.ts
638
+ Metric.of({
639
+ id: "token-count",
640
+ name: "Tokens",
641
+ aggregate: aggregateTokenCountSum,
642
+ format: (data, options) => {
643
+ const input = data.input ?? 0;
644
+ const output = data.output ?? 0;
645
+ const inputCached = data.inputCached ?? 0;
646
+ const outputCached = data.outputCached ?? 0;
647
+ const cached = inputCached + outputCached;
648
+ const base = `in:${input} out:${output} cached:${cached}`;
649
+ return options?.isAggregated ? `Total: ${base}` : base;
650
+ }
651
+ });
652
+ Metric.of({
653
+ id: "latency",
654
+ name: "Latency",
655
+ aggregate: aggregateLatencyAverage,
656
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
657
+ });
658
+
434
659
  // src/evals/score.ts
435
660
  var registry2 = /* @__PURE__ */ new Map();
436
661
  function formatScoreData(def, data, options) {
@@ -443,10 +668,7 @@ var ScoreAggregate = {
443
668
  const count = values.length || 1;
444
669
  const result = {};
445
670
  for (const field of fields) {
446
- result[field] = values.reduce(
447
- (s, v) => s + (v[field] ?? 0),
448
- 0
449
- ) / count;
671
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
450
672
  }
451
673
  return result;
452
674
  };
@@ -480,13 +702,10 @@ var ScoreAggregate = {
480
702
  (s, v) => s + (v[valueField] ?? 0),
481
703
  0
482
704
  );
483
- const sumSq = values.reduce(
484
- (s, v) => {
485
- const value = v[valueField] ?? 0;
486
- return s + value * value;
487
- },
488
- 0
489
- );
705
+ const sumSq = values.reduce((s, v) => {
706
+ const value = v[valueField] ?? 0;
707
+ return s + value * value;
708
+ }, 0);
490
709
  const mean = sum / count;
491
710
  const variance = (sumSq - count * mean * mean) / (count - 1);
492
711
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -545,54 +764,6 @@ function getScoreById(id) {
545
764
  return registry2.get(id);
546
765
  }
547
766
 
548
- // src/evals/aggregators.ts
549
- function aggregateTokenCountSum(values) {
550
- const initial = {
551
- input: 0,
552
- output: 0,
553
- inputCached: 0,
554
- outputCached: 0
555
- };
556
- return values.reduce(
557
- (acc, v) => ({
558
- input: acc.input + (v.input ?? 0),
559
- output: acc.output + (v.output ?? 0),
560
- inputCached: acc.inputCached + (v.inputCached ?? 0),
561
- outputCached: acc.outputCached + (v.outputCached ?? 0)
562
- }),
563
- initial
564
- );
565
- }
566
- function aggregateLatencyAverage(values) {
567
- if (values.length === 0) {
568
- return { ms: 0 };
569
- }
570
- const sum = values.reduce((s, v) => s + v.ms, 0);
571
- return { ms: sum / values.length };
572
- }
573
-
574
- // src/evals/metrics/standard.ts
575
- Metric.of({
576
- id: "token-count",
577
- name: "Tokens",
578
- aggregate: aggregateTokenCountSum,
579
- format: (data, options) => {
580
- const input = data.input ?? 0;
581
- const output = data.output ?? 0;
582
- const inputCached = data.inputCached ?? 0;
583
- const outputCached = data.outputCached ?? 0;
584
- const cached = inputCached + outputCached;
585
- const base = `in:${input} out:${output} cached:${cached}`;
586
- return options?.isAggregated ? `Total: ${base}` : base;
587
- }
588
- });
589
- Metric.of({
590
- id: "latency",
591
- name: "Latency",
592
- aggregate: aggregateLatencyAverage,
593
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
594
- });
595
-
596
767
  // src/evals/scores/standard.ts
597
768
  Score.of({
598
769
  id: "percent",
@@ -736,15 +907,17 @@ function readOutput(testCase) {
736
907
  }
737
908
  return candidate.getOutput();
738
909
  }
739
- function buildEvaluationUnits(testCases) {
910
+ function buildEvaluationUnits(testCases, repetitionCount) {
911
+ const count = Math.max(1, repetitionCount);
740
912
  const units = [];
741
913
  for (const testCaseItem of testCases) {
742
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
743
- for (let r = 0; r < rerunTotal; r++) {
914
+ const repetitionId = `rep-${randomUUID()}`;
915
+ for (let r = 0; r < count; r++) {
744
916
  units.push({
745
917
  testCaseItem,
746
- rerunIndex: r + 1,
747
- rerunTotal
918
+ repetitionId,
919
+ repetitionIndex: r + 1,
920
+ repetitionCount: count
748
921
  });
749
922
  }
750
923
  }
@@ -754,29 +927,24 @@ function nowIsoForFile() {
754
927
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
755
928
  }
756
929
  function createArtifactPath(artifactDirectory, datasetId, runId) {
757
- return join(
758
- artifactDirectory,
759
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
760
- );
930
+ return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
761
931
  }
762
932
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
763
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
933
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
764
934
  return Effect.gen(function* () {
765
935
  const evaluatorRunId = `run-${randomUUID()}`;
766
936
  const started = Date.now();
767
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
768
- n + 1,
769
- n + 1
770
- ]);
937
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
771
938
  yield* publishEvent({
772
939
  type: "TestCaseStarted",
773
940
  runId: task.runId,
774
941
  testCaseId: testCaseItem.id,
775
- testCaseName: testCaseItem.testCase.getName(),
942
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
776
943
  startedTestCases: startedEvaluations,
777
944
  totalTestCases: totalEvaluations,
778
- rerunIndex,
779
- rerunTotal
945
+ repetitionId,
946
+ repetitionIndex,
947
+ repetitionCount
780
948
  });
781
949
  const evaluatorScores = [];
782
950
  let testCaseError;
@@ -800,9 +968,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
800
968
  return error;
801
969
  };
802
970
  try {
803
- const ctx = yield* Effect.promise(
804
- () => Promise.resolve(evaluator.resolveContext())
805
- );
971
+ const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
806
972
  const result = yield* Effect.promise(
807
973
  () => Promise.resolve().then(
808
974
  () => evaluateFn({
@@ -812,8 +978,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
812
978
  meta: {
813
979
  triggerId: task.triggerId,
814
980
  runId: evaluatorRunId,
815
- datasetId: task.datasetId
981
+ datasetId: task.datasetId,
982
+ repetitionId,
983
+ repetitionIndex,
984
+ repetitionCount,
985
+ runConfigName: task.runConfigName
816
986
  },
987
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
988
+ runConfigTags: task.runConfigTags,
989
+ evaluatorTags: getEvaluatorTagList(evaluator),
817
990
  logDiff,
818
991
  log,
819
992
  createError
@@ -856,21 +1029,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
856
1029
  });
857
1030
  }
858
1031
  }
859
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
860
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
861
- n + 1,
862
- n + 1
863
- ]);
1032
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1033
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
864
1034
  const progressEvent = {
865
1035
  type: "TestCaseProgress",
866
1036
  runId: task.runId,
867
1037
  testCaseId: testCaseItem.id,
868
- testCaseName: testCaseItem.testCase.getName(),
1038
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
869
1039
  completedTestCases: completedEvaluations,
870
1040
  totalTestCases: totalEvaluations,
871
- rerunIndex,
872
- rerunTotal,
873
- passed: rerunPassedThis,
1041
+ repetitionId,
1042
+ repetitionIndex,
1043
+ repetitionCount,
1044
+ passed: repetitionPassedThis,
874
1045
  durationMs: Date.now() - started,
875
1046
  evaluatorScores,
876
1047
  output,
@@ -891,9 +1062,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
891
1062
  (map) => {
892
1063
  const key = testCaseItem.id;
893
1064
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
894
- const newResults = [...existing.results, rerunPassedThis];
1065
+ const newResults = [...existing.results, repetitionPassedThis];
895
1066
  const newCompletedCount = existing.completedCount + 1;
896
- const isLast = newCompletedCount === rerunTotal;
1067
+ const isLast = newCompletedCount === repetitionCount;
897
1068
  const newMap = new Map(map);
898
1069
  newMap.set(key, {
899
1070
  completedCount: newCompletedCount,
@@ -909,10 +1080,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
909
1080
  } else {
910
1081
  yield* Ref.update(failedRef, (n) => n + 1);
911
1082
  }
912
- const [passed, failed] = yield* Effect.all([
913
- Ref.get(passedRef),
914
- Ref.get(failedRef)
915
- ]);
1083
+ const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
916
1084
  yield* updateSnapshot(task.runId, (snapshot) => ({
917
1085
  ...snapshot,
918
1086
  passedTestCases: passed,
@@ -933,10 +1101,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
933
1101
  runId: task.runId,
934
1102
  startedAt
935
1103
  });
936
- const totalEvaluations = task.testCases.reduce(
937
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
938
- 0
939
- );
1104
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
940
1105
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
941
1106
  const completedRef = yield* Ref.make(0);
942
1107
  const startedRef = yield* Ref.make(0);
@@ -945,7 +1110,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
945
1110
  const testCaseResultsRef = yield* Ref.make(
946
1111
  /* @__PURE__ */ new Map()
947
1112
  );
948
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1113
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
949
1114
  const processEvaluation = (unit) => processOneEvaluation(
950
1115
  task,
951
1116
  unit,
@@ -959,11 +1124,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
959
1124
  failedRef,
960
1125
  testCaseResultsRef
961
1126
  );
962
- yield* Effect.forEach(
963
- evaluationUnits,
964
- processEvaluation,
965
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
966
- );
1127
+ const globalSem = task.globalEvaluationSemaphore;
1128
+ if (globalSem !== void 0) {
1129
+ yield* Effect.forEach(
1130
+ evaluationUnits,
1131
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1132
+ { concurrency: "unbounded", discard: true }
1133
+ );
1134
+ } else {
1135
+ yield* Effect.forEach(
1136
+ evaluationUnits,
1137
+ processEvaluation,
1138
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1139
+ );
1140
+ }
967
1141
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
968
1142
  Ref.get(completedRef),
969
1143
  Ref.get(passedRef),
@@ -999,125 +1173,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
999
1173
  artifactPath: task.snapshot.artifactPath
1000
1174
  });
1001
1175
  });
1002
- async function loadRunSnapshotsFromArtifacts(config) {
1003
- const baseDir = resolve(config.artifactDirectory);
1004
- let entries;
1005
- try {
1006
- entries = await readdir(baseDir);
1007
- } catch {
1008
- return [];
1009
- }
1010
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1011
- const snapshots = [];
1012
- for (const fileName of jsonlFiles) {
1013
- const filePath = join(baseDir, fileName);
1014
- try {
1015
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1016
- if (snapshot) {
1017
- snapshots.push(snapshot);
1018
- }
1019
- } catch {
1020
- }
1021
- }
1022
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1023
- }
1024
- async function parseArtifactToSnapshot(filePath, _config) {
1025
- const content = await readFile(filePath, "utf8");
1026
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1027
- if (lines.length === 0) {
1028
- return null;
1029
- }
1030
- let runQueued = null;
1031
- let runCompleted = null;
1032
- let runFailed = null;
1033
- let runStarted = null;
1034
- for (const line of lines) {
1035
- try {
1036
- const event = JSON.parse(line);
1037
- const type = event.type;
1038
- if (type === "RunQueued") {
1039
- runQueued = {
1040
- runId: event.runId,
1041
- datasetId: event.datasetId,
1042
- datasetName: event.datasetName,
1043
- evaluatorIds: event.evaluatorIds,
1044
- totalTestCases: event.totalTestCases ?? 0,
1045
- artifactPath: event.artifactPath ?? filePath,
1046
- ts: event.ts
1047
- };
1048
- }
1049
- if (type === "RunStarted") {
1050
- runStarted = { startedAt: event.startedAt };
1051
- }
1052
- if (type === "RunCompleted") {
1053
- runCompleted = {
1054
- passedTestCases: event.passedTestCases,
1055
- failedTestCases: event.failedTestCases,
1056
- totalTestCases: event.totalTestCases,
1057
- finishedAt: event.finishedAt
1058
- };
1059
- }
1060
- if (type === "RunFailed") {
1061
- runFailed = {
1062
- finishedAt: event.finishedAt,
1063
- errorMessage: event.errorMessage
1064
- };
1065
- }
1066
- } catch {
1067
- }
1176
+
1177
+ // src/runner/name-pattern.ts
1178
+ function parseRegexLiteral(pattern) {
1179
+ if (!pattern.startsWith("/")) {
1180
+ return void 0;
1068
1181
  }
1069
- if (!runQueued) {
1070
- return null;
1182
+ const lastSlash = pattern.lastIndexOf("/");
1183
+ if (lastSlash <= 0) {
1184
+ return void 0;
1071
1185
  }
1072
- const artifactPath = filePath;
1073
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1074
- const progress = aggregateTestCaseProgress(lines);
1075
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1076
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1077
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1078
1186
  return {
1079
- runId: runQueued.runId,
1080
- datasetId: runQueued.datasetId,
1081
- datasetName: runQueued.datasetName,
1082
- evaluatorIds: runQueued.evaluatorIds,
1083
- queuedAt: runQueued.ts ?? 0,
1084
- startedAt: runStarted?.startedAt,
1085
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1086
- totalTestCases: runQueued.totalTestCases,
1087
- completedTestCases,
1088
- passedTestCases,
1089
- failedTestCases,
1090
- status,
1091
- artifactPath,
1092
- errorMessage: runFailed?.errorMessage
1187
+ source: pattern.slice(1, lastSlash),
1188
+ flags: pattern.slice(lastSlash + 1)
1093
1189
  };
1094
1190
  }
1095
- function aggregateTestCaseProgress(lines) {
1096
- let completedTestCases = 0;
1097
- const testCasePassedBy = /* @__PURE__ */ new Map();
1098
- for (const line of lines) {
1099
- try {
1100
- const event = JSON.parse(line);
1101
- if (event.type === "TestCaseProgress") {
1102
- const ev = event;
1103
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1104
- const id = ev.testCaseId;
1105
- const current = testCasePassedBy.get(id);
1106
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1107
- }
1108
- } catch {
1109
- }
1191
+ function createNameMatcher(pattern) {
1192
+ const normalizedPattern = pattern.trim();
1193
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1194
+ if (regexLiteral) {
1195
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1196
+ return (value) => regex.test(value);
1110
1197
  }
1111
- let passedTestCases = 0;
1112
- let failedTestCases = 0;
1113
- for (const passed of testCasePassedBy.values()) {
1114
- if (passed) {
1115
- passedTestCases += 1;
1116
- } else {
1117
- failedTestCases += 1;
1118
- }
1198
+ if (normalizedPattern.includes("*")) {
1199
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1200
+ const regex = new RegExp(`^${escaped}$`, "i");
1201
+ return (value) => regex.test(value);
1119
1202
  }
1120
- return { completedTestCases, passedTestCases, failedTestCases };
1203
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1121
1204
  }
1122
1205
  async function appendJsonLine(artifactPath, payload) {
1123
1206
  await mkdir(dirname(artifactPath), { recursive: true });
@@ -1176,32 +1259,12 @@ function searchCollectedTestCases(all, query) {
1176
1259
  }
1177
1260
 
1178
1261
  // src/runner/api.ts
1179
- function parseRegexLiteral(pattern) {
1180
- if (!pattern.startsWith("/")) {
1181
- return void 0;
1182
- }
1183
- const lastSlash = pattern.lastIndexOf("/");
1184
- if (lastSlash <= 0) {
1185
- return void 0;
1186
- }
1187
- return {
1188
- source: pattern.slice(1, lastSlash),
1189
- flags: pattern.slice(lastSlash + 1)
1190
- };
1191
- }
1192
- function createNameMatcher(pattern) {
1193
- const normalizedPattern = pattern.trim();
1194
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1195
- if (regexLiteral) {
1196
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1197
- return (value) => regex.test(value);
1198
- }
1199
- if (normalizedPattern.includes("*")) {
1200
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1201
- const regex = new RegExp(`^${escaped}$`, "i");
1202
- return (value) => regex.test(value);
1262
+ function normalizeRunRepetitions(value) {
1263
+ const n = value ?? 1;
1264
+ if (!Number.isInteger(n) || n < 1) {
1265
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1203
1266
  }
1204
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1267
+ return n;
1205
1268
  }
1206
1269
  function mergeRunnerOverrides(base, next) {
1207
1270
  if (!base) {
@@ -1232,15 +1295,12 @@ var EffectRunner = class {
1232
1295
  this.persistenceQueue = Effect.runSync(
1233
1296
  Queue.unbounded()
1234
1297
  );
1235
- this.snapshotsRef = Effect.runSync(
1236
- Ref.make(/* @__PURE__ */ new Map())
1237
- );
1298
+ this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
1238
1299
  this.listeners = /* @__PURE__ */ new Set();
1239
1300
  this.datasetsById = /* @__PURE__ */ new Map();
1240
1301
  this.evaluatorsById = /* @__PURE__ */ new Map();
1241
- this.schedulerFiber = Effect.runFork(
1242
- this.createSchedulerEffect()
1243
- );
1302
+ this.runConfigsById = /* @__PURE__ */ new Map();
1303
+ this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1244
1304
  this.persistenceFiber = Effect.runFork(
1245
1305
  createPersistenceWorker(this.persistenceQueue)
1246
1306
  );
@@ -1280,6 +1340,137 @@ var EffectRunner = class {
1280
1340
  (item) => matcher(item.evaluator.getName() ?? "")
1281
1341
  );
1282
1342
  }
1343
+ async collectRunConfigs() {
1344
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
1345
+ this.runConfigsById.clear();
1346
+ const byNameLower = /* @__PURE__ */ new Map();
1347
+ for (const item of runConfigs) {
1348
+ const id = item.runConfig.getName();
1349
+ const lower = id.toLowerCase();
1350
+ const prev = byNameLower.get(lower);
1351
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
1352
+ throw new Error(
1353
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
1354
+ );
1355
+ }
1356
+ byNameLower.set(lower, item);
1357
+ this.runConfigsById.set(id, item);
1358
+ }
1359
+ return runConfigs;
1360
+ }
1361
+ async resolveRunConfigByName(name) {
1362
+ if (this.runConfigsById.size === 0) {
1363
+ await this.collectRunConfigs();
1364
+ }
1365
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
1366
+ const keyLower = key.toLowerCase();
1367
+ const matches = Array.from(this.runConfigsById.values()).filter(
1368
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
1369
+ );
1370
+ if (matches.length === 0) {
1371
+ return void 0;
1372
+ }
1373
+ if (matches.length > 1) {
1374
+ throw new Error(
1375
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
1376
+ );
1377
+ }
1378
+ return matches[0];
1379
+ }
1380
+ async expandRunConfigToJobs(collected) {
1381
+ if (this.datasetsById.size === 0) {
1382
+ await this.collectDatasets();
1383
+ }
1384
+ if (this.evaluatorsById.size === 0) {
1385
+ await this.collectEvaluators();
1386
+ }
1387
+ const rcName = collected.runConfig.getName();
1388
+ const jobs = [];
1389
+ const runs = collected.runConfig.getRuns();
1390
+ for (const [i, row] of runs.entries()) {
1391
+ const dsCollected = Array.from(this.datasetsById.values()).find(
1392
+ (d) => d.dataset === row.dataset
1393
+ );
1394
+ if (!dsCollected) {
1395
+ throw new Error(
1396
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
1397
+ );
1398
+ }
1399
+ let evaluatorIds;
1400
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
1401
+ const matcher = createNameMatcher(row.evaluatorPattern);
1402
+ const matched = Array.from(this.evaluatorsById.values()).filter(
1403
+ (item) => matcher(item.evaluator.getName() ?? "")
1404
+ );
1405
+ if (matched.length === 0) {
1406
+ throw new Error(
1407
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
1408
+ );
1409
+ }
1410
+ evaluatorIds = matched.map((item) => item.id);
1411
+ } else {
1412
+ const evaluators = row.evaluators;
1413
+ evaluatorIds = [];
1414
+ for (const ev of evaluators) {
1415
+ const found = Array.from(this.evaluatorsById.values()).find(
1416
+ (item) => item.evaluator === ev
1417
+ );
1418
+ if (!found) {
1419
+ throw new Error(
1420
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
1421
+ );
1422
+ }
1423
+ evaluatorIds.push(found.id);
1424
+ }
1425
+ }
1426
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
1427
+ jobs.push({
1428
+ datasetId: dsCollected.id,
1429
+ evaluatorIds,
1430
+ runConfigName: rcName,
1431
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
1432
+ runConfigTags: collected.runConfig.getTags(),
1433
+ repetitions
1434
+ });
1435
+ }
1436
+ return jobs;
1437
+ }
1438
+ async expandRunConfigNamesToJobs(names) {
1439
+ const jobs = [];
1440
+ for (const name of names) {
1441
+ const collected = await this.resolveRunConfigByName(name);
1442
+ if (!collected) {
1443
+ const known = await this.collectRunConfigs();
1444
+ const available = known.map((r) => r.runConfig.getName()).sort();
1445
+ throw new Error(
1446
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
1447
+ );
1448
+ }
1449
+ jobs.push(...await this.expandRunConfigToJobs(collected));
1450
+ }
1451
+ return jobs;
1452
+ }
1453
+ async runDatasetJobsWithSharedConcurrency(request) {
1454
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
1455
+ const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
1456
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1457
+ const snapshots = [];
1458
+ for (const job of request.jobs) {
1459
+ snapshots.push(
1460
+ await this.startDatasetRun({
1461
+ datasetId: job.datasetId,
1462
+ evaluatorIds: job.evaluatorIds,
1463
+ triggerId,
1464
+ maxConcurrency: this.config.maxConcurrency ?? 1,
1465
+ globalEvaluationSemaphore: sem,
1466
+ runConfigName: job.runConfigName,
1467
+ runConfigTags: job.runConfigTags,
1468
+ repetitions: job.repetitions
1469
+ })
1470
+ );
1471
+ }
1472
+ return snapshots;
1473
+ }
1283
1474
  async searchTestCases(query) {
1284
1475
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1285
1476
  return searchCollectedTestCases(testCases, query);
@@ -1298,35 +1489,45 @@ var EffectRunner = class {
1298
1489
  );
1299
1490
  }
1300
1491
  async runDatasetWith(request) {
1492
+ const runConfigName = validateRunConfigName(
1493
+ request.runConfigName,
1494
+ "runDatasetWith.runConfigName"
1495
+ );
1496
+ return this.startDatasetRun({
1497
+ datasetId: request.datasetId,
1498
+ evaluatorIds: request.evaluatorIds,
1499
+ triggerId: request.triggerId,
1500
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
1501
+ repetitions: request.repetitions,
1502
+ runConfigName,
1503
+ runConfigTags: request.runConfigTags
1504
+ });
1505
+ }
1506
+ async startDatasetRun(params) {
1301
1507
  if (this.datasetsById.size === 0) {
1302
1508
  await this.collectDatasets();
1303
1509
  }
1304
1510
  if (this.evaluatorsById.size === 0) {
1305
1511
  await this.collectEvaluators();
1306
1512
  }
1307
- const dataset = this.datasetsById.get(request.datasetId);
1513
+ const dataset = this.datasetsById.get(params.datasetId);
1308
1514
  if (!dataset) {
1309
- throw new Error(`Unknown dataset: ${request.datasetId}`);
1515
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1310
1516
  }
1311
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1517
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1312
1518
  if (selectedEvaluators.length === 0) {
1313
1519
  throw new Error("No evaluators selected for run");
1314
1520
  }
1315
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1316
- const totalEvaluations = selectedTestCases.reduce(
1317
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1318
- 0
1319
- );
1320
- const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
1521
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
1522
+ const repetitions = normalizeRunRepetitions(params.repetitions);
1523
+ const totalEvaluations = selectedTestCases.length * repetitions;
1524
+ const runConfigTags = [...params.runConfigTags ?? []];
1525
+ const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
1321
1526
  const runId = `run-${randomUUID()}`;
1322
- const artifactPath = createArtifactPath(
1323
- this.config.artifactDirectory,
1324
- request.datasetId,
1325
- runId
1326
- );
1527
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1327
1528
  const snapshot = {
1328
1529
  runId,
1329
- datasetId: request.datasetId,
1530
+ datasetId: params.datasetId,
1330
1531
  datasetName: dataset.dataset.getName(),
1331
1532
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1332
1533
  queuedAt: Date.now(),
@@ -1347,7 +1548,7 @@ var EffectRunner = class {
1347
1548
  const queuedEvent = {
1348
1549
  type: "RunQueued",
1349
1550
  runId,
1350
- datasetId: request.datasetId,
1551
+ datasetId: params.datasetId,
1351
1552
  datasetName: dataset.dataset.getName(),
1352
1553
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1353
1554
  totalTestCases: totalEvaluations,
@@ -1361,17 +1562,20 @@ var EffectRunner = class {
1361
1562
  payload: queuedEvent
1362
1563
  })
1363
1564
  );
1364
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1365
1565
  await Effect.runPromise(
1366
1566
  Queue.offer(this.runQueue, {
1367
1567
  runId,
1368
1568
  triggerId,
1369
- datasetId: request.datasetId,
1569
+ datasetId: params.datasetId,
1370
1570
  dataset: dataset.dataset,
1371
1571
  evaluators: selectedEvaluators,
1372
1572
  testCases: selectedTestCases,
1373
1573
  snapshot,
1374
- maxConcurrency
1574
+ maxConcurrency: params.maxConcurrency,
1575
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
1576
+ runConfigName: params.runConfigName,
1577
+ runConfigTags,
1578
+ repetitions
1375
1579
  })
1376
1580
  );
1377
1581
  return snapshot;
@@ -1387,9 +1591,9 @@ var EffectRunner = class {
1387
1591
  return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1388
1592
  }
1389
1593
  getAllRunSnapshots() {
1390
- return Array.from(
1391
- Effect.runSync(Ref.get(this.snapshotsRef)).values()
1392
- ).sort((a, b) => b.queuedAt - a.queuedAt);
1594
+ return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
1595
+ (a, b) => b.queuedAt - a.queuedAt
1596
+ );
1393
1597
  }
1394
1598
  async loadRunSnapshotsFromArtifacts() {
1395
1599
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1450,6 +1654,8 @@ function getDefaultConcurrency() {
1450
1654
  function parseSimpleCliArgs(argv) {
1451
1655
  const args = {
1452
1656
  help: false,
1657
+ ci: false,
1658
+ runConfigNames: [],
1453
1659
  unknownArgs: []
1454
1660
  };
1455
1661
  let index = 0;
@@ -1463,18 +1669,26 @@ function parseSimpleCliArgs(argv) {
1463
1669
  args.help = true;
1464
1670
  continue;
1465
1671
  }
1672
+ if (token === "--ci") {
1673
+ args.ci = true;
1674
+ continue;
1675
+ }
1466
1676
  if ((token === "--dataset" || token === "--datasetName") && argv[index + 1]) {
1467
1677
  args.datasetName = argv[index + 1];
1468
1678
  index += 1;
1469
1679
  continue;
1470
1680
  }
1471
- if ((token === "--evaluator" || token === "--name") && argv[index + 1]) {
1472
- args.evaluatorPattern = argv[index + 1];
1681
+ if ((token === "--run-config" || token === "--runConfig") && argv[index + 1]) {
1682
+ const next = argv[index + 1];
1683
+ if (typeof next === "string") {
1684
+ args.runConfigNames.push(next);
1685
+ }
1473
1686
  index += 1;
1474
1687
  continue;
1475
1688
  }
1476
1689
  if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1477
- const n = parseInt(argv[index + 1], 10);
1690
+ const nextConc = argv[index + 1];
1691
+ const n = typeof nextConc === "string" ? parseInt(nextConc, 10) : Number.NaN;
1478
1692
  if (!Number.isNaN(n) && n >= 1) {
1479
1693
  args.concurrency = n;
1480
1694
  }
@@ -1488,16 +1702,12 @@ function parseSimpleCliArgs(argv) {
1488
1702
  function getSimpleCliUsage() {
1489
1703
  return [
1490
1704
  "Usage:",
1491
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1705
+ " eval-agents-simple run --run-config <name> [--run-config <name> ...] [--concurrency N] [--ci]",
1492
1706
  " eval-agents-simple generate --dataset <datasetName>",
1493
1707
  "",
1494
1708
  "Options:",
1495
- " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1496
- "",
1497
- "Pattern examples for --evaluator:",
1498
- " score-evaluator exact name (case-insensitive)",
1499
- ' "*score*" wildcard pattern',
1500
- ' "/score/i" regex literal'
1709
+ " --ci With run: exit with code 1 if any test case fails.",
1710
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential."
1501
1711
  ].join("\n");
1502
1712
  }
1503
1713
 
@@ -1548,7 +1758,7 @@ function GenerateView({
1548
1758
  const payload = testCases.map((item) => {
1549
1759
  const tc = item.testCase;
1550
1760
  return {
1551
- name: item.testCase.getName(),
1761
+ name: getTestCaseDisplayLabel(item.testCase),
1552
1762
  input: item.testCase.getInput(),
1553
1763
  output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
1554
1764
  };
@@ -1556,12 +1766,8 @@ function GenerateView({
1556
1766
  const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
1557
1767
  const parsed = parse2(absoluteDatasetPath);
1558
1768
  const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
1559
- await writeFile2(
1560
- outputPath,
1561
- `${JSON.stringify(payload, null, 2)}
1562
- `,
1563
- "utf8"
1564
- );
1769
+ await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
1770
+ `, "utf8");
1565
1771
  if (!cancelled) {
1566
1772
  setResult({
1567
1773
  count: payload.length,
@@ -1618,7 +1824,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1618
1824
  }
1619
1825
  const testCases = await runner.collectDatasetTestCases(dataset.id);
1620
1826
  const payload = testCases.map((item) => ({
1621
- name: item.testCase.getName(),
1827
+ name: getTestCaseDisplayLabel(item.testCase),
1622
1828
  input: item.testCase.getInput(),
1623
1829
  output: readOutput2(item.testCase)
1624
1830
  }));
@@ -1632,7 +1838,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1632
1838
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1633
1839
  return new Promise((resolve5, reject) => {
1634
1840
  const app = render(
1635
- React2__default.createElement(GenerateView, {
1841
+ React__default.createElement(GenerateView, {
1636
1842
  runner,
1637
1843
  datasetName,
1638
1844
  onComplete: (err) => {
@@ -1717,9 +1923,7 @@ function createBar(value, max = 100, width = 20) {
1717
1923
  function aggregateEvaluatorScores(events, nameById) {
1718
1924
  if (events.length === 0)
1719
1925
  return [];
1720
- const evaluatorIds = new Set(
1721
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1722
- );
1926
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
1723
1927
  const result = [];
1724
1928
  for (const evaluatorId of evaluatorIds) {
1725
1929
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1749,9 +1953,7 @@ function aggregateEvaluatorScores(events, nameById) {
1749
1953
  return es?.passed ?? false;
1750
1954
  });
1751
1955
  const lastEvent = events[events.length - 1];
1752
- const lastEs = lastEvent?.evaluatorScores.find(
1753
- (x) => x.evaluatorId === evaluatorId
1754
- );
1956
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1755
1957
  result.push({
1756
1958
  evaluatorId,
1757
1959
  evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
@@ -1780,14 +1982,11 @@ function formatScorePart(item, _scoreToColor, options) {
1780
1982
  }
1781
1983
  function RunView({
1782
1984
  runner,
1783
- datasetName,
1784
- evaluatorPattern,
1985
+ runConfigNames,
1785
1986
  concurrency,
1786
1987
  onComplete
1787
1988
  }) {
1788
- const [phase, setPhase] = useState(
1789
- "loading"
1790
- );
1989
+ const [phase, setPhase] = useState("loading");
1791
1990
  const [runInfo, setRunInfo] = useState(null);
1792
1991
  const [testCases, setTestCases] = useState([]);
1793
1992
  const [startedEvaluations, setStartedEvaluations] = useState(0);
@@ -1796,30 +1995,30 @@ function RunView({
1796
1995
  const [summary, setSummary] = useState(null);
1797
1996
  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1798
1997
  const runEval = useCallback(async () => {
1799
- const dataset = await runner.resolveDatasetByName(datasetName);
1800
- if (!dataset) {
1801
- const known = await runner.collectDatasets();
1802
- const available = known.map((item) => item.dataset.getName()).sort();
1803
- onComplete(
1804
- new Error(
1805
- available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1806
- )
1807
- );
1998
+ const rcList = runConfigNames.filter((n) => n.trim().length > 0);
1999
+ if (rcList.length === 0) {
2000
+ onComplete(new Error("At least one RunConfig name is required."));
1808
2001
  return;
1809
2002
  }
1810
- const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1811
- if (evaluators.length === 0) {
1812
- const known = await runner.collectEvaluators();
1813
- const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1814
- onComplete(
1815
- new Error(
1816
- available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1817
- )
1818
- );
2003
+ setStartedEvaluations(0);
2004
+ setCompletedEvaluations(0);
2005
+ setTestCases([]);
2006
+ setRunningEvaluations([]);
2007
+ setSummary(null);
2008
+ let jobs;
2009
+ try {
2010
+ jobs = await runner.expandRunConfigNamesToJobs(rcList);
2011
+ } catch (err) {
2012
+ onComplete(err instanceof Error ? err : new Error(String(err)));
2013
+ return;
2014
+ }
2015
+ if (jobs.length === 0) {
2016
+ onComplete(new Error("No jobs expanded from RunConfigs."));
1819
2017
  return;
1820
2018
  }
2019
+ const allEvaluators = await runner.collectEvaluators();
1821
2020
  const nameById = new Map(
1822
- evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2021
+ allEvaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
1823
2022
  );
1824
2023
  setEvaluatorNameById(nameById);
1825
2024
  const aggregates = /* @__PURE__ */ new Map();
@@ -1827,21 +2026,30 @@ function RunView({
1827
2026
  let overallScoreTotal = 0;
1828
2027
  let overallScoreSumSq = 0;
1829
2028
  let overallScoreCount = 0;
1830
- const done = new Promise((resolve5) => {
2029
+ const batchPendingRunIds = /* @__PURE__ */ new Set();
2030
+ const runIdToLabel = /* @__PURE__ */ new Map();
2031
+ let batchReady = false;
2032
+ const completedRuns = /* @__PURE__ */ new Map();
2033
+ const done = new Promise((resolve5, reject) => {
1831
2034
  const unsubscribe = runner.subscribeRunEvents((event) => {
2035
+ if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
2036
+ return;
2037
+ }
1832
2038
  if (event.type === "TestCaseStarted") {
1833
- setStartedEvaluations(event.startedTestCases);
2039
+ setStartedEvaluations((c) => c + 1);
1834
2040
  setRunningEvaluations((prev) => {
1835
2041
  const withoutDuplicate = prev.filter(
1836
- (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
2042
+ (item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
1837
2043
  );
1838
2044
  return [
1839
2045
  ...withoutDuplicate,
1840
2046
  {
2047
+ runId: event.runId,
1841
2048
  testCaseId: event.testCaseId,
1842
2049
  name: event.testCaseName,
1843
- rerunIndex: event.rerunIndex,
1844
- rerunTotal: event.rerunTotal,
2050
+ repetitionId: event.repetitionId,
2051
+ repetitionIndex: event.repetitionIndex,
2052
+ repetitionCount: event.repetitionCount,
1845
2053
  startedTestCases: event.startedTestCases,
1846
2054
  totalTestCases: event.totalTestCases
1847
2055
  }
@@ -1877,9 +2085,12 @@ function RunView({
1877
2085
  scoreItemsByEvaluatorScore.set(key, list);
1878
2086
  }
1879
2087
  }
2088
+ const label = runIdToLabel.get(event.runId);
2089
+ const compositeId = `${event.runId}:${event.testCaseId}`;
2090
+ const displayName = label !== void 0 ? `${label} \u203A ${event.testCaseName}` : event.testCaseName;
1880
2091
  setTestCases((prev) => {
1881
2092
  const byId = new Map(prev.map((tc) => [tc.testCaseId, tc]));
1882
- const existing = byId.get(event.testCaseId);
2093
+ const existing = byId.get(compositeId);
1883
2094
  const newEvent = {
1884
2095
  evaluatorScores: event.evaluatorScores.map((item) => ({
1885
2096
  evaluatorId: item.evaluatorId,
@@ -1894,17 +2105,14 @@ function RunView({
1894
2105
  };
1895
2106
  const events = existing ? [...existing.events, newEvent] : [newEvent];
1896
2107
  const isAggregated = events.length > 1;
1897
- const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1898
- events,
1899
- nameById
1900
- );
2108
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
1901
2109
  const merged = {
1902
- name: event.testCaseName,
1903
- testCaseId: event.testCaseId,
2110
+ name: displayName,
2111
+ testCaseId: compositeId,
1904
2112
  completedTestCases: event.completedTestCases,
1905
2113
  totalTestCases: event.totalTestCases,
1906
- rerunIndex: event.rerunIndex,
1907
- rerunTotal: event.rerunTotal,
2114
+ repetitionIndex: event.repetitionIndex,
2115
+ repetitionCount: event.repetitionCount,
1908
2116
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1909
2117
  passed: events.every((e) => e.passed),
1910
2118
  errorMessage: event.errorMessage,
@@ -1912,84 +2120,118 @@ function RunView({
1912
2120
  aggregatedEvaluatorScores,
1913
2121
  isAggregated
1914
2122
  };
1915
- byId.set(event.testCaseId, merged);
1916
- setCompletedEvaluations(event.completedTestCases);
1917
- setRunningEvaluations(
1918
- (running) => running.filter(
1919
- (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1920
- )
1921
- );
2123
+ byId.set(compositeId, merged);
1922
2124
  return Array.from(byId.values());
1923
2125
  });
2126
+ setCompletedEvaluations((c) => c + 1);
2127
+ setRunningEvaluations(
2128
+ (running) => running.filter(
2129
+ (item) => !(item.testCaseId === event.testCaseId && item.repetitionIndex === event.repetitionIndex && item.runId === event.runId)
2130
+ )
2131
+ );
1924
2132
  }
1925
- if (event.type === "RunCompleted" || event.type === "RunFailed") {
2133
+ if (event.type === "RunFailed") {
2134
+ if (batchReady && !batchPendingRunIds.has(event.runId)) {
2135
+ return;
2136
+ }
1926
2137
  unsubscribe();
1927
- resolve5(event);
2138
+ reject(new Error(`Run failed: ${event.errorMessage}`));
2139
+ return;
2140
+ }
2141
+ if (event.type === "RunCompleted") {
2142
+ if (!batchPendingRunIds.has(event.runId)) {
2143
+ return;
2144
+ }
2145
+ completedRuns.set(event.runId, event);
2146
+ batchPendingRunIds.delete(event.runId);
2147
+ if (batchPendingRunIds.size === 0) {
2148
+ unsubscribe();
2149
+ resolve5();
2150
+ }
1928
2151
  }
1929
2152
  });
1930
2153
  });
1931
- const snapshot = await runner.runDatasetWith({
1932
- datasetId: dataset.id,
1933
- evaluatorIds: evaluators.map((item) => item.id),
1934
- concurrency
2154
+ const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2155
+ jobs,
2156
+ globalConcurrency: concurrency
1935
2157
  });
2158
+ for (let i = 0; i < snapshots.length; i += 1) {
2159
+ const snap = snapshots[i];
2160
+ const job = jobs[i];
2161
+ if (snap && job) {
2162
+ runIdToLabel.set(
2163
+ snap.runId,
2164
+ `${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
2165
+ );
2166
+ batchPendingRunIds.add(snap.runId);
2167
+ }
2168
+ }
2169
+ const totalUnits = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
2170
+ batchReady = true;
2171
+ const runConfigLabels = await Promise.all(
2172
+ rcList.map(async (n) => {
2173
+ const collected = await runner.resolveRunConfigByName(n);
2174
+ return collected?.runConfig.getDisplayLabel() ?? n;
2175
+ })
2176
+ );
1936
2177
  setRunInfo({
1937
- runId: snapshot.runId,
1938
- datasetName: snapshot.datasetName,
1939
- evaluatorNames: evaluators.map((e) => e.evaluator.getName() ?? e.id),
1940
- totalTestCases: snapshot.totalTestCases
2178
+ names: runConfigLabels,
2179
+ jobs: jobs.length,
2180
+ totalTestCases: totalUnits
1941
2181
  });
1942
2182
  setPhase("running");
1943
- const finalEvent = await done;
1944
- if (finalEvent.type === "RunFailed") {
1945
- onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
2183
+ try {
2184
+ await done;
2185
+ } catch (err) {
2186
+ onComplete(err instanceof Error ? err : new Error(String(err)));
1946
2187
  return;
1947
2188
  }
1948
- const completed = finalEvent;
2189
+ let passedTestCases = 0;
2190
+ let failedTestCases = 0;
2191
+ let totalTestCases = 0;
2192
+ const artifacts = [];
2193
+ for (const ev of completedRuns.values()) {
2194
+ passedTestCases += ev.passedTestCases;
2195
+ failedTestCases += ev.failedTestCases;
2196
+ totalTestCases += ev.totalTestCases;
2197
+ artifacts.push(ev.artifactPath);
2198
+ }
1949
2199
  setSummary({
1950
- passedTestCases: completed.passedTestCases,
1951
- failedTestCases: completed.failedTestCases,
1952
- totalTestCases: completed.totalTestCases,
2200
+ passedTestCases,
2201
+ failedTestCases,
2202
+ totalTestCases,
1953
2203
  overallScoreTotal,
1954
2204
  overallScoreSumSq,
1955
2205
  overallScoreCount,
1956
2206
  aggregates: new Map(aggregates),
1957
2207
  scoreItemsByEvaluatorScore: new Map(scoreItemsByEvaluatorScore),
1958
- artifactPath: completed.artifactPath
2208
+ artifactPath: artifacts.join("\n")
1959
2209
  });
1960
2210
  setPhase("completed");
1961
- setTimeout(() => onComplete(), 200);
1962
- }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
2211
+ const exitCode = failedTestCases > 0 ? 1 : 0;
2212
+ setTimeout(() => onComplete(void 0, exitCode), 200);
2213
+ }, [runner, runConfigNames, concurrency, onComplete]);
1963
2214
  useEffect(() => {
1964
2215
  void runEval();
1965
2216
  }, [runEval]);
1966
2217
  return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
1967
2218
  /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
1968
2219
  runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1969
- /* @__PURE__ */ jsxs(Text, { children: [
1970
- /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1971
- "Run",
1972
- " "
1973
- ] }),
1974
- /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
1975
- ] }),
1976
- /* @__PURE__ */ jsxs(Text, { children: [
1977
- /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1978
- "Dataset",
1979
- " "
1980
- ] }),
1981
- runInfo.datasetName
2220
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
2221
+ "RunConfigs",
2222
+ " "
1982
2223
  ] }),
2224
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.names.join(", ") }),
1983
2225
  /* @__PURE__ */ jsxs(Text, { children: [
1984
2226
  /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1985
- "Evaluators",
2227
+ "Jobs",
1986
2228
  " "
1987
2229
  ] }),
1988
- runInfo.evaluatorNames.join(", ")
2230
+ runInfo.jobs
1989
2231
  ] }),
1990
2232
  /* @__PURE__ */ jsxs(Text, { children: [
1991
2233
  /* @__PURE__ */ jsxs(Text, { color: "cyan", bold: true, children: [
1992
- "Test cases",
2234
+ "Evaluation units",
1993
2235
  " "
1994
2236
  ] }),
1995
2237
  runInfo.totalTestCases
@@ -2011,20 +2253,19 @@ function RunView({
2011
2253
  item.startedTestCases,
2012
2254
  "/",
2013
2255
  item.totalTestCases,
2014
- "]",
2015
- " ",
2256
+ "] ",
2016
2257
  item.name,
2017
2258
  " ",
2018
2259
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2019
2260
  "(",
2020
- item.rerunIndex,
2261
+ item.repetitionIndex,
2021
2262
  "/",
2022
- item.rerunTotal,
2263
+ item.repetitionCount,
2023
2264
  ")"
2024
2265
  ] })
2025
2266
  ]
2026
2267
  },
2027
- `${item.testCaseId}:${item.rerunIndex}`
2268
+ `${item.runId ?? ""}:${item.testCaseId}:${item.repetitionId}:${item.repetitionIndex}`
2028
2269
  )) })
2029
2270
  ] }),
2030
2271
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
@@ -2041,9 +2282,9 @@ function RunView({
2041
2282
  " ",
2042
2283
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
2043
2284
  "(",
2044
- tc.rerunIndex,
2285
+ tc.repetitionIndex,
2045
2286
  "/",
2046
- tc.rerunTotal,
2287
+ tc.repetitionCount,
2047
2288
  ")"
2048
2289
  ] }),
2049
2290
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
@@ -2057,73 +2298,70 @@ function RunView({
2057
2298
  ] }) : null
2058
2299
  ] }),
2059
2300
  tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
2060
- tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
2061
- Box,
2062
- {
2063
- flexDirection: "column",
2064
- marginLeft: 2,
2065
- children: [
2066
- /* @__PURE__ */ jsxs(Text, { children: [
2067
- item.evaluatorName,
2068
- ":",
2069
- " ",
2070
- /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2071
- item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
2301
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
2302
+ /* @__PURE__ */ jsxs(Text, { children: [
2303
+ item.evaluatorName,
2304
+ ":",
2305
+ " ",
2306
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2307
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
2308
+ " ",
2309
+ item.metrics.map((m) => {
2310
+ const def = getMetricById(m.id);
2311
+ if (!def)
2312
+ return null;
2313
+ const formatted = def.format(m.data, {
2314
+ isAggregated: tc.isAggregated
2315
+ });
2316
+ const label = m.name ?? def.name;
2317
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2318
+ "[",
2319
+ label ? `${label}: ` : "",
2320
+ formatted,
2321
+ "]",
2322
+ " "
2323
+ ] }, m.id);
2324
+ })
2325
+ ] }) : null
2326
+ ] }),
2327
+ item.scores.length > 0 ? item.scores.map((s) => {
2328
+ const def = s.def ?? getScoreById(s.id);
2329
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2330
+ return /* @__PURE__ */ jsxs(
2331
+ Text,
2332
+ {
2333
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2334
+ children: [
2335
+ " ",
2336
+ scoreLabel,
2337
+ ":",
2072
2338
  " ",
2073
- item.metrics.map((m) => {
2074
- const def = getMetricById(m.id);
2075
- if (!def)
2076
- return null;
2077
- const formatted = def.format(m.data, {
2078
- isAggregated: tc.isAggregated
2079
- });
2080
- const label = m.name ?? def.name;
2081
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2082
- "[",
2083
- label ? `${label}: ` : "",
2084
- formatted,
2085
- "]",
2086
- " "
2087
- ] }, m.id);
2339
+ formatScorePart(s, scoreColor, {
2340
+ isAggregated: tc.isAggregated
2088
2341
  })
2089
- ] }) : null
2090
- ] }),
2091
- item.scores.length > 0 ? item.scores.map((s, idx) => {
2092
- const def = s.def ?? getScoreById(s.id);
2093
- const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2094
- return /* @__PURE__ */ jsxs(
2342
+ ]
2343
+ },
2344
+ `${item.evaluatorId}-${s.id}-${scoreLabel}`
2345
+ );
2346
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2347
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2348
+ (log) => log.type === "diff" ? /* @__PURE__ */ jsx(
2349
+ Box,
2350
+ {
2351
+ flexDirection: "column",
2352
+ children: getDiffLines(log).map(({ type, line }) => /* @__PURE__ */ jsx(
2095
2353
  Text,
2096
2354
  {
2097
- color: scoreColor(toNumericScore(s.data) ?? 0),
2098
- children: [
2099
- " ",
2100
- scoreLabel,
2101
- ":",
2102
- " ",
2103
- formatScorePart(s, scoreColor, {
2104
- isAggregated: tc.isAggregated
2105
- })
2106
- ]
2355
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2356
+ children: line
2107
2357
  },
2108
- `${item.evaluatorId}-${s.id}-${idx}`
2109
- );
2110
- }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2111
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2112
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
2113
- ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
2114
- Text,
2115
- {
2116
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2117
- children: line
2118
- },
2119
- lineIdx
2120
- )
2121
- ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2122
- ) })
2123
- ]
2124
- },
2125
- item.evaluatorId
2126
- ))
2358
+ `${type}:${line}`
2359
+ ))
2360
+ },
2361
+ `diff:${getDiffLines(log).map((x) => x.line).join("|")}`
2362
+ ) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line)) }, `log:${getLogLines(log).join("\n")}`) : null
2363
+ ) })
2364
+ ] }, item.evaluatorId))
2127
2365
  ] }, tc.testCaseId)) }),
2128
2366
  phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
2129
2367
  /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -2165,9 +2403,9 @@ function RunView({
2165
2403
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
2166
2404
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2167
2405
  const agg = summary.aggregates.get(id);
2168
- const scoreKeys = [
2169
- ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2170
- ].filter((k) => k.startsWith(`${id}:`));
2406
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2407
+ (k) => k.startsWith(`${id}:`)
2408
+ );
2171
2409
  if (scoreKeys.length === 0) {
2172
2410
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2173
2411
  "- ",
@@ -2197,19 +2435,12 @@ function RunView({
2197
2435
  const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
2198
2436
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
2199
2437
  const numeric = toNumericScore(aggregated.data);
2200
- return /* @__PURE__ */ jsxs(
2201
- Text,
2202
- {
2203
- color: numeric !== void 0 ? scoreColor(numeric) : "gray",
2204
- children: [
2205
- " ",
2206
- label,
2207
- ": ",
2208
- formatted
2209
- ]
2210
- },
2211
- key
2212
- );
2438
+ return /* @__PURE__ */ jsxs(Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
2439
+ " ",
2440
+ label,
2441
+ ": ",
2442
+ formatted
2443
+ ] }, key);
2213
2444
  })
2214
2445
  ] }, id);
2215
2446
  })
@@ -2252,10 +2483,10 @@ function RunView({
2252
2483
  ] }, tc.testCaseId);
2253
2484
  })
2254
2485
  ] }),
2255
- /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2256
- "artifact: ",
2257
- summary.artifactPath
2258
- ] }) })
2486
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
2487
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "artifact(s):" }),
2488
+ summary.artifactPath.split("\n").map((line) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, line))
2489
+ ] })
2259
2490
  ] })
2260
2491
  ] });
2261
2492
  }
@@ -2285,9 +2516,7 @@ function buildTestCaseSummaries(byId) {
2285
2516
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
2286
2517
  const scoreIdToItems = /* @__PURE__ */ new Map();
2287
2518
  for (const ev of events) {
2288
- const es = ev.evaluatorScores.find(
2289
- (x) => x.evaluatorId === evaluatorScores.evaluatorId
2290
- );
2519
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
2291
2520
  for (const s of es?.scores ?? []) {
2292
2521
  const list = scoreIdToItems.get(s.id) ?? [];
2293
2522
  list.push(s);
@@ -2340,9 +2569,7 @@ function scoreToColor(score) {
2340
2569
  }
2341
2570
  function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2342
2571
  const lines = [];
2343
- const scoreKeys = [...scoreItemsByKey.keys()].filter(
2344
- (k) => k.startsWith(`${evaluatorId}:`)
2345
- );
2572
+ const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
2346
2573
  if (scoreKeys.length === 0) {
2347
2574
  lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2348
2575
  return lines;
@@ -2377,9 +2604,7 @@ function createBar2(value, max = 100, width = 20) {
2377
2604
  function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2378
2605
  if (events.length === 0)
2379
2606
  return [];
2380
- const evaluatorIds = new Set(
2381
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
2382
- );
2607
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
2383
2608
  const result = [];
2384
2609
  for (const evaluatorId of evaluatorIds) {
2385
2610
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -2426,9 +2651,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2426
2651
  if (def) {
2427
2652
  const formatted = def.format(m.data, options);
2428
2653
  const label = m.name ?? def.name;
2429
- metricParts.push(
2430
- label ? `[${label}: ${formatted}]` : `[${formatted}]`
2431
- );
2654
+ metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
2432
2655
  }
2433
2656
  }
2434
2657
  }
@@ -2475,25 +2698,14 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2475
2698
  }
2476
2699
  return lines;
2477
2700
  }
2478
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2479
- const dataset = await runner.resolveDatasetByName(datasetName);
2480
- if (!dataset) {
2481
- const known = await runner.collectDatasets();
2482
- const available = known.map((item) => item.dataset.getName()).sort();
2483
- throw new Error(
2484
- available.length > 0 ? `Dataset "${datasetName}" not found. Available datasets: ${available.join(", ")}` : `Dataset "${datasetName}" not found and no datasets were discovered.`
2485
- );
2486
- }
2487
- const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
2488
- if (evaluators.length === 0) {
2489
- const known = await runner.collectEvaluators();
2490
- const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
2491
- throw new Error(
2492
- available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available evaluators: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}" and no evaluators were discovered.`
2493
- );
2701
+ async function runSimpleEvalRunConfigsPlain(runner, runConfigNames, concurrency) {
2702
+ const jobs = await runner.expandRunConfigNamesToJobs(runConfigNames);
2703
+ if (jobs.length === 0) {
2704
+ throw new Error("No jobs expanded from RunConfigs.");
2494
2705
  }
2706
+ const evaluators = await runner.collectEvaluators();
2495
2707
  const evaluatorNameById = new Map(
2496
- evaluators.map((item) => [item.id, item.evaluator.getName() ?? item.id])
2708
+ evaluators.map((item) => [item.id, getEvaluatorDisplayLabel(item.evaluator) ?? item.id])
2497
2709
  );
2498
2710
  const aggregates = /* @__PURE__ */ new Map();
2499
2711
  const scoreItemsByEvaluatorScore = /* @__PURE__ */ new Map();
@@ -2501,11 +2713,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2501
2713
  let overallScoreTotal = 0;
2502
2714
  let overallScoreSumSq = 0;
2503
2715
  let overallScoreCount = 0;
2504
- let startedCount = 0;
2505
- let completedCount = 0;
2716
+ let globalStartedUnits = 0;
2717
+ let globalCompletedUnits = 0;
2506
2718
  let totalCount = 0;
2507
2719
  let runFinished = false;
2508
- const inFlightReruns = /* @__PURE__ */ new Set();
2720
+ const inFlightRepetitions = /* @__PURE__ */ new Set();
2509
2721
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2510
2722
  let spinnerIndex = 0;
2511
2723
  function clearLine() {
@@ -2527,33 +2739,46 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2527
2739
  spinnerIndex += 1;
2528
2740
  process.stdout.write(
2529
2741
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2530
- `${completedCount}/${totalCount}`,
2742
+ `${globalCompletedUnits}/${totalCount}`,
2531
2743
  ansi2.bold
2532
- )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2744
+ )} completed ${colorize(`${globalStartedUnits}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightRepetitions.size} running)`, ansi2.dim)}`
2533
2745
  );
2534
2746
  }
2535
2747
  let lastPrintedTestCaseId = null;
2536
2748
  let lastPrintedLineCount = 0;
2537
2749
  let spinnerTimer;
2538
- const done = new Promise((resolve5) => {
2750
+ const batchPendingRunIds = /* @__PURE__ */ new Set();
2751
+ const runIdToLabel = /* @__PURE__ */ new Map();
2752
+ let batchReady = false;
2753
+ const completedRuns = /* @__PURE__ */ new Map();
2754
+ const done = new Promise((resolve5, reject) => {
2539
2755
  const unsubscribe = runner.subscribeRunEvents((event) => {
2756
+ if (batchReady && "runId" in event && typeof event.runId === "string" && !batchPendingRunIds.has(event.runId)) {
2757
+ return;
2758
+ }
2759
+ const rowPrefix = typeof event.runId === "string" ? runIdToLabel.get(event.runId) : void 0;
2760
+ const pfx = rowPrefix !== void 0 ? `${colorize(`[${rowPrefix}]`, ansi2.dim)} ` : "";
2540
2761
  if (event.type === "TestCaseStarted") {
2541
- startedCount = event.startedTestCases;
2542
- inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2762
+ globalStartedUnits += 1;
2763
+ inFlightRepetitions.add(
2764
+ `${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
2765
+ );
2543
2766
  clearLine();
2544
2767
  process.stdout.write(
2545
- `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2768
+ `${pfx}${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2546
2769
  `
2547
2770
  );
2548
2771
  drawSpinner();
2549
2772
  }
2550
2773
  if (event.type === "TestCaseProgress") {
2551
- completedCount = event.completedTestCases;
2552
- inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2774
+ globalCompletedUnits += 1;
2775
+ inFlightRepetitions.delete(
2776
+ `${event.runId}:${event.testCaseId}:${event.repetitionId}:${event.repetitionIndex}`
2777
+ );
2553
2778
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2554
2779
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2555
- const testCaseId = event.testCaseId;
2556
- const existing = testCaseByTestId.get(testCaseId) ?? {
2780
+ const compositeId = `${event.runId}:${event.testCaseId}`;
2781
+ const existing = testCaseByTestId.get(compositeId) ?? {
2557
2782
  name: event.testCaseName,
2558
2783
  events: []
2559
2784
  };
@@ -2563,7 +2788,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2563
2788
  durationMs: event.durationMs,
2564
2789
  evaluatorScores: event.evaluatorScores
2565
2790
  });
2566
- testCaseByTestId.set(testCaseId, existing);
2791
+ testCaseByTestId.set(compositeId, existing);
2567
2792
  for (const item of event.evaluatorScores) {
2568
2793
  const numeric = toNumericScoreFromScores(item.scores);
2569
2794
  if (numeric !== void 0) {
@@ -2592,24 +2817,21 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2592
2817
  scoreItemsByEvaluatorScore.set(key, list);
2593
2818
  }
2594
2819
  }
2595
- const isSameTestCase = lastPrintedTestCaseId === testCaseId;
2596
- const isLastRerun = event.rerunIndex >= event.rerunTotal;
2820
+ const isSameTestCase = lastPrintedTestCaseId === compositeId;
2821
+ const isLastRepetition = event.repetitionIndex >= event.repetitionCount;
2597
2822
  const isNonTty = !process.stdout.isTTY;
2598
- const skipPrintNonTty = isNonTty && event.rerunTotal > 1 && !isLastRerun;
2823
+ const skipPrintNonTty = isNonTty && event.repetitionCount > 1 && !isLastRepetition;
2599
2824
  if (isSameTestCase && lastPrintedLineCount > 0 && !skipPrintNonTty) {
2600
2825
  cursorUp(lastPrintedLineCount);
2601
2826
  }
2602
2827
  const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2603
2828
  existing.events);
2604
2829
  const isAggregated = existing.events.length > 1;
2605
- const durationMs = existing.events.reduce(
2606
- (s, e) => s + e.durationMs,
2607
- 0
2608
- );
2830
+ const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
2609
2831
  const lines = [];
2610
2832
  const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2611
2833
  lines.push(
2612
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2834
+ `${pfx}${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.repetitionIndex}/${event.repetitionCount})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2613
2835
  );
2614
2836
  if (event.errorMessage) {
2615
2837
  lines.push(colorize(event.errorMessage, ansi2.red));
@@ -2617,18 +2839,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2617
2839
  for (const item of aggregatedScores) {
2618
2840
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2619
2841
  lines.push(
2620
- ...formatEvaluatorScoreLine(
2621
- name,
2622
- item.scores,
2623
- item.passed,
2624
- item.metrics,
2625
- { isAggregated }
2626
- )
2842
+ ...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
2843
+ isAggregated
2844
+ })
2627
2845
  );
2628
2846
  const lastEvent = existing.events[existing.events.length - 1];
2629
- const lastEs = lastEvent?.evaluatorScores.find(
2630
- (x) => x.evaluatorId === item.evaluatorId
2631
- );
2847
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
2632
2848
  if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2633
2849
  for (const log of lastEs.logs) {
2634
2850
  if (log.type === "diff") {
@@ -2646,73 +2862,102 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2646
2862
  }
2647
2863
  }
2648
2864
  if (!skipPrintNonTty) {
2649
- for (let i = 0; i < lines.length; i++) {
2865
+ for (let i = 0; i < lines.length; i += 1) {
2650
2866
  process.stdout.write(`\r\x1B[2K${lines[i]}
2651
2867
  `);
2652
2868
  }
2653
- lastPrintedTestCaseId = testCaseId;
2869
+ lastPrintedTestCaseId = compositeId;
2654
2870
  lastPrintedLineCount = lines.length;
2655
2871
  }
2656
2872
  drawSpinner();
2657
2873
  }
2658
- if (event.type === "RunCompleted" || event.type === "RunFailed") {
2874
+ if (event.type === "RunFailed") {
2875
+ if (batchReady && !batchPendingRunIds.has(event.runId)) {
2876
+ return;
2877
+ }
2659
2878
  runFinished = true;
2660
2879
  clearLine();
2661
2880
  unsubscribe();
2662
- resolve5(event);
2881
+ reject(new Error(`Run failed: ${event.errorMessage}`));
2882
+ return;
2883
+ }
2884
+ if (event.type === "RunCompleted") {
2885
+ if (!batchPendingRunIds.has(event.runId)) {
2886
+ return;
2887
+ }
2888
+ completedRuns.set(event.runId, event);
2889
+ batchPendingRunIds.delete(event.runId);
2890
+ if (batchPendingRunIds.size === 0) {
2891
+ runFinished = true;
2892
+ clearLine();
2893
+ unsubscribe();
2894
+ resolve5();
2895
+ }
2663
2896
  }
2664
2897
  });
2665
2898
  });
2666
- const snapshot = await runner.runDatasetWith({
2667
- datasetId: dataset.id,
2668
- evaluatorIds: evaluators.map((item) => item.id),
2669
- concurrency
2899
+ console.log(colorize("=== Eval Run Started (RunConfigs) ===", `${ansi2.bold}${ansi2.cyan}`));
2900
+ for (const name of runConfigNames) {
2901
+ const collected = await runner.resolveRunConfigByName(name);
2902
+ const label = collected?.runConfig.getDisplayLabel() ?? name;
2903
+ console.log(`RunConfig: ${colorize(label, ansi2.bold)}`);
2904
+ }
2905
+ console.log(`Jobs: ${colorize(String(jobs.length), ansi2.bold)}`);
2906
+ console.log(`Shared concurrency: ${colorize(String(concurrency), ansi2.bold)}`);
2907
+ console.log("");
2908
+ const snapshots = await runner.runDatasetJobsWithSharedConcurrency({
2909
+ jobs,
2910
+ globalConcurrency: concurrency
2670
2911
  });
2671
- totalCount = snapshot.totalTestCases;
2672
- console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
2673
- console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
2674
- console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
2675
- console.log(
2676
- `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
2677
- );
2678
- console.log(
2679
- `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
2680
- );
2912
+ for (let i = 0; i < snapshots.length; i += 1) {
2913
+ const snap = snapshots[i];
2914
+ const job = jobs[i];
2915
+ if (snap && job) {
2916
+ runIdToLabel.set(
2917
+ snap.runId,
2918
+ `${job.runConfigDisplayLabel ?? job.runConfigName} \xB7 ${snap.datasetName}`
2919
+ );
2920
+ batchPendingRunIds.add(snap.runId);
2921
+ }
2922
+ }
2923
+ totalCount = snapshots.reduce((sum, s) => sum + s.totalTestCases, 0);
2924
+ console.log(`Total evaluation units: ${colorize(String(totalCount), ansi2.bold)}`);
2681
2925
  console.log("");
2926
+ batchReady = true;
2682
2927
  drawSpinner();
2683
2928
  spinnerTimer = setInterval(drawSpinner, 100);
2684
- const finalEvent = await done;
2929
+ await done;
2685
2930
  if (spinnerTimer) {
2686
2931
  clearInterval(spinnerTimer);
2687
2932
  }
2688
- if (finalEvent.type === "RunFailed") {
2689
- throw new Error(`Run failed: ${finalEvent.errorMessage}`);
2690
- }
2691
- const completed = finalEvent;
2692
2933
  console.log("");
2693
- console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2694
- console.log(
2695
- `- passed: ${colorize(
2696
- `${completed.passedTestCases}/${completed.totalTestCases}`,
2697
- ansi2.green
2698
- )}`
2699
- );
2700
- console.log(
2701
- `- failed: ${colorize(
2702
- `${completed.failedTestCases}/${completed.totalTestCases}`,
2703
- completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2704
- )}`
2705
- );
2934
+ console.log(colorize("=== Run Summary (all jobs) ===", `${ansi2.bold}${ansi2.cyan}`));
2935
+ for (const snap of snapshots) {
2936
+ const completed = completedRuns.get(snap.runId);
2937
+ if (!completed) {
2938
+ continue;
2939
+ }
2940
+ const label = runIdToLabel.get(snap.runId) ?? snap.runId;
2941
+ console.log("");
2942
+ console.log(colorize(`\u2014 ${label}`, ansi2.magenta));
2943
+ console.log(
2944
+ `- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
2945
+ );
2946
+ console.log(
2947
+ `- failed: ${colorize(
2948
+ `${completed.failedTestCases}/${completed.totalTestCases}`,
2949
+ completed.failedTestCases > 0 ? ansi2.red : ansi2.dim
2950
+ )}`
2951
+ );
2952
+ console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2953
+ }
2706
2954
  if (overallScoreCount > 0) {
2707
2955
  const overallAverage = overallScoreTotal / overallScoreCount;
2708
- const overallSd = sampleStdDev2(
2709
- overallScoreTotal,
2710
- overallScoreSumSq,
2711
- overallScoreCount
2712
- );
2956
+ const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
2713
2957
  const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2958
+ console.log("");
2714
2959
  console.log(
2715
- `- overall avg score: ${colorize(
2960
+ `- overall avg score (all jobs): ${colorize(
2716
2961
  avgStr,
2717
2962
  scoreToColor(overallAverage)
2718
2963
  )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
@@ -2753,22 +2998,28 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2753
2998
  );
2754
2999
  }
2755
3000
  }
2756
- console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
3001
+ let failedTestCasesTotal = 0;
3002
+ for (const snap of snapshots) {
3003
+ const completed = completedRuns.get(snap.runId);
3004
+ if (completed) {
3005
+ failedTestCasesTotal += completed.failedTestCases;
3006
+ }
3007
+ }
3008
+ return failedTestCasesTotal > 0 ? 1 : 0;
2757
3009
  }
2758
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
3010
+ async function runSimpleEvalRunConfigsInk(runner, runConfigNames, concurrency) {
2759
3011
  return new Promise((resolve5, reject) => {
2760
3012
  const app = render(
2761
- React2.createElement(RunView, {
3013
+ React.createElement(RunView, {
2762
3014
  runner,
2763
- datasetName,
2764
- evaluatorPattern,
3015
+ runConfigNames,
2765
3016
  concurrency,
2766
- onComplete: (err) => {
3017
+ onComplete: (err, exitCode) => {
2767
3018
  app.unmount();
2768
3019
  if (err) {
2769
3020
  reject(err);
2770
3021
  } else {
2771
- resolve5();
3022
+ resolve5(exitCode ?? 0);
2772
3023
  }
2773
3024
  }
2774
3025
  })
@@ -2794,12 +3045,22 @@ async function main() {
2794
3045
  if (!args.command) {
2795
3046
  printUsageAndExit(1);
2796
3047
  }
2797
- if (!args.datasetName) {
2798
- console.error("Missing required --dataset <datasetName> argument.");
2799
- printUsageAndExit(1);
3048
+ if (args.command === "run") {
3049
+ if (args.runConfigNames.length === 0) {
3050
+ console.error(
3051
+ "Missing required --run-config <name> (repeat the flag to queue multiple RunConfigs)."
3052
+ );
3053
+ printUsageAndExit(1);
3054
+ }
3055
+ if (args.datasetName !== void 0) {
3056
+ console.error(
3057
+ "The run command no longer accepts --dataset; use --run-config <RunConfig name>."
3058
+ );
3059
+ printUsageAndExit(1);
3060
+ }
2800
3061
  }
2801
- if (args.command === "run" && !args.evaluatorPattern) {
2802
- console.error("Missing required --evaluator <name-or-pattern> argument.");
3062
+ if (args.command === "generate" && args.runConfigNames.length > 0) {
3063
+ console.error("generate does not accept --run-config.");
2803
3064
  printUsageAndExit(1);
2804
3065
  }
2805
3066
  const useInk = process.stdout.isTTY === true;
@@ -2810,17 +3071,24 @@ async function main() {
2810
3071
  try {
2811
3072
  if (args.command === "run") {
2812
3073
  const concurrency = args.concurrency ?? getDefaultConcurrency();
2813
- await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
3074
+ const exitCode = await (useInk ? runSimpleEvalRunConfigsInk : runSimpleEvalRunConfigsPlain)(
2814
3075
  runner,
2815
- args.datasetName,
2816
- args.evaluatorPattern,
3076
+ args.runConfigNames,
2817
3077
  concurrency
2818
3078
  );
3079
+ if (args.ci && exitCode !== 0) {
3080
+ process.exit(1);
3081
+ }
2819
3082
  return;
2820
3083
  }
3084
+ const genDataset = args.datasetName;
3085
+ if (!genDataset) {
3086
+ console.error("Missing required --dataset <datasetName> argument.");
3087
+ printUsageAndExit(1);
3088
+ }
2821
3089
  await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
2822
3090
  runner,
2823
- args.datasetName
3091
+ genDataset
2824
3092
  );
2825
3093
  } finally {
2826
3094
  await runner.shutdown();