@m4trix/evals 0.25.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -5,16 +5,16 @@ var fullscreenInk = require('fullscreen-ink');
5
5
  var React = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
- var path = require('path');
9
- var inkChart = require('@pppp606/ink-chart');
10
- var crypto = require('crypto');
11
8
  var effect = require('effect');
9
+ var crypto = require('crypto');
10
+ var promises = require('fs/promises');
11
+ var path = require('path');
12
12
  var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
- var promises = require('fs/promises');
15
14
  var url = require('url');
16
15
  var diff = require('diff');
17
16
  var stringify = require('fast-json-stable-stringify');
17
+ var inkChart = require('@pppp606/ink-chart');
18
18
 
19
19
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
20
20
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -264,6 +264,50 @@ function isPrintableCharacter(input) {
264
264
  function isBackKey(key) {
265
265
  return key.backspace || key.delete;
266
266
  }
267
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
268
+ function makeEntityIdSchema(brand, label) {
269
+ return effect.Schema.String.pipe(
270
+ effect.Schema.trimmed(),
271
+ effect.Schema.minLength(1, {
272
+ message: () => `${label} must be non-empty.`
273
+ }),
274
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
275
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
276
+ }),
277
+ effect.Schema.brand(brand)
278
+ );
279
+ }
280
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
281
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
282
+ makeEntityIdSchema("TestCaseName", "Test case name");
283
+ function validateWithSchema(schema, raw, context) {
284
+ const trimmed = raw.trim();
285
+ const decode = effect.Schema.decodeUnknownEither(
286
+ schema
287
+ );
288
+ const result = decode(trimmed);
289
+ if (effect.Either.isLeft(result)) {
290
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
291
+ }
292
+ return result.right;
293
+ }
294
+ function validateRunConfigName(raw, context) {
295
+ return validateWithSchema(RunConfigNameSchema, raw, context);
296
+ }
297
+
298
+ // src/evals/evaluator.ts
299
+ function getEvaluatorDisplayLabel(evaluator) {
300
+ if (typeof evaluator.getDisplayLabel === "function") {
301
+ const label = evaluator.getDisplayLabel();
302
+ if (label !== void 0) {
303
+ return label;
304
+ }
305
+ }
306
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
307
+ }
308
+ function getEvaluatorTagList(evaluator) {
309
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
310
+ }
267
311
 
268
312
  // src/cli/data.mock.json
269
313
  var data_mock_default = {
@@ -519,7 +563,7 @@ function toEvalDataset(item, snapshots) {
519
563
  function toEvaluatorOption(item) {
520
564
  return {
521
565
  id: item.id,
522
- name: item.evaluator.getName() ?? toSlug(item.id),
566
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
523
567
  configPreview: `Source: ${item.filePath}`
524
568
  };
525
569
  }
@@ -762,6 +806,159 @@ function reduceCliState(state, action) {
762
806
  }
763
807
  return state;
764
808
  }
809
+ async function loadRunSnapshotsFromArtifacts(config) {
810
+ const baseDir = path.resolve(config.artifactDirectory);
811
+ let entries;
812
+ try {
813
+ entries = await promises.readdir(baseDir);
814
+ } catch {
815
+ return [];
816
+ }
817
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
818
+ const snapshots = [];
819
+ for (const fileName of jsonlFiles) {
820
+ const filePath = path.join(baseDir, fileName);
821
+ try {
822
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
823
+ if (snapshot) {
824
+ snapshots.push(snapshot);
825
+ }
826
+ } catch {
827
+ }
828
+ }
829
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
830
+ }
831
+ async function parseArtifactToSnapshot(filePath, _config) {
832
+ const content = await promises.readFile(filePath, "utf8");
833
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
834
+ if (lines.length === 0) {
835
+ return null;
836
+ }
837
+ let runQueued = null;
838
+ let runCompleted = null;
839
+ let runFailed = null;
840
+ let runStarted = null;
841
+ for (const line of lines) {
842
+ try {
843
+ const event = JSON.parse(line);
844
+ const type = event.type;
845
+ if (type === "RunQueued") {
846
+ runQueued = {
847
+ runId: event.runId,
848
+ datasetId: event.datasetId,
849
+ datasetName: event.datasetName,
850
+ evaluatorIds: event.evaluatorIds,
851
+ totalTestCases: event.totalTestCases ?? 0,
852
+ artifactPath: event.artifactPath ?? filePath,
853
+ ts: event.ts
854
+ };
855
+ }
856
+ if (type === "RunStarted") {
857
+ runStarted = { startedAt: event.startedAt };
858
+ }
859
+ if (type === "RunCompleted") {
860
+ runCompleted = {
861
+ passedTestCases: event.passedTestCases,
862
+ failedTestCases: event.failedTestCases,
863
+ totalTestCases: event.totalTestCases,
864
+ finishedAt: event.finishedAt
865
+ };
866
+ }
867
+ if (type === "RunFailed") {
868
+ runFailed = {
869
+ finishedAt: event.finishedAt,
870
+ errorMessage: event.errorMessage
871
+ };
872
+ }
873
+ } catch {
874
+ }
875
+ }
876
+ if (!runQueued) {
877
+ return null;
878
+ }
879
+ const artifactPath = filePath;
880
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
881
+ const progress = aggregateTestCaseProgress(lines);
882
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
883
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
884
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
885
+ return {
886
+ runId: runQueued.runId,
887
+ datasetId: runQueued.datasetId,
888
+ datasetName: runQueued.datasetName,
889
+ evaluatorIds: runQueued.evaluatorIds,
890
+ queuedAt: runQueued.ts ?? 0,
891
+ startedAt: runStarted?.startedAt,
892
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
893
+ totalTestCases: runQueued.totalTestCases,
894
+ completedTestCases,
895
+ passedTestCases,
896
+ failedTestCases,
897
+ status,
898
+ artifactPath,
899
+ errorMessage: runFailed?.errorMessage
900
+ };
901
+ }
902
+ function aggregateTestCaseProgress(lines) {
903
+ let completedTestCases = 0;
904
+ const testCasePassedBy = /* @__PURE__ */ new Map();
905
+ for (const line of lines) {
906
+ try {
907
+ const event = JSON.parse(line);
908
+ if (event.type === "TestCaseProgress") {
909
+ const ev = event;
910
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
911
+ const id = ev.testCaseId;
912
+ const current = testCasePassedBy.get(id);
913
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
914
+ }
915
+ } catch {
916
+ }
917
+ }
918
+ let passedTestCases = 0;
919
+ let failedTestCases = 0;
920
+ for (const passed of testCasePassedBy.values()) {
921
+ if (passed) {
922
+ passedTestCases += 1;
923
+ } else {
924
+ failedTestCases += 1;
925
+ }
926
+ }
927
+ return { completedTestCases, passedTestCases, failedTestCases };
928
+ }
929
+ async function parseArtifactFile(artifactPath) {
930
+ try {
931
+ const content = await promises.readFile(artifactPath, "utf8");
932
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
933
+ const results = [];
934
+ for (const line of lines) {
935
+ try {
936
+ const event = JSON.parse(line);
937
+ if (event.type === "TestCaseProgress") {
938
+ const ev = event;
939
+ const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
940
+ const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
941
+ results.push({
942
+ testCaseId: ev.testCaseId,
943
+ testCaseName: ev.testCaseName,
944
+ completedTestCases: ev.completedTestCases,
945
+ totalTestCases: ev.totalTestCases,
946
+ repetitionId: ev.repetitionId,
947
+ repetitionIndex,
948
+ repetitionCount,
949
+ passed: ev.passed,
950
+ durationMs: ev.durationMs,
951
+ evaluatorScores: ev.evaluatorScores ?? []
952
+ });
953
+ }
954
+ } catch {
955
+ }
956
+ }
957
+ return results;
958
+ } catch {
959
+ return [];
960
+ }
961
+ }
765
962
 
766
963
  // src/runner/config.ts
767
964
  var defaultRunnerConfig = {
@@ -769,6 +966,7 @@ var defaultRunnerConfig = {
769
966
  rootDir: process.cwd(),
770
967
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
771
968
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
969
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
772
970
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
773
971
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
774
972
  },
@@ -794,6 +992,11 @@ function toRunnerConfigOverrides(config) {
794
992
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
795
993
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
796
994
  }
995
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
996
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
997
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
998
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
999
+ }
797
1000
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
798
1001
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
799
1002
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -892,6 +1095,9 @@ function isDatasetLike(value) {
892
1095
  function isEvaluatorLike(value) {
893
1096
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
894
1097
  }
1098
+ function isRunConfigLike(value) {
1099
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1100
+ }
895
1101
  function isTestCaseLike(value) {
896
1102
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
897
1103
  }
@@ -980,6 +1186,23 @@ async function collectEvaluatorsFromFiles(config) {
980
1186
  );
981
1187
  return found.flat();
982
1188
  }
1189
+ async function collectRunConfigsFromFiles(config) {
1190
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1191
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1192
+ const found = await Promise.all(
1193
+ matched.map(async (absolutePath) => {
1194
+ const exports = await loadModuleExports(absolutePath);
1195
+ const runConfigs = exports.filter(isRunConfigLike);
1196
+ const relPath = path.relative(config.rootDir, absolutePath);
1197
+ return runConfigs.map((runConfig) => ({
1198
+ id: runConfig.getName(),
1199
+ filePath: relPath,
1200
+ runConfig
1201
+ }));
1202
+ })
1203
+ );
1204
+ return found.flat();
1205
+ }
983
1206
  async function collectTestCasesFromFiles(config) {
984
1207
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
985
1208
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1133,6 +1356,17 @@ function getDiffLines(entry) {
1133
1356
  });
1134
1357
  }
1135
1358
 
1359
+ // src/evals/test-case.ts
1360
+ function getTestCaseDisplayLabel(testCase) {
1361
+ if (typeof testCase.getDisplayLabel === "function") {
1362
+ return testCase.getDisplayLabel();
1363
+ }
1364
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1365
+ }
1366
+ function getTestCaseTagList(testCase) {
1367
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1368
+ }
1369
+
1136
1370
  // src/evals/metric.ts
1137
1371
  var registry = /* @__PURE__ */ new Map();
1138
1372
  var Metric = {
@@ -1156,6 +1390,54 @@ function getMetricById(id) {
1156
1390
  return registry.get(id);
1157
1391
  }
1158
1392
 
1393
+ // src/evals/aggregators.ts
1394
+ function aggregateTokenCountSum(values) {
1395
+ const initial = {
1396
+ input: 0,
1397
+ output: 0,
1398
+ inputCached: 0,
1399
+ outputCached: 0
1400
+ };
1401
+ return values.reduce(
1402
+ (acc, v) => ({
1403
+ input: acc.input + (v.input ?? 0),
1404
+ output: acc.output + (v.output ?? 0),
1405
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1406
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1407
+ }),
1408
+ initial
1409
+ );
1410
+ }
1411
+ function aggregateLatencyAverage(values) {
1412
+ if (values.length === 0) {
1413
+ return { ms: 0 };
1414
+ }
1415
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1416
+ return { ms: sum / values.length };
1417
+ }
1418
+
1419
+ // src/evals/metrics/standard.ts
1420
+ Metric.of({
1421
+ id: "token-count",
1422
+ name: "Tokens",
1423
+ aggregate: aggregateTokenCountSum,
1424
+ format: (data, options) => {
1425
+ const input = data.input ?? 0;
1426
+ const output = data.output ?? 0;
1427
+ const inputCached = data.inputCached ?? 0;
1428
+ const outputCached = data.outputCached ?? 0;
1429
+ const cached = inputCached + outputCached;
1430
+ const base = `in:${input} out:${output} cached:${cached}`;
1431
+ return options?.isAggregated ? `Total: ${base}` : base;
1432
+ }
1433
+ });
1434
+ Metric.of({
1435
+ id: "latency",
1436
+ name: "Latency",
1437
+ aggregate: aggregateLatencyAverage,
1438
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1439
+ });
1440
+
1159
1441
  // src/evals/score.ts
1160
1442
  var registry2 = /* @__PURE__ */ new Map();
1161
1443
  function formatScoreData(def, data, options) {
@@ -1264,54 +1546,6 @@ function getScoreById(id) {
1264
1546
  return registry2.get(id);
1265
1547
  }
1266
1548
 
1267
- // src/evals/aggregators.ts
1268
- function aggregateTokenCountSum(values) {
1269
- const initial = {
1270
- input: 0,
1271
- output: 0,
1272
- inputCached: 0,
1273
- outputCached: 0
1274
- };
1275
- return values.reduce(
1276
- (acc, v) => ({
1277
- input: acc.input + (v.input ?? 0),
1278
- output: acc.output + (v.output ?? 0),
1279
- inputCached: acc.inputCached + (v.inputCached ?? 0),
1280
- outputCached: acc.outputCached + (v.outputCached ?? 0)
1281
- }),
1282
- initial
1283
- );
1284
- }
1285
- function aggregateLatencyAverage(values) {
1286
- if (values.length === 0) {
1287
- return { ms: 0 };
1288
- }
1289
- const sum = values.reduce((s, v) => s + v.ms, 0);
1290
- return { ms: sum / values.length };
1291
- }
1292
-
1293
- // src/evals/metrics/standard.ts
1294
- Metric.of({
1295
- id: "token-count",
1296
- name: "Tokens",
1297
- aggregate: aggregateTokenCountSum,
1298
- format: (data, options) => {
1299
- const input = data.input ?? 0;
1300
- const output = data.output ?? 0;
1301
- const inputCached = data.inputCached ?? 0;
1302
- const outputCached = data.outputCached ?? 0;
1303
- const cached = inputCached + outputCached;
1304
- const base = `in:${input} out:${output} cached:${cached}`;
1305
- return options?.isAggregated ? `Total: ${base}` : base;
1306
- }
1307
- });
1308
- Metric.of({
1309
- id: "latency",
1310
- name: "Latency",
1311
- aggregate: aggregateLatencyAverage,
1312
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1313
- });
1314
-
1315
1549
  // src/evals/scores/standard.ts
1316
1550
  Score.of({
1317
1551
  id: "percent",
@@ -1418,15 +1652,17 @@ function readOutput(testCase) {
1418
1652
  }
1419
1653
  return candidate.getOutput();
1420
1654
  }
1421
- function buildEvaluationUnits(testCases) {
1655
+ function buildEvaluationUnits(testCases, repetitionCount) {
1656
+ const count = Math.max(1, repetitionCount);
1422
1657
  const units = [];
1423
1658
  for (const testCaseItem of testCases) {
1424
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1425
- for (let r = 0; r < rerunTotal; r++) {
1659
+ const repetitionId = `rep-${crypto.randomUUID()}`;
1660
+ for (let r = 0; r < count; r++) {
1426
1661
  units.push({
1427
1662
  testCaseItem,
1428
- rerunIndex: r + 1,
1429
- rerunTotal
1663
+ repetitionId,
1664
+ repetitionIndex: r + 1,
1665
+ repetitionCount: count
1430
1666
  });
1431
1667
  }
1432
1668
  }
@@ -1439,7 +1675,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1439
1675
  return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1440
1676
  }
1441
1677
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1442
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1678
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1443
1679
  return effect.Effect.gen(function* () {
1444
1680
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1445
1681
  const started = Date.now();
@@ -1448,11 +1684,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1448
1684
  type: "TestCaseStarted",
1449
1685
  runId: task.runId,
1450
1686
  testCaseId: testCaseItem.id,
1451
- testCaseName: testCaseItem.testCase.getName(),
1687
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1452
1688
  startedTestCases: startedEvaluations,
1453
1689
  totalTestCases: totalEvaluations,
1454
- rerunIndex,
1455
- rerunTotal
1690
+ repetitionId,
1691
+ repetitionIndex,
1692
+ repetitionCount
1456
1693
  });
1457
1694
  const evaluatorScores = [];
1458
1695
  let testCaseError;
@@ -1486,8 +1723,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1486
1723
  meta: {
1487
1724
  triggerId: task.triggerId,
1488
1725
  runId: evaluatorRunId,
1489
- datasetId: task.datasetId
1726
+ datasetId: task.datasetId,
1727
+ repetitionId,
1728
+ repetitionIndex,
1729
+ repetitionCount,
1730
+ runConfigName: task.runConfigName
1490
1731
  },
1732
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1733
+ runConfigTags: task.runConfigTags,
1734
+ evaluatorTags: getEvaluatorTagList(evaluator),
1491
1735
  logDiff,
1492
1736
  log,
1493
1737
  createError
@@ -1530,18 +1774,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1530
1774
  });
1531
1775
  }
1532
1776
  }
1533
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1777
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1534
1778
  const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1535
1779
  const progressEvent = {
1536
1780
  type: "TestCaseProgress",
1537
1781
  runId: task.runId,
1538
1782
  testCaseId: testCaseItem.id,
1539
- testCaseName: testCaseItem.testCase.getName(),
1783
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1540
1784
  completedTestCases: completedEvaluations,
1541
1785
  totalTestCases: totalEvaluations,
1542
- rerunIndex,
1543
- rerunTotal,
1544
- passed: rerunPassedThis,
1786
+ repetitionId,
1787
+ repetitionIndex,
1788
+ repetitionCount,
1789
+ passed: repetitionPassedThis,
1545
1790
  durationMs: Date.now() - started,
1546
1791
  evaluatorScores,
1547
1792
  output,
@@ -1562,9 +1807,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1562
1807
  (map) => {
1563
1808
  const key = testCaseItem.id;
1564
1809
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1565
- const newResults = [...existing.results, rerunPassedThis];
1810
+ const newResults = [...existing.results, repetitionPassedThis];
1566
1811
  const newCompletedCount = existing.completedCount + 1;
1567
- const isLast = newCompletedCount === rerunTotal;
1812
+ const isLast = newCompletedCount === repetitionCount;
1568
1813
  const newMap = new Map(map);
1569
1814
  newMap.set(key, {
1570
1815
  completedCount: newCompletedCount,
@@ -1601,10 +1846,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1601
1846
  runId: task.runId,
1602
1847
  startedAt
1603
1848
  });
1604
- const totalEvaluations = task.testCases.reduce(
1605
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1606
- 0
1607
- );
1849
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1608
1850
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1609
1851
  const completedRef = yield* effect.Ref.make(0);
1610
1852
  const startedRef = yield* effect.Ref.make(0);
@@ -1613,7 +1855,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1613
1855
  const testCaseResultsRef = yield* effect.Ref.make(
1614
1856
  /* @__PURE__ */ new Map()
1615
1857
  );
1616
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1858
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1617
1859
  const processEvaluation = (unit) => processOneEvaluation(
1618
1860
  task,
1619
1861
  unit,
@@ -1627,11 +1869,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1627
1869
  failedRef,
1628
1870
  testCaseResultsRef
1629
1871
  );
1630
- yield* effect.Effect.forEach(
1631
- evaluationUnits,
1632
- processEvaluation,
1633
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1634
- );
1872
+ const globalSem = task.globalEvaluationSemaphore;
1873
+ if (globalSem !== void 0) {
1874
+ yield* effect.Effect.forEach(
1875
+ evaluationUnits,
1876
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1877
+ { concurrency: "unbounded", discard: true }
1878
+ );
1879
+ } else {
1880
+ yield* effect.Effect.forEach(
1881
+ evaluationUnits,
1882
+ processEvaluation,
1883
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1884
+ );
1885
+ }
1635
1886
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1636
1887
  effect.Ref.get(completedRef),
1637
1888
  effect.Ref.get(passedRef),
@@ -1667,155 +1918,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1667
1918
  artifactPath: task.snapshot.artifactPath
1668
1919
  });
1669
1920
  });
1670
- async function loadRunSnapshotsFromArtifacts(config) {
1671
- const baseDir = path.resolve(config.artifactDirectory);
1672
- let entries;
1673
- try {
1674
- entries = await promises.readdir(baseDir);
1675
- } catch {
1676
- return [];
1677
- }
1678
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1679
- const snapshots = [];
1680
- for (const fileName of jsonlFiles) {
1681
- const filePath = path.join(baseDir, fileName);
1682
- try {
1683
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1684
- if (snapshot) {
1685
- snapshots.push(snapshot);
1686
- }
1687
- } catch {
1688
- }
1689
- }
1690
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1691
- }
1692
- async function parseArtifactToSnapshot(filePath, _config) {
1693
- const content = await promises.readFile(filePath, "utf8");
1694
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1695
- if (lines.length === 0) {
1696
- return null;
1697
- }
1698
- let runQueued = null;
1699
- let runCompleted = null;
1700
- let runFailed = null;
1701
- let runStarted = null;
1702
- for (const line of lines) {
1703
- try {
1704
- const event = JSON.parse(line);
1705
- const type = event.type;
1706
- if (type === "RunQueued") {
1707
- runQueued = {
1708
- runId: event.runId,
1709
- datasetId: event.datasetId,
1710
- datasetName: event.datasetName,
1711
- evaluatorIds: event.evaluatorIds,
1712
- totalTestCases: event.totalTestCases ?? 0,
1713
- artifactPath: event.artifactPath ?? filePath,
1714
- ts: event.ts
1715
- };
1716
- }
1717
- if (type === "RunStarted") {
1718
- runStarted = { startedAt: event.startedAt };
1719
- }
1720
- if (type === "RunCompleted") {
1721
- runCompleted = {
1722
- passedTestCases: event.passedTestCases,
1723
- failedTestCases: event.failedTestCases,
1724
- totalTestCases: event.totalTestCases,
1725
- finishedAt: event.finishedAt
1726
- };
1727
- }
1728
- if (type === "RunFailed") {
1729
- runFailed = {
1730
- finishedAt: event.finishedAt,
1731
- errorMessage: event.errorMessage
1732
- };
1733
- }
1734
- } catch {
1735
- }
1921
+
1922
+ // src/runner/name-pattern.ts
1923
+ function parseRegexLiteral(pattern) {
1924
+ if (!pattern.startsWith("/")) {
1925
+ return void 0;
1736
1926
  }
1737
- if (!runQueued) {
1738
- return null;
1927
+ const lastSlash = pattern.lastIndexOf("/");
1928
+ if (lastSlash <= 0) {
1929
+ return void 0;
1739
1930
  }
1740
- const artifactPath = filePath;
1741
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1742
- const progress = aggregateTestCaseProgress(lines);
1743
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1744
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1745
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1746
1931
  return {
1747
- runId: runQueued.runId,
1748
- datasetId: runQueued.datasetId,
1749
- datasetName: runQueued.datasetName,
1750
- evaluatorIds: runQueued.evaluatorIds,
1751
- queuedAt: runQueued.ts ?? 0,
1752
- startedAt: runStarted?.startedAt,
1753
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1754
- totalTestCases: runQueued.totalTestCases,
1755
- completedTestCases,
1756
- passedTestCases,
1757
- failedTestCases,
1758
- status,
1759
- artifactPath,
1760
- errorMessage: runFailed?.errorMessage
1932
+ source: pattern.slice(1, lastSlash),
1933
+ flags: pattern.slice(lastSlash + 1)
1761
1934
  };
1762
1935
  }
1763
- function aggregateTestCaseProgress(lines) {
1764
- let completedTestCases = 0;
1765
- const testCasePassedBy = /* @__PURE__ */ new Map();
1766
- for (const line of lines) {
1767
- try {
1768
- const event = JSON.parse(line);
1769
- if (event.type === "TestCaseProgress") {
1770
- const ev = event;
1771
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1772
- const id = ev.testCaseId;
1773
- const current = testCasePassedBy.get(id);
1774
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1775
- }
1776
- } catch {
1777
- }
1778
- }
1779
- let passedTestCases = 0;
1780
- let failedTestCases = 0;
1781
- for (const passed of testCasePassedBy.values()) {
1782
- if (passed) {
1783
- passedTestCases += 1;
1784
- } else {
1785
- failedTestCases += 1;
1786
- }
1936
+ function createNameMatcher(pattern) {
1937
+ const normalizedPattern = pattern.trim();
1938
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1939
+ if (regexLiteral) {
1940
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1941
+ return (value) => regex.test(value);
1787
1942
  }
1788
- return { completedTestCases, passedTestCases, failedTestCases };
1789
- }
1790
- async function parseArtifactFile(artifactPath) {
1791
- try {
1792
- const content = await promises.readFile(artifactPath, "utf8");
1793
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1794
- const results = [];
1795
- for (const line of lines) {
1796
- try {
1797
- const event = JSON.parse(line);
1798
- if (event.type === "TestCaseProgress") {
1799
- const ev = event;
1800
- results.push({
1801
- testCaseId: ev.testCaseId,
1802
- testCaseName: ev.testCaseName,
1803
- completedTestCases: ev.completedTestCases,
1804
- totalTestCases: ev.totalTestCases,
1805
- rerunIndex: ev.rerunIndex,
1806
- rerunTotal: ev.rerunTotal,
1807
- passed: ev.passed,
1808
- durationMs: ev.durationMs,
1809
- evaluatorScores: ev.evaluatorScores ?? []
1810
- });
1811
- }
1812
- } catch {
1813
- }
1814
- }
1815
- return results;
1816
- } catch {
1817
- return [];
1943
+ if (normalizedPattern.includes("*")) {
1944
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1945
+ const regex = new RegExp(`^${escaped}$`, "i");
1946
+ return (value) => regex.test(value);
1818
1947
  }
1948
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1819
1949
  }
1820
1950
  async function appendJsonLine(artifactPath, payload) {
1821
1951
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1874,32 +2004,12 @@ function searchCollectedTestCases(all, query) {
1874
2004
  }
1875
2005
 
1876
2006
  // src/runner/api.ts
1877
- function parseRegexLiteral(pattern) {
1878
- if (!pattern.startsWith("/")) {
1879
- return void 0;
1880
- }
1881
- const lastSlash = pattern.lastIndexOf("/");
1882
- if (lastSlash <= 0) {
1883
- return void 0;
2007
+ function normalizeRunRepetitions(value) {
2008
+ const n = value ?? 1;
2009
+ if (!Number.isInteger(n) || n < 1) {
2010
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1884
2011
  }
1885
- return {
1886
- source: pattern.slice(1, lastSlash),
1887
- flags: pattern.slice(lastSlash + 1)
1888
- };
1889
- }
1890
- function createNameMatcher(pattern) {
1891
- const normalizedPattern = pattern.trim();
1892
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1893
- if (regexLiteral) {
1894
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1895
- return (value) => regex.test(value);
1896
- }
1897
- if (normalizedPattern.includes("*")) {
1898
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1899
- const regex = new RegExp(`^${escaped}$`, "i");
1900
- return (value) => regex.test(value);
1901
- }
1902
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
2012
+ return n;
1903
2013
  }
1904
2014
  function mergeRunnerOverrides(base, next) {
1905
2015
  if (!base) {
@@ -1934,6 +2044,7 @@ var EffectRunner = class {
1934
2044
  this.listeners = /* @__PURE__ */ new Set();
1935
2045
  this.datasetsById = /* @__PURE__ */ new Map();
1936
2046
  this.evaluatorsById = /* @__PURE__ */ new Map();
2047
+ this.runConfigsById = /* @__PURE__ */ new Map();
1937
2048
  this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1938
2049
  this.persistenceFiber = effect.Effect.runFork(
1939
2050
  createPersistenceWorker(this.persistenceQueue)
@@ -1974,6 +2085,137 @@ var EffectRunner = class {
1974
2085
  (item) => matcher(item.evaluator.getName() ?? "")
1975
2086
  );
1976
2087
  }
2088
+ async collectRunConfigs() {
2089
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
2090
+ this.runConfigsById.clear();
2091
+ const byNameLower = /* @__PURE__ */ new Map();
2092
+ for (const item of runConfigs) {
2093
+ const id = item.runConfig.getName();
2094
+ const lower = id.toLowerCase();
2095
+ const prev = byNameLower.get(lower);
2096
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
2097
+ throw new Error(
2098
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
2099
+ );
2100
+ }
2101
+ byNameLower.set(lower, item);
2102
+ this.runConfigsById.set(id, item);
2103
+ }
2104
+ return runConfigs;
2105
+ }
2106
+ async resolveRunConfigByName(name) {
2107
+ if (this.runConfigsById.size === 0) {
2108
+ await this.collectRunConfigs();
2109
+ }
2110
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
2111
+ const keyLower = key.toLowerCase();
2112
+ const matches = Array.from(this.runConfigsById.values()).filter(
2113
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
2114
+ );
2115
+ if (matches.length === 0) {
2116
+ return void 0;
2117
+ }
2118
+ if (matches.length > 1) {
2119
+ throw new Error(
2120
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
2121
+ );
2122
+ }
2123
+ return matches[0];
2124
+ }
2125
+ async expandRunConfigToJobs(collected) {
2126
+ if (this.datasetsById.size === 0) {
2127
+ await this.collectDatasets();
2128
+ }
2129
+ if (this.evaluatorsById.size === 0) {
2130
+ await this.collectEvaluators();
2131
+ }
2132
+ const rcName = collected.runConfig.getName();
2133
+ const jobs = [];
2134
+ const runs = collected.runConfig.getRuns();
2135
+ for (const [i, row] of runs.entries()) {
2136
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2137
+ (d) => d.dataset === row.dataset
2138
+ );
2139
+ if (!dsCollected) {
2140
+ throw new Error(
2141
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2142
+ );
2143
+ }
2144
+ let evaluatorIds;
2145
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2146
+ const matcher = createNameMatcher(row.evaluatorPattern);
2147
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2148
+ (item) => matcher(item.evaluator.getName() ?? "")
2149
+ );
2150
+ if (matched.length === 0) {
2151
+ throw new Error(
2152
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2153
+ );
2154
+ }
2155
+ evaluatorIds = matched.map((item) => item.id);
2156
+ } else {
2157
+ const evaluators = row.evaluators;
2158
+ evaluatorIds = [];
2159
+ for (const ev of evaluators) {
2160
+ const found = Array.from(this.evaluatorsById.values()).find(
2161
+ (item) => item.evaluator === ev
2162
+ );
2163
+ if (!found) {
2164
+ throw new Error(
2165
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2166
+ );
2167
+ }
2168
+ evaluatorIds.push(found.id);
2169
+ }
2170
+ }
2171
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2172
+ jobs.push({
2173
+ datasetId: dsCollected.id,
2174
+ evaluatorIds,
2175
+ runConfigName: rcName,
2176
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2177
+ runConfigTags: collected.runConfig.getTags(),
2178
+ repetitions
2179
+ });
2180
+ }
2181
+ return jobs;
2182
+ }
2183
+ async expandRunConfigNamesToJobs(names) {
2184
+ const jobs = [];
2185
+ for (const name of names) {
2186
+ const collected = await this.resolveRunConfigByName(name);
2187
+ if (!collected) {
2188
+ const known = await this.collectRunConfigs();
2189
+ const available = known.map((r) => r.runConfig.getName()).sort();
2190
+ throw new Error(
2191
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2192
+ );
2193
+ }
2194
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2195
+ }
2196
+ return jobs;
2197
+ }
2198
+ async runDatasetJobsWithSharedConcurrency(request) {
2199
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2200
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
2201
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2202
+ const snapshots = [];
2203
+ for (const job of request.jobs) {
2204
+ snapshots.push(
2205
+ await this.startDatasetRun({
2206
+ datasetId: job.datasetId,
2207
+ evaluatorIds: job.evaluatorIds,
2208
+ triggerId,
2209
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2210
+ globalEvaluationSemaphore: sem,
2211
+ runConfigName: job.runConfigName,
2212
+ runConfigTags: job.runConfigTags,
2213
+ repetitions: job.repetitions
2214
+ })
2215
+ );
2216
+ }
2217
+ return snapshots;
2218
+ }
1977
2219
  async searchTestCases(query) {
1978
2220
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1979
2221
  return searchCollectedTestCases(testCases, query);
@@ -1992,35 +2234,45 @@ var EffectRunner = class {
1992
2234
  );
1993
2235
  }
1994
2236
  async runDatasetWith(request) {
2237
+ const runConfigName = validateRunConfigName(
2238
+ request.runConfigName,
2239
+ "runDatasetWith.runConfigName"
2240
+ );
2241
+ return this.startDatasetRun({
2242
+ datasetId: request.datasetId,
2243
+ evaluatorIds: request.evaluatorIds,
2244
+ triggerId: request.triggerId,
2245
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2246
+ repetitions: request.repetitions,
2247
+ runConfigName,
2248
+ runConfigTags: request.runConfigTags
2249
+ });
2250
+ }
2251
+ async startDatasetRun(params) {
1995
2252
  if (this.datasetsById.size === 0) {
1996
2253
  await this.collectDatasets();
1997
2254
  }
1998
2255
  if (this.evaluatorsById.size === 0) {
1999
2256
  await this.collectEvaluators();
2000
2257
  }
2001
- const dataset = this.datasetsById.get(request.datasetId);
2258
+ const dataset = this.datasetsById.get(params.datasetId);
2002
2259
  if (!dataset) {
2003
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2260
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
2004
2261
  }
2005
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2262
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2006
2263
  if (selectedEvaluators.length === 0) {
2007
2264
  throw new Error("No evaluators selected for run");
2008
2265
  }
2009
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
2010
- const totalEvaluations = selectedTestCases.reduce(
2011
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
2012
- 0
2013
- );
2014
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2266
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2267
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2268
+ const totalEvaluations = selectedTestCases.length * repetitions;
2269
+ const runConfigTags = [...params.runConfigTags ?? []];
2270
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
2015
2271
  const runId = `run-${crypto.randomUUID()}`;
2016
- const artifactPath = createArtifactPath(
2017
- this.config.artifactDirectory,
2018
- request.datasetId,
2019
- runId
2020
- );
2272
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
2021
2273
  const snapshot = {
2022
2274
  runId,
2023
- datasetId: request.datasetId,
2275
+ datasetId: params.datasetId,
2024
2276
  datasetName: dataset.dataset.getName(),
2025
2277
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2026
2278
  queuedAt: Date.now(),
@@ -2041,7 +2293,7 @@ var EffectRunner = class {
2041
2293
  const queuedEvent = {
2042
2294
  type: "RunQueued",
2043
2295
  runId,
2044
- datasetId: request.datasetId,
2296
+ datasetId: params.datasetId,
2045
2297
  datasetName: dataset.dataset.getName(),
2046
2298
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2047
2299
  totalTestCases: totalEvaluations,
@@ -2055,17 +2307,20 @@ var EffectRunner = class {
2055
2307
  payload: queuedEvent
2056
2308
  })
2057
2309
  );
2058
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
2059
2310
  await effect.Effect.runPromise(
2060
2311
  effect.Queue.offer(this.runQueue, {
2061
2312
  runId,
2062
2313
  triggerId,
2063
- datasetId: request.datasetId,
2314
+ datasetId: params.datasetId,
2064
2315
  dataset: dataset.dataset,
2065
2316
  evaluators: selectedEvaluators,
2066
2317
  testCases: selectedTestCases,
2067
2318
  snapshot,
2068
- maxConcurrency
2319
+ maxConcurrency: params.maxConcurrency,
2320
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2321
+ runConfigName: params.runConfigName,
2322
+ runConfigTags,
2323
+ repetitions
2069
2324
  })
2070
2325
  );
2071
2326
  return snapshot;
@@ -2136,6 +2391,11 @@ var EffectRunner = class {
2136
2391
  );
2137
2392
  }
2138
2393
  };
2394
+
2395
+ // src/runner/events.ts
2396
+ var PROGRAMMATIC_RUN_CONFIG = {
2397
+ runConfigName: "programmatic"
2398
+ };
2139
2399
  var LEFT_PANE_WIDTH2 = 44;
2140
2400
  var MAX_RUNS_FOR_CHART = 12;
2141
2401
  var MAX_RUNS_FOR_TREND = 20;
@@ -2483,7 +2743,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2483
2743
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
2484
2744
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2485
2745
  for (const tc of testCases) {
2486
- const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2746
+ const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
2487
2747
  rows.push(
2488
2748
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2489
2749
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -2495,13 +2755,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2495
2755
  ] }),
2496
2756
  " ",
2497
2757
  tc.testCaseName,
2498
- rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
2758
+ repetitionPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: repetitionPart }) : null,
2499
2759
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2500
2760
  " (",
2501
2761
  tc.durationMs,
2502
2762
  "ms)"
2503
2763
  ] })
2504
- ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2764
+ ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
2505
2765
  );
2506
2766
  for (const item of tc.evaluatorScores) {
2507
2767
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2827,7 +3087,8 @@ function EvalsCliApp({ data, args, runner }) {
2827
3087
  }
2828
3088
  void runner.runDatasetWith({
2829
3089
  datasetId: selectedDataset.id,
2830
- evaluatorIds: clampedState.selectedEvaluatorIds
3090
+ evaluatorIds: clampedState.selectedEvaluatorIds,
3091
+ ...PROGRAMMATIC_RUN_CONFIG
2831
3092
  }).then((snapshot) => {
2832
3093
  setRuntimeMessage(
2833
3094
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`