@m4trix/evals 0.25.1 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3,16 +3,16 @@ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
3
3
  import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
4
4
  import { useApp, useInput, Box, Text } from 'ink';
5
5
  import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
- import { resolve, relative, join, dirname } from 'path';
7
- import { LineGraph } from '@pppp606/ink-chart';
6
+ import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
8
7
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
8
+ import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
9
+ import { resolve, join, relative, dirname } from 'path';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
- import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
12
  import { pathToFileURL } from 'url';
14
13
  import { diffLines } from 'diff';
15
14
  import stringify from 'fast-json-stable-stringify';
15
+ import { LineGraph } from '@pppp606/ink-chart';
16
16
 
17
17
  var SEP = " ";
18
18
  var ARROW = "\u203A";
@@ -237,6 +237,50 @@ function isPrintableCharacter(input) {
237
237
  function isBackKey(key) {
238
238
  return key.backspace || key.delete;
239
239
  }
240
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
241
+ function makeEntityIdSchema(brand, label) {
242
+ return Schema.String.pipe(
243
+ Schema.trimmed(),
244
+ Schema.minLength(1, {
245
+ message: () => `${label} must be non-empty.`
246
+ }),
247
+ Schema.pattern(ENTITY_ID_PATTERN, {
248
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
249
+ }),
250
+ Schema.brand(brand)
251
+ );
252
+ }
253
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
254
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
255
+ makeEntityIdSchema("TestCaseName", "Test case name");
256
+ function validateWithSchema(schema, raw, context) {
257
+ const trimmed = raw.trim();
258
+ const decode = Schema.decodeUnknownEither(
259
+ schema
260
+ );
261
+ const result = decode(trimmed);
262
+ if (Either.isLeft(result)) {
263
+ throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
264
+ }
265
+ return result.right;
266
+ }
267
+ function validateRunConfigName(raw, context) {
268
+ return validateWithSchema(RunConfigNameSchema, raw, context);
269
+ }
270
+
271
+ // src/evals/evaluator.ts
272
+ function getEvaluatorDisplayLabel(evaluator) {
273
+ if (typeof evaluator.getDisplayLabel === "function") {
274
+ const label = evaluator.getDisplayLabel();
275
+ if (label !== void 0) {
276
+ return label;
277
+ }
278
+ }
279
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
280
+ }
281
+ function getEvaluatorTagList(evaluator) {
282
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
283
+ }
240
284
 
241
285
  // src/cli/data.mock.json
242
286
  var data_mock_default = {
@@ -492,7 +536,7 @@ function toEvalDataset(item, snapshots) {
492
536
  function toEvaluatorOption(item) {
493
537
  return {
494
538
  id: item.id,
495
- name: item.evaluator.getName() ?? toSlug(item.id),
539
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
496
540
  configPreview: `Source: ${item.filePath}`
497
541
  };
498
542
  }
@@ -735,6 +779,159 @@ function reduceCliState(state, action) {
735
779
  }
736
780
  return state;
737
781
  }
782
+ async function loadRunSnapshotsFromArtifacts(config) {
783
+ const baseDir = resolve(config.artifactDirectory);
784
+ let entries;
785
+ try {
786
+ entries = await readdir(baseDir);
787
+ } catch {
788
+ return [];
789
+ }
790
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
791
+ const snapshots = [];
792
+ for (const fileName of jsonlFiles) {
793
+ const filePath = join(baseDir, fileName);
794
+ try {
795
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
796
+ if (snapshot) {
797
+ snapshots.push(snapshot);
798
+ }
799
+ } catch {
800
+ }
801
+ }
802
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
803
+ }
804
+ async function parseArtifactToSnapshot(filePath, _config) {
805
+ const content = await readFile(filePath, "utf8");
806
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
807
+ if (lines.length === 0) {
808
+ return null;
809
+ }
810
+ let runQueued = null;
811
+ let runCompleted = null;
812
+ let runFailed = null;
813
+ let runStarted = null;
814
+ for (const line of lines) {
815
+ try {
816
+ const event = JSON.parse(line);
817
+ const type = event.type;
818
+ if (type === "RunQueued") {
819
+ runQueued = {
820
+ runId: event.runId,
821
+ datasetId: event.datasetId,
822
+ datasetName: event.datasetName,
823
+ evaluatorIds: event.evaluatorIds,
824
+ totalTestCases: event.totalTestCases ?? 0,
825
+ artifactPath: event.artifactPath ?? filePath,
826
+ ts: event.ts
827
+ };
828
+ }
829
+ if (type === "RunStarted") {
830
+ runStarted = { startedAt: event.startedAt };
831
+ }
832
+ if (type === "RunCompleted") {
833
+ runCompleted = {
834
+ passedTestCases: event.passedTestCases,
835
+ failedTestCases: event.failedTestCases,
836
+ totalTestCases: event.totalTestCases,
837
+ finishedAt: event.finishedAt
838
+ };
839
+ }
840
+ if (type === "RunFailed") {
841
+ runFailed = {
842
+ finishedAt: event.finishedAt,
843
+ errorMessage: event.errorMessage
844
+ };
845
+ }
846
+ } catch {
847
+ }
848
+ }
849
+ if (!runQueued) {
850
+ return null;
851
+ }
852
+ const artifactPath = filePath;
853
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
854
+ const progress = aggregateTestCaseProgress(lines);
855
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
856
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
857
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
858
+ return {
859
+ runId: runQueued.runId,
860
+ datasetId: runQueued.datasetId,
861
+ datasetName: runQueued.datasetName,
862
+ evaluatorIds: runQueued.evaluatorIds,
863
+ queuedAt: runQueued.ts ?? 0,
864
+ startedAt: runStarted?.startedAt,
865
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
866
+ totalTestCases: runQueued.totalTestCases,
867
+ completedTestCases,
868
+ passedTestCases,
869
+ failedTestCases,
870
+ status,
871
+ artifactPath,
872
+ errorMessage: runFailed?.errorMessage
873
+ };
874
+ }
875
+ function aggregateTestCaseProgress(lines) {
876
+ let completedTestCases = 0;
877
+ const testCasePassedBy = /* @__PURE__ */ new Map();
878
+ for (const line of lines) {
879
+ try {
880
+ const event = JSON.parse(line);
881
+ if (event.type === "TestCaseProgress") {
882
+ const ev = event;
883
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
884
+ const id = ev.testCaseId;
885
+ const current = testCasePassedBy.get(id);
886
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
887
+ }
888
+ } catch {
889
+ }
890
+ }
891
+ let passedTestCases = 0;
892
+ let failedTestCases = 0;
893
+ for (const passed of testCasePassedBy.values()) {
894
+ if (passed) {
895
+ passedTestCases += 1;
896
+ } else {
897
+ failedTestCases += 1;
898
+ }
899
+ }
900
+ return { completedTestCases, passedTestCases, failedTestCases };
901
+ }
902
+ async function parseArtifactFile(artifactPath) {
903
+ try {
904
+ const content = await readFile(artifactPath, "utf8");
905
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
906
+ const results = [];
907
+ for (const line of lines) {
908
+ try {
909
+ const event = JSON.parse(line);
910
+ if (event.type === "TestCaseProgress") {
911
+ const ev = event;
912
+ const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
913
+ const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
914
+ results.push({
915
+ testCaseId: ev.testCaseId,
916
+ testCaseName: ev.testCaseName,
917
+ completedTestCases: ev.completedTestCases,
918
+ totalTestCases: ev.totalTestCases,
919
+ repetitionId: ev.repetitionId,
920
+ repetitionIndex,
921
+ repetitionCount,
922
+ passed: ev.passed,
923
+ durationMs: ev.durationMs,
924
+ evaluatorScores: ev.evaluatorScores ?? []
925
+ });
926
+ }
927
+ } catch {
928
+ }
929
+ }
930
+ return results;
931
+ } catch {
932
+ return [];
933
+ }
934
+ }
738
935
 
739
936
  // src/runner/config.ts
740
937
  var defaultRunnerConfig = {
@@ -742,6 +939,7 @@ var defaultRunnerConfig = {
742
939
  rootDir: process.cwd(),
743
940
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
744
941
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
942
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
745
943
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
746
944
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
747
945
  },
@@ -767,6 +965,11 @@ function toRunnerConfigOverrides(config) {
767
965
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
768
966
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
769
967
  }
968
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
969
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
970
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
971
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
972
+ }
770
973
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
771
974
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
772
975
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -865,6 +1068,9 @@ function isDatasetLike(value) {
865
1068
  function isEvaluatorLike(value) {
866
1069
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
867
1070
  }
1071
+ function isRunConfigLike(value) {
1072
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1073
+ }
868
1074
  function isTestCaseLike(value) {
869
1075
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
870
1076
  }
@@ -953,6 +1159,23 @@ async function collectEvaluatorsFromFiles(config) {
953
1159
  );
954
1160
  return found.flat();
955
1161
  }
1162
+ async function collectRunConfigsFromFiles(config) {
1163
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1164
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1165
+ const found = await Promise.all(
1166
+ matched.map(async (absolutePath) => {
1167
+ const exports = await loadModuleExports(absolutePath);
1168
+ const runConfigs = exports.filter(isRunConfigLike);
1169
+ const relPath = relative(config.rootDir, absolutePath);
1170
+ return runConfigs.map((runConfig) => ({
1171
+ id: runConfig.getName(),
1172
+ filePath: relPath,
1173
+ runConfig
1174
+ }));
1175
+ })
1176
+ );
1177
+ return found.flat();
1178
+ }
956
1179
  async function collectTestCasesFromFiles(config) {
957
1180
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
958
1181
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1106,6 +1329,17 @@ function getDiffLines(entry) {
1106
1329
  });
1107
1330
  }
1108
1331
 
1332
+ // src/evals/test-case.ts
1333
+ function getTestCaseDisplayLabel(testCase) {
1334
+ if (typeof testCase.getDisplayLabel === "function") {
1335
+ return testCase.getDisplayLabel();
1336
+ }
1337
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1338
+ }
1339
+ function getTestCaseTagList(testCase) {
1340
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1341
+ }
1342
+
1109
1343
  // src/evals/metric.ts
1110
1344
  var registry = /* @__PURE__ */ new Map();
1111
1345
  var Metric = {
@@ -1129,6 +1363,54 @@ function getMetricById(id) {
1129
1363
  return registry.get(id);
1130
1364
  }
1131
1365
 
1366
+ // src/evals/aggregators.ts
1367
+ function aggregateTokenCountSum(values) {
1368
+ const initial = {
1369
+ input: 0,
1370
+ output: 0,
1371
+ inputCached: 0,
1372
+ outputCached: 0
1373
+ };
1374
+ return values.reduce(
1375
+ (acc, v) => ({
1376
+ input: acc.input + (v.input ?? 0),
1377
+ output: acc.output + (v.output ?? 0),
1378
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1379
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1380
+ }),
1381
+ initial
1382
+ );
1383
+ }
1384
+ function aggregateLatencyAverage(values) {
1385
+ if (values.length === 0) {
1386
+ return { ms: 0 };
1387
+ }
1388
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1389
+ return { ms: sum / values.length };
1390
+ }
1391
+
1392
+ // src/evals/metrics/standard.ts
1393
+ Metric.of({
1394
+ id: "token-count",
1395
+ name: "Tokens",
1396
+ aggregate: aggregateTokenCountSum,
1397
+ format: (data, options) => {
1398
+ const input = data.input ?? 0;
1399
+ const output = data.output ?? 0;
1400
+ const inputCached = data.inputCached ?? 0;
1401
+ const outputCached = data.outputCached ?? 0;
1402
+ const cached = inputCached + outputCached;
1403
+ const base = `in:${input} out:${output} cached:${cached}`;
1404
+ return options?.isAggregated ? `Total: ${base}` : base;
1405
+ }
1406
+ });
1407
+ Metric.of({
1408
+ id: "latency",
1409
+ name: "Latency",
1410
+ aggregate: aggregateLatencyAverage,
1411
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1412
+ });
1413
+
1132
1414
  // src/evals/score.ts
1133
1415
  var registry2 = /* @__PURE__ */ new Map();
1134
1416
  function formatScoreData(def, data, options) {
@@ -1237,54 +1519,6 @@ function getScoreById(id) {
1237
1519
  return registry2.get(id);
1238
1520
  }
1239
1521
 
1240
- // src/evals/aggregators.ts
1241
- function aggregateTokenCountSum(values) {
1242
- const initial = {
1243
- input: 0,
1244
- output: 0,
1245
- inputCached: 0,
1246
- outputCached: 0
1247
- };
1248
- return values.reduce(
1249
- (acc, v) => ({
1250
- input: acc.input + (v.input ?? 0),
1251
- output: acc.output + (v.output ?? 0),
1252
- inputCached: acc.inputCached + (v.inputCached ?? 0),
1253
- outputCached: acc.outputCached + (v.outputCached ?? 0)
1254
- }),
1255
- initial
1256
- );
1257
- }
1258
- function aggregateLatencyAverage(values) {
1259
- if (values.length === 0) {
1260
- return { ms: 0 };
1261
- }
1262
- const sum = values.reduce((s, v) => s + v.ms, 0);
1263
- return { ms: sum / values.length };
1264
- }
1265
-
1266
- // src/evals/metrics/standard.ts
1267
- Metric.of({
1268
- id: "token-count",
1269
- name: "Tokens",
1270
- aggregate: aggregateTokenCountSum,
1271
- format: (data, options) => {
1272
- const input = data.input ?? 0;
1273
- const output = data.output ?? 0;
1274
- const inputCached = data.inputCached ?? 0;
1275
- const outputCached = data.outputCached ?? 0;
1276
- const cached = inputCached + outputCached;
1277
- const base = `in:${input} out:${output} cached:${cached}`;
1278
- return options?.isAggregated ? `Total: ${base}` : base;
1279
- }
1280
- });
1281
- Metric.of({
1282
- id: "latency",
1283
- name: "Latency",
1284
- aggregate: aggregateLatencyAverage,
1285
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1286
- });
1287
-
1288
1522
  // src/evals/scores/standard.ts
1289
1523
  Score.of({
1290
1524
  id: "percent",
@@ -1391,15 +1625,17 @@ function readOutput(testCase) {
1391
1625
  }
1392
1626
  return candidate.getOutput();
1393
1627
  }
1394
- function buildEvaluationUnits(testCases) {
1628
+ function buildEvaluationUnits(testCases, repetitionCount) {
1629
+ const count = Math.max(1, repetitionCount);
1395
1630
  const units = [];
1396
1631
  for (const testCaseItem of testCases) {
1397
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1398
- for (let r = 0; r < rerunTotal; r++) {
1632
+ const repetitionId = `rep-${randomUUID()}`;
1633
+ for (let r = 0; r < count; r++) {
1399
1634
  units.push({
1400
1635
  testCaseItem,
1401
- rerunIndex: r + 1,
1402
- rerunTotal
1636
+ repetitionId,
1637
+ repetitionIndex: r + 1,
1638
+ repetitionCount: count
1403
1639
  });
1404
1640
  }
1405
1641
  }
@@ -1412,7 +1648,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1412
1648
  return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1413
1649
  }
1414
1650
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1415
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1651
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1416
1652
  return Effect.gen(function* () {
1417
1653
  const evaluatorRunId = `run-${randomUUID()}`;
1418
1654
  const started = Date.now();
@@ -1421,11 +1657,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1421
1657
  type: "TestCaseStarted",
1422
1658
  runId: task.runId,
1423
1659
  testCaseId: testCaseItem.id,
1424
- testCaseName: testCaseItem.testCase.getName(),
1660
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1425
1661
  startedTestCases: startedEvaluations,
1426
1662
  totalTestCases: totalEvaluations,
1427
- rerunIndex,
1428
- rerunTotal
1663
+ repetitionId,
1664
+ repetitionIndex,
1665
+ repetitionCount
1429
1666
  });
1430
1667
  const evaluatorScores = [];
1431
1668
  let testCaseError;
@@ -1459,8 +1696,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1459
1696
  meta: {
1460
1697
  triggerId: task.triggerId,
1461
1698
  runId: evaluatorRunId,
1462
- datasetId: task.datasetId
1699
+ datasetId: task.datasetId,
1700
+ repetitionId,
1701
+ repetitionIndex,
1702
+ repetitionCount,
1703
+ runConfigName: task.runConfigName
1463
1704
  },
1705
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1706
+ runConfigTags: task.runConfigTags,
1707
+ evaluatorTags: getEvaluatorTagList(evaluator),
1464
1708
  logDiff,
1465
1709
  log,
1466
1710
  createError
@@ -1503,18 +1747,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1503
1747
  });
1504
1748
  }
1505
1749
  }
1506
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1750
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1507
1751
  const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1508
1752
  const progressEvent = {
1509
1753
  type: "TestCaseProgress",
1510
1754
  runId: task.runId,
1511
1755
  testCaseId: testCaseItem.id,
1512
- testCaseName: testCaseItem.testCase.getName(),
1756
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1513
1757
  completedTestCases: completedEvaluations,
1514
1758
  totalTestCases: totalEvaluations,
1515
- rerunIndex,
1516
- rerunTotal,
1517
- passed: rerunPassedThis,
1759
+ repetitionId,
1760
+ repetitionIndex,
1761
+ repetitionCount,
1762
+ passed: repetitionPassedThis,
1518
1763
  durationMs: Date.now() - started,
1519
1764
  evaluatorScores,
1520
1765
  output,
@@ -1535,9 +1780,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1535
1780
  (map) => {
1536
1781
  const key = testCaseItem.id;
1537
1782
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1538
- const newResults = [...existing.results, rerunPassedThis];
1783
+ const newResults = [...existing.results, repetitionPassedThis];
1539
1784
  const newCompletedCount = existing.completedCount + 1;
1540
- const isLast = newCompletedCount === rerunTotal;
1785
+ const isLast = newCompletedCount === repetitionCount;
1541
1786
  const newMap = new Map(map);
1542
1787
  newMap.set(key, {
1543
1788
  completedCount: newCompletedCount,
@@ -1574,10 +1819,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1574
1819
  runId: task.runId,
1575
1820
  startedAt
1576
1821
  });
1577
- const totalEvaluations = task.testCases.reduce(
1578
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1579
- 0
1580
- );
1822
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1581
1823
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1582
1824
  const completedRef = yield* Ref.make(0);
1583
1825
  const startedRef = yield* Ref.make(0);
@@ -1586,7 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1586
1828
  const testCaseResultsRef = yield* Ref.make(
1587
1829
  /* @__PURE__ */ new Map()
1588
1830
  );
1589
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1831
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1590
1832
  const processEvaluation = (unit) => processOneEvaluation(
1591
1833
  task,
1592
1834
  unit,
@@ -1600,11 +1842,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1600
1842
  failedRef,
1601
1843
  testCaseResultsRef
1602
1844
  );
1603
- yield* Effect.forEach(
1604
- evaluationUnits,
1605
- processEvaluation,
1606
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1607
- );
1845
+ const globalSem = task.globalEvaluationSemaphore;
1846
+ if (globalSem !== void 0) {
1847
+ yield* Effect.forEach(
1848
+ evaluationUnits,
1849
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1850
+ { concurrency: "unbounded", discard: true }
1851
+ );
1852
+ } else {
1853
+ yield* Effect.forEach(
1854
+ evaluationUnits,
1855
+ processEvaluation,
1856
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1857
+ );
1858
+ }
1608
1859
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1609
1860
  Ref.get(completedRef),
1610
1861
  Ref.get(passedRef),
@@ -1640,155 +1891,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1640
1891
  artifactPath: task.snapshot.artifactPath
1641
1892
  });
1642
1893
  });
1643
- async function loadRunSnapshotsFromArtifacts(config) {
1644
- const baseDir = resolve(config.artifactDirectory);
1645
- let entries;
1646
- try {
1647
- entries = await readdir(baseDir);
1648
- } catch {
1649
- return [];
1650
- }
1651
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1652
- const snapshots = [];
1653
- for (const fileName of jsonlFiles) {
1654
- const filePath = join(baseDir, fileName);
1655
- try {
1656
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1657
- if (snapshot) {
1658
- snapshots.push(snapshot);
1659
- }
1660
- } catch {
1661
- }
1662
- }
1663
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1664
- }
1665
- async function parseArtifactToSnapshot(filePath, _config) {
1666
- const content = await readFile(filePath, "utf8");
1667
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1668
- if (lines.length === 0) {
1669
- return null;
1670
- }
1671
- let runQueued = null;
1672
- let runCompleted = null;
1673
- let runFailed = null;
1674
- let runStarted = null;
1675
- for (const line of lines) {
1676
- try {
1677
- const event = JSON.parse(line);
1678
- const type = event.type;
1679
- if (type === "RunQueued") {
1680
- runQueued = {
1681
- runId: event.runId,
1682
- datasetId: event.datasetId,
1683
- datasetName: event.datasetName,
1684
- evaluatorIds: event.evaluatorIds,
1685
- totalTestCases: event.totalTestCases ?? 0,
1686
- artifactPath: event.artifactPath ?? filePath,
1687
- ts: event.ts
1688
- };
1689
- }
1690
- if (type === "RunStarted") {
1691
- runStarted = { startedAt: event.startedAt };
1692
- }
1693
- if (type === "RunCompleted") {
1694
- runCompleted = {
1695
- passedTestCases: event.passedTestCases,
1696
- failedTestCases: event.failedTestCases,
1697
- totalTestCases: event.totalTestCases,
1698
- finishedAt: event.finishedAt
1699
- };
1700
- }
1701
- if (type === "RunFailed") {
1702
- runFailed = {
1703
- finishedAt: event.finishedAt,
1704
- errorMessage: event.errorMessage
1705
- };
1706
- }
1707
- } catch {
1708
- }
1894
+
1895
+ // src/runner/name-pattern.ts
1896
+ function parseRegexLiteral(pattern) {
1897
+ if (!pattern.startsWith("/")) {
1898
+ return void 0;
1709
1899
  }
1710
- if (!runQueued) {
1711
- return null;
1900
+ const lastSlash = pattern.lastIndexOf("/");
1901
+ if (lastSlash <= 0) {
1902
+ return void 0;
1712
1903
  }
1713
- const artifactPath = filePath;
1714
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1715
- const progress = aggregateTestCaseProgress(lines);
1716
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1717
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1718
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1719
1904
  return {
1720
- runId: runQueued.runId,
1721
- datasetId: runQueued.datasetId,
1722
- datasetName: runQueued.datasetName,
1723
- evaluatorIds: runQueued.evaluatorIds,
1724
- queuedAt: runQueued.ts ?? 0,
1725
- startedAt: runStarted?.startedAt,
1726
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1727
- totalTestCases: runQueued.totalTestCases,
1728
- completedTestCases,
1729
- passedTestCases,
1730
- failedTestCases,
1731
- status,
1732
- artifactPath,
1733
- errorMessage: runFailed?.errorMessage
1905
+ source: pattern.slice(1, lastSlash),
1906
+ flags: pattern.slice(lastSlash + 1)
1734
1907
  };
1735
1908
  }
1736
- function aggregateTestCaseProgress(lines) {
1737
- let completedTestCases = 0;
1738
- const testCasePassedBy = /* @__PURE__ */ new Map();
1739
- for (const line of lines) {
1740
- try {
1741
- const event = JSON.parse(line);
1742
- if (event.type === "TestCaseProgress") {
1743
- const ev = event;
1744
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1745
- const id = ev.testCaseId;
1746
- const current = testCasePassedBy.get(id);
1747
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1748
- }
1749
- } catch {
1750
- }
1751
- }
1752
- let passedTestCases = 0;
1753
- let failedTestCases = 0;
1754
- for (const passed of testCasePassedBy.values()) {
1755
- if (passed) {
1756
- passedTestCases += 1;
1757
- } else {
1758
- failedTestCases += 1;
1759
- }
1909
+ function createNameMatcher(pattern) {
1910
+ const normalizedPattern = pattern.trim();
1911
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1912
+ if (regexLiteral) {
1913
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1914
+ return (value) => regex.test(value);
1760
1915
  }
1761
- return { completedTestCases, passedTestCases, failedTestCases };
1762
- }
1763
- async function parseArtifactFile(artifactPath) {
1764
- try {
1765
- const content = await readFile(artifactPath, "utf8");
1766
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1767
- const results = [];
1768
- for (const line of lines) {
1769
- try {
1770
- const event = JSON.parse(line);
1771
- if (event.type === "TestCaseProgress") {
1772
- const ev = event;
1773
- results.push({
1774
- testCaseId: ev.testCaseId,
1775
- testCaseName: ev.testCaseName,
1776
- completedTestCases: ev.completedTestCases,
1777
- totalTestCases: ev.totalTestCases,
1778
- rerunIndex: ev.rerunIndex,
1779
- rerunTotal: ev.rerunTotal,
1780
- passed: ev.passed,
1781
- durationMs: ev.durationMs,
1782
- evaluatorScores: ev.evaluatorScores ?? []
1783
- });
1784
- }
1785
- } catch {
1786
- }
1787
- }
1788
- return results;
1789
- } catch {
1790
- return [];
1916
+ if (normalizedPattern.includes("*")) {
1917
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1918
+ const regex = new RegExp(`^${escaped}$`, "i");
1919
+ return (value) => regex.test(value);
1791
1920
  }
1921
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1792
1922
  }
1793
1923
  async function appendJsonLine(artifactPath, payload) {
1794
1924
  await mkdir(dirname(artifactPath), { recursive: true });
@@ -1847,32 +1977,12 @@ function searchCollectedTestCases(all, query) {
1847
1977
  }
1848
1978
 
1849
1979
  // src/runner/api.ts
1850
- function parseRegexLiteral(pattern) {
1851
- if (!pattern.startsWith("/")) {
1852
- return void 0;
1853
- }
1854
- const lastSlash = pattern.lastIndexOf("/");
1855
- if (lastSlash <= 0) {
1856
- return void 0;
1980
+ function normalizeRunRepetitions(value) {
1981
+ const n = value ?? 1;
1982
+ if (!Number.isInteger(n) || n < 1) {
1983
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1857
1984
  }
1858
- return {
1859
- source: pattern.slice(1, lastSlash),
1860
- flags: pattern.slice(lastSlash + 1)
1861
- };
1862
- }
1863
- function createNameMatcher(pattern) {
1864
- const normalizedPattern = pattern.trim();
1865
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1866
- if (regexLiteral) {
1867
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1868
- return (value) => regex.test(value);
1869
- }
1870
- if (normalizedPattern.includes("*")) {
1871
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1872
- const regex = new RegExp(`^${escaped}$`, "i");
1873
- return (value) => regex.test(value);
1874
- }
1875
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1985
+ return n;
1876
1986
  }
1877
1987
  function mergeRunnerOverrides(base, next) {
1878
1988
  if (!base) {
@@ -1907,6 +2017,7 @@ var EffectRunner = class {
1907
2017
  this.listeners = /* @__PURE__ */ new Set();
1908
2018
  this.datasetsById = /* @__PURE__ */ new Map();
1909
2019
  this.evaluatorsById = /* @__PURE__ */ new Map();
2020
+ this.runConfigsById = /* @__PURE__ */ new Map();
1910
2021
  this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1911
2022
  this.persistenceFiber = Effect.runFork(
1912
2023
  createPersistenceWorker(this.persistenceQueue)
@@ -1947,6 +2058,137 @@ var EffectRunner = class {
1947
2058
  (item) => matcher(item.evaluator.getName() ?? "")
1948
2059
  );
1949
2060
  }
2061
+ async collectRunConfigs() {
2062
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
2063
+ this.runConfigsById.clear();
2064
+ const byNameLower = /* @__PURE__ */ new Map();
2065
+ for (const item of runConfigs) {
2066
+ const id = item.runConfig.getName();
2067
+ const lower = id.toLowerCase();
2068
+ const prev = byNameLower.get(lower);
2069
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
2070
+ throw new Error(
2071
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
2072
+ );
2073
+ }
2074
+ byNameLower.set(lower, item);
2075
+ this.runConfigsById.set(id, item);
2076
+ }
2077
+ return runConfigs;
2078
+ }
2079
+ async resolveRunConfigByName(name) {
2080
+ if (this.runConfigsById.size === 0) {
2081
+ await this.collectRunConfigs();
2082
+ }
2083
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
2084
+ const keyLower = key.toLowerCase();
2085
+ const matches = Array.from(this.runConfigsById.values()).filter(
2086
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
2087
+ );
2088
+ if (matches.length === 0) {
2089
+ return void 0;
2090
+ }
2091
+ if (matches.length > 1) {
2092
+ throw new Error(
2093
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
2094
+ );
2095
+ }
2096
+ return matches[0];
2097
+ }
2098
+ async expandRunConfigToJobs(collected) {
2099
+ if (this.datasetsById.size === 0) {
2100
+ await this.collectDatasets();
2101
+ }
2102
+ if (this.evaluatorsById.size === 0) {
2103
+ await this.collectEvaluators();
2104
+ }
2105
+ const rcName = collected.runConfig.getName();
2106
+ const jobs = [];
2107
+ const runs = collected.runConfig.getRuns();
2108
+ for (const [i, row] of runs.entries()) {
2109
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2110
+ (d) => d.dataset === row.dataset
2111
+ );
2112
+ if (!dsCollected) {
2113
+ throw new Error(
2114
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2115
+ );
2116
+ }
2117
+ let evaluatorIds;
2118
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2119
+ const matcher = createNameMatcher(row.evaluatorPattern);
2120
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2121
+ (item) => matcher(item.evaluator.getName() ?? "")
2122
+ );
2123
+ if (matched.length === 0) {
2124
+ throw new Error(
2125
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2126
+ );
2127
+ }
2128
+ evaluatorIds = matched.map((item) => item.id);
2129
+ } else {
2130
+ const evaluators = row.evaluators;
2131
+ evaluatorIds = [];
2132
+ for (const ev of evaluators) {
2133
+ const found = Array.from(this.evaluatorsById.values()).find(
2134
+ (item) => item.evaluator === ev
2135
+ );
2136
+ if (!found) {
2137
+ throw new Error(
2138
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2139
+ );
2140
+ }
2141
+ evaluatorIds.push(found.id);
2142
+ }
2143
+ }
2144
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2145
+ jobs.push({
2146
+ datasetId: dsCollected.id,
2147
+ evaluatorIds,
2148
+ runConfigName: rcName,
2149
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2150
+ runConfigTags: collected.runConfig.getTags(),
2151
+ repetitions
2152
+ });
2153
+ }
2154
+ return jobs;
2155
+ }
2156
+ async expandRunConfigNamesToJobs(names) {
2157
+ const jobs = [];
2158
+ for (const name of names) {
2159
+ const collected = await this.resolveRunConfigByName(name);
2160
+ if (!collected) {
2161
+ const known = await this.collectRunConfigs();
2162
+ const available = known.map((r) => r.runConfig.getName()).sort();
2163
+ throw new Error(
2164
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2165
+ );
2166
+ }
2167
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2168
+ }
2169
+ return jobs;
2170
+ }
2171
+ async runDatasetJobsWithSharedConcurrency(request) {
2172
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2173
+ const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
2174
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2175
+ const snapshots = [];
2176
+ for (const job of request.jobs) {
2177
+ snapshots.push(
2178
+ await this.startDatasetRun({
2179
+ datasetId: job.datasetId,
2180
+ evaluatorIds: job.evaluatorIds,
2181
+ triggerId,
2182
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2183
+ globalEvaluationSemaphore: sem,
2184
+ runConfigName: job.runConfigName,
2185
+ runConfigTags: job.runConfigTags,
2186
+ repetitions: job.repetitions
2187
+ })
2188
+ );
2189
+ }
2190
+ return snapshots;
2191
+ }
1950
2192
  async searchTestCases(query) {
1951
2193
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1952
2194
  return searchCollectedTestCases(testCases, query);
@@ -1965,35 +2207,45 @@ var EffectRunner = class {
1965
2207
  );
1966
2208
  }
1967
2209
  async runDatasetWith(request) {
2210
+ const runConfigName = validateRunConfigName(
2211
+ request.runConfigName,
2212
+ "runDatasetWith.runConfigName"
2213
+ );
2214
+ return this.startDatasetRun({
2215
+ datasetId: request.datasetId,
2216
+ evaluatorIds: request.evaluatorIds,
2217
+ triggerId: request.triggerId,
2218
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2219
+ repetitions: request.repetitions,
2220
+ runConfigName,
2221
+ runConfigTags: request.runConfigTags
2222
+ });
2223
+ }
2224
+ async startDatasetRun(params) {
1968
2225
  if (this.datasetsById.size === 0) {
1969
2226
  await this.collectDatasets();
1970
2227
  }
1971
2228
  if (this.evaluatorsById.size === 0) {
1972
2229
  await this.collectEvaluators();
1973
2230
  }
1974
- const dataset = this.datasetsById.get(request.datasetId);
2231
+ const dataset = this.datasetsById.get(params.datasetId);
1975
2232
  if (!dataset) {
1976
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2233
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1977
2234
  }
1978
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2235
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1979
2236
  if (selectedEvaluators.length === 0) {
1980
2237
  throw new Error("No evaluators selected for run");
1981
2238
  }
1982
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1983
- const totalEvaluations = selectedTestCases.reduce(
1984
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1985
- 0
1986
- );
1987
- const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2239
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2240
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2241
+ const totalEvaluations = selectedTestCases.length * repetitions;
2242
+ const runConfigTags = [...params.runConfigTags ?? []];
2243
+ const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
1988
2244
  const runId = `run-${randomUUID()}`;
1989
- const artifactPath = createArtifactPath(
1990
- this.config.artifactDirectory,
1991
- request.datasetId,
1992
- runId
1993
- );
2245
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1994
2246
  const snapshot = {
1995
2247
  runId,
1996
- datasetId: request.datasetId,
2248
+ datasetId: params.datasetId,
1997
2249
  datasetName: dataset.dataset.getName(),
1998
2250
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1999
2251
  queuedAt: Date.now(),
@@ -2014,7 +2266,7 @@ var EffectRunner = class {
2014
2266
  const queuedEvent = {
2015
2267
  type: "RunQueued",
2016
2268
  runId,
2017
- datasetId: request.datasetId,
2269
+ datasetId: params.datasetId,
2018
2270
  datasetName: dataset.dataset.getName(),
2019
2271
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2020
2272
  totalTestCases: totalEvaluations,
@@ -2028,17 +2280,20 @@ var EffectRunner = class {
2028
2280
  payload: queuedEvent
2029
2281
  })
2030
2282
  );
2031
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
2032
2283
  await Effect.runPromise(
2033
2284
  Queue.offer(this.runQueue, {
2034
2285
  runId,
2035
2286
  triggerId,
2036
- datasetId: request.datasetId,
2287
+ datasetId: params.datasetId,
2037
2288
  dataset: dataset.dataset,
2038
2289
  evaluators: selectedEvaluators,
2039
2290
  testCases: selectedTestCases,
2040
2291
  snapshot,
2041
- maxConcurrency
2292
+ maxConcurrency: params.maxConcurrency,
2293
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2294
+ runConfigName: params.runConfigName,
2295
+ runConfigTags,
2296
+ repetitions
2042
2297
  })
2043
2298
  );
2044
2299
  return snapshot;
@@ -2109,6 +2364,11 @@ var EffectRunner = class {
2109
2364
  );
2110
2365
  }
2111
2366
  };
2367
+
2368
+ // src/runner/events.ts
2369
+ var PROGRAMMATIC_RUN_CONFIG = {
2370
+ runConfigName: "programmatic"
2371
+ };
2112
2372
  var LEFT_PANE_WIDTH2 = 44;
2113
2373
  var MAX_RUNS_FOR_CHART = 12;
2114
2374
  var MAX_RUNS_FOR_TREND = 20;
@@ -2456,7 +2716,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2456
2716
  rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
2457
2717
  rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2458
2718
  for (const tc of testCases) {
2459
- const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2719
+ const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
2460
2720
  rows.push(
2461
2721
  /* @__PURE__ */ jsxs(Text, { children: [
2462
2722
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -2468,13 +2728,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2468
2728
  ] }),
2469
2729
  " ",
2470
2730
  tc.testCaseName,
2471
- rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
2731
+ repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
2472
2732
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2473
2733
  " (",
2474
2734
  tc.durationMs,
2475
2735
  "ms)"
2476
2736
  ] })
2477
- ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2737
+ ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
2478
2738
  );
2479
2739
  for (const item of tc.evaluatorScores) {
2480
2740
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2800,7 +3060,8 @@ function EvalsCliApp({ data, args, runner }) {
2800
3060
  }
2801
3061
  void runner.runDatasetWith({
2802
3062
  datasetId: selectedDataset.id,
2803
- evaluatorIds: clampedState.selectedEvaluatorIds
3063
+ evaluatorIds: clampedState.selectedEvaluatorIds,
3064
+ ...PROGRAMMATIC_RUN_CONFIG
2804
3065
  }).then((snapshot) => {
2805
3066
  setRuntimeMessage(
2806
3067
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`