@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -5,16 +5,16 @@ var fullscreenInk = require('fullscreen-ink');
5
5
  var React = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
- var path = require('path');
9
- var inkChart = require('@pppp606/ink-chart');
10
- var crypto = require('crypto');
11
8
  var effect = require('effect');
9
+ var crypto = require('crypto');
10
+ var promises = require('fs/promises');
11
+ var path = require('path');
12
12
  var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
- var promises = require('fs/promises');
15
14
  var url = require('url');
16
15
  var diff = require('diff');
17
16
  var stringify = require('fast-json-stable-stringify');
17
+ var inkChart = require('@pppp606/ink-chart');
18
18
 
19
19
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
20
20
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -264,6 +264,59 @@ function isPrintableCharacter(input) {
264
264
  function isBackKey(key) {
265
265
  return key.backspace || key.delete;
266
266
  }
267
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
268
+ function makeEntityIdSchema(brand, label) {
269
+ return effect.Schema.String.pipe(
270
+ effect.Schema.trimmed(),
271
+ effect.Schema.minLength(1, {
272
+ message: () => `${label} must be non-empty.`
273
+ }),
274
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
275
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
276
+ }),
277
+ effect.Schema.brand(brand)
278
+ );
279
+ }
280
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
281
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
282
+ makeEntityIdSchema("TestCaseName", "Test case name");
283
+ makeEntityIdSchema("DatasetName", "Dataset name");
284
+ function validateWithSchema(schema, raw, context) {
285
+ const trimmed = raw.trim();
286
+ const decode = effect.Schema.decodeUnknownEither(
287
+ schema
288
+ );
289
+ const result = decode(trimmed);
290
+ if (effect.Either.isLeft(result)) {
291
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
292
+ }
293
+ return result.right;
294
+ }
295
+ function validateRunConfigName(raw, context) {
296
+ return validateWithSchema(RunConfigNameSchema, raw, context);
297
+ }
298
+
299
+ // src/evals/dataset.ts
300
+ function getDatasetDisplayLabel(dataset) {
301
+ if (typeof dataset.getDisplayLabel === "function") {
302
+ return dataset.getDisplayLabel();
303
+ }
304
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
305
+ }
306
+
307
+ // src/evals/evaluator.ts
308
+ function getEvaluatorDisplayLabel(evaluator) {
309
+ if (typeof evaluator.getDisplayLabel === "function") {
310
+ const label = evaluator.getDisplayLabel();
311
+ if (label !== void 0) {
312
+ return label;
313
+ }
314
+ }
315
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
316
+ }
317
+ function getEvaluatorTagList(evaluator) {
318
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
319
+ }
267
320
 
268
321
  // src/cli/data.mock.json
269
322
  var data_mock_default = {
@@ -511,7 +564,7 @@ function toEvalDataset(item, snapshots) {
511
564
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
512
565
  return {
513
566
  id: item.id,
514
- name: item.dataset.getName(),
567
+ name: getDatasetDisplayLabel(item.dataset),
515
568
  overview: `Discovered from ${item.filePath}`,
516
569
  runs
517
570
  };
@@ -519,7 +572,7 @@ function toEvalDataset(item, snapshots) {
519
572
  function toEvaluatorOption(item) {
520
573
  return {
521
574
  id: item.id,
522
- name: item.evaluator.getName() ?? toSlug(item.id),
575
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
523
576
  configPreview: `Source: ${item.filePath}`
524
577
  };
525
578
  }
@@ -762,6 +815,159 @@ function reduceCliState(state, action) {
762
815
  }
763
816
  return state;
764
817
  }
818
+ async function loadRunSnapshotsFromArtifacts(config) {
819
+ const baseDir = path.resolve(config.artifactDirectory);
820
+ let entries;
821
+ try {
822
+ entries = await promises.readdir(baseDir);
823
+ } catch {
824
+ return [];
825
+ }
826
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
827
+ const snapshots = [];
828
+ for (const fileName of jsonlFiles) {
829
+ const filePath = path.join(baseDir, fileName);
830
+ try {
831
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
832
+ if (snapshot) {
833
+ snapshots.push(snapshot);
834
+ }
835
+ } catch {
836
+ }
837
+ }
838
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
839
+ }
840
+ async function parseArtifactToSnapshot(filePath, _config) {
841
+ const content = await promises.readFile(filePath, "utf8");
842
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
843
+ if (lines.length === 0) {
844
+ return null;
845
+ }
846
+ let runQueued = null;
847
+ let runCompleted = null;
848
+ let runFailed = null;
849
+ let runStarted = null;
850
+ for (const line of lines) {
851
+ try {
852
+ const event = JSON.parse(line);
853
+ const type = event.type;
854
+ if (type === "RunQueued") {
855
+ runQueued = {
856
+ runId: event.runId,
857
+ datasetId: event.datasetId,
858
+ datasetName: event.datasetName,
859
+ evaluatorIds: event.evaluatorIds,
860
+ totalTestCases: event.totalTestCases ?? 0,
861
+ artifactPath: event.artifactPath ?? filePath,
862
+ ts: event.ts
863
+ };
864
+ }
865
+ if (type === "RunStarted") {
866
+ runStarted = { startedAt: event.startedAt };
867
+ }
868
+ if (type === "RunCompleted") {
869
+ runCompleted = {
870
+ passedTestCases: event.passedTestCases,
871
+ failedTestCases: event.failedTestCases,
872
+ totalTestCases: event.totalTestCases,
873
+ finishedAt: event.finishedAt
874
+ };
875
+ }
876
+ if (type === "RunFailed") {
877
+ runFailed = {
878
+ finishedAt: event.finishedAt,
879
+ errorMessage: event.errorMessage
880
+ };
881
+ }
882
+ } catch {
883
+ }
884
+ }
885
+ if (!runQueued) {
886
+ return null;
887
+ }
888
+ const artifactPath = filePath;
889
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
890
+ const progress = aggregateTestCaseProgress(lines);
891
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
892
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
893
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
894
+ return {
895
+ runId: runQueued.runId,
896
+ datasetId: runQueued.datasetId,
897
+ datasetName: runQueued.datasetName,
898
+ evaluatorIds: runQueued.evaluatorIds,
899
+ queuedAt: runQueued.ts ?? 0,
900
+ startedAt: runStarted?.startedAt,
901
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
902
+ totalTestCases: runQueued.totalTestCases,
903
+ completedTestCases,
904
+ passedTestCases,
905
+ failedTestCases,
906
+ status,
907
+ artifactPath,
908
+ errorMessage: runFailed?.errorMessage
909
+ };
910
+ }
911
+ function aggregateTestCaseProgress(lines) {
912
+ let completedTestCases = 0;
913
+ const testCasePassedBy = /* @__PURE__ */ new Map();
914
+ for (const line of lines) {
915
+ try {
916
+ const event = JSON.parse(line);
917
+ if (event.type === "TestCaseProgress") {
918
+ const ev = event;
919
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
920
+ const id = ev.testCaseId;
921
+ const current = testCasePassedBy.get(id);
922
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
923
+ }
924
+ } catch {
925
+ }
926
+ }
927
+ let passedTestCases = 0;
928
+ let failedTestCases = 0;
929
+ for (const passed of testCasePassedBy.values()) {
930
+ if (passed) {
931
+ passedTestCases += 1;
932
+ } else {
933
+ failedTestCases += 1;
934
+ }
935
+ }
936
+ return { completedTestCases, passedTestCases, failedTestCases };
937
+ }
938
+ async function parseArtifactFile(artifactPath) {
939
+ try {
940
+ const content = await promises.readFile(artifactPath, "utf8");
941
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
942
+ const results = [];
943
+ for (const line of lines) {
944
+ try {
945
+ const event = JSON.parse(line);
946
+ if (event.type === "TestCaseProgress") {
947
+ const ev = event;
948
+ const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
949
+ const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
950
+ results.push({
951
+ testCaseId: ev.testCaseId,
952
+ testCaseName: ev.testCaseName,
953
+ completedTestCases: ev.completedTestCases,
954
+ totalTestCases: ev.totalTestCases,
955
+ repetitionId: ev.repetitionId,
956
+ repetitionIndex,
957
+ repetitionCount,
958
+ passed: ev.passed,
959
+ durationMs: ev.durationMs,
960
+ evaluatorScores: ev.evaluatorScores ?? []
961
+ });
962
+ }
963
+ } catch {
964
+ }
965
+ }
966
+ return results;
967
+ } catch {
968
+ return [];
969
+ }
970
+ }
765
971
 
766
972
  // src/runner/config.ts
767
973
  var defaultRunnerConfig = {
@@ -769,6 +975,7 @@ var defaultRunnerConfig = {
769
975
  rootDir: process.cwd(),
770
976
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
771
977
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
978
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
772
979
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
773
980
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
774
981
  },
@@ -794,6 +1001,11 @@ function toRunnerConfigOverrides(config) {
794
1001
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
795
1002
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
796
1003
  }
1004
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
1005
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
1006
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
1007
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
1008
+ }
797
1009
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
798
1010
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
799
1011
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -892,6 +1104,9 @@ function isDatasetLike(value) {
892
1104
  function isEvaluatorLike(value) {
893
1105
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
894
1106
  }
1107
+ function isRunConfigLike(value) {
1108
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1109
+ }
895
1110
  function isTestCaseLike(value) {
896
1111
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
897
1112
  }
@@ -980,6 +1195,23 @@ async function collectEvaluatorsFromFiles(config) {
980
1195
  );
981
1196
  return found.flat();
982
1197
  }
1198
+ async function collectRunConfigsFromFiles(config) {
1199
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1200
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1201
+ const found = await Promise.all(
1202
+ matched.map(async (absolutePath) => {
1203
+ const exports = await loadModuleExports(absolutePath);
1204
+ const runConfigs = exports.filter(isRunConfigLike);
1205
+ const relPath = path.relative(config.rootDir, absolutePath);
1206
+ return runConfigs.map((runConfig) => ({
1207
+ id: runConfig.getName(),
1208
+ filePath: relPath,
1209
+ runConfig
1210
+ }));
1211
+ })
1212
+ );
1213
+ return found.flat();
1214
+ }
983
1215
  async function collectTestCasesFromFiles(config) {
984
1216
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
985
1217
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1133,6 +1365,17 @@ function getDiffLines(entry) {
1133
1365
  });
1134
1366
  }
1135
1367
 
1368
+ // src/evals/test-case.ts
1369
+ function getTestCaseDisplayLabel(testCase) {
1370
+ if (typeof testCase.getDisplayLabel === "function") {
1371
+ return testCase.getDisplayLabel();
1372
+ }
1373
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1374
+ }
1375
+ function getTestCaseTagList(testCase) {
1376
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1377
+ }
1378
+
1136
1379
  // src/evals/metric.ts
1137
1380
  var registry = /* @__PURE__ */ new Map();
1138
1381
  var Metric = {
@@ -1156,6 +1399,54 @@ function getMetricById(id) {
1156
1399
  return registry.get(id);
1157
1400
  }
1158
1401
 
1402
+ // src/evals/aggregators.ts
1403
+ function aggregateTokenCountSum(values) {
1404
+ const initial = {
1405
+ input: 0,
1406
+ output: 0,
1407
+ inputCached: 0,
1408
+ outputCached: 0
1409
+ };
1410
+ return values.reduce(
1411
+ (acc, v) => ({
1412
+ input: acc.input + (v.input ?? 0),
1413
+ output: acc.output + (v.output ?? 0),
1414
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1415
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1416
+ }),
1417
+ initial
1418
+ );
1419
+ }
1420
+ function aggregateLatencyAverage(values) {
1421
+ if (values.length === 0) {
1422
+ return { ms: 0 };
1423
+ }
1424
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1425
+ return { ms: sum / values.length };
1426
+ }
1427
+
1428
+ // src/evals/metrics/standard.ts
1429
+ Metric.of({
1430
+ id: "token-count",
1431
+ name: "Tokens",
1432
+ aggregate: aggregateTokenCountSum,
1433
+ format: (data, options) => {
1434
+ const input = data.input ?? 0;
1435
+ const output = data.output ?? 0;
1436
+ const inputCached = data.inputCached ?? 0;
1437
+ const outputCached = data.outputCached ?? 0;
1438
+ const cached = inputCached + outputCached;
1439
+ const base = `in:${input} out:${output} cached:${cached}`;
1440
+ return options?.isAggregated ? `Total: ${base}` : base;
1441
+ }
1442
+ });
1443
+ Metric.of({
1444
+ id: "latency",
1445
+ name: "Latency",
1446
+ aggregate: aggregateLatencyAverage,
1447
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1448
+ });
1449
+
1159
1450
  // src/evals/score.ts
1160
1451
  var registry2 = /* @__PURE__ */ new Map();
1161
1452
  function formatScoreData(def, data, options) {
@@ -1264,54 +1555,6 @@ function getScoreById(id) {
1264
1555
  return registry2.get(id);
1265
1556
  }
1266
1557
 
1267
- // src/evals/aggregators.ts
1268
- function aggregateTokenCountSum(values) {
1269
- const initial = {
1270
- input: 0,
1271
- output: 0,
1272
- inputCached: 0,
1273
- outputCached: 0
1274
- };
1275
- return values.reduce(
1276
- (acc, v) => ({
1277
- input: acc.input + (v.input ?? 0),
1278
- output: acc.output + (v.output ?? 0),
1279
- inputCached: acc.inputCached + (v.inputCached ?? 0),
1280
- outputCached: acc.outputCached + (v.outputCached ?? 0)
1281
- }),
1282
- initial
1283
- );
1284
- }
1285
- function aggregateLatencyAverage(values) {
1286
- if (values.length === 0) {
1287
- return { ms: 0 };
1288
- }
1289
- const sum = values.reduce((s, v) => s + v.ms, 0);
1290
- return { ms: sum / values.length };
1291
- }
1292
-
1293
- // src/evals/metrics/standard.ts
1294
- Metric.of({
1295
- id: "token-count",
1296
- name: "Tokens",
1297
- aggregate: aggregateTokenCountSum,
1298
- format: (data, options) => {
1299
- const input = data.input ?? 0;
1300
- const output = data.output ?? 0;
1301
- const inputCached = data.inputCached ?? 0;
1302
- const outputCached = data.outputCached ?? 0;
1303
- const cached = inputCached + outputCached;
1304
- const base = `in:${input} out:${output} cached:${cached}`;
1305
- return options?.isAggregated ? `Total: ${base}` : base;
1306
- }
1307
- });
1308
- Metric.of({
1309
- id: "latency",
1310
- name: "Latency",
1311
- aggregate: aggregateLatencyAverage,
1312
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1313
- });
1314
-
1315
1558
  // src/evals/scores/standard.ts
1316
1559
  Score.of({
1317
1560
  id: "percent",
@@ -1418,15 +1661,17 @@ function readOutput(testCase) {
1418
1661
  }
1419
1662
  return candidate.getOutput();
1420
1663
  }
1421
- function buildEvaluationUnits(testCases) {
1664
+ function buildEvaluationUnits(testCases, repetitionCount) {
1665
+ const count = Math.max(1, repetitionCount);
1422
1666
  const units = [];
1423
1667
  for (const testCaseItem of testCases) {
1424
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1425
- for (let r = 0; r < rerunTotal; r++) {
1668
+ const repetitionId = `rep-${crypto.randomUUID()}`;
1669
+ for (let r = 0; r < count; r++) {
1426
1670
  units.push({
1427
1671
  testCaseItem,
1428
- rerunIndex: r + 1,
1429
- rerunTotal
1672
+ repetitionId,
1673
+ repetitionIndex: r + 1,
1674
+ repetitionCount: count
1430
1675
  });
1431
1676
  }
1432
1677
  }
@@ -1439,7 +1684,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1439
1684
  return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1440
1685
  }
1441
1686
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1442
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1687
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1443
1688
  return effect.Effect.gen(function* () {
1444
1689
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1445
1690
  const started = Date.now();
@@ -1448,11 +1693,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1448
1693
  type: "TestCaseStarted",
1449
1694
  runId: task.runId,
1450
1695
  testCaseId: testCaseItem.id,
1451
- testCaseName: testCaseItem.testCase.getName(),
1696
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1452
1697
  startedTestCases: startedEvaluations,
1453
1698
  totalTestCases: totalEvaluations,
1454
- rerunIndex,
1455
- rerunTotal
1699
+ repetitionId,
1700
+ repetitionIndex,
1701
+ repetitionCount
1456
1702
  });
1457
1703
  const evaluatorScores = [];
1458
1704
  let testCaseError;
@@ -1486,8 +1732,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1486
1732
  meta: {
1487
1733
  triggerId: task.triggerId,
1488
1734
  runId: evaluatorRunId,
1489
- datasetId: task.datasetId
1735
+ datasetName: task.dataset.getDisplayLabel(),
1736
+ repetitionId,
1737
+ repetitionIndex,
1738
+ repetitionCount,
1739
+ runConfigName: task.runConfigName
1490
1740
  },
1741
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1742
+ runConfigTags: task.runConfigTags,
1743
+ evaluatorTags: getEvaluatorTagList(evaluator),
1491
1744
  logDiff,
1492
1745
  log,
1493
1746
  createError
@@ -1530,18 +1783,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1530
1783
  });
1531
1784
  }
1532
1785
  }
1533
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1786
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1534
1787
  const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1535
1788
  const progressEvent = {
1536
1789
  type: "TestCaseProgress",
1537
1790
  runId: task.runId,
1538
1791
  testCaseId: testCaseItem.id,
1539
- testCaseName: testCaseItem.testCase.getName(),
1792
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1540
1793
  completedTestCases: completedEvaluations,
1541
1794
  totalTestCases: totalEvaluations,
1542
- rerunIndex,
1543
- rerunTotal,
1544
- passed: rerunPassedThis,
1795
+ repetitionId,
1796
+ repetitionIndex,
1797
+ repetitionCount,
1798
+ passed: repetitionPassedThis,
1545
1799
  durationMs: Date.now() - started,
1546
1800
  evaluatorScores,
1547
1801
  output,
@@ -1562,9 +1816,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1562
1816
  (map) => {
1563
1817
  const key = testCaseItem.id;
1564
1818
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1565
- const newResults = [...existing.results, rerunPassedThis];
1819
+ const newResults = [...existing.results, repetitionPassedThis];
1566
1820
  const newCompletedCount = existing.completedCount + 1;
1567
- const isLast = newCompletedCount === rerunTotal;
1821
+ const isLast = newCompletedCount === repetitionCount;
1568
1822
  const newMap = new Map(map);
1569
1823
  newMap.set(key, {
1570
1824
  completedCount: newCompletedCount,
@@ -1601,10 +1855,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1601
1855
  runId: task.runId,
1602
1856
  startedAt
1603
1857
  });
1604
- const totalEvaluations = task.testCases.reduce(
1605
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1606
- 0
1607
- );
1858
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1608
1859
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1609
1860
  const completedRef = yield* effect.Ref.make(0);
1610
1861
  const startedRef = yield* effect.Ref.make(0);
@@ -1613,7 +1864,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1613
1864
  const testCaseResultsRef = yield* effect.Ref.make(
1614
1865
  /* @__PURE__ */ new Map()
1615
1866
  );
1616
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1867
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1617
1868
  const processEvaluation = (unit) => processOneEvaluation(
1618
1869
  task,
1619
1870
  unit,
@@ -1627,11 +1878,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1627
1878
  failedRef,
1628
1879
  testCaseResultsRef
1629
1880
  );
1630
- yield* effect.Effect.forEach(
1631
- evaluationUnits,
1632
- processEvaluation,
1633
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1634
- );
1881
+ const globalSem = task.globalEvaluationSemaphore;
1882
+ if (globalSem !== void 0) {
1883
+ yield* effect.Effect.forEach(
1884
+ evaluationUnits,
1885
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1886
+ { concurrency: "unbounded", discard: true }
1887
+ );
1888
+ } else {
1889
+ yield* effect.Effect.forEach(
1890
+ evaluationUnits,
1891
+ processEvaluation,
1892
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1893
+ );
1894
+ }
1635
1895
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1636
1896
  effect.Ref.get(completedRef),
1637
1897
  effect.Ref.get(passedRef),
@@ -1667,155 +1927,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1667
1927
  artifactPath: task.snapshot.artifactPath
1668
1928
  });
1669
1929
  });
1670
- async function loadRunSnapshotsFromArtifacts(config) {
1671
- const baseDir = path.resolve(config.artifactDirectory);
1672
- let entries;
1673
- try {
1674
- entries = await promises.readdir(baseDir);
1675
- } catch {
1676
- return [];
1677
- }
1678
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1679
- const snapshots = [];
1680
- for (const fileName of jsonlFiles) {
1681
- const filePath = path.join(baseDir, fileName);
1682
- try {
1683
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1684
- if (snapshot) {
1685
- snapshots.push(snapshot);
1686
- }
1687
- } catch {
1688
- }
1689
- }
1690
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1691
- }
1692
- async function parseArtifactToSnapshot(filePath, _config) {
1693
- const content = await promises.readFile(filePath, "utf8");
1694
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1695
- if (lines.length === 0) {
1696
- return null;
1697
- }
1698
- let runQueued = null;
1699
- let runCompleted = null;
1700
- let runFailed = null;
1701
- let runStarted = null;
1702
- for (const line of lines) {
1703
- try {
1704
- const event = JSON.parse(line);
1705
- const type = event.type;
1706
- if (type === "RunQueued") {
1707
- runQueued = {
1708
- runId: event.runId,
1709
- datasetId: event.datasetId,
1710
- datasetName: event.datasetName,
1711
- evaluatorIds: event.evaluatorIds,
1712
- totalTestCases: event.totalTestCases ?? 0,
1713
- artifactPath: event.artifactPath ?? filePath,
1714
- ts: event.ts
1715
- };
1716
- }
1717
- if (type === "RunStarted") {
1718
- runStarted = { startedAt: event.startedAt };
1719
- }
1720
- if (type === "RunCompleted") {
1721
- runCompleted = {
1722
- passedTestCases: event.passedTestCases,
1723
- failedTestCases: event.failedTestCases,
1724
- totalTestCases: event.totalTestCases,
1725
- finishedAt: event.finishedAt
1726
- };
1727
- }
1728
- if (type === "RunFailed") {
1729
- runFailed = {
1730
- finishedAt: event.finishedAt,
1731
- errorMessage: event.errorMessage
1732
- };
1733
- }
1734
- } catch {
1735
- }
1930
+
1931
+ // src/runner/name-pattern.ts
1932
+ function parseRegexLiteral(pattern) {
1933
+ if (!pattern.startsWith("/")) {
1934
+ return void 0;
1736
1935
  }
1737
- if (!runQueued) {
1738
- return null;
1936
+ const lastSlash = pattern.lastIndexOf("/");
1937
+ if (lastSlash <= 0) {
1938
+ return void 0;
1739
1939
  }
1740
- const artifactPath = filePath;
1741
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1742
- const progress = aggregateTestCaseProgress(lines);
1743
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1744
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1745
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1746
1940
  return {
1747
- runId: runQueued.runId,
1748
- datasetId: runQueued.datasetId,
1749
- datasetName: runQueued.datasetName,
1750
- evaluatorIds: runQueued.evaluatorIds,
1751
- queuedAt: runQueued.ts ?? 0,
1752
- startedAt: runStarted?.startedAt,
1753
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1754
- totalTestCases: runQueued.totalTestCases,
1755
- completedTestCases,
1756
- passedTestCases,
1757
- failedTestCases,
1758
- status,
1759
- artifactPath,
1760
- errorMessage: runFailed?.errorMessage
1941
+ source: pattern.slice(1, lastSlash),
1942
+ flags: pattern.slice(lastSlash + 1)
1761
1943
  };
1762
1944
  }
1763
- function aggregateTestCaseProgress(lines) {
1764
- let completedTestCases = 0;
1765
- const testCasePassedBy = /* @__PURE__ */ new Map();
1766
- for (const line of lines) {
1767
- try {
1768
- const event = JSON.parse(line);
1769
- if (event.type === "TestCaseProgress") {
1770
- const ev = event;
1771
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1772
- const id = ev.testCaseId;
1773
- const current = testCasePassedBy.get(id);
1774
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1775
- }
1776
- } catch {
1777
- }
1778
- }
1779
- let passedTestCases = 0;
1780
- let failedTestCases = 0;
1781
- for (const passed of testCasePassedBy.values()) {
1782
- if (passed) {
1783
- passedTestCases += 1;
1784
- } else {
1785
- failedTestCases += 1;
1786
- }
1945
+ function createNameMatcher(pattern) {
1946
+ const normalizedPattern = pattern.trim();
1947
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1948
+ if (regexLiteral) {
1949
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1950
+ return (value) => regex.test(value);
1787
1951
  }
1788
- return { completedTestCases, passedTestCases, failedTestCases };
1789
- }
1790
- async function parseArtifactFile(artifactPath) {
1791
- try {
1792
- const content = await promises.readFile(artifactPath, "utf8");
1793
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1794
- const results = [];
1795
- for (const line of lines) {
1796
- try {
1797
- const event = JSON.parse(line);
1798
- if (event.type === "TestCaseProgress") {
1799
- const ev = event;
1800
- results.push({
1801
- testCaseId: ev.testCaseId,
1802
- testCaseName: ev.testCaseName,
1803
- completedTestCases: ev.completedTestCases,
1804
- totalTestCases: ev.totalTestCases,
1805
- rerunIndex: ev.rerunIndex,
1806
- rerunTotal: ev.rerunTotal,
1807
- passed: ev.passed,
1808
- durationMs: ev.durationMs,
1809
- evaluatorScores: ev.evaluatorScores ?? []
1810
- });
1811
- }
1812
- } catch {
1813
- }
1814
- }
1815
- return results;
1816
- } catch {
1817
- return [];
1952
+ if (normalizedPattern.includes("*")) {
1953
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1954
+ const regex = new RegExp(`^${escaped}$`, "i");
1955
+ return (value) => regex.test(value);
1818
1956
  }
1957
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1819
1958
  }
1820
1959
  async function appendJsonLine(artifactPath, payload) {
1821
1960
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1874,32 +2013,12 @@ function searchCollectedTestCases(all, query) {
1874
2013
  }
1875
2014
 
1876
2015
  // src/runner/api.ts
1877
- function parseRegexLiteral(pattern) {
1878
- if (!pattern.startsWith("/")) {
1879
- return void 0;
1880
- }
1881
- const lastSlash = pattern.lastIndexOf("/");
1882
- if (lastSlash <= 0) {
1883
- return void 0;
2016
+ function normalizeRunRepetitions(value) {
2017
+ const n = value ?? 1;
2018
+ if (!Number.isInteger(n) || n < 1) {
2019
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1884
2020
  }
1885
- return {
1886
- source: pattern.slice(1, lastSlash),
1887
- flags: pattern.slice(lastSlash + 1)
1888
- };
1889
- }
1890
- function createNameMatcher(pattern) {
1891
- const normalizedPattern = pattern.trim();
1892
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1893
- if (regexLiteral) {
1894
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1895
- return (value) => regex.test(value);
1896
- }
1897
- if (normalizedPattern.includes("*")) {
1898
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1899
- const regex = new RegExp(`^${escaped}$`, "i");
1900
- return (value) => regex.test(value);
1901
- }
1902
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
2021
+ return n;
1903
2022
  }
1904
2023
  function mergeRunnerOverrides(base, next) {
1905
2024
  if (!base) {
@@ -1934,6 +2053,7 @@ var EffectRunner = class {
1934
2053
  this.listeners = /* @__PURE__ */ new Set();
1935
2054
  this.datasetsById = /* @__PURE__ */ new Map();
1936
2055
  this.evaluatorsById = /* @__PURE__ */ new Map();
2056
+ this.runConfigsById = /* @__PURE__ */ new Map();
1937
2057
  this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1938
2058
  this.persistenceFiber = effect.Effect.runFork(
1939
2059
  createPersistenceWorker(this.persistenceQueue)
@@ -1974,6 +2094,137 @@ var EffectRunner = class {
1974
2094
  (item) => matcher(item.evaluator.getName() ?? "")
1975
2095
  );
1976
2096
  }
2097
+ async collectRunConfigs() {
2098
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
2099
+ this.runConfigsById.clear();
2100
+ const byNameLower = /* @__PURE__ */ new Map();
2101
+ for (const item of runConfigs) {
2102
+ const id = item.runConfig.getName();
2103
+ const lower = id.toLowerCase();
2104
+ const prev = byNameLower.get(lower);
2105
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
2106
+ throw new Error(
2107
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
2108
+ );
2109
+ }
2110
+ byNameLower.set(lower, item);
2111
+ this.runConfigsById.set(id, item);
2112
+ }
2113
+ return runConfigs;
2114
+ }
2115
+ async resolveRunConfigByName(name) {
2116
+ if (this.runConfigsById.size === 0) {
2117
+ await this.collectRunConfigs();
2118
+ }
2119
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
2120
+ const keyLower = key.toLowerCase();
2121
+ const matches = Array.from(this.runConfigsById.values()).filter(
2122
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
2123
+ );
2124
+ if (matches.length === 0) {
2125
+ return void 0;
2126
+ }
2127
+ if (matches.length > 1) {
2128
+ throw new Error(
2129
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
2130
+ );
2131
+ }
2132
+ return matches[0];
2133
+ }
2134
+ async expandRunConfigToJobs(collected) {
2135
+ if (this.datasetsById.size === 0) {
2136
+ await this.collectDatasets();
2137
+ }
2138
+ if (this.evaluatorsById.size === 0) {
2139
+ await this.collectEvaluators();
2140
+ }
2141
+ const rcName = collected.runConfig.getName();
2142
+ const jobs = [];
2143
+ const runs = collected.runConfig.getRuns();
2144
+ for (const [i, row] of runs.entries()) {
2145
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2146
+ (d) => d.dataset === row.dataset
2147
+ );
2148
+ if (!dsCollected) {
2149
+ throw new Error(
2150
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2151
+ );
2152
+ }
2153
+ let evaluatorIds;
2154
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2155
+ const matcher = createNameMatcher(row.evaluatorPattern);
2156
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2157
+ (item) => matcher(item.evaluator.getName() ?? "")
2158
+ );
2159
+ if (matched.length === 0) {
2160
+ throw new Error(
2161
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2162
+ );
2163
+ }
2164
+ evaluatorIds = matched.map((item) => item.id);
2165
+ } else {
2166
+ const evaluators = row.evaluators;
2167
+ evaluatorIds = [];
2168
+ for (const ev of evaluators) {
2169
+ const found = Array.from(this.evaluatorsById.values()).find(
2170
+ (item) => item.evaluator === ev
2171
+ );
2172
+ if (!found) {
2173
+ throw new Error(
2174
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2175
+ );
2176
+ }
2177
+ evaluatorIds.push(found.id);
2178
+ }
2179
+ }
2180
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2181
+ jobs.push({
2182
+ datasetId: dsCollected.id,
2183
+ evaluatorIds,
2184
+ runConfigName: rcName,
2185
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2186
+ runConfigTags: collected.runConfig.getTags(),
2187
+ repetitions
2188
+ });
2189
+ }
2190
+ return jobs;
2191
+ }
2192
+ async expandRunConfigNamesToJobs(names) {
2193
+ const jobs = [];
2194
+ for (const name of names) {
2195
+ const collected = await this.resolveRunConfigByName(name);
2196
+ if (!collected) {
2197
+ const known = await this.collectRunConfigs();
2198
+ const available = known.map((r) => r.runConfig.getName()).sort();
2199
+ throw new Error(
2200
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2201
+ );
2202
+ }
2203
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2204
+ }
2205
+ return jobs;
2206
+ }
2207
+ async runDatasetJobsWithSharedConcurrency(request) {
2208
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2209
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
2210
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2211
+ const snapshots = [];
2212
+ for (const job of request.jobs) {
2213
+ snapshots.push(
2214
+ await this.startDatasetRun({
2215
+ datasetId: job.datasetId,
2216
+ evaluatorIds: job.evaluatorIds,
2217
+ triggerId,
2218
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2219
+ globalEvaluationSemaphore: sem,
2220
+ runConfigName: job.runConfigName,
2221
+ runConfigTags: job.runConfigTags,
2222
+ repetitions: job.repetitions
2223
+ })
2224
+ );
2225
+ }
2226
+ return snapshots;
2227
+ }
1977
2228
  async searchTestCases(query) {
1978
2229
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1979
2230
  return searchCollectedTestCases(testCases, query);
@@ -1992,36 +2243,46 @@ var EffectRunner = class {
1992
2243
  );
1993
2244
  }
1994
2245
  async runDatasetWith(request) {
2246
+ const runConfigName = validateRunConfigName(
2247
+ request.runConfigName,
2248
+ "runDatasetWith.runConfigName"
2249
+ );
2250
+ return this.startDatasetRun({
2251
+ datasetId: request.datasetId,
2252
+ evaluatorIds: request.evaluatorIds,
2253
+ triggerId: request.triggerId,
2254
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2255
+ repetitions: request.repetitions,
2256
+ runConfigName,
2257
+ runConfigTags: request.runConfigTags
2258
+ });
2259
+ }
2260
+ async startDatasetRun(params) {
1995
2261
  if (this.datasetsById.size === 0) {
1996
2262
  await this.collectDatasets();
1997
2263
  }
1998
2264
  if (this.evaluatorsById.size === 0) {
1999
2265
  await this.collectEvaluators();
2000
2266
  }
2001
- const dataset = this.datasetsById.get(request.datasetId);
2267
+ const dataset = this.datasetsById.get(params.datasetId);
2002
2268
  if (!dataset) {
2003
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2269
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
2004
2270
  }
2005
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2271
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2006
2272
  if (selectedEvaluators.length === 0) {
2007
2273
  throw new Error("No evaluators selected for run");
2008
2274
  }
2009
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
2010
- const totalEvaluations = selectedTestCases.reduce(
2011
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
2012
- 0
2013
- );
2014
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2275
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2276
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2277
+ const totalEvaluations = selectedTestCases.length * repetitions;
2278
+ const runConfigTags = [...params.runConfigTags ?? []];
2279
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
2015
2280
  const runId = `run-${crypto.randomUUID()}`;
2016
- const artifactPath = createArtifactPath(
2017
- this.config.artifactDirectory,
2018
- request.datasetId,
2019
- runId
2020
- );
2281
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
2021
2282
  const snapshot = {
2022
2283
  runId,
2023
- datasetId: request.datasetId,
2024
- datasetName: dataset.dataset.getName(),
2284
+ datasetId: params.datasetId,
2285
+ datasetName: dataset.dataset.getDisplayLabel(),
2025
2286
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2026
2287
  queuedAt: Date.now(),
2027
2288
  totalTestCases: totalEvaluations,
@@ -2041,8 +2302,8 @@ var EffectRunner = class {
2041
2302
  const queuedEvent = {
2042
2303
  type: "RunQueued",
2043
2304
  runId,
2044
- datasetId: request.datasetId,
2045
- datasetName: dataset.dataset.getName(),
2305
+ datasetId: params.datasetId,
2306
+ datasetName: dataset.dataset.getDisplayLabel(),
2046
2307
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2047
2308
  totalTestCases: totalEvaluations,
2048
2309
  artifactPath
@@ -2055,17 +2316,20 @@ var EffectRunner = class {
2055
2316
  payload: queuedEvent
2056
2317
  })
2057
2318
  );
2058
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
2059
2319
  await effect.Effect.runPromise(
2060
2320
  effect.Queue.offer(this.runQueue, {
2061
2321
  runId,
2062
2322
  triggerId,
2063
- datasetId: request.datasetId,
2323
+ datasetId: params.datasetId,
2064
2324
  dataset: dataset.dataset,
2065
2325
  evaluators: selectedEvaluators,
2066
2326
  testCases: selectedTestCases,
2067
2327
  snapshot,
2068
- maxConcurrency
2328
+ maxConcurrency: params.maxConcurrency,
2329
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2330
+ runConfigName: params.runConfigName,
2331
+ runConfigTags,
2332
+ repetitions
2069
2333
  })
2070
2334
  );
2071
2335
  return snapshot;
@@ -2136,6 +2400,11 @@ var EffectRunner = class {
2136
2400
  );
2137
2401
  }
2138
2402
  };
2403
+
2404
+ // src/runner/events.ts
2405
+ var PROGRAMMATIC_RUN_CONFIG = {
2406
+ runConfigName: "programmatic"
2407
+ };
2139
2408
  var LEFT_PANE_WIDTH2 = 44;
2140
2409
  var MAX_RUNS_FOR_CHART = 12;
2141
2410
  var MAX_RUNS_FOR_TREND = 20;
@@ -2483,7 +2752,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2483
2752
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
2484
2753
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2485
2754
  for (const tc of testCases) {
2486
- const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2755
+ const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
2487
2756
  rows.push(
2488
2757
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2489
2758
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -2495,13 +2764,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2495
2764
  ] }),
2496
2765
  " ",
2497
2766
  tc.testCaseName,
2498
- rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
2767
+ repetitionPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: repetitionPart }) : null,
2499
2768
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2500
2769
  " (",
2501
2770
  tc.durationMs,
2502
2771
  "ms)"
2503
2772
  ] })
2504
- ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2773
+ ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
2505
2774
  );
2506
2775
  for (const item of tc.evaluatorScores) {
2507
2776
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2827,7 +3096,8 @@ function EvalsCliApp({ data, args, runner }) {
2827
3096
  }
2828
3097
  void runner.runDatasetWith({
2829
3098
  datasetId: selectedDataset.id,
2830
- evaluatorIds: clampedState.selectedEvaluatorIds
3099
+ evaluatorIds: clampedState.selectedEvaluatorIds,
3100
+ ...PROGRAMMATIC_RUN_CONFIG
2831
3101
  }).then((snapshot) => {
2832
3102
  setRuntimeMessage(
2833
3103
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`