@m4trix/evals 0.25.1 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3,16 +3,16 @@ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
3
3
  import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
4
4
  import { useApp, useInput, Box, Text } from 'ink';
5
5
  import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
- import { resolve, relative, join, dirname } from 'path';
7
- import { LineGraph } from '@pppp606/ink-chart';
6
+ import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
8
7
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
8
+ import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
9
+ import { resolve, join, relative, dirname } from 'path';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
- import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
12
  import { pathToFileURL } from 'url';
14
13
  import { diffLines } from 'diff';
15
14
  import stringify from 'fast-json-stable-stringify';
15
+ import { LineGraph } from '@pppp606/ink-chart';
16
16
 
17
17
  var SEP = " ";
18
18
  var ARROW = "\u203A";
@@ -237,6 +237,59 @@ function isPrintableCharacter(input) {
237
237
  function isBackKey(key) {
238
238
  return key.backspace || key.delete;
239
239
  }
240
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
241
+ function makeEntityIdSchema(brand, label) {
242
+ return Schema.String.pipe(
243
+ Schema.trimmed(),
244
+ Schema.minLength(1, {
245
+ message: () => `${label} must be non-empty.`
246
+ }),
247
+ Schema.pattern(ENTITY_ID_PATTERN, {
248
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
249
+ }),
250
+ Schema.brand(brand)
251
+ );
252
+ }
253
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
254
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
255
+ makeEntityIdSchema("TestCaseName", "Test case name");
256
+ makeEntityIdSchema("DatasetName", "Dataset name");
257
+ function validateWithSchema(schema, raw, context) {
258
+ const trimmed = raw.trim();
259
+ const decode = Schema.decodeUnknownEither(
260
+ schema
261
+ );
262
+ const result = decode(trimmed);
263
+ if (Either.isLeft(result)) {
264
+ throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
265
+ }
266
+ return result.right;
267
+ }
268
+ function validateRunConfigName(raw, context) {
269
+ return validateWithSchema(RunConfigNameSchema, raw, context);
270
+ }
271
+
272
+ // src/evals/dataset.ts
273
+ function getDatasetDisplayLabel(dataset) {
274
+ if (typeof dataset.getDisplayLabel === "function") {
275
+ return dataset.getDisplayLabel();
276
+ }
277
+ return typeof dataset.getName === "function" ? dataset.getName() : "";
278
+ }
279
+
280
+ // src/evals/evaluator.ts
281
+ function getEvaluatorDisplayLabel(evaluator) {
282
+ if (typeof evaluator.getDisplayLabel === "function") {
283
+ const label = evaluator.getDisplayLabel();
284
+ if (label !== void 0) {
285
+ return label;
286
+ }
287
+ }
288
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
289
+ }
290
+ function getEvaluatorTagList(evaluator) {
291
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
292
+ }
240
293
 
241
294
  // src/cli/data.mock.json
242
295
  var data_mock_default = {
@@ -484,7 +537,7 @@ function toEvalDataset(item, snapshots) {
484
537
  const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
485
538
  return {
486
539
  id: item.id,
487
- name: item.dataset.getName(),
540
+ name: getDatasetDisplayLabel(item.dataset),
488
541
  overview: `Discovered from ${item.filePath}`,
489
542
  runs
490
543
  };
@@ -492,7 +545,7 @@ function toEvalDataset(item, snapshots) {
492
545
  function toEvaluatorOption(item) {
493
546
  return {
494
547
  id: item.id,
495
- name: item.evaluator.getName() ?? toSlug(item.id),
548
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
496
549
  configPreview: `Source: ${item.filePath}`
497
550
  };
498
551
  }
@@ -735,6 +788,159 @@ function reduceCliState(state, action) {
735
788
  }
736
789
  return state;
737
790
  }
791
+ async function loadRunSnapshotsFromArtifacts(config) {
792
+ const baseDir = resolve(config.artifactDirectory);
793
+ let entries;
794
+ try {
795
+ entries = await readdir(baseDir);
796
+ } catch {
797
+ return [];
798
+ }
799
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
800
+ const snapshots = [];
801
+ for (const fileName of jsonlFiles) {
802
+ const filePath = join(baseDir, fileName);
803
+ try {
804
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
805
+ if (snapshot) {
806
+ snapshots.push(snapshot);
807
+ }
808
+ } catch {
809
+ }
810
+ }
811
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
812
+ }
813
+ async function parseArtifactToSnapshot(filePath, _config) {
814
+ const content = await readFile(filePath, "utf8");
815
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
816
+ if (lines.length === 0) {
817
+ return null;
818
+ }
819
+ let runQueued = null;
820
+ let runCompleted = null;
821
+ let runFailed = null;
822
+ let runStarted = null;
823
+ for (const line of lines) {
824
+ try {
825
+ const event = JSON.parse(line);
826
+ const type = event.type;
827
+ if (type === "RunQueued") {
828
+ runQueued = {
829
+ runId: event.runId,
830
+ datasetId: event.datasetId,
831
+ datasetName: event.datasetName,
832
+ evaluatorIds: event.evaluatorIds,
833
+ totalTestCases: event.totalTestCases ?? 0,
834
+ artifactPath: event.artifactPath ?? filePath,
835
+ ts: event.ts
836
+ };
837
+ }
838
+ if (type === "RunStarted") {
839
+ runStarted = { startedAt: event.startedAt };
840
+ }
841
+ if (type === "RunCompleted") {
842
+ runCompleted = {
843
+ passedTestCases: event.passedTestCases,
844
+ failedTestCases: event.failedTestCases,
845
+ totalTestCases: event.totalTestCases,
846
+ finishedAt: event.finishedAt
847
+ };
848
+ }
849
+ if (type === "RunFailed") {
850
+ runFailed = {
851
+ finishedAt: event.finishedAt,
852
+ errorMessage: event.errorMessage
853
+ };
854
+ }
855
+ } catch {
856
+ }
857
+ }
858
+ if (!runQueued) {
859
+ return null;
860
+ }
861
+ const artifactPath = filePath;
862
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
863
+ const progress = aggregateTestCaseProgress(lines);
864
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
865
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
866
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
867
+ return {
868
+ runId: runQueued.runId,
869
+ datasetId: runQueued.datasetId,
870
+ datasetName: runQueued.datasetName,
871
+ evaluatorIds: runQueued.evaluatorIds,
872
+ queuedAt: runQueued.ts ?? 0,
873
+ startedAt: runStarted?.startedAt,
874
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
875
+ totalTestCases: runQueued.totalTestCases,
876
+ completedTestCases,
877
+ passedTestCases,
878
+ failedTestCases,
879
+ status,
880
+ artifactPath,
881
+ errorMessage: runFailed?.errorMessage
882
+ };
883
+ }
884
+ function aggregateTestCaseProgress(lines) {
885
+ let completedTestCases = 0;
886
+ const testCasePassedBy = /* @__PURE__ */ new Map();
887
+ for (const line of lines) {
888
+ try {
889
+ const event = JSON.parse(line);
890
+ if (event.type === "TestCaseProgress") {
891
+ const ev = event;
892
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
893
+ const id = ev.testCaseId;
894
+ const current = testCasePassedBy.get(id);
895
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
896
+ }
897
+ } catch {
898
+ }
899
+ }
900
+ let passedTestCases = 0;
901
+ let failedTestCases = 0;
902
+ for (const passed of testCasePassedBy.values()) {
903
+ if (passed) {
904
+ passedTestCases += 1;
905
+ } else {
906
+ failedTestCases += 1;
907
+ }
908
+ }
909
+ return { completedTestCases, passedTestCases, failedTestCases };
910
+ }
911
+ async function parseArtifactFile(artifactPath) {
912
+ try {
913
+ const content = await readFile(artifactPath, "utf8");
914
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
915
+ const results = [];
916
+ for (const line of lines) {
917
+ try {
918
+ const event = JSON.parse(line);
919
+ if (event.type === "TestCaseProgress") {
920
+ const ev = event;
921
+ const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
922
+ const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
923
+ results.push({
924
+ testCaseId: ev.testCaseId,
925
+ testCaseName: ev.testCaseName,
926
+ completedTestCases: ev.completedTestCases,
927
+ totalTestCases: ev.totalTestCases,
928
+ repetitionId: ev.repetitionId,
929
+ repetitionIndex,
930
+ repetitionCount,
931
+ passed: ev.passed,
932
+ durationMs: ev.durationMs,
933
+ evaluatorScores: ev.evaluatorScores ?? []
934
+ });
935
+ }
936
+ } catch {
937
+ }
938
+ }
939
+ return results;
940
+ } catch {
941
+ return [];
942
+ }
943
+ }
738
944
 
739
945
  // src/runner/config.ts
740
946
  var defaultRunnerConfig = {
@@ -742,6 +948,7 @@ var defaultRunnerConfig = {
742
948
  rootDir: process.cwd(),
743
949
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
744
950
  evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
951
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
745
952
  testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
746
953
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
747
954
  },
@@ -767,6 +974,11 @@ function toRunnerConfigOverrides(config) {
767
974
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
768
975
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
769
976
  }
977
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
978
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
979
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
980
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
981
+ }
770
982
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
771
983
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
772
984
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -865,6 +1077,9 @@ function isDatasetLike(value) {
865
1077
  function isEvaluatorLike(value) {
866
1078
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
867
1079
  }
1080
+ function isRunConfigLike(value) {
1081
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1082
+ }
868
1083
  function isTestCaseLike(value) {
869
1084
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
870
1085
  }
@@ -953,6 +1168,23 @@ async function collectEvaluatorsFromFiles(config) {
953
1168
  );
954
1169
  return found.flat();
955
1170
  }
1171
+ async function collectRunConfigsFromFiles(config) {
1172
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1173
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1174
+ const found = await Promise.all(
1175
+ matched.map(async (absolutePath) => {
1176
+ const exports = await loadModuleExports(absolutePath);
1177
+ const runConfigs = exports.filter(isRunConfigLike);
1178
+ const relPath = relative(config.rootDir, absolutePath);
1179
+ return runConfigs.map((runConfig) => ({
1180
+ id: runConfig.getName(),
1181
+ filePath: relPath,
1182
+ runConfig
1183
+ }));
1184
+ })
1185
+ );
1186
+ return found.flat();
1187
+ }
956
1188
  async function collectTestCasesFromFiles(config) {
957
1189
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
958
1190
  const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
@@ -1106,6 +1338,17 @@ function getDiffLines(entry) {
1106
1338
  });
1107
1339
  }
1108
1340
 
1341
+ // src/evals/test-case.ts
1342
+ function getTestCaseDisplayLabel(testCase) {
1343
+ if (typeof testCase.getDisplayLabel === "function") {
1344
+ return testCase.getDisplayLabel();
1345
+ }
1346
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1347
+ }
1348
+ function getTestCaseTagList(testCase) {
1349
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1350
+ }
1351
+
1109
1352
  // src/evals/metric.ts
1110
1353
  var registry = /* @__PURE__ */ new Map();
1111
1354
  var Metric = {
@@ -1129,6 +1372,54 @@ function getMetricById(id) {
1129
1372
  return registry.get(id);
1130
1373
  }
1131
1374
 
1375
+ // src/evals/aggregators.ts
1376
+ function aggregateTokenCountSum(values) {
1377
+ const initial = {
1378
+ input: 0,
1379
+ output: 0,
1380
+ inputCached: 0,
1381
+ outputCached: 0
1382
+ };
1383
+ return values.reduce(
1384
+ (acc, v) => ({
1385
+ input: acc.input + (v.input ?? 0),
1386
+ output: acc.output + (v.output ?? 0),
1387
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1388
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1389
+ }),
1390
+ initial
1391
+ );
1392
+ }
1393
+ function aggregateLatencyAverage(values) {
1394
+ if (values.length === 0) {
1395
+ return { ms: 0 };
1396
+ }
1397
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1398
+ return { ms: sum / values.length };
1399
+ }
1400
+
1401
+ // src/evals/metrics/standard.ts
1402
+ Metric.of({
1403
+ id: "token-count",
1404
+ name: "Tokens",
1405
+ aggregate: aggregateTokenCountSum,
1406
+ format: (data, options) => {
1407
+ const input = data.input ?? 0;
1408
+ const output = data.output ?? 0;
1409
+ const inputCached = data.inputCached ?? 0;
1410
+ const outputCached = data.outputCached ?? 0;
1411
+ const cached = inputCached + outputCached;
1412
+ const base = `in:${input} out:${output} cached:${cached}`;
1413
+ return options?.isAggregated ? `Total: ${base}` : base;
1414
+ }
1415
+ });
1416
+ Metric.of({
1417
+ id: "latency",
1418
+ name: "Latency",
1419
+ aggregate: aggregateLatencyAverage,
1420
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1421
+ });
1422
+
1132
1423
  // src/evals/score.ts
1133
1424
  var registry2 = /* @__PURE__ */ new Map();
1134
1425
  function formatScoreData(def, data, options) {
@@ -1237,54 +1528,6 @@ function getScoreById(id) {
1237
1528
  return registry2.get(id);
1238
1529
  }
1239
1530
 
1240
- // src/evals/aggregators.ts
1241
- function aggregateTokenCountSum(values) {
1242
- const initial = {
1243
- input: 0,
1244
- output: 0,
1245
- inputCached: 0,
1246
- outputCached: 0
1247
- };
1248
- return values.reduce(
1249
- (acc, v) => ({
1250
- input: acc.input + (v.input ?? 0),
1251
- output: acc.output + (v.output ?? 0),
1252
- inputCached: acc.inputCached + (v.inputCached ?? 0),
1253
- outputCached: acc.outputCached + (v.outputCached ?? 0)
1254
- }),
1255
- initial
1256
- );
1257
- }
1258
- function aggregateLatencyAverage(values) {
1259
- if (values.length === 0) {
1260
- return { ms: 0 };
1261
- }
1262
- const sum = values.reduce((s, v) => s + v.ms, 0);
1263
- return { ms: sum / values.length };
1264
- }
1265
-
1266
- // src/evals/metrics/standard.ts
1267
- Metric.of({
1268
- id: "token-count",
1269
- name: "Tokens",
1270
- aggregate: aggregateTokenCountSum,
1271
- format: (data, options) => {
1272
- const input = data.input ?? 0;
1273
- const output = data.output ?? 0;
1274
- const inputCached = data.inputCached ?? 0;
1275
- const outputCached = data.outputCached ?? 0;
1276
- const cached = inputCached + outputCached;
1277
- const base = `in:${input} out:${output} cached:${cached}`;
1278
- return options?.isAggregated ? `Total: ${base}` : base;
1279
- }
1280
- });
1281
- Metric.of({
1282
- id: "latency",
1283
- name: "Latency",
1284
- aggregate: aggregateLatencyAverage,
1285
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1286
- });
1287
-
1288
1531
  // src/evals/scores/standard.ts
1289
1532
  Score.of({
1290
1533
  id: "percent",
@@ -1391,15 +1634,17 @@ function readOutput(testCase) {
1391
1634
  }
1392
1635
  return candidate.getOutput();
1393
1636
  }
1394
- function buildEvaluationUnits(testCases) {
1637
+ function buildEvaluationUnits(testCases, repetitionCount) {
1638
+ const count = Math.max(1, repetitionCount);
1395
1639
  const units = [];
1396
1640
  for (const testCaseItem of testCases) {
1397
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1398
- for (let r = 0; r < rerunTotal; r++) {
1641
+ const repetitionId = `rep-${randomUUID()}`;
1642
+ for (let r = 0; r < count; r++) {
1399
1643
  units.push({
1400
1644
  testCaseItem,
1401
- rerunIndex: r + 1,
1402
- rerunTotal
1645
+ repetitionId,
1646
+ repetitionIndex: r + 1,
1647
+ repetitionCount: count
1403
1648
  });
1404
1649
  }
1405
1650
  }
@@ -1412,7 +1657,7 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1412
1657
  return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1413
1658
  }
1414
1659
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1415
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1660
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1416
1661
  return Effect.gen(function* () {
1417
1662
  const evaluatorRunId = `run-${randomUUID()}`;
1418
1663
  const started = Date.now();
@@ -1421,11 +1666,12 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1421
1666
  type: "TestCaseStarted",
1422
1667
  runId: task.runId,
1423
1668
  testCaseId: testCaseItem.id,
1424
- testCaseName: testCaseItem.testCase.getName(),
1669
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1425
1670
  startedTestCases: startedEvaluations,
1426
1671
  totalTestCases: totalEvaluations,
1427
- rerunIndex,
1428
- rerunTotal
1672
+ repetitionId,
1673
+ repetitionIndex,
1674
+ repetitionCount
1429
1675
  });
1430
1676
  const evaluatorScores = [];
1431
1677
  let testCaseError;
@@ -1459,8 +1705,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1459
1705
  meta: {
1460
1706
  triggerId: task.triggerId,
1461
1707
  runId: evaluatorRunId,
1462
- datasetId: task.datasetId
1708
+ datasetName: task.dataset.getDisplayLabel(),
1709
+ repetitionId,
1710
+ repetitionIndex,
1711
+ repetitionCount,
1712
+ runConfigName: task.runConfigName
1463
1713
  },
1714
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1715
+ runConfigTags: task.runConfigTags,
1716
+ evaluatorTags: getEvaluatorTagList(evaluator),
1464
1717
  logDiff,
1465
1718
  log,
1466
1719
  createError
@@ -1503,18 +1756,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1503
1756
  });
1504
1757
  }
1505
1758
  }
1506
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1759
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1507
1760
  const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1508
1761
  const progressEvent = {
1509
1762
  type: "TestCaseProgress",
1510
1763
  runId: task.runId,
1511
1764
  testCaseId: testCaseItem.id,
1512
- testCaseName: testCaseItem.testCase.getName(),
1765
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1513
1766
  completedTestCases: completedEvaluations,
1514
1767
  totalTestCases: totalEvaluations,
1515
- rerunIndex,
1516
- rerunTotal,
1517
- passed: rerunPassedThis,
1768
+ repetitionId,
1769
+ repetitionIndex,
1770
+ repetitionCount,
1771
+ passed: repetitionPassedThis,
1518
1772
  durationMs: Date.now() - started,
1519
1773
  evaluatorScores,
1520
1774
  output,
@@ -1535,9 +1789,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1535
1789
  (map) => {
1536
1790
  const key = testCaseItem.id;
1537
1791
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1538
- const newResults = [...existing.results, rerunPassedThis];
1792
+ const newResults = [...existing.results, repetitionPassedThis];
1539
1793
  const newCompletedCount = existing.completedCount + 1;
1540
- const isLast = newCompletedCount === rerunTotal;
1794
+ const isLast = newCompletedCount === repetitionCount;
1541
1795
  const newMap = new Map(map);
1542
1796
  newMap.set(key, {
1543
1797
  completedCount: newCompletedCount,
@@ -1574,10 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1574
1828
  runId: task.runId,
1575
1829
  startedAt
1576
1830
  });
1577
- const totalEvaluations = task.testCases.reduce(
1578
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1579
- 0
1580
- );
1831
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1581
1832
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1582
1833
  const completedRef = yield* Ref.make(0);
1583
1834
  const startedRef = yield* Ref.make(0);
@@ -1586,7 +1837,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1586
1837
  const testCaseResultsRef = yield* Ref.make(
1587
1838
  /* @__PURE__ */ new Map()
1588
1839
  );
1589
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1840
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1590
1841
  const processEvaluation = (unit) => processOneEvaluation(
1591
1842
  task,
1592
1843
  unit,
@@ -1600,11 +1851,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1600
1851
  failedRef,
1601
1852
  testCaseResultsRef
1602
1853
  );
1603
- yield* Effect.forEach(
1604
- evaluationUnits,
1605
- processEvaluation,
1606
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1607
- );
1854
+ const globalSem = task.globalEvaluationSemaphore;
1855
+ if (globalSem !== void 0) {
1856
+ yield* Effect.forEach(
1857
+ evaluationUnits,
1858
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1859
+ { concurrency: "unbounded", discard: true }
1860
+ );
1861
+ } else {
1862
+ yield* Effect.forEach(
1863
+ evaluationUnits,
1864
+ processEvaluation,
1865
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1866
+ );
1867
+ }
1608
1868
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1609
1869
  Ref.get(completedRef),
1610
1870
  Ref.get(passedRef),
@@ -1640,155 +1900,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1640
1900
  artifactPath: task.snapshot.artifactPath
1641
1901
  });
1642
1902
  });
1643
- async function loadRunSnapshotsFromArtifacts(config) {
1644
- const baseDir = resolve(config.artifactDirectory);
1645
- let entries;
1646
- try {
1647
- entries = await readdir(baseDir);
1648
- } catch {
1649
- return [];
1650
- }
1651
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1652
- const snapshots = [];
1653
- for (const fileName of jsonlFiles) {
1654
- const filePath = join(baseDir, fileName);
1655
- try {
1656
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1657
- if (snapshot) {
1658
- snapshots.push(snapshot);
1659
- }
1660
- } catch {
1661
- }
1662
- }
1663
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1664
- }
1665
- async function parseArtifactToSnapshot(filePath, _config) {
1666
- const content = await readFile(filePath, "utf8");
1667
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1668
- if (lines.length === 0) {
1669
- return null;
1670
- }
1671
- let runQueued = null;
1672
- let runCompleted = null;
1673
- let runFailed = null;
1674
- let runStarted = null;
1675
- for (const line of lines) {
1676
- try {
1677
- const event = JSON.parse(line);
1678
- const type = event.type;
1679
- if (type === "RunQueued") {
1680
- runQueued = {
1681
- runId: event.runId,
1682
- datasetId: event.datasetId,
1683
- datasetName: event.datasetName,
1684
- evaluatorIds: event.evaluatorIds,
1685
- totalTestCases: event.totalTestCases ?? 0,
1686
- artifactPath: event.artifactPath ?? filePath,
1687
- ts: event.ts
1688
- };
1689
- }
1690
- if (type === "RunStarted") {
1691
- runStarted = { startedAt: event.startedAt };
1692
- }
1693
- if (type === "RunCompleted") {
1694
- runCompleted = {
1695
- passedTestCases: event.passedTestCases,
1696
- failedTestCases: event.failedTestCases,
1697
- totalTestCases: event.totalTestCases,
1698
- finishedAt: event.finishedAt
1699
- };
1700
- }
1701
- if (type === "RunFailed") {
1702
- runFailed = {
1703
- finishedAt: event.finishedAt,
1704
- errorMessage: event.errorMessage
1705
- };
1706
- }
1707
- } catch {
1708
- }
1903
+
1904
+ // src/runner/name-pattern.ts
1905
+ function parseRegexLiteral(pattern) {
1906
+ if (!pattern.startsWith("/")) {
1907
+ return void 0;
1709
1908
  }
1710
- if (!runQueued) {
1711
- return null;
1909
+ const lastSlash = pattern.lastIndexOf("/");
1910
+ if (lastSlash <= 0) {
1911
+ return void 0;
1712
1912
  }
1713
- const artifactPath = filePath;
1714
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1715
- const progress = aggregateTestCaseProgress(lines);
1716
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1717
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1718
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1719
1913
  return {
1720
- runId: runQueued.runId,
1721
- datasetId: runQueued.datasetId,
1722
- datasetName: runQueued.datasetName,
1723
- evaluatorIds: runQueued.evaluatorIds,
1724
- queuedAt: runQueued.ts ?? 0,
1725
- startedAt: runStarted?.startedAt,
1726
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1727
- totalTestCases: runQueued.totalTestCases,
1728
- completedTestCases,
1729
- passedTestCases,
1730
- failedTestCases,
1731
- status,
1732
- artifactPath,
1733
- errorMessage: runFailed?.errorMessage
1914
+ source: pattern.slice(1, lastSlash),
1915
+ flags: pattern.slice(lastSlash + 1)
1734
1916
  };
1735
1917
  }
1736
- function aggregateTestCaseProgress(lines) {
1737
- let completedTestCases = 0;
1738
- const testCasePassedBy = /* @__PURE__ */ new Map();
1739
- for (const line of lines) {
1740
- try {
1741
- const event = JSON.parse(line);
1742
- if (event.type === "TestCaseProgress") {
1743
- const ev = event;
1744
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1745
- const id = ev.testCaseId;
1746
- const current = testCasePassedBy.get(id);
1747
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1748
- }
1749
- } catch {
1750
- }
1751
- }
1752
- let passedTestCases = 0;
1753
- let failedTestCases = 0;
1754
- for (const passed of testCasePassedBy.values()) {
1755
- if (passed) {
1756
- passedTestCases += 1;
1757
- } else {
1758
- failedTestCases += 1;
1759
- }
1918
+ function createNameMatcher(pattern) {
1919
+ const normalizedPattern = pattern.trim();
1920
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1921
+ if (regexLiteral) {
1922
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1923
+ return (value) => regex.test(value);
1760
1924
  }
1761
- return { completedTestCases, passedTestCases, failedTestCases };
1762
- }
1763
- async function parseArtifactFile(artifactPath) {
1764
- try {
1765
- const content = await readFile(artifactPath, "utf8");
1766
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1767
- const results = [];
1768
- for (const line of lines) {
1769
- try {
1770
- const event = JSON.parse(line);
1771
- if (event.type === "TestCaseProgress") {
1772
- const ev = event;
1773
- results.push({
1774
- testCaseId: ev.testCaseId,
1775
- testCaseName: ev.testCaseName,
1776
- completedTestCases: ev.completedTestCases,
1777
- totalTestCases: ev.totalTestCases,
1778
- rerunIndex: ev.rerunIndex,
1779
- rerunTotal: ev.rerunTotal,
1780
- passed: ev.passed,
1781
- durationMs: ev.durationMs,
1782
- evaluatorScores: ev.evaluatorScores ?? []
1783
- });
1784
- }
1785
- } catch {
1786
- }
1787
- }
1788
- return results;
1789
- } catch {
1790
- return [];
1925
+ if (normalizedPattern.includes("*")) {
1926
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1927
+ const regex = new RegExp(`^${escaped}$`, "i");
1928
+ return (value) => regex.test(value);
1791
1929
  }
1930
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1792
1931
  }
1793
1932
  async function appendJsonLine(artifactPath, payload) {
1794
1933
  await mkdir(dirname(artifactPath), { recursive: true });
@@ -1847,32 +1986,12 @@ function searchCollectedTestCases(all, query) {
1847
1986
  }
1848
1987
 
1849
1988
  // src/runner/api.ts
1850
- function parseRegexLiteral(pattern) {
1851
- if (!pattern.startsWith("/")) {
1852
- return void 0;
1853
- }
1854
- const lastSlash = pattern.lastIndexOf("/");
1855
- if (lastSlash <= 0) {
1856
- return void 0;
1989
+ function normalizeRunRepetitions(value) {
1990
+ const n = value ?? 1;
1991
+ if (!Number.isInteger(n) || n < 1) {
1992
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1857
1993
  }
1858
- return {
1859
- source: pattern.slice(1, lastSlash),
1860
- flags: pattern.slice(lastSlash + 1)
1861
- };
1862
- }
1863
- function createNameMatcher(pattern) {
1864
- const normalizedPattern = pattern.trim();
1865
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1866
- if (regexLiteral) {
1867
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1868
- return (value) => regex.test(value);
1869
- }
1870
- if (normalizedPattern.includes("*")) {
1871
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1872
- const regex = new RegExp(`^${escaped}$`, "i");
1873
- return (value) => regex.test(value);
1874
- }
1875
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1994
+ return n;
1876
1995
  }
1877
1996
  function mergeRunnerOverrides(base, next) {
1878
1997
  if (!base) {
@@ -1907,6 +2026,7 @@ var EffectRunner = class {
1907
2026
  this.listeners = /* @__PURE__ */ new Set();
1908
2027
  this.datasetsById = /* @__PURE__ */ new Map();
1909
2028
  this.evaluatorsById = /* @__PURE__ */ new Map();
2029
+ this.runConfigsById = /* @__PURE__ */ new Map();
1910
2030
  this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1911
2031
  this.persistenceFiber = Effect.runFork(
1912
2032
  createPersistenceWorker(this.persistenceQueue)
@@ -1947,6 +2067,137 @@ var EffectRunner = class {
1947
2067
  (item) => matcher(item.evaluator.getName() ?? "")
1948
2068
  );
1949
2069
  }
2070
+ async collectRunConfigs() {
2071
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
2072
+ this.runConfigsById.clear();
2073
+ const byNameLower = /* @__PURE__ */ new Map();
2074
+ for (const item of runConfigs) {
2075
+ const id = item.runConfig.getName();
2076
+ const lower = id.toLowerCase();
2077
+ const prev = byNameLower.get(lower);
2078
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
2079
+ throw new Error(
2080
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
2081
+ );
2082
+ }
2083
+ byNameLower.set(lower, item);
2084
+ this.runConfigsById.set(id, item);
2085
+ }
2086
+ return runConfigs;
2087
+ }
2088
+ async resolveRunConfigByName(name) {
2089
+ if (this.runConfigsById.size === 0) {
2090
+ await this.collectRunConfigs();
2091
+ }
2092
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
2093
+ const keyLower = key.toLowerCase();
2094
+ const matches = Array.from(this.runConfigsById.values()).filter(
2095
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
2096
+ );
2097
+ if (matches.length === 0) {
2098
+ return void 0;
2099
+ }
2100
+ if (matches.length > 1) {
2101
+ throw new Error(
2102
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
2103
+ );
2104
+ }
2105
+ return matches[0];
2106
+ }
2107
+ async expandRunConfigToJobs(collected) {
2108
+ if (this.datasetsById.size === 0) {
2109
+ await this.collectDatasets();
2110
+ }
2111
+ if (this.evaluatorsById.size === 0) {
2112
+ await this.collectEvaluators();
2113
+ }
2114
+ const rcName = collected.runConfig.getName();
2115
+ const jobs = [];
2116
+ const runs = collected.runConfig.getRuns();
2117
+ for (const [i, row] of runs.entries()) {
2118
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2119
+ (d) => d.dataset === row.dataset
2120
+ );
2121
+ if (!dsCollected) {
2122
+ throw new Error(
2123
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getDisplayLabel()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2124
+ );
2125
+ }
2126
+ let evaluatorIds;
2127
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2128
+ const matcher = createNameMatcher(row.evaluatorPattern);
2129
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2130
+ (item) => matcher(item.evaluator.getName() ?? "")
2131
+ );
2132
+ if (matched.length === 0) {
2133
+ throw new Error(
2134
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2135
+ );
2136
+ }
2137
+ evaluatorIds = matched.map((item) => item.id);
2138
+ } else {
2139
+ const evaluators = row.evaluators;
2140
+ evaluatorIds = [];
2141
+ for (const ev of evaluators) {
2142
+ const found = Array.from(this.evaluatorsById.values()).find(
2143
+ (item) => item.evaluator === ev
2144
+ );
2145
+ if (!found) {
2146
+ throw new Error(
2147
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2148
+ );
2149
+ }
2150
+ evaluatorIds.push(found.id);
2151
+ }
2152
+ }
2153
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2154
+ jobs.push({
2155
+ datasetId: dsCollected.id,
2156
+ evaluatorIds,
2157
+ runConfigName: rcName,
2158
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2159
+ runConfigTags: collected.runConfig.getTags(),
2160
+ repetitions
2161
+ });
2162
+ }
2163
+ return jobs;
2164
+ }
2165
+ async expandRunConfigNamesToJobs(names) {
2166
+ const jobs = [];
2167
+ for (const name of names) {
2168
+ const collected = await this.resolveRunConfigByName(name);
2169
+ if (!collected) {
2170
+ const known = await this.collectRunConfigs();
2171
+ const available = known.map((r) => r.runConfig.getName()).sort();
2172
+ throw new Error(
2173
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2174
+ );
2175
+ }
2176
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2177
+ }
2178
+ return jobs;
2179
+ }
2180
+ async runDatasetJobsWithSharedConcurrency(request) {
2181
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2182
+ const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
2183
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2184
+ const snapshots = [];
2185
+ for (const job of request.jobs) {
2186
+ snapshots.push(
2187
+ await this.startDatasetRun({
2188
+ datasetId: job.datasetId,
2189
+ evaluatorIds: job.evaluatorIds,
2190
+ triggerId,
2191
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2192
+ globalEvaluationSemaphore: sem,
2193
+ runConfigName: job.runConfigName,
2194
+ runConfigTags: job.runConfigTags,
2195
+ repetitions: job.repetitions
2196
+ })
2197
+ );
2198
+ }
2199
+ return snapshots;
2200
+ }
1950
2201
  async searchTestCases(query) {
1951
2202
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1952
2203
  return searchCollectedTestCases(testCases, query);
@@ -1965,36 +2216,46 @@ var EffectRunner = class {
1965
2216
  );
1966
2217
  }
1967
2218
  async runDatasetWith(request) {
2219
+ const runConfigName = validateRunConfigName(
2220
+ request.runConfigName,
2221
+ "runDatasetWith.runConfigName"
2222
+ );
2223
+ return this.startDatasetRun({
2224
+ datasetId: request.datasetId,
2225
+ evaluatorIds: request.evaluatorIds,
2226
+ triggerId: request.triggerId,
2227
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2228
+ repetitions: request.repetitions,
2229
+ runConfigName,
2230
+ runConfigTags: request.runConfigTags
2231
+ });
2232
+ }
2233
+ async startDatasetRun(params) {
1968
2234
  if (this.datasetsById.size === 0) {
1969
2235
  await this.collectDatasets();
1970
2236
  }
1971
2237
  if (this.evaluatorsById.size === 0) {
1972
2238
  await this.collectEvaluators();
1973
2239
  }
1974
- const dataset = this.datasetsById.get(request.datasetId);
2240
+ const dataset = this.datasetsById.get(params.datasetId);
1975
2241
  if (!dataset) {
1976
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2242
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
1977
2243
  }
1978
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2244
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1979
2245
  if (selectedEvaluators.length === 0) {
1980
2246
  throw new Error("No evaluators selected for run");
1981
2247
  }
1982
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1983
- const totalEvaluations = selectedTestCases.reduce(
1984
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1985
- 0
1986
- );
1987
- const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2248
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2249
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2250
+ const totalEvaluations = selectedTestCases.length * repetitions;
2251
+ const runConfigTags = [...params.runConfigTags ?? []];
2252
+ const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
1988
2253
  const runId = `run-${randomUUID()}`;
1989
- const artifactPath = createArtifactPath(
1990
- this.config.artifactDirectory,
1991
- request.datasetId,
1992
- runId
1993
- );
2254
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
1994
2255
  const snapshot = {
1995
2256
  runId,
1996
- datasetId: request.datasetId,
1997
- datasetName: dataset.dataset.getName(),
2257
+ datasetId: params.datasetId,
2258
+ datasetName: dataset.dataset.getDisplayLabel(),
1998
2259
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1999
2260
  queuedAt: Date.now(),
2000
2261
  totalTestCases: totalEvaluations,
@@ -2014,8 +2275,8 @@ var EffectRunner = class {
2014
2275
  const queuedEvent = {
2015
2276
  type: "RunQueued",
2016
2277
  runId,
2017
- datasetId: request.datasetId,
2018
- datasetName: dataset.dataset.getName(),
2278
+ datasetId: params.datasetId,
2279
+ datasetName: dataset.dataset.getDisplayLabel(),
2019
2280
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2020
2281
  totalTestCases: totalEvaluations,
2021
2282
  artifactPath
@@ -2028,17 +2289,20 @@ var EffectRunner = class {
2028
2289
  payload: queuedEvent
2029
2290
  })
2030
2291
  );
2031
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
2032
2292
  await Effect.runPromise(
2033
2293
  Queue.offer(this.runQueue, {
2034
2294
  runId,
2035
2295
  triggerId,
2036
- datasetId: request.datasetId,
2296
+ datasetId: params.datasetId,
2037
2297
  dataset: dataset.dataset,
2038
2298
  evaluators: selectedEvaluators,
2039
2299
  testCases: selectedTestCases,
2040
2300
  snapshot,
2041
- maxConcurrency
2301
+ maxConcurrency: params.maxConcurrency,
2302
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2303
+ runConfigName: params.runConfigName,
2304
+ runConfigTags,
2305
+ repetitions
2042
2306
  })
2043
2307
  );
2044
2308
  return snapshot;
@@ -2109,6 +2373,11 @@ var EffectRunner = class {
2109
2373
  );
2110
2374
  }
2111
2375
  };
2376
+
2377
+ // src/runner/events.ts
2378
+ var PROGRAMMATIC_RUN_CONFIG = {
2379
+ runConfigName: "programmatic"
2380
+ };
2112
2381
  var LEFT_PANE_WIDTH2 = 44;
2113
2382
  var MAX_RUNS_FOR_CHART = 12;
2114
2383
  var MAX_RUNS_FOR_TREND = 20;
@@ -2456,7 +2725,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2456
2725
  rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
2457
2726
  rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2458
2727
  for (const tc of testCases) {
2459
- const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2728
+ const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
2460
2729
  rows.push(
2461
2730
  /* @__PURE__ */ jsxs(Text, { children: [
2462
2731
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -2468,13 +2737,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2468
2737
  ] }),
2469
2738
  " ",
2470
2739
  tc.testCaseName,
2471
- rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
2740
+ repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
2472
2741
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2473
2742
  " (",
2474
2743
  tc.durationMs,
2475
2744
  "ms)"
2476
2745
  ] })
2477
- ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2746
+ ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
2478
2747
  );
2479
2748
  for (const item of tc.evaluatorScores) {
2480
2749
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2800,7 +3069,8 @@ function EvalsCliApp({ data, args, runner }) {
2800
3069
  }
2801
3070
  void runner.runDatasetWith({
2802
3071
  datasetId: selectedDataset.id,
2803
- evaluatorIds: clampedState.selectedEvaluatorIds
3072
+ evaluatorIds: clampedState.selectedEvaluatorIds,
3073
+ ...PROGRAMMATIC_RUN_CONFIG
2804
3074
  }).then((snapshot) => {
2805
3075
  setRuntimeMessage(
2806
3076
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`