@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -2,19 +2,19 @@
2
2
  'use strict';
3
3
 
4
4
  var fullscreenInk = require('fullscreen-ink');
5
- var React2 = require('react');
5
+ var React = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
- var path = require('path');
9
- var inkChart = require('@pppp606/ink-chart');
10
- var crypto = require('crypto');
11
8
  var effect = require('effect');
9
+ var crypto = require('crypto');
10
+ var promises = require('fs/promises');
11
+ var path = require('path');
12
12
  var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
- var promises = require('fs/promises');
15
14
  var url = require('url');
16
15
  var diff = require('diff');
17
16
  var stringify = require('fast-json-stable-stringify');
17
+ var inkChart = require('@pppp606/ink-chart');
18
18
 
19
19
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
20
20
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -37,7 +37,7 @@ function _interopNamespace(e) {
37
37
  return Object.freeze(n);
38
38
  }
39
39
 
40
- var React2__default = /*#__PURE__*/_interopDefault(React2);
40
+ var React__default = /*#__PURE__*/_interopDefault(React);
41
41
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
42
42
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
43
43
 
@@ -117,11 +117,7 @@ function getFooterText(state) {
117
117
  }
118
118
  return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
119
119
  }
120
- function ListItem({
121
- selected,
122
- label,
123
- itemKey
124
- }) {
120
+ function ListItem({ selected, label, itemKey }) {
125
121
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
126
122
  selected ? "\u25B8 " : " ",
127
123
  label
@@ -148,9 +144,7 @@ function Pane({
148
144
  }
149
145
  );
150
146
  }
151
- function SectionHeader({
152
- children
153
- }) {
147
+ function SectionHeader({ children }) {
154
148
  return /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children });
155
149
  }
156
150
  function StatusText({ status }) {
@@ -162,10 +156,7 @@ function StatusText({ status }) {
162
156
  ] });
163
157
  }
164
158
  var LEFT_PANE_WIDTH = 44;
165
- function RunsSidebar({
166
- state,
167
- runs
168
- }) {
159
+ function RunsSidebar({ state, runs }) {
169
160
  const focused = state.focus === "left";
170
161
  return /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
171
162
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Runs" }),
@@ -194,11 +185,7 @@ function RunsSidebar({
194
185
  ] });
195
186
  }
196
187
  var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
197
- function Sparkline({
198
- data,
199
- width,
200
- label
201
- }) {
188
+ function Sparkline({ data, width, label }) {
202
189
  if (data.length === 0)
203
190
  return null;
204
191
  const max = Math.max(...data);
@@ -277,6 +264,50 @@ function isPrintableCharacter(input) {
277
264
  function isBackKey(key) {
278
265
  return key.backspace || key.delete;
279
266
  }
267
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
268
+ function makeEntityIdSchema(brand, label) {
269
+ return effect.Schema.String.pipe(
270
+ effect.Schema.trimmed(),
271
+ effect.Schema.minLength(1, {
272
+ message: () => `${label} must be non-empty.`
273
+ }),
274
+ effect.Schema.pattern(ENTITY_ID_PATTERN, {
275
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
276
+ }),
277
+ effect.Schema.brand(brand)
278
+ );
279
+ }
280
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
281
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
282
+ makeEntityIdSchema("TestCaseName", "Test case name");
283
+ function validateWithSchema(schema, raw, context) {
284
+ const trimmed = raw.trim();
285
+ const decode = effect.Schema.decodeUnknownEither(
286
+ schema
287
+ );
288
+ const result = decode(trimmed);
289
+ if (effect.Either.isLeft(result)) {
290
+ throw new Error(`${context}: ${effect.ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
291
+ }
292
+ return result.right;
293
+ }
294
+ function validateRunConfigName(raw, context) {
295
+ return validateWithSchema(RunConfigNameSchema, raw, context);
296
+ }
297
+
298
+ // src/evals/evaluator.ts
299
+ function getEvaluatorDisplayLabel(evaluator) {
300
+ if (typeof evaluator.getDisplayLabel === "function") {
301
+ const label = evaluator.getDisplayLabel();
302
+ if (label !== void 0) {
303
+ return label;
304
+ }
305
+ }
306
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
307
+ }
308
+ function getEvaluatorTagList(evaluator) {
309
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
310
+ }
280
311
 
281
312
  // src/cli/data.mock.json
282
313
  var data_mock_default = {
@@ -428,9 +459,7 @@ var data_mock_default = {
428
459
  { name: "contract_match", score: 100 },
429
460
  { name: "arg_validity", score: 100 }
430
461
  ],
431
- checks: [
432
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
433
- ],
462
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
434
463
  failures: [],
435
464
  meta: {
436
465
  model: "gpt-4o-mini",
@@ -453,9 +482,21 @@ var data_mock_default = {
453
482
  }
454
483
  ],
455
484
  evaluators: [
456
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
457
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
458
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
485
+ {
486
+ id: "json-schema-validator",
487
+ name: "JSON Schema Validator",
488
+ configPreview: "strict=true"
489
+ },
490
+ {
491
+ id: "tool-call-contract-checker",
492
+ name: "Tool-call Contract Checker",
493
+ configPreview: "unexpectedCalls=error"
494
+ },
495
+ {
496
+ id: "rubric-judge",
497
+ name: "Rubric Judge (LLM)",
498
+ configPreview: "model=gpt-4o-mini; scale=0-100"
499
+ },
459
500
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
460
501
  ]
461
502
  };
@@ -522,7 +563,7 @@ function toEvalDataset(item, snapshots) {
522
563
  function toEvaluatorOption(item) {
523
564
  return {
524
565
  id: item.id,
525
- name: item.evaluator.getName() ?? toSlug(item.id),
566
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
526
567
  configPreview: `Source: ${item.filePath}`
527
568
  };
528
569
  }
@@ -535,9 +576,7 @@ async function loadRunnerData(runner) {
535
576
  const memSnapshots = runner.getAllRunSnapshots();
536
577
  const seen = new Set(memSnapshots.map((s) => s.runId));
537
578
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
538
- const snapshots = [...memSnapshots, ...fromDisk].sort(
539
- (a, b) => b.queuedAt - a.queuedAt
540
- );
579
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
541
580
  if (datasets.length === 0 && evaluators.length === 0) {
542
581
  return loadMockData();
543
582
  }
@@ -659,7 +698,11 @@ function reduceCliState(state, action) {
659
698
  return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
660
699
  }
661
700
  if (state.level === "datasets") {
662
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
701
+ return {
702
+ ...state,
703
+ datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
704
+ overviewScrollOffset: 0
705
+ };
663
706
  }
664
707
  if (state.level === "runs") {
665
708
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -677,10 +720,17 @@ function reduceCliState(state, action) {
677
720
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
678
721
  }
679
722
  if (state.level === "datasets" && state.focus === "right") {
680
- return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
723
+ return {
724
+ ...state,
725
+ overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
726
+ };
681
727
  }
682
728
  if (state.level === "datasets") {
683
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
729
+ return {
730
+ ...state,
731
+ datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
732
+ overviewScrollOffset: 0
733
+ };
684
734
  }
685
735
  if (state.level === "runs") {
686
736
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -756,24 +806,168 @@ function reduceCliState(state, action) {
756
806
  }
757
807
  return state;
758
808
  }
809
+ async function loadRunSnapshotsFromArtifacts(config) {
810
+ const baseDir = path.resolve(config.artifactDirectory);
811
+ let entries;
812
+ try {
813
+ entries = await promises.readdir(baseDir);
814
+ } catch {
815
+ return [];
816
+ }
817
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
818
+ const snapshots = [];
819
+ for (const fileName of jsonlFiles) {
820
+ const filePath = path.join(baseDir, fileName);
821
+ try {
822
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
823
+ if (snapshot) {
824
+ snapshots.push(snapshot);
825
+ }
826
+ } catch {
827
+ }
828
+ }
829
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
830
+ }
831
+ async function parseArtifactToSnapshot(filePath, _config) {
832
+ const content = await promises.readFile(filePath, "utf8");
833
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
834
+ if (lines.length === 0) {
835
+ return null;
836
+ }
837
+ let runQueued = null;
838
+ let runCompleted = null;
839
+ let runFailed = null;
840
+ let runStarted = null;
841
+ for (const line of lines) {
842
+ try {
843
+ const event = JSON.parse(line);
844
+ const type = event.type;
845
+ if (type === "RunQueued") {
846
+ runQueued = {
847
+ runId: event.runId,
848
+ datasetId: event.datasetId,
849
+ datasetName: event.datasetName,
850
+ evaluatorIds: event.evaluatorIds,
851
+ totalTestCases: event.totalTestCases ?? 0,
852
+ artifactPath: event.artifactPath ?? filePath,
853
+ ts: event.ts
854
+ };
855
+ }
856
+ if (type === "RunStarted") {
857
+ runStarted = { startedAt: event.startedAt };
858
+ }
859
+ if (type === "RunCompleted") {
860
+ runCompleted = {
861
+ passedTestCases: event.passedTestCases,
862
+ failedTestCases: event.failedTestCases,
863
+ totalTestCases: event.totalTestCases,
864
+ finishedAt: event.finishedAt
865
+ };
866
+ }
867
+ if (type === "RunFailed") {
868
+ runFailed = {
869
+ finishedAt: event.finishedAt,
870
+ errorMessage: event.errorMessage
871
+ };
872
+ }
873
+ } catch {
874
+ }
875
+ }
876
+ if (!runQueued) {
877
+ return null;
878
+ }
879
+ const artifactPath = filePath;
880
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
881
+ const progress = aggregateTestCaseProgress(lines);
882
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
883
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
884
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
885
+ return {
886
+ runId: runQueued.runId,
887
+ datasetId: runQueued.datasetId,
888
+ datasetName: runQueued.datasetName,
889
+ evaluatorIds: runQueued.evaluatorIds,
890
+ queuedAt: runQueued.ts ?? 0,
891
+ startedAt: runStarted?.startedAt,
892
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
893
+ totalTestCases: runQueued.totalTestCases,
894
+ completedTestCases,
895
+ passedTestCases,
896
+ failedTestCases,
897
+ status,
898
+ artifactPath,
899
+ errorMessage: runFailed?.errorMessage
900
+ };
901
+ }
902
+ function aggregateTestCaseProgress(lines) {
903
+ let completedTestCases = 0;
904
+ const testCasePassedBy = /* @__PURE__ */ new Map();
905
+ for (const line of lines) {
906
+ try {
907
+ const event = JSON.parse(line);
908
+ if (event.type === "TestCaseProgress") {
909
+ const ev = event;
910
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
911
+ const id = ev.testCaseId;
912
+ const current = testCasePassedBy.get(id);
913
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
914
+ }
915
+ } catch {
916
+ }
917
+ }
918
+ let passedTestCases = 0;
919
+ let failedTestCases = 0;
920
+ for (const passed of testCasePassedBy.values()) {
921
+ if (passed) {
922
+ passedTestCases += 1;
923
+ } else {
924
+ failedTestCases += 1;
925
+ }
926
+ }
927
+ return { completedTestCases, passedTestCases, failedTestCases };
928
+ }
929
+ async function parseArtifactFile(artifactPath) {
930
+ try {
931
+ const content = await promises.readFile(artifactPath, "utf8");
932
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
933
+ const results = [];
934
+ for (const line of lines) {
935
+ try {
936
+ const event = JSON.parse(line);
937
+ if (event.type === "TestCaseProgress") {
938
+ const ev = event;
939
+ const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
940
+ const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
941
+ results.push({
942
+ testCaseId: ev.testCaseId,
943
+ testCaseName: ev.testCaseName,
944
+ completedTestCases: ev.completedTestCases,
945
+ totalTestCases: ev.totalTestCases,
946
+ repetitionId: ev.repetitionId,
947
+ repetitionIndex,
948
+ repetitionCount,
949
+ passed: ev.passed,
950
+ durationMs: ev.durationMs,
951
+ evaluatorScores: ev.evaluatorScores ?? []
952
+ });
953
+ }
954
+ } catch {
955
+ }
956
+ }
957
+ return results;
958
+ } catch {
959
+ return [];
960
+ }
961
+ }
759
962
 
760
963
  // src/runner/config.ts
761
964
  var defaultRunnerConfig = {
762
965
  discovery: {
763
966
  rootDir: process.cwd(),
764
967
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
765
- evaluatorSuffixes: [
766
- ".evaluator.ts",
767
- ".evaluator.tsx",
768
- ".evaluator.js",
769
- ".evaluator.mjs"
770
- ],
771
- testCaseSuffixes: [
772
- ".test-case.ts",
773
- ".test-case.tsx",
774
- ".test-case.js",
775
- ".test-case.mjs"
776
- ],
968
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
969
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
970
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
777
971
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
778
972
  },
779
973
  artifactDirectory: ".eval-results",
@@ -798,6 +992,11 @@ function toRunnerConfigOverrides(config) {
798
992
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
799
993
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
800
994
  }
995
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
996
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
997
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
998
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
999
+ }
801
1000
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
802
1001
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
803
1002
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -840,14 +1039,15 @@ function getJitiLoader() {
840
1039
  }
841
1040
  const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
842
1041
  if (typeof createJiti2 !== "function") {
843
- throw new Error(
844
- "Failed to initialize jiti for m4trix eval config loading."
845
- );
1042
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
846
1043
  }
847
- cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
848
- interopDefault: true,
849
- moduleCache: true
850
- });
1044
+ cachedLoader = createJiti2(
1045
+ (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
1046
+ {
1047
+ interopDefault: true,
1048
+ moduleCache: true
1049
+ }
1050
+ );
851
1051
  return cachedLoader;
852
1052
  }
853
1053
  function resolveConfigModuleExport(loadedModule) {
@@ -895,6 +1095,9 @@ function isDatasetLike(value) {
895
1095
  function isEvaluatorLike(value) {
896
1096
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
897
1097
  }
1098
+ function isRunConfigLike(value) {
1099
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1100
+ }
898
1101
  function isTestCaseLike(value) {
899
1102
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
900
1103
  }
@@ -951,9 +1154,7 @@ async function loadModuleExports(filePath) {
951
1154
  }
952
1155
  async function collectDatasetsFromFiles(config) {
953
1156
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
954
- const matched = files.filter(
955
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
956
- );
1157
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
957
1158
  const found = await Promise.all(
958
1159
  matched.map(async (absolutePath) => {
959
1160
  const exports = await loadModuleExports(absolutePath);
@@ -970,9 +1171,7 @@ async function collectDatasetsFromFiles(config) {
970
1171
  }
971
1172
  async function collectEvaluatorsFromFiles(config) {
972
1173
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
973
- const matched = files.filter(
974
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
975
- );
1174
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
976
1175
  const found = await Promise.all(
977
1176
  matched.map(async (absolutePath) => {
978
1177
  const exports = await loadModuleExports(absolutePath);
@@ -987,11 +1186,26 @@ async function collectEvaluatorsFromFiles(config) {
987
1186
  );
988
1187
  return found.flat();
989
1188
  }
990
- async function collectTestCasesFromFiles(config) {
1189
+ async function collectRunConfigsFromFiles(config) {
991
1190
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
992
- const matched = files.filter(
993
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1191
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1192
+ const found = await Promise.all(
1193
+ matched.map(async (absolutePath) => {
1194
+ const exports = await loadModuleExports(absolutePath);
1195
+ const runConfigs = exports.filter(isRunConfigLike);
1196
+ const relPath = path.relative(config.rootDir, absolutePath);
1197
+ return runConfigs.map((runConfig) => ({
1198
+ id: runConfig.getName(),
1199
+ filePath: relPath,
1200
+ runConfig
1201
+ }));
1202
+ })
994
1203
  );
1204
+ return found.flat();
1205
+ }
1206
+ async function collectTestCasesFromFiles(config) {
1207
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1208
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
995
1209
  const found = await Promise.all(
996
1210
  matched.map(async (absolutePath) => {
997
1211
  const exports = await loadModuleExports(absolutePath);
@@ -1063,16 +1277,8 @@ function createDiffString(expected, actual, diffOptions) {
1063
1277
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
1064
1278
  const actualProcessed = preprocessForDiff(actual, diffOptions);
1065
1279
  if (diffOptions?.keysOnly) {
1066
- const expectedKeys = JSON.stringify(
1067
- extractKeys(expectedProcessed),
1068
- null,
1069
- 2
1070
- );
1071
- const actualKeys = JSON.stringify(
1072
- extractKeys(actualProcessed),
1073
- null,
1074
- 2
1075
- );
1280
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
1281
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
1076
1282
  const parts2 = diff.diffLines(expectedKeys, actualKeys);
1077
1283
  return formatDiffParts(parts2);
1078
1284
  }
@@ -1083,9 +1289,7 @@ function createDiffString(expected, actual, diffOptions) {
1083
1289
  }
1084
1290
  const parts = diff.diffLines(expectedStr, actualStr);
1085
1291
  if (diffOptions?.outputNewOnly) {
1086
- const filtered = parts.filter(
1087
- (p) => p.added === true
1088
- );
1292
+ const filtered = parts.filter((p) => p.added === true);
1089
1293
  return formatDiffParts(filtered);
1090
1294
  }
1091
1295
  return formatDiffParts(parts);
@@ -1152,6 +1356,17 @@ function getDiffLines(entry) {
1152
1356
  });
1153
1357
  }
1154
1358
 
1359
+ // src/evals/test-case.ts
1360
+ function getTestCaseDisplayLabel(testCase) {
1361
+ if (typeof testCase.getDisplayLabel === "function") {
1362
+ return testCase.getDisplayLabel();
1363
+ }
1364
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1365
+ }
1366
+ function getTestCaseTagList(testCase) {
1367
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1368
+ }
1369
+
1155
1370
  // src/evals/metric.ts
1156
1371
  var registry = /* @__PURE__ */ new Map();
1157
1372
  var Metric = {
@@ -1175,25 +1390,70 @@ function getMetricById(id) {
1175
1390
  return registry.get(id);
1176
1391
  }
1177
1392
 
1178
- // src/evals/score.ts
1179
- var registry2 = /* @__PURE__ */ new Map();
1180
- function formatScoreData(def, data, options) {
1181
- return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1393
+ // src/evals/aggregators.ts
1394
+ function aggregateTokenCountSum(values) {
1395
+ const initial = {
1396
+ input: 0,
1397
+ output: 0,
1398
+ inputCached: 0,
1399
+ outputCached: 0
1400
+ };
1401
+ return values.reduce(
1402
+ (acc, v) => ({
1403
+ input: acc.input + (v.input ?? 0),
1404
+ output: acc.output + (v.output ?? 0),
1405
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1406
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1407
+ }),
1408
+ initial
1409
+ );
1182
1410
  }
1183
- var ScoreAggregate = {
1184
- /** Average numeric fields. Use for scores like { value, delta }. */
1185
- averageFields(fields) {
1186
- return (values) => {
1187
- const count = values.length || 1;
1188
- const result = {};
1189
- for (const field of fields) {
1190
- result[field] = values.reduce(
1191
- (s, v) => s + (v[field] ?? 0),
1192
- 0
1193
- ) / count;
1194
- }
1195
- return result;
1196
- };
1411
+ function aggregateLatencyAverage(values) {
1412
+ if (values.length === 0) {
1413
+ return { ms: 0 };
1414
+ }
1415
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1416
+ return { ms: sum / values.length };
1417
+ }
1418
+
1419
+ // src/evals/metrics/standard.ts
1420
+ Metric.of({
1421
+ id: "token-count",
1422
+ name: "Tokens",
1423
+ aggregate: aggregateTokenCountSum,
1424
+ format: (data, options) => {
1425
+ const input = data.input ?? 0;
1426
+ const output = data.output ?? 0;
1427
+ const inputCached = data.inputCached ?? 0;
1428
+ const outputCached = data.outputCached ?? 0;
1429
+ const cached = inputCached + outputCached;
1430
+ const base = `in:${input} out:${output} cached:${cached}`;
1431
+ return options?.isAggregated ? `Total: ${base}` : base;
1432
+ }
1433
+ });
1434
+ Metric.of({
1435
+ id: "latency",
1436
+ name: "Latency",
1437
+ aggregate: aggregateLatencyAverage,
1438
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1439
+ });
1440
+
1441
+ // src/evals/score.ts
1442
+ var registry2 = /* @__PURE__ */ new Map();
1443
+ function formatScoreData(def, data, options) {
1444
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1445
+ }
1446
+ var ScoreAggregate = {
1447
+ /** Average numeric fields. Use for scores like { value, delta }. */
1448
+ averageFields(fields) {
1449
+ return (values) => {
1450
+ const count = values.length || 1;
1451
+ const result = {};
1452
+ for (const field of fields) {
1453
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1454
+ }
1455
+ return result;
1456
+ };
1197
1457
  },
1198
1458
  /** Average selected numeric fields, with sample std dev tracked for `value`. */
1199
1459
  averageWithVariance(fields) {
@@ -1224,13 +1484,10 @@ var ScoreAggregate = {
1224
1484
  (s, v) => s + (v[valueField] ?? 0),
1225
1485
  0
1226
1486
  );
1227
- const sumSq = values.reduce(
1228
- (s, v) => {
1229
- const value = v[valueField] ?? 0;
1230
- return s + value * value;
1231
- },
1232
- 0
1233
- );
1487
+ const sumSq = values.reduce((s, v) => {
1488
+ const value = v[valueField] ?? 0;
1489
+ return s + value * value;
1490
+ }, 0);
1234
1491
  const mean = sum / count;
1235
1492
  const variance = (sumSq - count * mean * mean) / (count - 1);
1236
1493
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -1289,54 +1546,6 @@ function getScoreById(id) {
1289
1546
  return registry2.get(id);
1290
1547
  }
1291
1548
 
1292
- // src/evals/aggregators.ts
1293
- function aggregateTokenCountSum(values) {
1294
- const initial = {
1295
- input: 0,
1296
- output: 0,
1297
- inputCached: 0,
1298
- outputCached: 0
1299
- };
1300
- return values.reduce(
1301
- (acc, v) => ({
1302
- input: acc.input + (v.input ?? 0),
1303
- output: acc.output + (v.output ?? 0),
1304
- inputCached: acc.inputCached + (v.inputCached ?? 0),
1305
- outputCached: acc.outputCached + (v.outputCached ?? 0)
1306
- }),
1307
- initial
1308
- );
1309
- }
1310
- function aggregateLatencyAverage(values) {
1311
- if (values.length === 0) {
1312
- return { ms: 0 };
1313
- }
1314
- const sum = values.reduce((s, v) => s + v.ms, 0);
1315
- return { ms: sum / values.length };
1316
- }
1317
-
1318
- // src/evals/metrics/standard.ts
1319
- Metric.of({
1320
- id: "token-count",
1321
- name: "Tokens",
1322
- aggregate: aggregateTokenCountSum,
1323
- format: (data, options) => {
1324
- const input = data.input ?? 0;
1325
- const output = data.output ?? 0;
1326
- const inputCached = data.inputCached ?? 0;
1327
- const outputCached = data.outputCached ?? 0;
1328
- const cached = inputCached + outputCached;
1329
- const base = `in:${input} out:${output} cached:${cached}`;
1330
- return options?.isAggregated ? `Total: ${base}` : base;
1331
- }
1332
- });
1333
- Metric.of({
1334
- id: "latency",
1335
- name: "Latency",
1336
- aggregate: aggregateLatencyAverage,
1337
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1338
- });
1339
-
1340
1549
  // src/evals/scores/standard.ts
1341
1550
  Score.of({
1342
1551
  id: "percent",
@@ -1443,15 +1652,17 @@ function readOutput(testCase) {
1443
1652
  }
1444
1653
  return candidate.getOutput();
1445
1654
  }
1446
- function buildEvaluationUnits(testCases) {
1655
+ function buildEvaluationUnits(testCases, repetitionCount) {
1656
+ const count = Math.max(1, repetitionCount);
1447
1657
  const units = [];
1448
1658
  for (const testCaseItem of testCases) {
1449
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1450
- for (let r = 0; r < rerunTotal; r++) {
1659
+ const repetitionId = `rep-${crypto.randomUUID()}`;
1660
+ for (let r = 0; r < count; r++) {
1451
1661
  units.push({
1452
1662
  testCaseItem,
1453
- rerunIndex: r + 1,
1454
- rerunTotal
1663
+ repetitionId,
1664
+ repetitionIndex: r + 1,
1665
+ repetitionCount: count
1455
1666
  });
1456
1667
  }
1457
1668
  }
@@ -1461,29 +1672,24 @@ function nowIsoForFile() {
1461
1672
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1462
1673
  }
1463
1674
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1464
- return path.join(
1465
- artifactDirectory,
1466
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1467
- );
1675
+ return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1468
1676
  }
1469
1677
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1470
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1678
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1471
1679
  return effect.Effect.gen(function* () {
1472
1680
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
1473
1681
  const started = Date.now();
1474
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
1475
- n + 1,
1476
- n + 1
1477
- ]);
1682
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1478
1683
  yield* publishEvent({
1479
1684
  type: "TestCaseStarted",
1480
1685
  runId: task.runId,
1481
1686
  testCaseId: testCaseItem.id,
1482
- testCaseName: testCaseItem.testCase.getName(),
1687
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1483
1688
  startedTestCases: startedEvaluations,
1484
1689
  totalTestCases: totalEvaluations,
1485
- rerunIndex,
1486
- rerunTotal
1690
+ repetitionId,
1691
+ repetitionIndex,
1692
+ repetitionCount
1487
1693
  });
1488
1694
  const evaluatorScores = [];
1489
1695
  let testCaseError;
@@ -1507,9 +1713,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1507
1713
  return error;
1508
1714
  };
1509
1715
  try {
1510
- const ctx = yield* effect.Effect.promise(
1511
- () => Promise.resolve(evaluator.resolveContext())
1512
- );
1716
+ const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1513
1717
  const result = yield* effect.Effect.promise(
1514
1718
  () => Promise.resolve().then(
1515
1719
  () => evaluateFn({
@@ -1519,8 +1723,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1519
1723
  meta: {
1520
1724
  triggerId: task.triggerId,
1521
1725
  runId: evaluatorRunId,
1522
- datasetId: task.datasetId
1726
+ datasetId: task.datasetId,
1727
+ repetitionId,
1728
+ repetitionIndex,
1729
+ repetitionCount,
1730
+ runConfigName: task.runConfigName
1523
1731
  },
1732
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1733
+ runConfigTags: task.runConfigTags,
1734
+ evaluatorTags: getEvaluatorTagList(evaluator),
1524
1735
  logDiff,
1525
1736
  log,
1526
1737
  createError
@@ -1563,21 +1774,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1563
1774
  });
1564
1775
  }
1565
1776
  }
1566
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1567
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1568
- n + 1,
1569
- n + 1
1570
- ]);
1777
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1778
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1571
1779
  const progressEvent = {
1572
1780
  type: "TestCaseProgress",
1573
1781
  runId: task.runId,
1574
1782
  testCaseId: testCaseItem.id,
1575
- testCaseName: testCaseItem.testCase.getName(),
1783
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1576
1784
  completedTestCases: completedEvaluations,
1577
1785
  totalTestCases: totalEvaluations,
1578
- rerunIndex,
1579
- rerunTotal,
1580
- passed: rerunPassedThis,
1786
+ repetitionId,
1787
+ repetitionIndex,
1788
+ repetitionCount,
1789
+ passed: repetitionPassedThis,
1581
1790
  durationMs: Date.now() - started,
1582
1791
  evaluatorScores,
1583
1792
  output,
@@ -1598,9 +1807,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1598
1807
  (map) => {
1599
1808
  const key = testCaseItem.id;
1600
1809
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1601
- const newResults = [...existing.results, rerunPassedThis];
1810
+ const newResults = [...existing.results, repetitionPassedThis];
1602
1811
  const newCompletedCount = existing.completedCount + 1;
1603
- const isLast = newCompletedCount === rerunTotal;
1812
+ const isLast = newCompletedCount === repetitionCount;
1604
1813
  const newMap = new Map(map);
1605
1814
  newMap.set(key, {
1606
1815
  completedCount: newCompletedCount,
@@ -1616,10 +1825,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1616
1825
  } else {
1617
1826
  yield* effect.Ref.update(failedRef, (n) => n + 1);
1618
1827
  }
1619
- const [passed, failed] = yield* effect.Effect.all([
1620
- effect.Ref.get(passedRef),
1621
- effect.Ref.get(failedRef)
1622
- ]);
1828
+ const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
1623
1829
  yield* updateSnapshot(task.runId, (snapshot) => ({
1624
1830
  ...snapshot,
1625
1831
  passedTestCases: passed,
@@ -1640,10 +1846,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1640
1846
  runId: task.runId,
1641
1847
  startedAt
1642
1848
  });
1643
- const totalEvaluations = task.testCases.reduce(
1644
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1645
- 0
1646
- );
1849
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1647
1850
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1648
1851
  const completedRef = yield* effect.Ref.make(0);
1649
1852
  const startedRef = yield* effect.Ref.make(0);
@@ -1652,7 +1855,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1652
1855
  const testCaseResultsRef = yield* effect.Ref.make(
1653
1856
  /* @__PURE__ */ new Map()
1654
1857
  );
1655
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1858
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1656
1859
  const processEvaluation = (unit) => processOneEvaluation(
1657
1860
  task,
1658
1861
  unit,
@@ -1666,11 +1869,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1666
1869
  failedRef,
1667
1870
  testCaseResultsRef
1668
1871
  );
1669
- yield* effect.Effect.forEach(
1670
- evaluationUnits,
1671
- processEvaluation,
1672
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1673
- );
1872
+ const globalSem = task.globalEvaluationSemaphore;
1873
+ if (globalSem !== void 0) {
1874
+ yield* effect.Effect.forEach(
1875
+ evaluationUnits,
1876
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1877
+ { concurrency: "unbounded", discard: true }
1878
+ );
1879
+ } else {
1880
+ yield* effect.Effect.forEach(
1881
+ evaluationUnits,
1882
+ processEvaluation,
1883
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1884
+ );
1885
+ }
1674
1886
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1675
1887
  effect.Ref.get(completedRef),
1676
1888
  effect.Ref.get(passedRef),
@@ -1706,155 +1918,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1706
1918
  artifactPath: task.snapshot.artifactPath
1707
1919
  });
1708
1920
  });
1709
- async function loadRunSnapshotsFromArtifacts(config) {
1710
- const baseDir = path.resolve(config.artifactDirectory);
1711
- let entries;
1712
- try {
1713
- entries = await promises.readdir(baseDir);
1714
- } catch {
1715
- return [];
1716
- }
1717
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1718
- const snapshots = [];
1719
- for (const fileName of jsonlFiles) {
1720
- const filePath = path.join(baseDir, fileName);
1721
- try {
1722
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1723
- if (snapshot) {
1724
- snapshots.push(snapshot);
1725
- }
1726
- } catch {
1727
- }
1728
- }
1729
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1730
- }
1731
- async function parseArtifactToSnapshot(filePath, _config) {
1732
- const content = await promises.readFile(filePath, "utf8");
1733
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1734
- if (lines.length === 0) {
1735
- return null;
1736
- }
1737
- let runQueued = null;
1738
- let runCompleted = null;
1739
- let runFailed = null;
1740
- let runStarted = null;
1741
- for (const line of lines) {
1742
- try {
1743
- const event = JSON.parse(line);
1744
- const type = event.type;
1745
- if (type === "RunQueued") {
1746
- runQueued = {
1747
- runId: event.runId,
1748
- datasetId: event.datasetId,
1749
- datasetName: event.datasetName,
1750
- evaluatorIds: event.evaluatorIds,
1751
- totalTestCases: event.totalTestCases ?? 0,
1752
- artifactPath: event.artifactPath ?? filePath,
1753
- ts: event.ts
1754
- };
1755
- }
1756
- if (type === "RunStarted") {
1757
- runStarted = { startedAt: event.startedAt };
1758
- }
1759
- if (type === "RunCompleted") {
1760
- runCompleted = {
1761
- passedTestCases: event.passedTestCases,
1762
- failedTestCases: event.failedTestCases,
1763
- totalTestCases: event.totalTestCases,
1764
- finishedAt: event.finishedAt
1765
- };
1766
- }
1767
- if (type === "RunFailed") {
1768
- runFailed = {
1769
- finishedAt: event.finishedAt,
1770
- errorMessage: event.errorMessage
1771
- };
1772
- }
1773
- } catch {
1774
- }
1921
+
1922
+ // src/runner/name-pattern.ts
1923
+ function parseRegexLiteral(pattern) {
1924
+ if (!pattern.startsWith("/")) {
1925
+ return void 0;
1775
1926
  }
1776
- if (!runQueued) {
1777
- return null;
1927
+ const lastSlash = pattern.lastIndexOf("/");
1928
+ if (lastSlash <= 0) {
1929
+ return void 0;
1778
1930
  }
1779
- const artifactPath = filePath;
1780
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1781
- const progress = aggregateTestCaseProgress(lines);
1782
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1783
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1784
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1785
1931
  return {
1786
- runId: runQueued.runId,
1787
- datasetId: runQueued.datasetId,
1788
- datasetName: runQueued.datasetName,
1789
- evaluatorIds: runQueued.evaluatorIds,
1790
- queuedAt: runQueued.ts ?? 0,
1791
- startedAt: runStarted?.startedAt,
1792
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1793
- totalTestCases: runQueued.totalTestCases,
1794
- completedTestCases,
1795
- passedTestCases,
1796
- failedTestCases,
1797
- status,
1798
- artifactPath,
1799
- errorMessage: runFailed?.errorMessage
1932
+ source: pattern.slice(1, lastSlash),
1933
+ flags: pattern.slice(lastSlash + 1)
1800
1934
  };
1801
1935
  }
1802
- function aggregateTestCaseProgress(lines) {
1803
- let completedTestCases = 0;
1804
- const testCasePassedBy = /* @__PURE__ */ new Map();
1805
- for (const line of lines) {
1806
- try {
1807
- const event = JSON.parse(line);
1808
- if (event.type === "TestCaseProgress") {
1809
- const ev = event;
1810
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1811
- const id = ev.testCaseId;
1812
- const current = testCasePassedBy.get(id);
1813
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1814
- }
1815
- } catch {
1816
- }
1817
- }
1818
- let passedTestCases = 0;
1819
- let failedTestCases = 0;
1820
- for (const passed of testCasePassedBy.values()) {
1821
- if (passed) {
1822
- passedTestCases += 1;
1823
- } else {
1824
- failedTestCases += 1;
1825
- }
1936
+ function createNameMatcher(pattern) {
1937
+ const normalizedPattern = pattern.trim();
1938
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1939
+ if (regexLiteral) {
1940
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1941
+ return (value) => regex.test(value);
1826
1942
  }
1827
- return { completedTestCases, passedTestCases, failedTestCases };
1828
- }
1829
- async function parseArtifactFile(artifactPath) {
1830
- try {
1831
- const content = await promises.readFile(artifactPath, "utf8");
1832
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1833
- const results = [];
1834
- for (const line of lines) {
1835
- try {
1836
- const event = JSON.parse(line);
1837
- if (event.type === "TestCaseProgress") {
1838
- const ev = event;
1839
- results.push({
1840
- testCaseId: ev.testCaseId,
1841
- testCaseName: ev.testCaseName,
1842
- completedTestCases: ev.completedTestCases,
1843
- totalTestCases: ev.totalTestCases,
1844
- rerunIndex: ev.rerunIndex,
1845
- rerunTotal: ev.rerunTotal,
1846
- passed: ev.passed,
1847
- durationMs: ev.durationMs,
1848
- evaluatorScores: ev.evaluatorScores ?? []
1849
- });
1850
- }
1851
- } catch {
1852
- }
1853
- }
1854
- return results;
1855
- } catch {
1856
- return [];
1943
+ if (normalizedPattern.includes("*")) {
1944
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1945
+ const regex = new RegExp(`^${escaped}$`, "i");
1946
+ return (value) => regex.test(value);
1857
1947
  }
1948
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1858
1949
  }
1859
1950
  async function appendJsonLine(artifactPath, payload) {
1860
1951
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
@@ -1913,32 +2004,12 @@ function searchCollectedTestCases(all, query) {
1913
2004
  }
1914
2005
 
1915
2006
  // src/runner/api.ts
1916
- function parseRegexLiteral(pattern) {
1917
- if (!pattern.startsWith("/")) {
1918
- return void 0;
1919
- }
1920
- const lastSlash = pattern.lastIndexOf("/");
1921
- if (lastSlash <= 0) {
1922
- return void 0;
2007
+ function normalizeRunRepetitions(value) {
2008
+ const n = value ?? 1;
2009
+ if (!Number.isInteger(n) || n < 1) {
2010
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1923
2011
  }
1924
- return {
1925
- source: pattern.slice(1, lastSlash),
1926
- flags: pattern.slice(lastSlash + 1)
1927
- };
1928
- }
1929
- function createNameMatcher(pattern) {
1930
- const normalizedPattern = pattern.trim();
1931
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1932
- if (regexLiteral) {
1933
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1934
- return (value) => regex.test(value);
1935
- }
1936
- if (normalizedPattern.includes("*")) {
1937
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1938
- const regex = new RegExp(`^${escaped}$`, "i");
1939
- return (value) => regex.test(value);
1940
- }
1941
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
2012
+ return n;
1942
2013
  }
1943
2014
  function mergeRunnerOverrides(base, next) {
1944
2015
  if (!base) {
@@ -1969,15 +2040,12 @@ var EffectRunner = class {
1969
2040
  this.persistenceQueue = effect.Effect.runSync(
1970
2041
  effect.Queue.unbounded()
1971
2042
  );
1972
- this.snapshotsRef = effect.Effect.runSync(
1973
- effect.Ref.make(/* @__PURE__ */ new Map())
1974
- );
2043
+ this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
1975
2044
  this.listeners = /* @__PURE__ */ new Set();
1976
2045
  this.datasetsById = /* @__PURE__ */ new Map();
1977
2046
  this.evaluatorsById = /* @__PURE__ */ new Map();
1978
- this.schedulerFiber = effect.Effect.runFork(
1979
- this.createSchedulerEffect()
1980
- );
2047
+ this.runConfigsById = /* @__PURE__ */ new Map();
2048
+ this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1981
2049
  this.persistenceFiber = effect.Effect.runFork(
1982
2050
  createPersistenceWorker(this.persistenceQueue)
1983
2051
  );
@@ -2017,6 +2085,137 @@ var EffectRunner = class {
2017
2085
  (item) => matcher(item.evaluator.getName() ?? "")
2018
2086
  );
2019
2087
  }
2088
+ async collectRunConfigs() {
2089
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
2090
+ this.runConfigsById.clear();
2091
+ const byNameLower = /* @__PURE__ */ new Map();
2092
+ for (const item of runConfigs) {
2093
+ const id = item.runConfig.getName();
2094
+ const lower = id.toLowerCase();
2095
+ const prev = byNameLower.get(lower);
2096
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
2097
+ throw new Error(
2098
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
2099
+ );
2100
+ }
2101
+ byNameLower.set(lower, item);
2102
+ this.runConfigsById.set(id, item);
2103
+ }
2104
+ return runConfigs;
2105
+ }
2106
+ async resolveRunConfigByName(name) {
2107
+ if (this.runConfigsById.size === 0) {
2108
+ await this.collectRunConfigs();
2109
+ }
2110
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
2111
+ const keyLower = key.toLowerCase();
2112
+ const matches = Array.from(this.runConfigsById.values()).filter(
2113
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
2114
+ );
2115
+ if (matches.length === 0) {
2116
+ return void 0;
2117
+ }
2118
+ if (matches.length > 1) {
2119
+ throw new Error(
2120
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
2121
+ );
2122
+ }
2123
+ return matches[0];
2124
+ }
2125
+ async expandRunConfigToJobs(collected) {
2126
+ if (this.datasetsById.size === 0) {
2127
+ await this.collectDatasets();
2128
+ }
2129
+ if (this.evaluatorsById.size === 0) {
2130
+ await this.collectEvaluators();
2131
+ }
2132
+ const rcName = collected.runConfig.getName();
2133
+ const jobs = [];
2134
+ const runs = collected.runConfig.getRuns();
2135
+ for (const [i, row] of runs.entries()) {
2136
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2137
+ (d) => d.dataset === row.dataset
2138
+ );
2139
+ if (!dsCollected) {
2140
+ throw new Error(
2141
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2142
+ );
2143
+ }
2144
+ let evaluatorIds;
2145
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2146
+ const matcher = createNameMatcher(row.evaluatorPattern);
2147
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2148
+ (item) => matcher(item.evaluator.getName() ?? "")
2149
+ );
2150
+ if (matched.length === 0) {
2151
+ throw new Error(
2152
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2153
+ );
2154
+ }
2155
+ evaluatorIds = matched.map((item) => item.id);
2156
+ } else {
2157
+ const evaluators = row.evaluators;
2158
+ evaluatorIds = [];
2159
+ for (const ev of evaluators) {
2160
+ const found = Array.from(this.evaluatorsById.values()).find(
2161
+ (item) => item.evaluator === ev
2162
+ );
2163
+ if (!found) {
2164
+ throw new Error(
2165
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2166
+ );
2167
+ }
2168
+ evaluatorIds.push(found.id);
2169
+ }
2170
+ }
2171
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2172
+ jobs.push({
2173
+ datasetId: dsCollected.id,
2174
+ evaluatorIds,
2175
+ runConfigName: rcName,
2176
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2177
+ runConfigTags: collected.runConfig.getTags(),
2178
+ repetitions
2179
+ });
2180
+ }
2181
+ return jobs;
2182
+ }
2183
+ async expandRunConfigNamesToJobs(names) {
2184
+ const jobs = [];
2185
+ for (const name of names) {
2186
+ const collected = await this.resolveRunConfigByName(name);
2187
+ if (!collected) {
2188
+ const known = await this.collectRunConfigs();
2189
+ const available = known.map((r) => r.runConfig.getName()).sort();
2190
+ throw new Error(
2191
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2192
+ );
2193
+ }
2194
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2195
+ }
2196
+ return jobs;
2197
+ }
2198
+ async runDatasetJobsWithSharedConcurrency(request) {
2199
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2200
+ const sem = effect.Effect.unsafeMakeSemaphore(globalConcurrency);
2201
+ const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2202
+ const snapshots = [];
2203
+ for (const job of request.jobs) {
2204
+ snapshots.push(
2205
+ await this.startDatasetRun({
2206
+ datasetId: job.datasetId,
2207
+ evaluatorIds: job.evaluatorIds,
2208
+ triggerId,
2209
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2210
+ globalEvaluationSemaphore: sem,
2211
+ runConfigName: job.runConfigName,
2212
+ runConfigTags: job.runConfigTags,
2213
+ repetitions: job.repetitions
2214
+ })
2215
+ );
2216
+ }
2217
+ return snapshots;
2218
+ }
2020
2219
  async searchTestCases(query) {
2021
2220
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
2022
2221
  return searchCollectedTestCases(testCases, query);
@@ -2035,35 +2234,45 @@ var EffectRunner = class {
2035
2234
  );
2036
2235
  }
2037
2236
  async runDatasetWith(request) {
2237
+ const runConfigName = validateRunConfigName(
2238
+ request.runConfigName,
2239
+ "runDatasetWith.runConfigName"
2240
+ );
2241
+ return this.startDatasetRun({
2242
+ datasetId: request.datasetId,
2243
+ evaluatorIds: request.evaluatorIds,
2244
+ triggerId: request.triggerId,
2245
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2246
+ repetitions: request.repetitions,
2247
+ runConfigName,
2248
+ runConfigTags: request.runConfigTags
2249
+ });
2250
+ }
2251
+ async startDatasetRun(params) {
2038
2252
  if (this.datasetsById.size === 0) {
2039
2253
  await this.collectDatasets();
2040
2254
  }
2041
2255
  if (this.evaluatorsById.size === 0) {
2042
2256
  await this.collectEvaluators();
2043
2257
  }
2044
- const dataset = this.datasetsById.get(request.datasetId);
2258
+ const dataset = this.datasetsById.get(params.datasetId);
2045
2259
  if (!dataset) {
2046
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2260
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
2047
2261
  }
2048
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2262
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2049
2263
  if (selectedEvaluators.length === 0) {
2050
2264
  throw new Error("No evaluators selected for run");
2051
2265
  }
2052
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
2053
- const totalEvaluations = selectedTestCases.reduce(
2054
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
2055
- 0
2056
- );
2057
- const triggerId = request.triggerId ?? `trg-${crypto.randomUUID()}`;
2266
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2267
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2268
+ const totalEvaluations = selectedTestCases.length * repetitions;
2269
+ const runConfigTags = [...params.runConfigTags ?? []];
2270
+ const triggerId = params.triggerId ?? `trg-${crypto.randomUUID()}`;
2058
2271
  const runId = `run-${crypto.randomUUID()}`;
2059
- const artifactPath = createArtifactPath(
2060
- this.config.artifactDirectory,
2061
- request.datasetId,
2062
- runId
2063
- );
2272
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
2064
2273
  const snapshot = {
2065
2274
  runId,
2066
- datasetId: request.datasetId,
2275
+ datasetId: params.datasetId,
2067
2276
  datasetName: dataset.dataset.getName(),
2068
2277
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2069
2278
  queuedAt: Date.now(),
@@ -2084,7 +2293,7 @@ var EffectRunner = class {
2084
2293
  const queuedEvent = {
2085
2294
  type: "RunQueued",
2086
2295
  runId,
2087
- datasetId: request.datasetId,
2296
+ datasetId: params.datasetId,
2088
2297
  datasetName: dataset.dataset.getName(),
2089
2298
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2090
2299
  totalTestCases: totalEvaluations,
@@ -2098,17 +2307,20 @@ var EffectRunner = class {
2098
2307
  payload: queuedEvent
2099
2308
  })
2100
2309
  );
2101
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
2102
2310
  await effect.Effect.runPromise(
2103
2311
  effect.Queue.offer(this.runQueue, {
2104
2312
  runId,
2105
2313
  triggerId,
2106
- datasetId: request.datasetId,
2314
+ datasetId: params.datasetId,
2107
2315
  dataset: dataset.dataset,
2108
2316
  evaluators: selectedEvaluators,
2109
2317
  testCases: selectedTestCases,
2110
2318
  snapshot,
2111
- maxConcurrency
2319
+ maxConcurrency: params.maxConcurrency,
2320
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2321
+ runConfigName: params.runConfigName,
2322
+ runConfigTags,
2323
+ repetitions
2112
2324
  })
2113
2325
  );
2114
2326
  return snapshot;
@@ -2124,9 +2336,9 @@ var EffectRunner = class {
2124
2336
  return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
2125
2337
  }
2126
2338
  getAllRunSnapshots() {
2127
- return Array.from(
2128
- effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
2129
- ).sort((a, b) => b.queuedAt - a.queuedAt);
2339
+ return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
2340
+ (a, b) => b.queuedAt - a.queuedAt
2341
+ );
2130
2342
  }
2131
2343
  async loadRunSnapshotsFromArtifacts() {
2132
2344
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -2179,6 +2391,11 @@ var EffectRunner = class {
2179
2391
  );
2180
2392
  }
2181
2393
  };
2394
+
2395
+ // src/runner/events.ts
2396
+ var PROGRAMMATIC_RUN_CONFIG = {
2397
+ runConfigName: "programmatic"
2398
+ };
2182
2399
  var LEFT_PANE_WIDTH2 = 44;
2183
2400
  var MAX_RUNS_FOR_CHART = 12;
2184
2401
  var MAX_RUNS_FOR_TREND = 20;
@@ -2238,9 +2455,9 @@ function DatasetsView({
2238
2455
  }) {
2239
2456
  const leftFocused = state.focus === "left";
2240
2457
  const rightFocused = state.focus === "right";
2241
- const [runScores, setRunScores] = React2.useState([]);
2242
- const [loading, setLoading] = React2.useState(false);
2243
- React2.useEffect(() => {
2458
+ const [runScores, setRunScores] = React.useState([]);
2459
+ const [loading, setLoading] = React.useState(false);
2460
+ React.useEffect(() => {
2244
2461
  if (!selectedDataset?.runs?.length) {
2245
2462
  setRunScores([]);
2246
2463
  return;
@@ -2252,7 +2469,7 @@ function DatasetsView({
2252
2469
  const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
2253
2470
  const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
2254
2471
  const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
2255
- const overviewRows = React2.useMemo(() => {
2472
+ const overviewRows = React.useMemo(() => {
2256
2473
  const rows = [];
2257
2474
  rows.push(
2258
2475
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
@@ -2342,11 +2559,7 @@ function DatasetsView({
2342
2559
  ] })
2343
2560
  ] });
2344
2561
  }
2345
- function RunsView({
2346
- state,
2347
- dataset,
2348
- selectedRun
2349
- }) {
2562
+ function RunsView({ state, dataset, selectedRun }) {
2350
2563
  const runs = dataset?.runs ?? [];
2351
2564
  const rightFocused = state.focus === "right";
2352
2565
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
@@ -2362,10 +2575,10 @@ function RunsView({
2362
2575
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2363
2576
  "Commit: ",
2364
2577
  selectedRun.meta.commit,
2365
- " Branch: ",
2578
+ " Branch: ",
2366
2579
  selectedRun.meta.branch,
2580
+ " Seed:",
2367
2581
  " ",
2368
- "Seed: ",
2369
2582
  selectedRun.meta.seed
2370
2583
  ] }),
2371
2584
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
@@ -2378,23 +2591,10 @@ function RunsView({
2378
2591
  format: (v) => `${v}%`
2379
2592
  }
2380
2593
  ),
2381
- /* @__PURE__ */ jsxRuntime.jsx(
2382
- TextBar,
2383
- {
2384
- label: "avg score",
2385
- value: Math.round(selectedRun.performance.avgScore * 100)
2386
- }
2387
- ),
2594
+ /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
2388
2595
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2389
2596
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
2390
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
2391
- TextBar,
2392
- {
2393
- label: dimension.name,
2394
- value: dimension.score
2395
- },
2396
- dimension.name
2397
- )),
2597
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
2398
2598
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2399
2599
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
2400
2600
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -2497,15 +2697,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2497
2697
  ...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
2498
2698
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
2499
2699
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
2500
- ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
2501
- CheckRow,
2502
- {
2503
- name: c.name,
2504
- passed: c.passed,
2505
- detail: c.detail
2506
- },
2507
- `chk-${c.name}`
2508
- )),
2700
+ ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2509
2701
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
2510
2702
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
2511
2703
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -2551,7 +2743,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2551
2743
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
2552
2744
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2553
2745
  for (const tc of testCases) {
2554
- const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2746
+ const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
2555
2747
  rows.push(
2556
2748
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2557
2749
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -2563,13 +2755,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2563
2755
  ] }),
2564
2756
  " ",
2565
2757
  tc.testCaseName,
2566
- rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
2758
+ repetitionPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: repetitionPart }) : null,
2567
2759
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2568
2760
  " (",
2569
2761
  tc.durationMs,
2570
2762
  "ms)"
2571
2763
  ] })
2572
- ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2764
+ ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
2573
2765
  );
2574
2766
  for (const item of tc.evaluatorScores) {
2575
2767
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2622,17 +2814,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2622
2814
  }
2623
2815
  } else {
2624
2816
  rows.push(
2625
- /* @__PURE__ */ jsxRuntime.jsxs(
2626
- ink.Text,
2627
- {
2628
- color: "gray",
2629
- children: [
2630
- " ",
2631
- "n/a"
2632
- ]
2633
- },
2634
- `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
2635
- )
2817
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2818
+ " ",
2819
+ "n/a"
2820
+ ] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
2636
2821
  );
2637
2822
  }
2638
2823
  if (!item.passed && item.logs && item.logs.length > 0) {
@@ -2689,12 +2874,12 @@ function RunDetailsView({
2689
2874
  }) {
2690
2875
  const runs = dataset?.runs ?? [];
2691
2876
  const rightFocused = state.focus === "right";
2692
- const [testCases, setTestCases] = React2.useState([]);
2693
- const evaluatorNameById = React2__default.default.useMemo(
2877
+ const [testCases, setTestCases] = React.useState([]);
2878
+ const evaluatorNameById = React__default.default.useMemo(
2694
2879
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2695
2880
  [evaluators]
2696
2881
  );
2697
- React2.useEffect(() => {
2882
+ React.useEffect(() => {
2698
2883
  if (!selectedRun?.meta?.artifact) {
2699
2884
  setTestCases([]);
2700
2885
  return;
@@ -2713,7 +2898,7 @@ function RunDetailsView({
2713
2898
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2714
2899
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2715
2900
  /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2716
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
2901
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
2717
2902
  ] });
2718
2903
  }
2719
2904
  var LEFT_PANE_WIDTH3 = 44;
@@ -2736,19 +2921,11 @@ function NewEvaluationView({
2736
2921
  visibleEvaluators.map((evaluator, index) => {
2737
2922
  const selected = index === state.evaluatorMenuIndex;
2738
2923
  const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
2739
- return /* @__PURE__ */ jsxRuntime.jsxs(
2740
- ink.Text,
2741
- {
2742
- color: selected ? "cyan" : "gray",
2743
- bold: selected,
2744
- children: [
2745
- selected ? "\u25B8 " : " ",
2746
- inSelection ? "[x] " : "[ ] ",
2747
- evaluator.name
2748
- ]
2749
- },
2750
- evaluator.id
2751
- );
2924
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
2925
+ selected ? "\u25B8 " : " ",
2926
+ inSelection ? "[x] " : "[ ] ",
2927
+ evaluator.name
2928
+ ] }, evaluator.id);
2752
2929
  })
2753
2930
  ] }),
2754
2931
  /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
@@ -2780,30 +2957,20 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
2780
2957
  ...state,
2781
2958
  datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
2782
2959
  runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
2783
- evaluatorMenuIndex: Math.max(
2784
- 0,
2785
- Math.min(state.evaluatorMenuIndex, evaluatorMax)
2786
- )
2960
+ evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
2787
2961
  };
2788
2962
  }
2789
- function EvalsCliApp({
2790
- data,
2791
- args,
2792
- runner
2793
- }) {
2963
+ function EvalsCliApp({ data, args, runner }) {
2794
2964
  const { exit } = ink.useApp();
2795
2965
  const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
2796
- const [liveData, setLiveData] = React2.useState(data);
2797
- const [runtimeMessage, setRuntimeMessage] = React2.useState();
2798
- const overviewRowCountRef = React2.useRef(0);
2799
- const [state, dispatch] = React2.useReducer(
2800
- reduceCliState,
2801
- createInitialState(data, args)
2802
- );
2803
- React2.useEffect(() => {
2966
+ const [liveData, setLiveData] = React.useState(data);
2967
+ const [runtimeMessage, setRuntimeMessage] = React.useState();
2968
+ const overviewRowCountRef = React.useRef(0);
2969
+ const [state, dispatch] = React.useReducer(reduceCliState, createInitialState(data, args));
2970
+ React.useEffect(() => {
2804
2971
  setLiveData(data);
2805
2972
  }, [data]);
2806
- React2.useEffect(() => {
2973
+ React.useEffect(() => {
2807
2974
  if (!runner) {
2808
2975
  return void 0;
2809
2976
  }
@@ -2822,7 +2989,7 @@ function EvalsCliApp({
2822
2989
  }
2823
2990
  });
2824
2991
  }, [runner]);
2825
- const filteredDatasets = React2.useMemo(
2992
+ const filteredDatasets = React.useMemo(
2826
2993
  () => getFilteredDatasets(liveData, state.searchQuery),
2827
2994
  [liveData, state.searchQuery]
2828
2995
  );
@@ -2831,14 +2998,8 @@ function EvalsCliApp({
2831
2998
  filteredDatasets.length,
2832
2999
  getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
2833
3000
  );
2834
- const selectedDataset = getDatasetByMenuIndex(
2835
- filteredDatasets,
2836
- clampedState.datasetMenuIndex
2837
- );
2838
- const selectedRun = getRunByMenuIndex(
2839
- selectedDataset,
2840
- clampedState.runMenuIndex
2841
- );
3001
+ const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
3002
+ const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
2842
3003
  const visibleEvaluators = liveData.evaluators.filter(
2843
3004
  (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
2844
3005
  );
@@ -2926,15 +3087,14 @@ function EvalsCliApp({
2926
3087
  }
2927
3088
  void runner.runDatasetWith({
2928
3089
  datasetId: selectedDataset.id,
2929
- evaluatorIds: clampedState.selectedEvaluatorIds
3090
+ evaluatorIds: clampedState.selectedEvaluatorIds,
3091
+ ...PROGRAMMATIC_RUN_CONFIG
2930
3092
  }).then((snapshot) => {
2931
3093
  setRuntimeMessage(
2932
3094
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
2933
3095
  );
2934
3096
  }).catch((error) => {
2935
- setRuntimeMessage(
2936
- error instanceof Error ? error.message : "Failed to start evaluation."
2937
- );
3097
+ setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
2938
3098
  });
2939
3099
  }
2940
3100
  });
@@ -2961,14 +3121,7 @@ function EvalsCliApp({
2961
3121
  );
2962
3122
  }
2963
3123
  if (clampedState.level === "runs") {
2964
- return /* @__PURE__ */ jsxRuntime.jsx(
2965
- RunsView,
2966
- {
2967
- state: clampedState,
2968
- dataset: selectedDataset,
2969
- selectedRun
2970
- }
2971
- );
3124
+ return /* @__PURE__ */ jsxRuntime.jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
2972
3125
  }
2973
3126
  return /* @__PURE__ */ jsxRuntime.jsx(
2974
3127
  RunDetailsView,
@@ -2980,82 +3133,44 @@ function EvalsCliApp({
2980
3133
  }
2981
3134
  );
2982
3135
  };
2983
- return /* @__PURE__ */ jsxRuntime.jsxs(
2984
- ink.Box,
2985
- {
2986
- flexDirection: "column",
2987
- flexGrow: 1,
2988
- width: stdoutWidth,
2989
- height: stdoutHeight,
2990
- children: [
2991
- /* @__PURE__ */ jsxRuntime.jsx(
2992
- ink.Box,
2993
- {
2994
- borderStyle: "round",
2995
- borderColor: "cyan",
2996
- paddingX: 1,
2997
- width: stdoutWidth,
2998
- children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(
2999
- clampedState,
3000
- selectedDataset?.name,
3001
- selectedRun?.label
3002
- ) })
3003
- }
3004
- ),
3005
- clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
3006
- ink.Box,
3007
- {
3008
- marginTop: 1,
3009
- borderStyle: "round",
3010
- borderColor: "yellow",
3011
- paddingX: 1,
3012
- flexDirection: "column",
3013
- width: stdoutWidth,
3014
- children: [
3015
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
3016
- clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
3017
- ]
3018
- }
3019
- ),
3020
- clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
3021
- ink.Box,
3022
- {
3023
- marginTop: 1,
3024
- borderStyle: "round",
3025
- borderColor: "magenta",
3026
- paddingX: 1,
3027
- width: stdoutWidth,
3028
- children: [
3029
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", bold: true, children: "Search: " }),
3030
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
3031
- ]
3032
- }
3033
- ),
3034
- runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(
3035
- ink.Box,
3036
- {
3037
- marginTop: 1,
3038
- borderStyle: "round",
3039
- borderColor: "blue",
3040
- paddingX: 1,
3041
- width: stdoutWidth,
3042
- children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage })
3043
- }
3044
- ),
3045
- /* @__PURE__ */ jsxRuntime.jsx(
3046
- ink.Box,
3047
- {
3048
- marginTop: 1,
3049
- flexGrow: 1,
3050
- width: stdoutWidth,
3051
- flexDirection: "row",
3052
- children: renderContent()
3053
- }
3054
- ),
3055
- /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
3056
- ]
3057
- }
3058
- );
3136
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
3137
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
3138
+ clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
3139
+ ink.Box,
3140
+ {
3141
+ marginTop: 1,
3142
+ borderStyle: "round",
3143
+ borderColor: "yellow",
3144
+ paddingX: 1,
3145
+ flexDirection: "column",
3146
+ width: stdoutWidth,
3147
+ children: [
3148
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
3149
+ clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
3150
+ ]
3151
+ }
3152
+ ),
3153
+ clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
3154
+ ink.Box,
3155
+ {
3156
+ marginTop: 1,
3157
+ borderStyle: "round",
3158
+ borderColor: "magenta",
3159
+ paddingX: 1,
3160
+ width: stdoutWidth,
3161
+ children: [
3162
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "magenta", bold: true, children: [
3163
+ "Search:",
3164
+ " "
3165
+ ] }),
3166
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
3167
+ ]
3168
+ }
3169
+ ),
3170
+ runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage }) }),
3171
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
3172
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
3173
+ ] });
3059
3174
  }
3060
3175
  async function main() {
3061
3176
  const args = parseStartupArgs(process.argv.slice(2));
@@ -3067,9 +3182,7 @@ async function main() {
3067
3182
  process.on("SIGTERM", () => {
3068
3183
  void runner.shutdown().finally(() => process.exit(0));
3069
3184
  });
3070
- fullscreenInk.withFullScreen(
3071
- /* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })
3072
- ).start();
3185
+ fullscreenInk.withFullScreen(/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })).start();
3073
3186
  }
3074
3187
  void main();
3075
3188
  //# sourceMappingURL=out.js.map