@m4trix/evals 0.25.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,18 +1,18 @@
1
1
  #!/usr/bin/env node
2
2
  import { withFullScreen, useScreenSize } from 'fullscreen-ink';
3
- import React2, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
3
+ import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
4
4
  import { useApp, useInput, Box, Text } from 'ink';
5
5
  import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
- import { resolve, relative, join, dirname } from 'path';
7
- import { LineGraph } from '@pppp606/ink-chart';
6
+ import { Schema, Effect, PubSub, Queue, Ref, Fiber, Either, ParseResult } from 'effect';
8
7
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
8
+ import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
9
+ import { resolve, join, relative, dirname } from 'path';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
- import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
12
  import { pathToFileURL } from 'url';
14
13
  import { diffLines } from 'diff';
15
14
  import stringify from 'fast-json-stable-stringify';
15
+ import { LineGraph } from '@pppp606/ink-chart';
16
16
 
17
17
  var SEP = " ";
18
18
  var ARROW = "\u203A";
@@ -90,11 +90,7 @@ function getFooterText(state) {
90
90
  }
91
91
  return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
92
92
  }
93
- function ListItem({
94
- selected,
95
- label,
96
- itemKey
97
- }) {
93
+ function ListItem({ selected, label, itemKey }) {
98
94
  return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
99
95
  selected ? "\u25B8 " : " ",
100
96
  label
@@ -121,9 +117,7 @@ function Pane({
121
117
  }
122
118
  );
123
119
  }
124
- function SectionHeader({
125
- children
126
- }) {
120
+ function SectionHeader({ children }) {
127
121
  return /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children });
128
122
  }
129
123
  function StatusText({ status }) {
@@ -135,10 +129,7 @@ function StatusText({ status }) {
135
129
  ] });
136
130
  }
137
131
  var LEFT_PANE_WIDTH = 44;
138
- function RunsSidebar({
139
- state,
140
- runs
141
- }) {
132
+ function RunsSidebar({ state, runs }) {
142
133
  const focused = state.focus === "left";
143
134
  return /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
144
135
  /* @__PURE__ */ jsx(SectionHeader, { children: "Runs" }),
@@ -167,11 +158,7 @@ function RunsSidebar({
167
158
  ] });
168
159
  }
169
160
  var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
170
- function Sparkline({
171
- data,
172
- width,
173
- label
174
- }) {
161
+ function Sparkline({ data, width, label }) {
175
162
  if (data.length === 0)
176
163
  return null;
177
164
  const max = Math.max(...data);
@@ -250,6 +237,50 @@ function isPrintableCharacter(input) {
250
237
  function isBackKey(key) {
251
238
  return key.backspace || key.delete;
252
239
  }
240
+ var ENTITY_ID_PATTERN = /^[a-zA-Z0-9_-]+$/;
241
+ function makeEntityIdSchema(brand, label) {
242
+ return Schema.String.pipe(
243
+ Schema.trimmed(),
244
+ Schema.minLength(1, {
245
+ message: () => `${label} must be non-empty.`
246
+ }),
247
+ Schema.pattern(ENTITY_ID_PATTERN, {
248
+ message: () => `${label} may only contain letters, digits, underscores, and hyphens (no spaces). Examples: "my-nightly", "my_nightly", "myNightly".`
249
+ }),
250
+ Schema.brand(brand)
251
+ );
252
+ }
253
+ var RunConfigNameSchema = makeEntityIdSchema("RunConfigName", "RunConfig name");
254
+ makeEntityIdSchema("EvaluatorName", "Evaluator name");
255
+ makeEntityIdSchema("TestCaseName", "Test case name");
256
+ function validateWithSchema(schema, raw, context) {
257
+ const trimmed = raw.trim();
258
+ const decode = Schema.decodeUnknownEither(
259
+ schema
260
+ );
261
+ const result = decode(trimmed);
262
+ if (Either.isLeft(result)) {
263
+ throw new Error(`${context}: ${ParseResult.TreeFormatter.formatErrorSync(result.left)}`);
264
+ }
265
+ return result.right;
266
+ }
267
+ function validateRunConfigName(raw, context) {
268
+ return validateWithSchema(RunConfigNameSchema, raw, context);
269
+ }
270
+
271
+ // src/evals/evaluator.ts
272
+ function getEvaluatorDisplayLabel(evaluator) {
273
+ if (typeof evaluator.getDisplayLabel === "function") {
274
+ const label = evaluator.getDisplayLabel();
275
+ if (label !== void 0) {
276
+ return label;
277
+ }
278
+ }
279
+ return typeof evaluator.getName === "function" ? evaluator.getName() : void 0;
280
+ }
281
+ function getEvaluatorTagList(evaluator) {
282
+ return typeof evaluator.getTags === "function" ? [...evaluator.getTags()] : [];
283
+ }
253
284
 
254
285
  // src/cli/data.mock.json
255
286
  var data_mock_default = {
@@ -401,9 +432,7 @@ var data_mock_default = {
401
432
  { name: "contract_match", score: 100 },
402
433
  { name: "arg_validity", score: 100 }
403
434
  ],
404
- checks: [
405
- { name: "tool_calls", passed: true, detail: "0 unexpected" }
406
- ],
435
+ checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
407
436
  failures: [],
408
437
  meta: {
409
438
  model: "gpt-4o-mini",
@@ -426,9 +455,21 @@ var data_mock_default = {
426
455
  }
427
456
  ],
428
457
  evaluators: [
429
- { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
430
- { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
431
- { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
458
+ {
459
+ id: "json-schema-validator",
460
+ name: "JSON Schema Validator",
461
+ configPreview: "strict=true"
462
+ },
463
+ {
464
+ id: "tool-call-contract-checker",
465
+ name: "Tool-call Contract Checker",
466
+ configPreview: "unexpectedCalls=error"
467
+ },
468
+ {
469
+ id: "rubric-judge",
470
+ name: "Rubric Judge (LLM)",
471
+ configPreview: "model=gpt-4o-mini; scale=0-100"
472
+ },
432
473
  { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
433
474
  ]
434
475
  };
@@ -495,7 +536,7 @@ function toEvalDataset(item, snapshots) {
495
536
  function toEvaluatorOption(item) {
496
537
  return {
497
538
  id: item.id,
498
- name: item.evaluator.getName() ?? toSlug(item.id),
539
+ name: getEvaluatorDisplayLabel(item.evaluator) ?? toSlug(item.id),
499
540
  configPreview: `Source: ${item.filePath}`
500
541
  };
501
542
  }
@@ -508,9 +549,7 @@ async function loadRunnerData(runner) {
508
549
  const memSnapshots = runner.getAllRunSnapshots();
509
550
  const seen = new Set(memSnapshots.map((s) => s.runId));
510
551
  const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
511
- const snapshots = [...memSnapshots, ...fromDisk].sort(
512
- (a, b) => b.queuedAt - a.queuedAt
513
- );
552
+ const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
514
553
  if (datasets.length === 0 && evaluators.length === 0) {
515
554
  return loadMockData();
516
555
  }
@@ -632,7 +671,11 @@ function reduceCliState(state, action) {
632
671
  return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
633
672
  }
634
673
  if (state.level === "datasets") {
635
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
674
+ return {
675
+ ...state,
676
+ datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
677
+ overviewScrollOffset: 0
678
+ };
636
679
  }
637
680
  if (state.level === "runs") {
638
681
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -650,10 +693,17 @@ function reduceCliState(state, action) {
650
693
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
651
694
  }
652
695
  if (state.level === "datasets" && state.focus === "right") {
653
- return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
696
+ return {
697
+ ...state,
698
+ overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
699
+ };
654
700
  }
655
701
  if (state.level === "datasets") {
656
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
702
+ return {
703
+ ...state,
704
+ datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
705
+ overviewScrollOffset: 0
706
+ };
657
707
  }
658
708
  if (state.level === "runs") {
659
709
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -729,24 +779,168 @@ function reduceCliState(state, action) {
729
779
  }
730
780
  return state;
731
781
  }
782
+ async function loadRunSnapshotsFromArtifacts(config) {
783
+ const baseDir = resolve(config.artifactDirectory);
784
+ let entries;
785
+ try {
786
+ entries = await readdir(baseDir);
787
+ } catch {
788
+ return [];
789
+ }
790
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
791
+ const snapshots = [];
792
+ for (const fileName of jsonlFiles) {
793
+ const filePath = join(baseDir, fileName);
794
+ try {
795
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
796
+ if (snapshot) {
797
+ snapshots.push(snapshot);
798
+ }
799
+ } catch {
800
+ }
801
+ }
802
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
803
+ }
804
+ async function parseArtifactToSnapshot(filePath, _config) {
805
+ const content = await readFile(filePath, "utf8");
806
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
807
+ if (lines.length === 0) {
808
+ return null;
809
+ }
810
+ let runQueued = null;
811
+ let runCompleted = null;
812
+ let runFailed = null;
813
+ let runStarted = null;
814
+ for (const line of lines) {
815
+ try {
816
+ const event = JSON.parse(line);
817
+ const type = event.type;
818
+ if (type === "RunQueued") {
819
+ runQueued = {
820
+ runId: event.runId,
821
+ datasetId: event.datasetId,
822
+ datasetName: event.datasetName,
823
+ evaluatorIds: event.evaluatorIds,
824
+ totalTestCases: event.totalTestCases ?? 0,
825
+ artifactPath: event.artifactPath ?? filePath,
826
+ ts: event.ts
827
+ };
828
+ }
829
+ if (type === "RunStarted") {
830
+ runStarted = { startedAt: event.startedAt };
831
+ }
832
+ if (type === "RunCompleted") {
833
+ runCompleted = {
834
+ passedTestCases: event.passedTestCases,
835
+ failedTestCases: event.failedTestCases,
836
+ totalTestCases: event.totalTestCases,
837
+ finishedAt: event.finishedAt
838
+ };
839
+ }
840
+ if (type === "RunFailed") {
841
+ runFailed = {
842
+ finishedAt: event.finishedAt,
843
+ errorMessage: event.errorMessage
844
+ };
845
+ }
846
+ } catch {
847
+ }
848
+ }
849
+ if (!runQueued) {
850
+ return null;
851
+ }
852
+ const artifactPath = filePath;
853
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
854
+ const progress = aggregateTestCaseProgress(lines);
855
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
856
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
857
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
858
+ return {
859
+ runId: runQueued.runId,
860
+ datasetId: runQueued.datasetId,
861
+ datasetName: runQueued.datasetName,
862
+ evaluatorIds: runQueued.evaluatorIds,
863
+ queuedAt: runQueued.ts ?? 0,
864
+ startedAt: runStarted?.startedAt,
865
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
866
+ totalTestCases: runQueued.totalTestCases,
867
+ completedTestCases,
868
+ passedTestCases,
869
+ failedTestCases,
870
+ status,
871
+ artifactPath,
872
+ errorMessage: runFailed?.errorMessage
873
+ };
874
+ }
875
+ function aggregateTestCaseProgress(lines) {
876
+ let completedTestCases = 0;
877
+ const testCasePassedBy = /* @__PURE__ */ new Map();
878
+ for (const line of lines) {
879
+ try {
880
+ const event = JSON.parse(line);
881
+ if (event.type === "TestCaseProgress") {
882
+ const ev = event;
883
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
884
+ const id = ev.testCaseId;
885
+ const current = testCasePassedBy.get(id);
886
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
887
+ }
888
+ } catch {
889
+ }
890
+ }
891
+ let passedTestCases = 0;
892
+ let failedTestCases = 0;
893
+ for (const passed of testCasePassedBy.values()) {
894
+ if (passed) {
895
+ passedTestCases += 1;
896
+ } else {
897
+ failedTestCases += 1;
898
+ }
899
+ }
900
+ return { completedTestCases, passedTestCases, failedTestCases };
901
+ }
902
+ async function parseArtifactFile(artifactPath) {
903
+ try {
904
+ const content = await readFile(artifactPath, "utf8");
905
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
906
+ const results = [];
907
+ for (const line of lines) {
908
+ try {
909
+ const event = JSON.parse(line);
910
+ if (event.type === "TestCaseProgress") {
911
+ const ev = event;
912
+ const repetitionIndex = ev.repetitionIndex ?? ev.rerunIndex;
913
+ const repetitionCount = ev.repetitionCount ?? ev.rerunTotal;
914
+ results.push({
915
+ testCaseId: ev.testCaseId,
916
+ testCaseName: ev.testCaseName,
917
+ completedTestCases: ev.completedTestCases,
918
+ totalTestCases: ev.totalTestCases,
919
+ repetitionId: ev.repetitionId,
920
+ repetitionIndex,
921
+ repetitionCount,
922
+ passed: ev.passed,
923
+ durationMs: ev.durationMs,
924
+ evaluatorScores: ev.evaluatorScores ?? []
925
+ });
926
+ }
927
+ } catch {
928
+ }
929
+ }
930
+ return results;
931
+ } catch {
932
+ return [];
933
+ }
934
+ }
732
935
 
733
936
  // src/runner/config.ts
734
937
  var defaultRunnerConfig = {
735
938
  discovery: {
736
939
  rootDir: process.cwd(),
737
940
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
738
- evaluatorSuffixes: [
739
- ".evaluator.ts",
740
- ".evaluator.tsx",
741
- ".evaluator.js",
742
- ".evaluator.mjs"
743
- ],
744
- testCaseSuffixes: [
745
- ".test-case.ts",
746
- ".test-case.tsx",
747
- ".test-case.js",
748
- ".test-case.mjs"
749
- ],
941
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
942
+ runConfigSuffixes: [".run-config.ts", ".run-config.tsx", ".run-config.js", ".run-config.mjs"],
943
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
750
944
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
751
945
  },
752
946
  artifactDirectory: ".eval-results",
@@ -771,6 +965,11 @@ function toRunnerConfigOverrides(config) {
771
965
  } else if (rawDiscovery?.evaluatorSuffixes !== void 0) {
772
966
  discovery.evaluatorSuffixes = rawDiscovery.evaluatorSuffixes;
773
967
  }
968
+ if (rawDiscovery?.runConfigFilePatterns !== void 0) {
969
+ discovery.runConfigSuffixes = rawDiscovery.runConfigFilePatterns;
970
+ } else if (rawDiscovery?.runConfigSuffixes !== void 0) {
971
+ discovery.runConfigSuffixes = rawDiscovery.runConfigSuffixes;
972
+ }
774
973
  if (rawDiscovery?.testCaseFilePatterns !== void 0) {
775
974
  discovery.testCaseSuffixes = rawDiscovery.testCaseFilePatterns;
776
975
  } else if (rawDiscovery?.testCaseSuffixes !== void 0) {
@@ -813,14 +1012,15 @@ function getJitiLoader() {
813
1012
  }
814
1013
  const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
815
1014
  if (typeof createJiti2 !== "function") {
816
- throw new Error(
817
- "Failed to initialize jiti for m4trix eval config loading."
818
- );
1015
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
819
1016
  }
820
- cachedLoader = createJiti2(import.meta.url, {
821
- interopDefault: true,
822
- moduleCache: true
823
- });
1017
+ cachedLoader = createJiti2(
1018
+ import.meta.url,
1019
+ {
1020
+ interopDefault: true,
1021
+ moduleCache: true
1022
+ }
1023
+ );
824
1024
  return cachedLoader;
825
1025
  }
826
1026
  function resolveConfigModuleExport(loadedModule) {
@@ -868,6 +1068,9 @@ function isDatasetLike(value) {
868
1068
  function isEvaluatorLike(value) {
869
1069
  return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
870
1070
  }
1071
+ function isRunConfigLike(value) {
1072
+ return hasMethod(value, "getName") && hasMethod(value, "getRuns") && typeof value.getRuns === "function";
1073
+ }
871
1074
  function isTestCaseLike(value) {
872
1075
  return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
873
1076
  }
@@ -924,9 +1127,7 @@ async function loadModuleExports(filePath) {
924
1127
  }
925
1128
  async function collectDatasetsFromFiles(config) {
926
1129
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
927
- const matched = files.filter(
928
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
929
- );
1130
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
930
1131
  const found = await Promise.all(
931
1132
  matched.map(async (absolutePath) => {
932
1133
  const exports = await loadModuleExports(absolutePath);
@@ -943,9 +1144,7 @@ async function collectDatasetsFromFiles(config) {
943
1144
  }
944
1145
  async function collectEvaluatorsFromFiles(config) {
945
1146
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
946
- const matched = files.filter(
947
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
948
- );
1147
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
949
1148
  const found = await Promise.all(
950
1149
  matched.map(async (absolutePath) => {
951
1150
  const exports = await loadModuleExports(absolutePath);
@@ -960,11 +1159,26 @@ async function collectEvaluatorsFromFiles(config) {
960
1159
  );
961
1160
  return found.flat();
962
1161
  }
963
- async function collectTestCasesFromFiles(config) {
1162
+ async function collectRunConfigsFromFiles(config) {
964
1163
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
965
- const matched = files.filter(
966
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1164
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.runConfigSuffixes));
1165
+ const found = await Promise.all(
1166
+ matched.map(async (absolutePath) => {
1167
+ const exports = await loadModuleExports(absolutePath);
1168
+ const runConfigs = exports.filter(isRunConfigLike);
1169
+ const relPath = relative(config.rootDir, absolutePath);
1170
+ return runConfigs.map((runConfig) => ({
1171
+ id: runConfig.getName(),
1172
+ filePath: relPath,
1173
+ runConfig
1174
+ }));
1175
+ })
967
1176
  );
1177
+ return found.flat();
1178
+ }
1179
+ async function collectTestCasesFromFiles(config) {
1180
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1181
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
968
1182
  const found = await Promise.all(
969
1183
  matched.map(async (absolutePath) => {
970
1184
  const exports = await loadModuleExports(absolutePath);
@@ -1036,16 +1250,8 @@ function createDiffString(expected, actual, diffOptions) {
1036
1250
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
1037
1251
  const actualProcessed = preprocessForDiff(actual, diffOptions);
1038
1252
  if (diffOptions?.keysOnly) {
1039
- const expectedKeys = JSON.stringify(
1040
- extractKeys(expectedProcessed),
1041
- null,
1042
- 2
1043
- );
1044
- const actualKeys = JSON.stringify(
1045
- extractKeys(actualProcessed),
1046
- null,
1047
- 2
1048
- );
1253
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
1254
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
1049
1255
  const parts2 = diffLines(expectedKeys, actualKeys);
1050
1256
  return formatDiffParts(parts2);
1051
1257
  }
@@ -1056,9 +1262,7 @@ function createDiffString(expected, actual, diffOptions) {
1056
1262
  }
1057
1263
  const parts = diffLines(expectedStr, actualStr);
1058
1264
  if (diffOptions?.outputNewOnly) {
1059
- const filtered = parts.filter(
1060
- (p) => p.added === true
1061
- );
1265
+ const filtered = parts.filter((p) => p.added === true);
1062
1266
  return formatDiffParts(filtered);
1063
1267
  }
1064
1268
  return formatDiffParts(parts);
@@ -1125,6 +1329,17 @@ function getDiffLines(entry) {
1125
1329
  });
1126
1330
  }
1127
1331
 
1332
+ // src/evals/test-case.ts
1333
+ function getTestCaseDisplayLabel(testCase) {
1334
+ if (typeof testCase.getDisplayLabel === "function") {
1335
+ return testCase.getDisplayLabel();
1336
+ }
1337
+ return typeof testCase.getName === "function" ? testCase.getName() : "";
1338
+ }
1339
+ function getTestCaseTagList(testCase) {
1340
+ return typeof testCase.getTags === "function" ? [...testCase.getTags()] : [];
1341
+ }
1342
+
1128
1343
  // src/evals/metric.ts
1129
1344
  var registry = /* @__PURE__ */ new Map();
1130
1345
  var Metric = {
@@ -1148,29 +1363,74 @@ function getMetricById(id) {
1148
1363
  return registry.get(id);
1149
1364
  }
1150
1365
 
1151
- // src/evals/score.ts
1152
- var registry2 = /* @__PURE__ */ new Map();
1153
- function formatScoreData(def, data, options) {
1154
- return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1366
+ // src/evals/aggregators.ts
1367
+ function aggregateTokenCountSum(values) {
1368
+ const initial = {
1369
+ input: 0,
1370
+ output: 0,
1371
+ inputCached: 0,
1372
+ outputCached: 0
1373
+ };
1374
+ return values.reduce(
1375
+ (acc, v) => ({
1376
+ input: acc.input + (v.input ?? 0),
1377
+ output: acc.output + (v.output ?? 0),
1378
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1379
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1380
+ }),
1381
+ initial
1382
+ );
1155
1383
  }
1156
- var ScoreAggregate = {
1157
- /** Average numeric fields. Use for scores like { value, delta }. */
1158
- averageFields(fields) {
1159
- return (values) => {
1160
- const count = values.length || 1;
1161
- const result = {};
1162
- for (const field of fields) {
1163
- result[field] = values.reduce(
1164
- (s, v) => s + (v[field] ?? 0),
1165
- 0
1166
- ) / count;
1167
- }
1168
- return result;
1169
- };
1170
- },
1171
- /** Average selected numeric fields, with sample std dev tracked for `value`. */
1172
- averageWithVariance(fields) {
1173
- return (values) => {
1384
+ function aggregateLatencyAverage(values) {
1385
+ if (values.length === 0) {
1386
+ return { ms: 0 };
1387
+ }
1388
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1389
+ return { ms: sum / values.length };
1390
+ }
1391
+
1392
+ // src/evals/metrics/standard.ts
1393
+ Metric.of({
1394
+ id: "token-count",
1395
+ name: "Tokens",
1396
+ aggregate: aggregateTokenCountSum,
1397
+ format: (data, options) => {
1398
+ const input = data.input ?? 0;
1399
+ const output = data.output ?? 0;
1400
+ const inputCached = data.inputCached ?? 0;
1401
+ const outputCached = data.outputCached ?? 0;
1402
+ const cached = inputCached + outputCached;
1403
+ const base = `in:${input} out:${output} cached:${cached}`;
1404
+ return options?.isAggregated ? `Total: ${base}` : base;
1405
+ }
1406
+ });
1407
+ Metric.of({
1408
+ id: "latency",
1409
+ name: "Latency",
1410
+ aggregate: aggregateLatencyAverage,
1411
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1412
+ });
1413
+
1414
+ // src/evals/score.ts
1415
+ var registry2 = /* @__PURE__ */ new Map();
1416
+ function formatScoreData(def, data, options) {
1417
+ return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
1418
+ }
1419
+ var ScoreAggregate = {
1420
+ /** Average numeric fields. Use for scores like { value, delta }. */
1421
+ averageFields(fields) {
1422
+ return (values) => {
1423
+ const count = values.length || 1;
1424
+ const result = {};
1425
+ for (const field of fields) {
1426
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
1427
+ }
1428
+ return result;
1429
+ };
1430
+ },
1431
+ /** Average selected numeric fields, with sample std dev tracked for `value`. */
1432
+ averageWithVariance(fields) {
1433
+ return (values) => {
1174
1434
  const count = values.length;
1175
1435
  const result = {};
1176
1436
  for (const field of fields) {
@@ -1197,13 +1457,10 @@ var ScoreAggregate = {
1197
1457
  (s, v) => s + (v[valueField] ?? 0),
1198
1458
  0
1199
1459
  );
1200
- const sumSq = values.reduce(
1201
- (s, v) => {
1202
- const value = v[valueField] ?? 0;
1203
- return s + value * value;
1204
- },
1205
- 0
1206
- );
1460
+ const sumSq = values.reduce((s, v) => {
1461
+ const value = v[valueField] ?? 0;
1462
+ return s + value * value;
1463
+ }, 0);
1207
1464
  const mean = sum / count;
1208
1465
  const variance = (sumSq - count * mean * mean) / (count - 1);
1209
1466
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -1262,54 +1519,6 @@ function getScoreById(id) {
1262
1519
  return registry2.get(id);
1263
1520
  }
1264
1521
 
1265
- // src/evals/aggregators.ts
1266
- function aggregateTokenCountSum(values) {
1267
- const initial = {
1268
- input: 0,
1269
- output: 0,
1270
- inputCached: 0,
1271
- outputCached: 0
1272
- };
1273
- return values.reduce(
1274
- (acc, v) => ({
1275
- input: acc.input + (v.input ?? 0),
1276
- output: acc.output + (v.output ?? 0),
1277
- inputCached: acc.inputCached + (v.inputCached ?? 0),
1278
- outputCached: acc.outputCached + (v.outputCached ?? 0)
1279
- }),
1280
- initial
1281
- );
1282
- }
1283
- function aggregateLatencyAverage(values) {
1284
- if (values.length === 0) {
1285
- return { ms: 0 };
1286
- }
1287
- const sum = values.reduce((s, v) => s + v.ms, 0);
1288
- return { ms: sum / values.length };
1289
- }
1290
-
1291
- // src/evals/metrics/standard.ts
1292
- Metric.of({
1293
- id: "token-count",
1294
- name: "Tokens",
1295
- aggregate: aggregateTokenCountSum,
1296
- format: (data, options) => {
1297
- const input = data.input ?? 0;
1298
- const output = data.output ?? 0;
1299
- const inputCached = data.inputCached ?? 0;
1300
- const outputCached = data.outputCached ?? 0;
1301
- const cached = inputCached + outputCached;
1302
- const base = `in:${input} out:${output} cached:${cached}`;
1303
- return options?.isAggregated ? `Total: ${base}` : base;
1304
- }
1305
- });
1306
- Metric.of({
1307
- id: "latency",
1308
- name: "Latency",
1309
- aggregate: aggregateLatencyAverage,
1310
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1311
- });
1312
-
1313
1522
  // src/evals/scores/standard.ts
1314
1523
  Score.of({
1315
1524
  id: "percent",
@@ -1416,15 +1625,17 @@ function readOutput(testCase) {
1416
1625
  }
1417
1626
  return candidate.getOutput();
1418
1627
  }
1419
- function buildEvaluationUnits(testCases) {
1628
+ function buildEvaluationUnits(testCases, repetitionCount) {
1629
+ const count = Math.max(1, repetitionCount);
1420
1630
  const units = [];
1421
1631
  for (const testCaseItem of testCases) {
1422
- const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1423
- for (let r = 0; r < rerunTotal; r++) {
1632
+ const repetitionId = `rep-${randomUUID()}`;
1633
+ for (let r = 0; r < count; r++) {
1424
1634
  units.push({
1425
1635
  testCaseItem,
1426
- rerunIndex: r + 1,
1427
- rerunTotal
1636
+ repetitionId,
1637
+ repetitionIndex: r + 1,
1638
+ repetitionCount: count
1428
1639
  });
1429
1640
  }
1430
1641
  }
@@ -1434,29 +1645,24 @@ function nowIsoForFile() {
1434
1645
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1435
1646
  }
1436
1647
  function createArtifactPath(artifactDirectory, datasetId, runId) {
1437
- return join(
1438
- artifactDirectory,
1439
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1440
- );
1648
+ return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
1441
1649
  }
1442
1650
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
1443
- const { testCaseItem, rerunIndex, rerunTotal } = unit;
1651
+ const { testCaseItem, repetitionId, repetitionIndex, repetitionCount } = unit;
1444
1652
  return Effect.gen(function* () {
1445
1653
  const evaluatorRunId = `run-${randomUUID()}`;
1446
1654
  const started = Date.now();
1447
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1448
- n + 1,
1449
- n + 1
1450
- ]);
1655
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
1451
1656
  yield* publishEvent({
1452
1657
  type: "TestCaseStarted",
1453
1658
  runId: task.runId,
1454
1659
  testCaseId: testCaseItem.id,
1455
- testCaseName: testCaseItem.testCase.getName(),
1660
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1456
1661
  startedTestCases: startedEvaluations,
1457
1662
  totalTestCases: totalEvaluations,
1458
- rerunIndex,
1459
- rerunTotal
1663
+ repetitionId,
1664
+ repetitionIndex,
1665
+ repetitionCount
1460
1666
  });
1461
1667
  const evaluatorScores = [];
1462
1668
  let testCaseError;
@@ -1480,9 +1686,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1480
1686
  return error;
1481
1687
  };
1482
1688
  try {
1483
- const ctx = yield* Effect.promise(
1484
- () => Promise.resolve(evaluator.resolveContext())
1485
- );
1689
+ const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
1486
1690
  const result = yield* Effect.promise(
1487
1691
  () => Promise.resolve().then(
1488
1692
  () => evaluateFn({
@@ -1492,8 +1696,15 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1492
1696
  meta: {
1493
1697
  triggerId: task.triggerId,
1494
1698
  runId: evaluatorRunId,
1495
- datasetId: task.datasetId
1699
+ datasetId: task.datasetId,
1700
+ repetitionId,
1701
+ repetitionIndex,
1702
+ repetitionCount,
1703
+ runConfigName: task.runConfigName
1496
1704
  },
1705
+ testCaseTags: getTestCaseTagList(testCaseItem.testCase),
1706
+ runConfigTags: task.runConfigTags,
1707
+ evaluatorTags: getEvaluatorTagList(evaluator),
1497
1708
  logDiff,
1498
1709
  log,
1499
1710
  createError
@@ -1536,21 +1747,19 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1536
1747
  });
1537
1748
  }
1538
1749
  }
1539
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1540
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1541
- n + 1,
1542
- n + 1
1543
- ]);
1750
+ const repetitionPassedThis = evaluatorScores.every((s) => s.passed);
1751
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
1544
1752
  const progressEvent = {
1545
1753
  type: "TestCaseProgress",
1546
1754
  runId: task.runId,
1547
1755
  testCaseId: testCaseItem.id,
1548
- testCaseName: testCaseItem.testCase.getName(),
1756
+ testCaseName: getTestCaseDisplayLabel(testCaseItem.testCase),
1549
1757
  completedTestCases: completedEvaluations,
1550
1758
  totalTestCases: totalEvaluations,
1551
- rerunIndex,
1552
- rerunTotal,
1553
- passed: rerunPassedThis,
1759
+ repetitionId,
1760
+ repetitionIndex,
1761
+ repetitionCount,
1762
+ passed: repetitionPassedThis,
1554
1763
  durationMs: Date.now() - started,
1555
1764
  evaluatorScores,
1556
1765
  output,
@@ -1571,9 +1780,9 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1571
1780
  (map) => {
1572
1781
  const key = testCaseItem.id;
1573
1782
  const existing = map.get(key) ?? { completedCount: 0, results: [] };
1574
- const newResults = [...existing.results, rerunPassedThis];
1783
+ const newResults = [...existing.results, repetitionPassedThis];
1575
1784
  const newCompletedCount = existing.completedCount + 1;
1576
- const isLast = newCompletedCount === rerunTotal;
1785
+ const isLast = newCompletedCount === repetitionCount;
1577
1786
  const newMap = new Map(map);
1578
1787
  newMap.set(key, {
1579
1788
  completedCount: newCompletedCount,
@@ -1589,10 +1798,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
1589
1798
  } else {
1590
1799
  yield* Ref.update(failedRef, (n) => n + 1);
1591
1800
  }
1592
- const [passed, failed] = yield* Effect.all([
1593
- Ref.get(passedRef),
1594
- Ref.get(failedRef)
1595
- ]);
1801
+ const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
1596
1802
  yield* updateSnapshot(task.runId, (snapshot) => ({
1597
1803
  ...snapshot,
1598
1804
  passedTestCases: passed,
@@ -1613,10 +1819,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1613
1819
  runId: task.runId,
1614
1820
  startedAt
1615
1821
  });
1616
- const totalEvaluations = task.testCases.reduce(
1617
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1618
- 0
1619
- );
1822
+ const totalEvaluations = task.testCases.length * Math.max(1, task.repetitions);
1620
1823
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1621
1824
  const completedRef = yield* Ref.make(0);
1622
1825
  const startedRef = yield* Ref.make(0);
@@ -1625,7 +1828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1625
1828
  const testCaseResultsRef = yield* Ref.make(
1626
1829
  /* @__PURE__ */ new Map()
1627
1830
  );
1628
- const evaluationUnits = buildEvaluationUnits(task.testCases);
1831
+ const evaluationUnits = buildEvaluationUnits(task.testCases, task.repetitions);
1629
1832
  const processEvaluation = (unit) => processOneEvaluation(
1630
1833
  task,
1631
1834
  unit,
@@ -1639,11 +1842,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1639
1842
  failedRef,
1640
1843
  testCaseResultsRef
1641
1844
  );
1642
- yield* Effect.forEach(
1643
- evaluationUnits,
1644
- processEvaluation,
1645
- maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1646
- );
1845
+ const globalSem = task.globalEvaluationSemaphore;
1846
+ if (globalSem !== void 0) {
1847
+ yield* Effect.forEach(
1848
+ evaluationUnits,
1849
+ (unit) => globalSem.withPermits(1)(processEvaluation(unit)),
1850
+ { concurrency: "unbounded", discard: true }
1851
+ );
1852
+ } else {
1853
+ yield* Effect.forEach(
1854
+ evaluationUnits,
1855
+ processEvaluation,
1856
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1857
+ );
1858
+ }
1647
1859
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1648
1860
  Ref.get(completedRef),
1649
1861
  Ref.get(passedRef),
@@ -1679,155 +1891,34 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1679
1891
  artifactPath: task.snapshot.artifactPath
1680
1892
  });
1681
1893
  });
1682
- async function loadRunSnapshotsFromArtifacts(config) {
1683
- const baseDir = resolve(config.artifactDirectory);
1684
- let entries;
1685
- try {
1686
- entries = await readdir(baseDir);
1687
- } catch {
1688
- return [];
1689
- }
1690
- const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1691
- const snapshots = [];
1692
- for (const fileName of jsonlFiles) {
1693
- const filePath = join(baseDir, fileName);
1694
- try {
1695
- const snapshot = await parseArtifactToSnapshot(filePath, config);
1696
- if (snapshot) {
1697
- snapshots.push(snapshot);
1698
- }
1699
- } catch {
1700
- }
1701
- }
1702
- return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1703
- }
1704
- async function parseArtifactToSnapshot(filePath, _config) {
1705
- const content = await readFile(filePath, "utf8");
1706
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1707
- if (lines.length === 0) {
1708
- return null;
1709
- }
1710
- let runQueued = null;
1711
- let runCompleted = null;
1712
- let runFailed = null;
1713
- let runStarted = null;
1714
- for (const line of lines) {
1715
- try {
1716
- const event = JSON.parse(line);
1717
- const type = event.type;
1718
- if (type === "RunQueued") {
1719
- runQueued = {
1720
- runId: event.runId,
1721
- datasetId: event.datasetId,
1722
- datasetName: event.datasetName,
1723
- evaluatorIds: event.evaluatorIds,
1724
- totalTestCases: event.totalTestCases ?? 0,
1725
- artifactPath: event.artifactPath ?? filePath,
1726
- ts: event.ts
1727
- };
1728
- }
1729
- if (type === "RunStarted") {
1730
- runStarted = { startedAt: event.startedAt };
1731
- }
1732
- if (type === "RunCompleted") {
1733
- runCompleted = {
1734
- passedTestCases: event.passedTestCases,
1735
- failedTestCases: event.failedTestCases,
1736
- totalTestCases: event.totalTestCases,
1737
- finishedAt: event.finishedAt
1738
- };
1739
- }
1740
- if (type === "RunFailed") {
1741
- runFailed = {
1742
- finishedAt: event.finishedAt,
1743
- errorMessage: event.errorMessage
1744
- };
1745
- }
1746
- } catch {
1747
- }
1894
+
1895
+ // src/runner/name-pattern.ts
1896
+ function parseRegexLiteral(pattern) {
1897
+ if (!pattern.startsWith("/")) {
1898
+ return void 0;
1748
1899
  }
1749
- if (!runQueued) {
1750
- return null;
1900
+ const lastSlash = pattern.lastIndexOf("/");
1901
+ if (lastSlash <= 0) {
1902
+ return void 0;
1751
1903
  }
1752
- const artifactPath = filePath;
1753
- const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1754
- const progress = aggregateTestCaseProgress(lines);
1755
- const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1756
- const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1757
- const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1758
1904
  return {
1759
- runId: runQueued.runId,
1760
- datasetId: runQueued.datasetId,
1761
- datasetName: runQueued.datasetName,
1762
- evaluatorIds: runQueued.evaluatorIds,
1763
- queuedAt: runQueued.ts ?? 0,
1764
- startedAt: runStarted?.startedAt,
1765
- finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1766
- totalTestCases: runQueued.totalTestCases,
1767
- completedTestCases,
1768
- passedTestCases,
1769
- failedTestCases,
1770
- status,
1771
- artifactPath,
1772
- errorMessage: runFailed?.errorMessage
1905
+ source: pattern.slice(1, lastSlash),
1906
+ flags: pattern.slice(lastSlash + 1)
1773
1907
  };
1774
1908
  }
1775
- function aggregateTestCaseProgress(lines) {
1776
- let completedTestCases = 0;
1777
- const testCasePassedBy = /* @__PURE__ */ new Map();
1778
- for (const line of lines) {
1779
- try {
1780
- const event = JSON.parse(line);
1781
- if (event.type === "TestCaseProgress") {
1782
- const ev = event;
1783
- completedTestCases = ev.completedTestCases ?? completedTestCases;
1784
- const id = ev.testCaseId;
1785
- const current = testCasePassedBy.get(id);
1786
- testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1787
- }
1788
- } catch {
1789
- }
1790
- }
1791
- let passedTestCases = 0;
1792
- let failedTestCases = 0;
1793
- for (const passed of testCasePassedBy.values()) {
1794
- if (passed) {
1795
- passedTestCases += 1;
1796
- } else {
1797
- failedTestCases += 1;
1798
- }
1909
+ function createNameMatcher(pattern) {
1910
+ const normalizedPattern = pattern.trim();
1911
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1912
+ if (regexLiteral) {
1913
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1914
+ return (value) => regex.test(value);
1799
1915
  }
1800
- return { completedTestCases, passedTestCases, failedTestCases };
1801
- }
1802
- async function parseArtifactFile(artifactPath) {
1803
- try {
1804
- const content = await readFile(artifactPath, "utf8");
1805
- const lines = content.split("\n").filter((line) => line.trim().length > 0);
1806
- const results = [];
1807
- for (const line of lines) {
1808
- try {
1809
- const event = JSON.parse(line);
1810
- if (event.type === "TestCaseProgress") {
1811
- const ev = event;
1812
- results.push({
1813
- testCaseId: ev.testCaseId,
1814
- testCaseName: ev.testCaseName,
1815
- completedTestCases: ev.completedTestCases,
1816
- totalTestCases: ev.totalTestCases,
1817
- rerunIndex: ev.rerunIndex,
1818
- rerunTotal: ev.rerunTotal,
1819
- passed: ev.passed,
1820
- durationMs: ev.durationMs,
1821
- evaluatorScores: ev.evaluatorScores ?? []
1822
- });
1823
- }
1824
- } catch {
1825
- }
1826
- }
1827
- return results;
1828
- } catch {
1829
- return [];
1916
+ if (normalizedPattern.includes("*")) {
1917
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1918
+ const regex = new RegExp(`^${escaped}$`, "i");
1919
+ return (value) => regex.test(value);
1830
1920
  }
1921
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1831
1922
  }
1832
1923
  async function appendJsonLine(artifactPath, payload) {
1833
1924
  await mkdir(dirname(artifactPath), { recursive: true });
@@ -1886,32 +1977,12 @@ function searchCollectedTestCases(all, query) {
1886
1977
  }
1887
1978
 
1888
1979
  // src/runner/api.ts
1889
- function parseRegexLiteral(pattern) {
1890
- if (!pattern.startsWith("/")) {
1891
- return void 0;
1892
- }
1893
- const lastSlash = pattern.lastIndexOf("/");
1894
- if (lastSlash <= 0) {
1895
- return void 0;
1896
- }
1897
- return {
1898
- source: pattern.slice(1, lastSlash),
1899
- flags: pattern.slice(lastSlash + 1)
1900
- };
1901
- }
1902
- function createNameMatcher(pattern) {
1903
- const normalizedPattern = pattern.trim();
1904
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1905
- if (regexLiteral) {
1906
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1907
- return (value) => regex.test(value);
1980
+ function normalizeRunRepetitions(value) {
1981
+ const n = value ?? 1;
1982
+ if (!Number.isInteger(n) || n < 1) {
1983
+ throw new Error(`repetitions must be a positive integer, got ${String(value)}`);
1908
1984
  }
1909
- if (normalizedPattern.includes("*")) {
1910
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1911
- const regex = new RegExp(`^${escaped}$`, "i");
1912
- return (value) => regex.test(value);
1913
- }
1914
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1985
+ return n;
1915
1986
  }
1916
1987
  function mergeRunnerOverrides(base, next) {
1917
1988
  if (!base) {
@@ -1942,15 +2013,12 @@ var EffectRunner = class {
1942
2013
  this.persistenceQueue = Effect.runSync(
1943
2014
  Queue.unbounded()
1944
2015
  );
1945
- this.snapshotsRef = Effect.runSync(
1946
- Ref.make(/* @__PURE__ */ new Map())
1947
- );
2016
+ this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
1948
2017
  this.listeners = /* @__PURE__ */ new Set();
1949
2018
  this.datasetsById = /* @__PURE__ */ new Map();
1950
2019
  this.evaluatorsById = /* @__PURE__ */ new Map();
1951
- this.schedulerFiber = Effect.runFork(
1952
- this.createSchedulerEffect()
1953
- );
2020
+ this.runConfigsById = /* @__PURE__ */ new Map();
2021
+ this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1954
2022
  this.persistenceFiber = Effect.runFork(
1955
2023
  createPersistenceWorker(this.persistenceQueue)
1956
2024
  );
@@ -1990,6 +2058,137 @@ var EffectRunner = class {
1990
2058
  (item) => matcher(item.evaluator.getName() ?? "")
1991
2059
  );
1992
2060
  }
2061
+ async collectRunConfigs() {
2062
+ const runConfigs = await collectRunConfigsFromFiles(this.config.discovery);
2063
+ this.runConfigsById.clear();
2064
+ const byNameLower = /* @__PURE__ */ new Map();
2065
+ for (const item of runConfigs) {
2066
+ const id = item.runConfig.getName();
2067
+ const lower = id.toLowerCase();
2068
+ const prev = byNameLower.get(lower);
2069
+ if (prev !== void 0 && prev.filePath !== item.filePath) {
2070
+ throw new Error(
2071
+ `Duplicate RunConfig name "${id}" (matches "${prev.runConfig.getName()}" case-insensitively): ${prev.filePath} and ${item.filePath}`
2072
+ );
2073
+ }
2074
+ byNameLower.set(lower, item);
2075
+ this.runConfigsById.set(id, item);
2076
+ }
2077
+ return runConfigs;
2078
+ }
2079
+ async resolveRunConfigByName(name) {
2080
+ if (this.runConfigsById.size === 0) {
2081
+ await this.collectRunConfigs();
2082
+ }
2083
+ const key = validateRunConfigName(name, `RunConfig "${name.trim()}"`);
2084
+ const keyLower = key.toLowerCase();
2085
+ const matches = Array.from(this.runConfigsById.values()).filter(
2086
+ (item) => item.runConfig.getName().toLowerCase() === keyLower
2087
+ );
2088
+ if (matches.length === 0) {
2089
+ return void 0;
2090
+ }
2091
+ if (matches.length > 1) {
2092
+ throw new Error(
2093
+ `Multiple RunConfigs named "${name}": ${matches.map((m) => m.filePath).join(", ")}`
2094
+ );
2095
+ }
2096
+ return matches[0];
2097
+ }
2098
+ async expandRunConfigToJobs(collected) {
2099
+ if (this.datasetsById.size === 0) {
2100
+ await this.collectDatasets();
2101
+ }
2102
+ if (this.evaluatorsById.size === 0) {
2103
+ await this.collectEvaluators();
2104
+ }
2105
+ const rcName = collected.runConfig.getName();
2106
+ const jobs = [];
2107
+ const runs = collected.runConfig.getRuns();
2108
+ for (const [i, row] of runs.entries()) {
2109
+ const dsCollected = Array.from(this.datasetsById.values()).find(
2110
+ (d) => d.dataset === row.dataset
2111
+ );
2112
+ if (!dsCollected) {
2113
+ throw new Error(
2114
+ `RunConfig "${rcName}" run[${i}]: dataset "${row.dataset.getName()}" was not found among discovered dataset exports (import the same module instances the scanner loads).`
2115
+ );
2116
+ }
2117
+ let evaluatorIds;
2118
+ if ("evaluatorPattern" in row && typeof row.evaluatorPattern === "string") {
2119
+ const matcher = createNameMatcher(row.evaluatorPattern);
2120
+ const matched = Array.from(this.evaluatorsById.values()).filter(
2121
+ (item) => matcher(item.evaluator.getName() ?? "")
2122
+ );
2123
+ if (matched.length === 0) {
2124
+ throw new Error(
2125
+ `RunConfig "${rcName}" run[${i}]: no evaluator matched pattern "${row.evaluatorPattern}"`
2126
+ );
2127
+ }
2128
+ evaluatorIds = matched.map((item) => item.id);
2129
+ } else {
2130
+ const evaluators = row.evaluators;
2131
+ evaluatorIds = [];
2132
+ for (const ev of evaluators) {
2133
+ const found = Array.from(this.evaluatorsById.values()).find(
2134
+ (item) => item.evaluator === ev
2135
+ );
2136
+ if (!found) {
2137
+ throw new Error(
2138
+ `RunConfig "${rcName}" run[${i}]: evaluator "${getEvaluatorDisplayLabel(ev) ?? "unknown"}" was not found among discovered evaluator exports`
2139
+ );
2140
+ }
2141
+ evaluatorIds.push(found.id);
2142
+ }
2143
+ }
2144
+ const repetitions = "repetitions" in row && row.repetitions !== void 0 ? row.repetitions : 1;
2145
+ jobs.push({
2146
+ datasetId: dsCollected.id,
2147
+ evaluatorIds,
2148
+ runConfigName: rcName,
2149
+ runConfigDisplayLabel: collected.runConfig.getDisplayLabel(),
2150
+ runConfigTags: collected.runConfig.getTags(),
2151
+ repetitions
2152
+ });
2153
+ }
2154
+ return jobs;
2155
+ }
2156
+ async expandRunConfigNamesToJobs(names) {
2157
+ const jobs = [];
2158
+ for (const name of names) {
2159
+ const collected = await this.resolveRunConfigByName(name);
2160
+ if (!collected) {
2161
+ const known = await this.collectRunConfigs();
2162
+ const available = known.map((r) => r.runConfig.getName()).sort();
2163
+ throw new Error(
2164
+ available.length > 0 ? `RunConfig "${name}" not found. Available RunConfigs: ${available.join(", ")}` : `RunConfig "${name}" not found and no RunConfigs were discovered.`
2165
+ );
2166
+ }
2167
+ jobs.push(...await this.expandRunConfigToJobs(collected));
2168
+ }
2169
+ return jobs;
2170
+ }
2171
+ async runDatasetJobsWithSharedConcurrency(request) {
2172
+ const globalConcurrency = Math.max(1, request.globalConcurrency);
2173
+ const sem = Effect.unsafeMakeSemaphore(globalConcurrency);
2174
+ const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2175
+ const snapshots = [];
2176
+ for (const job of request.jobs) {
2177
+ snapshots.push(
2178
+ await this.startDatasetRun({
2179
+ datasetId: job.datasetId,
2180
+ evaluatorIds: job.evaluatorIds,
2181
+ triggerId,
2182
+ maxConcurrency: this.config.maxConcurrency ?? 1,
2183
+ globalEvaluationSemaphore: sem,
2184
+ runConfigName: job.runConfigName,
2185
+ runConfigTags: job.runConfigTags,
2186
+ repetitions: job.repetitions
2187
+ })
2188
+ );
2189
+ }
2190
+ return snapshots;
2191
+ }
1993
2192
  async searchTestCases(query) {
1994
2193
  const testCases = await collectTestCasesFromFiles(this.config.discovery);
1995
2194
  return searchCollectedTestCases(testCases, query);
@@ -2008,35 +2207,45 @@ var EffectRunner = class {
2008
2207
  );
2009
2208
  }
2010
2209
  async runDatasetWith(request) {
2210
+ const runConfigName = validateRunConfigName(
2211
+ request.runConfigName,
2212
+ "runDatasetWith.runConfigName"
2213
+ );
2214
+ return this.startDatasetRun({
2215
+ datasetId: request.datasetId,
2216
+ evaluatorIds: request.evaluatorIds,
2217
+ triggerId: request.triggerId,
2218
+ maxConcurrency: request.concurrency ?? this.config.maxConcurrency ?? 1,
2219
+ repetitions: request.repetitions,
2220
+ runConfigName,
2221
+ runConfigTags: request.runConfigTags
2222
+ });
2223
+ }
2224
+ async startDatasetRun(params) {
2011
2225
  if (this.datasetsById.size === 0) {
2012
2226
  await this.collectDatasets();
2013
2227
  }
2014
2228
  if (this.evaluatorsById.size === 0) {
2015
2229
  await this.collectEvaluators();
2016
2230
  }
2017
- const dataset = this.datasetsById.get(request.datasetId);
2231
+ const dataset = this.datasetsById.get(params.datasetId);
2018
2232
  if (!dataset) {
2019
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2233
+ throw new Error(`Unknown dataset: ${params.datasetId}`);
2020
2234
  }
2021
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2235
+ const selectedEvaluators = params.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
2022
2236
  if (selectedEvaluators.length === 0) {
2023
2237
  throw new Error("No evaluators selected for run");
2024
2238
  }
2025
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
2026
- const totalEvaluations = selectedTestCases.reduce(
2027
- (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
2028
- 0
2029
- );
2030
- const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
2239
+ const selectedTestCases = await this.collectDatasetTestCases(params.datasetId);
2240
+ const repetitions = normalizeRunRepetitions(params.repetitions);
2241
+ const totalEvaluations = selectedTestCases.length * repetitions;
2242
+ const runConfigTags = [...params.runConfigTags ?? []];
2243
+ const triggerId = params.triggerId ?? `trg-${randomUUID()}`;
2031
2244
  const runId = `run-${randomUUID()}`;
2032
- const artifactPath = createArtifactPath(
2033
- this.config.artifactDirectory,
2034
- request.datasetId,
2035
- runId
2036
- );
2245
+ const artifactPath = createArtifactPath(this.config.artifactDirectory, params.datasetId, runId);
2037
2246
  const snapshot = {
2038
2247
  runId,
2039
- datasetId: request.datasetId,
2248
+ datasetId: params.datasetId,
2040
2249
  datasetName: dataset.dataset.getName(),
2041
2250
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2042
2251
  queuedAt: Date.now(),
@@ -2057,7 +2266,7 @@ var EffectRunner = class {
2057
2266
  const queuedEvent = {
2058
2267
  type: "RunQueued",
2059
2268
  runId,
2060
- datasetId: request.datasetId,
2269
+ datasetId: params.datasetId,
2061
2270
  datasetName: dataset.dataset.getName(),
2062
2271
  evaluatorIds: selectedEvaluators.map((item) => item.id),
2063
2272
  totalTestCases: totalEvaluations,
@@ -2071,17 +2280,20 @@ var EffectRunner = class {
2071
2280
  payload: queuedEvent
2072
2281
  })
2073
2282
  );
2074
- const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
2075
2283
  await Effect.runPromise(
2076
2284
  Queue.offer(this.runQueue, {
2077
2285
  runId,
2078
2286
  triggerId,
2079
- datasetId: request.datasetId,
2287
+ datasetId: params.datasetId,
2080
2288
  dataset: dataset.dataset,
2081
2289
  evaluators: selectedEvaluators,
2082
2290
  testCases: selectedTestCases,
2083
2291
  snapshot,
2084
- maxConcurrency
2292
+ maxConcurrency: params.maxConcurrency,
2293
+ globalEvaluationSemaphore: params.globalEvaluationSemaphore,
2294
+ runConfigName: params.runConfigName,
2295
+ runConfigTags,
2296
+ repetitions
2085
2297
  })
2086
2298
  );
2087
2299
  return snapshot;
@@ -2097,9 +2309,9 @@ var EffectRunner = class {
2097
2309
  return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
2098
2310
  }
2099
2311
  getAllRunSnapshots() {
2100
- return Array.from(
2101
- Effect.runSync(Ref.get(this.snapshotsRef)).values()
2102
- ).sort((a, b) => b.queuedAt - a.queuedAt);
2312
+ return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
2313
+ (a, b) => b.queuedAt - a.queuedAt
2314
+ );
2103
2315
  }
2104
2316
  async loadRunSnapshotsFromArtifacts() {
2105
2317
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -2152,6 +2364,11 @@ var EffectRunner = class {
2152
2364
  );
2153
2365
  }
2154
2366
  };
2367
+
2368
+ // src/runner/events.ts
2369
+ var PROGRAMMATIC_RUN_CONFIG = {
2370
+ runConfigName: "programmatic"
2371
+ };
2155
2372
  var LEFT_PANE_WIDTH2 = 44;
2156
2373
  var MAX_RUNS_FOR_CHART = 12;
2157
2374
  var MAX_RUNS_FOR_TREND = 20;
@@ -2315,11 +2532,7 @@ function DatasetsView({
2315
2532
  ] })
2316
2533
  ] });
2317
2534
  }
2318
- function RunsView({
2319
- state,
2320
- dataset,
2321
- selectedRun
2322
- }) {
2535
+ function RunsView({ state, dataset, selectedRun }) {
2323
2536
  const runs = dataset?.runs ?? [];
2324
2537
  const rightFocused = state.focus === "right";
2325
2538
  return /* @__PURE__ */ jsxs(Fragment, { children: [
@@ -2335,10 +2548,10 @@ function RunsView({
2335
2548
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2336
2549
  "Commit: ",
2337
2550
  selectedRun.meta.commit,
2338
- " Branch: ",
2551
+ " Branch: ",
2339
2552
  selectedRun.meta.branch,
2553
+ " Seed:",
2340
2554
  " ",
2341
- "Seed: ",
2342
2555
  selectedRun.meta.seed
2343
2556
  ] }),
2344
2557
  /* @__PURE__ */ jsx(Text, { children: " " }),
@@ -2351,23 +2564,10 @@ function RunsView({
2351
2564
  format: (v) => `${v}%`
2352
2565
  }
2353
2566
  ),
2354
- /* @__PURE__ */ jsx(
2355
- TextBar,
2356
- {
2357
- label: "avg score",
2358
- value: Math.round(selectedRun.performance.avgScore * 100)
2359
- }
2360
- ),
2567
+ /* @__PURE__ */ jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
2361
2568
  /* @__PURE__ */ jsx(Text, { children: " " }),
2362
2569
  /* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
2363
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
2364
- TextBar,
2365
- {
2366
- label: dimension.name,
2367
- value: dimension.score
2368
- },
2369
- dimension.name
2370
- )),
2570
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
2371
2571
  /* @__PURE__ */ jsx(Text, { children: " " }),
2372
2572
  /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
2373
2573
  /* @__PURE__ */ jsx(
@@ -2470,15 +2670,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2470
2670
  ...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
2471
2671
  /* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
2472
2672
  /* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
2473
- ...checks.map((c) => /* @__PURE__ */ jsx(
2474
- CheckRow,
2475
- {
2476
- name: c.name,
2477
- passed: c.passed,
2478
- detail: c.detail
2479
- },
2480
- `chk-${c.name}`
2481
- )),
2673
+ ...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2482
2674
  /* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
2483
2675
  /* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
2484
2676
  /* @__PURE__ */ jsx(
@@ -2524,7 +2716,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2524
2716
  rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
2525
2717
  rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2526
2718
  for (const tc of testCases) {
2527
- const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2719
+ const repetitionPart = tc.repetitionCount != null && tc.repetitionCount > 1 && tc.repetitionIndex != null ? ` (${tc.repetitionIndex}/${tc.repetitionCount})` : "";
2528
2720
  rows.push(
2529
2721
  /* @__PURE__ */ jsxs(Text, { children: [
2530
2722
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -2536,13 +2728,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2536
2728
  ] }),
2537
2729
  " ",
2538
2730
  tc.testCaseName,
2539
- rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
2731
+ repetitionPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: repetitionPart }) : null,
2540
2732
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2541
2733
  " (",
2542
2734
  tc.durationMs,
2543
2735
  "ms)"
2544
2736
  ] })
2545
- ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2737
+ ] }, `tc-${tc.testCaseId}-${tc.repetitionId ?? "x"}-${tc.repetitionIndex ?? 0}`)
2546
2738
  );
2547
2739
  for (const item of tc.evaluatorScores) {
2548
2740
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2595,17 +2787,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2595
2787
  }
2596
2788
  } else {
2597
2789
  rows.push(
2598
- /* @__PURE__ */ jsxs(
2599
- Text,
2600
- {
2601
- color: "gray",
2602
- children: [
2603
- " ",
2604
- "n/a"
2605
- ]
2606
- },
2607
- `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
2608
- )
2790
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2791
+ " ",
2792
+ "n/a"
2793
+ ] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
2609
2794
  );
2610
2795
  }
2611
2796
  if (!item.passed && item.logs && item.logs.length > 0) {
@@ -2663,7 +2848,7 @@ function RunDetailsView({
2663
2848
  const runs = dataset?.runs ?? [];
2664
2849
  const rightFocused = state.focus === "right";
2665
2850
  const [testCases, setTestCases] = useState([]);
2666
- const evaluatorNameById = React2.useMemo(
2851
+ const evaluatorNameById = React.useMemo(
2667
2852
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2668
2853
  [evaluators]
2669
2854
  );
@@ -2686,7 +2871,7 @@ function RunDetailsView({
2686
2871
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2687
2872
  return /* @__PURE__ */ jsxs(Fragment, { children: [
2688
2873
  /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
2689
- /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React2.Fragment, { children: row }, i)) }) })
2874
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
2690
2875
  ] });
2691
2876
  }
2692
2877
  var LEFT_PANE_WIDTH3 = 44;
@@ -2709,19 +2894,11 @@ function NewEvaluationView({
2709
2894
  visibleEvaluators.map((evaluator, index) => {
2710
2895
  const selected = index === state.evaluatorMenuIndex;
2711
2896
  const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
2712
- return /* @__PURE__ */ jsxs(
2713
- Text,
2714
- {
2715
- color: selected ? "cyan" : "gray",
2716
- bold: selected,
2717
- children: [
2718
- selected ? "\u25B8 " : " ",
2719
- inSelection ? "[x] " : "[ ] ",
2720
- evaluator.name
2721
- ]
2722
- },
2723
- evaluator.id
2724
- );
2897
+ return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
2898
+ selected ? "\u25B8 " : " ",
2899
+ inSelection ? "[x] " : "[ ] ",
2900
+ evaluator.name
2901
+ ] }, evaluator.id);
2725
2902
  })
2726
2903
  ] }),
2727
2904
  /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
@@ -2753,26 +2930,16 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
2753
2930
  ...state,
2754
2931
  datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
2755
2932
  runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
2756
- evaluatorMenuIndex: Math.max(
2757
- 0,
2758
- Math.min(state.evaluatorMenuIndex, evaluatorMax)
2759
- )
2933
+ evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
2760
2934
  };
2761
2935
  }
2762
- function EvalsCliApp({
2763
- data,
2764
- args,
2765
- runner
2766
- }) {
2936
+ function EvalsCliApp({ data, args, runner }) {
2767
2937
  const { exit } = useApp();
2768
2938
  const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
2769
2939
  const [liveData, setLiveData] = useState(data);
2770
2940
  const [runtimeMessage, setRuntimeMessage] = useState();
2771
2941
  const overviewRowCountRef = useRef(0);
2772
- const [state, dispatch] = useReducer(
2773
- reduceCliState,
2774
- createInitialState(data, args)
2775
- );
2942
+ const [state, dispatch] = useReducer(reduceCliState, createInitialState(data, args));
2776
2943
  useEffect(() => {
2777
2944
  setLiveData(data);
2778
2945
  }, [data]);
@@ -2804,14 +2971,8 @@ function EvalsCliApp({
2804
2971
  filteredDatasets.length,
2805
2972
  getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
2806
2973
  );
2807
- const selectedDataset = getDatasetByMenuIndex(
2808
- filteredDatasets,
2809
- clampedState.datasetMenuIndex
2810
- );
2811
- const selectedRun = getRunByMenuIndex(
2812
- selectedDataset,
2813
- clampedState.runMenuIndex
2814
- );
2974
+ const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
2975
+ const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
2815
2976
  const visibleEvaluators = liveData.evaluators.filter(
2816
2977
  (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
2817
2978
  );
@@ -2899,15 +3060,14 @@ function EvalsCliApp({
2899
3060
  }
2900
3061
  void runner.runDatasetWith({
2901
3062
  datasetId: selectedDataset.id,
2902
- evaluatorIds: clampedState.selectedEvaluatorIds
3063
+ evaluatorIds: clampedState.selectedEvaluatorIds,
3064
+ ...PROGRAMMATIC_RUN_CONFIG
2903
3065
  }).then((snapshot) => {
2904
3066
  setRuntimeMessage(
2905
3067
  `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
2906
3068
  );
2907
3069
  }).catch((error) => {
2908
- setRuntimeMessage(
2909
- error instanceof Error ? error.message : "Failed to start evaluation."
2910
- );
3070
+ setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
2911
3071
  });
2912
3072
  }
2913
3073
  });
@@ -2934,14 +3094,7 @@ function EvalsCliApp({
2934
3094
  );
2935
3095
  }
2936
3096
  if (clampedState.level === "runs") {
2937
- return /* @__PURE__ */ jsx(
2938
- RunsView,
2939
- {
2940
- state: clampedState,
2941
- dataset: selectedDataset,
2942
- selectedRun
2943
- }
2944
- );
3097
+ return /* @__PURE__ */ jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
2945
3098
  }
2946
3099
  return /* @__PURE__ */ jsx(
2947
3100
  RunDetailsView,
@@ -2953,82 +3106,44 @@ function EvalsCliApp({
2953
3106
  }
2954
3107
  );
2955
3108
  };
2956
- return /* @__PURE__ */ jsxs(
2957
- Box,
2958
- {
2959
- flexDirection: "column",
2960
- flexGrow: 1,
2961
- width: stdoutWidth,
2962
- height: stdoutHeight,
2963
- children: [
2964
- /* @__PURE__ */ jsx(
2965
- Box,
2966
- {
2967
- borderStyle: "round",
2968
- borderColor: "cyan",
2969
- paddingX: 1,
2970
- width: stdoutWidth,
2971
- children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(
2972
- clampedState,
2973
- selectedDataset?.name,
2974
- selectedRun?.label
2975
- ) })
2976
- }
2977
- ),
2978
- clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
2979
- Box,
2980
- {
2981
- marginTop: 1,
2982
- borderStyle: "round",
2983
- borderColor: "yellow",
2984
- paddingX: 1,
2985
- flexDirection: "column",
2986
- width: stdoutWidth,
2987
- children: [
2988
- /* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
2989
- clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
2990
- ]
2991
- }
2992
- ),
2993
- clampedState.searchMode && /* @__PURE__ */ jsxs(
2994
- Box,
2995
- {
2996
- marginTop: 1,
2997
- borderStyle: "round",
2998
- borderColor: "magenta",
2999
- paddingX: 1,
3000
- width: stdoutWidth,
3001
- children: [
3002
- /* @__PURE__ */ jsx(Text, { color: "magenta", bold: true, children: "Search: " }),
3003
- /* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
3004
- ]
3005
- }
3006
- ),
3007
- runtimeMessage && /* @__PURE__ */ jsx(
3008
- Box,
3009
- {
3010
- marginTop: 1,
3011
- borderStyle: "round",
3012
- borderColor: "blue",
3013
- paddingX: 1,
3014
- width: stdoutWidth,
3015
- children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage })
3016
- }
3017
- ),
3018
- /* @__PURE__ */ jsx(
3019
- Box,
3020
- {
3021
- marginTop: 1,
3022
- flexGrow: 1,
3023
- width: stdoutWidth,
3024
- flexDirection: "row",
3025
- children: renderContent()
3026
- }
3027
- ),
3028
- /* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
3029
- ]
3030
- }
3031
- );
3109
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
3110
+ /* @__PURE__ */ jsx(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
3111
+ clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
3112
+ Box,
3113
+ {
3114
+ marginTop: 1,
3115
+ borderStyle: "round",
3116
+ borderColor: "yellow",
3117
+ paddingX: 1,
3118
+ flexDirection: "column",
3119
+ width: stdoutWidth,
3120
+ children: [
3121
+ /* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
3122
+ clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
3123
+ ]
3124
+ }
3125
+ ),
3126
+ clampedState.searchMode && /* @__PURE__ */ jsxs(
3127
+ Box,
3128
+ {
3129
+ marginTop: 1,
3130
+ borderStyle: "round",
3131
+ borderColor: "magenta",
3132
+ paddingX: 1,
3133
+ width: stdoutWidth,
3134
+ children: [
3135
+ /* @__PURE__ */ jsxs(Text, { color: "magenta", bold: true, children: [
3136
+ "Search:",
3137
+ " "
3138
+ ] }),
3139
+ /* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
3140
+ ]
3141
+ }
3142
+ ),
3143
+ runtimeMessage && /* @__PURE__ */ jsx(Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage }) }),
3144
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
3145
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
3146
+ ] });
3032
3147
  }
3033
3148
  async function main() {
3034
3149
  const args = parseStartupArgs(process.argv.slice(2));
@@ -3040,9 +3155,7 @@ async function main() {
3040
3155
  process.on("SIGTERM", () => {
3041
3156
  void runner.shutdown().finally(() => process.exit(0));
3042
3157
  });
3043
- withFullScreen(
3044
- /* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })
3045
- ).start();
3158
+ withFullScreen(/* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })).start();
3046
3159
  }
3047
3160
  void main();
3048
3161
  //# sourceMappingURL=out.js.map