@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -2,17 +2,18 @@
2
2
  'use strict';
3
3
 
4
4
  var fullscreenInk = require('fullscreen-ink');
5
- var React = require('react');
5
+ var React2 = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
8
  var path = require('path');
9
- var jsonDiff = require('json-diff');
9
+ var inkChart = require('@pppp606/ink-chart');
10
10
  var crypto = require('crypto');
11
11
  var effect = require('effect');
12
12
  var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
14
  var promises = require('fs/promises');
15
15
  var url = require('url');
16
+ var diff = require('diff');
16
17
 
17
18
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
18
19
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -35,7 +36,7 @@ function _interopNamespace(e) {
35
36
  return Object.freeze(n);
36
37
  }
37
38
 
38
- var React__default = /*#__PURE__*/_interopDefault(React);
39
+ var React2__default = /*#__PURE__*/_interopDefault(React2);
39
40
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
40
41
 
41
42
  var SEP = " ";
@@ -104,7 +105,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
104
105
  // src/cli/components/Footer.tsx
105
106
  function getFooterText(state) {
106
107
  if (state.level === "datasets") {
107
- return "\u2191\u2193 move Enter open / search Tab focus q quit";
108
+ return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
108
109
  }
109
110
  if (state.level === "runs") {
110
111
  return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
@@ -636,6 +637,7 @@ function createInitialState(data, args) {
636
637
  datasetMenuIndex,
637
638
  runMenuIndex,
638
639
  detailsScrollOffset: 0,
640
+ overviewScrollOffset: 0,
639
641
  selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
640
642
  evaluatorMenuIndex: 0,
641
643
  searchQuery,
@@ -651,8 +653,11 @@ function reduceCliState(state, action) {
651
653
  if (state.level === "details" && state.focus === "right") {
652
654
  return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
653
655
  }
656
+ if (state.level === "datasets" && state.focus === "right") {
657
+ return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
658
+ }
654
659
  if (state.level === "datasets") {
655
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
660
+ return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
656
661
  }
657
662
  if (state.level === "runs") {
658
663
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -669,8 +674,11 @@ function reduceCliState(state, action) {
669
674
  if (state.level === "details" && state.focus === "right") {
670
675
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
671
676
  }
677
+ if (state.level === "datasets" && state.focus === "right") {
678
+ return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
679
+ }
672
680
  if (state.level === "datasets") {
673
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
681
+ return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
674
682
  }
675
683
  if (state.level === "runs") {
676
684
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -746,249 +754,6 @@ function reduceCliState(state, action) {
746
754
  }
747
755
  return state;
748
756
  }
749
- var LEFT_PANE_WIDTH2 = 44;
750
- function DatasetsView({
751
- state,
752
- filteredDatasets,
753
- selectedDataset
754
- }) {
755
- const leftFocused = state.focus === "left";
756
- const rightFocused = state.focus === "right";
757
- return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
758
- /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
759
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
760
- /* @__PURE__ */ jsxRuntime.jsx(
761
- ListItem,
762
- {
763
- selected: state.datasetMenuIndex === 0,
764
- label: "New evaluation",
765
- itemKey: "datasets-new-eval"
766
- }
767
- ),
768
- filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
769
- ListItem,
770
- {
771
- selected: state.datasetMenuIndex === index + 1,
772
- label: dataset.name,
773
- itemKey: `dataset-${dataset.id}`
774
- },
775
- dataset.id
776
- ))
777
- ] }),
778
- /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
779
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
780
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
781
- ] })
782
- ] });
783
- }
784
- function RunsView({
785
- state,
786
- dataset,
787
- selectedRun
788
- }) {
789
- const runs = dataset?.runs ?? [];
790
- const rightFocused = state.focus === "right";
791
- return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
792
- /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
793
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
794
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
795
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
796
- " ",
797
- selectedRun.label,
798
- " ",
799
- /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
800
- ] }),
801
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
802
- "Commit: ",
803
- selectedRun.meta.commit,
804
- " Branch: ",
805
- selectedRun.meta.branch,
806
- " ",
807
- "Seed: ",
808
- selectedRun.meta.seed
809
- ] }),
810
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
811
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
812
- /* @__PURE__ */ jsxRuntime.jsx(
813
- TextBar,
814
- {
815
- label: "pass rate",
816
- value: selectedRun.performance.passRate,
817
- format: (v) => `${v}%`
818
- }
819
- ),
820
- /* @__PURE__ */ jsxRuntime.jsx(
821
- TextBar,
822
- {
823
- label: "avg score",
824
- value: Math.round(selectedRun.performance.avgScore * 100)
825
- }
826
- ),
827
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
828
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
829
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
830
- TextBar,
831
- {
832
- label: dimension.name,
833
- value: dimension.score
834
- },
835
- dimension.name
836
- )),
837
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
838
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
839
- /* @__PURE__ */ jsxRuntime.jsx(
840
- Sparkline,
841
- {
842
- data: selectedRun.performance.latencyHistoryMs ?? [
843
- selectedRun.performance.latencyAvgMs - 40,
844
- selectedRun.performance.latencyAvgMs - 10,
845
- selectedRun.performance.latencyAvgMs + 20,
846
- selectedRun.performance.latencyP95Ms - 80,
847
- selectedRun.performance.latencyP95Ms
848
- ],
849
- width: 24
850
- }
851
- )
852
- ] }) })
853
- ] });
854
- }
855
-
856
- // src/evals/metric.ts
857
- var registry = /* @__PURE__ */ new Map();
858
- var Metric = {
859
- of(config) {
860
- const def = {
861
- id: config.id,
862
- name: config.name,
863
- format: config.format,
864
- make: (data) => ({ id: config.id, data })
865
- };
866
- registry.set(config.id, def);
867
- return def;
868
- }
869
- };
870
- function getMetricById(id) {
871
- return registry.get(id);
872
- }
873
-
874
- // src/evals/score.ts
875
- var registry2 = /* @__PURE__ */ new Map();
876
- var Score = {
877
- of(config) {
878
- const def = {
879
- id: config.id,
880
- name: config.name,
881
- displayStrategy: config.displayStrategy,
882
- format: config.format,
883
- make: (data, options) => {
884
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
885
- return {
886
- id: config.id,
887
- data,
888
- ...passed !== void 0 && { passed }
889
- };
890
- }
891
- };
892
- registry2.set(config.id, def);
893
- return def;
894
- }
895
- };
896
- function getScoreById(id) {
897
- return registry2.get(id);
898
- }
899
-
900
- // src/evals/metrics/standard.ts
901
- Metric.of({
902
- id: "token-count",
903
- name: "Tokens",
904
- format: (data) => {
905
- const input = data.input ?? 0;
906
- const output = data.output ?? 0;
907
- const inputCached = data.inputCached ?? 0;
908
- const outputCached = data.outputCached ?? 0;
909
- const cached = inputCached + outputCached;
910
- return `in:${input} out:${output} cached:${cached}`;
911
- }
912
- });
913
- Metric.of({
914
- id: "latency",
915
- name: "Latency",
916
- format: (data) => `${data.ms}ms`
917
- });
918
-
919
- // src/evals/scores/standard.ts
920
- Score.of({
921
- id: "percent",
922
- name: "Score",
923
- displayStrategy: "bar",
924
- format: (data) => data.value.toFixed(2)
925
- });
926
- Score.of({
927
- id: "binary",
928
- name: "Result",
929
- displayStrategy: "passFail",
930
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
931
- });
932
- function createDiffLogEntry(expected, actual, options) {
933
- const diff = jsonDiff.diffString(expected, actual, { color: false });
934
- return {
935
- type: "diff",
936
- label: options?.label,
937
- expected,
938
- actual,
939
- diff: diff || "(no differences)"
940
- };
941
- }
942
- function getDiffLines(entry) {
943
- const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
944
- return raw.split("\n").map((line) => {
945
- const trimmed = line.trimStart();
946
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
947
- return { type: "remove", line };
948
- }
949
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
950
- return { type: "add", line };
951
- }
952
- return { type: "context", line };
953
- });
954
- }
955
-
956
- // src/runner/score-utils.ts
957
- function toNumericScoreFromScores(scores) {
958
- for (const item of scores) {
959
- const def = getScoreById(item.id);
960
- if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
961
- const value = item.data.value;
962
- if (typeof value === "number" && Number.isFinite(value)) {
963
- return value;
964
- }
965
- }
966
- const numeric = toNumericScore(item.data);
967
- if (numeric !== void 0) {
968
- return numeric;
969
- }
970
- }
971
- return void 0;
972
- }
973
- function toNumericScore(value) {
974
- if (typeof value === "number" && Number.isFinite(value)) {
975
- return value;
976
- }
977
- if (typeof value !== "object" || value === null) {
978
- return void 0;
979
- }
980
- const obj = value;
981
- if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
982
- return obj.score;
983
- }
984
- const numberValues = Object.values(value).filter(
985
- (entry) => typeof entry === "number" && Number.isFinite(entry)
986
- );
987
- if (numberValues.length === 0) {
988
- return void 0;
989
- }
990
- return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
991
- }
992
757
 
993
758
  // src/runner/config.ts
994
759
  var defaultRunnerConfig = {
@@ -1009,7 +774,8 @@ var defaultRunnerConfig = {
1009
774
  ],
1010
775
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
1011
776
  },
1012
- artifactDirectory: ".eval-results"
777
+ artifactDirectory: ".eval-results",
778
+ maxConcurrency: 1
1013
779
  };
1014
780
  function toRunnerConfigOverrides(config) {
1015
781
  if (!config) {
@@ -1042,6 +808,9 @@ function toRunnerConfigOverrides(config) {
1042
808
  if (config.artifactDirectory !== void 0) {
1043
809
  overrides.artifactDirectory = config.artifactDirectory;
1044
810
  }
811
+ if (config.maxConcurrency !== void 0) {
812
+ overrides.maxConcurrency = config.maxConcurrency;
813
+ }
1045
814
  if (Object.keys(discovery).length > 0) {
1046
815
  overrides.discovery = discovery;
1047
816
  }
@@ -1216,25 +985,261 @@ async function collectEvaluatorsFromFiles(config) {
1216
985
  );
1217
986
  return found.flat();
1218
987
  }
1219
- async function collectTestCasesFromFiles(config) {
1220
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1221
- const matched = files.filter(
1222
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1223
- );
1224
- const found = await Promise.all(
1225
- matched.map(async (absolutePath) => {
1226
- const exports = await loadModuleExports(absolutePath);
1227
- const testCases = exports.filter(isTestCaseLike);
1228
- const relPath = path.relative(config.rootDir, absolutePath);
1229
- return testCases.map((testCase) => ({
1230
- id: toId("test-case", relPath, testCase.getName()),
1231
- filePath: relPath,
1232
- testCase
1233
- }));
1234
- })
988
+ async function collectTestCasesFromFiles(config) {
989
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
990
+ const matched = files.filter(
991
+ (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
992
+ );
993
+ const found = await Promise.all(
994
+ matched.map(async (absolutePath) => {
995
+ const exports = await loadModuleExports(absolutePath);
996
+ const testCases = exports.filter(isTestCaseLike);
997
+ const relPath = path.relative(config.rootDir, absolutePath);
998
+ return testCases.map((testCase) => ({
999
+ id: toId("test-case", relPath, testCase.getName()),
1000
+ filePath: relPath,
1001
+ testCase
1002
+ }));
1003
+ })
1004
+ );
1005
+ return found.flat();
1006
+ }
1007
+ function toJsonLines(value) {
1008
+ try {
1009
+ return JSON.stringify(value, null, 2);
1010
+ } catch {
1011
+ return String(value);
1012
+ }
1013
+ }
1014
+ function formatDiffString(changes) {
1015
+ const lines = [];
1016
+ for (const part of changes) {
1017
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
1018
+ const partLines = part.value.split("\n");
1019
+ if (partLines[partLines.length - 1] === "") {
1020
+ partLines.pop();
1021
+ }
1022
+ for (const line of partLines) {
1023
+ lines.push(`${prefix} ${line}`);
1024
+ }
1025
+ }
1026
+ return lines.join("\n");
1027
+ }
1028
+ function createDiffString(expected, actual) {
1029
+ const expectedStr = toJsonLines(expected);
1030
+ const actualStr = toJsonLines(actual);
1031
+ const changes = diff.diffLines(expectedStr, actualStr);
1032
+ return formatDiffString(changes);
1033
+ }
1034
+ function createDiffLogEntry(expected, actual, options) {
1035
+ const diff = createDiffString(expected, actual);
1036
+ return {
1037
+ type: "diff",
1038
+ label: options?.label,
1039
+ expected,
1040
+ actual,
1041
+ diff: diff || "(no differences)"
1042
+ };
1043
+ }
1044
+ function getDiffLines(entry) {
1045
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1046
+ return raw.split("\n").map((line) => {
1047
+ const trimmed = line.trimStart();
1048
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
1049
+ return { type: "remove", line };
1050
+ }
1051
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
1052
+ return { type: "add", line };
1053
+ }
1054
+ return { type: "context", line };
1055
+ });
1056
+ }
1057
+
1058
+ // src/evals/metric.ts
1059
+ var registry = /* @__PURE__ */ new Map();
1060
+ var Metric = {
1061
+ of(config) {
1062
+ const def = {
1063
+ id: config.id,
1064
+ name: config.name,
1065
+ aggregate: config.aggregate,
1066
+ format: config.format,
1067
+ make: (data) => ({ id: config.id, data })
1068
+ };
1069
+ registry.set(config.id, def);
1070
+ return def;
1071
+ }
1072
+ };
1073
+ function getMetricById(id) {
1074
+ return registry.get(id);
1075
+ }
1076
+
1077
+ // src/evals/score.ts
1078
+ var registry2 = /* @__PURE__ */ new Map();
1079
+ var Score = {
1080
+ of(config) {
1081
+ const def = {
1082
+ id: config.id,
1083
+ name: config.name,
1084
+ displayStrategy: config.displayStrategy,
1085
+ aggregate: config.aggregate,
1086
+ format: config.format,
1087
+ make: (data, options) => {
1088
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1089
+ return {
1090
+ id: config.id,
1091
+ data,
1092
+ ...passed !== void 0 && { passed }
1093
+ };
1094
+ }
1095
+ };
1096
+ registry2.set(config.id, def);
1097
+ return def;
1098
+ }
1099
+ };
1100
+ function getScoreById(id) {
1101
+ return registry2.get(id);
1102
+ }
1103
+
1104
+ // src/evals/aggregators.ts
1105
+ function aggregateAverageWithVariance(values) {
1106
+ if (values.length === 0) {
1107
+ return { value: 0, count: 0 };
1108
+ }
1109
+ const sum = values.reduce((s, v) => s + v.value, 0);
1110
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1111
+ const mean = sum / values.length;
1112
+ let stdDev;
1113
+ if (values.length >= 2) {
1114
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1115
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1116
+ }
1117
+ return { value: mean, stdDev, count: values.length };
1118
+ }
1119
+ function aggregateAll(values) {
1120
+ const total = values.length;
1121
+ const passedCount = values.filter((v) => v.passed).length;
1122
+ return {
1123
+ passed: total > 0 && values.every((v) => v.passed),
1124
+ passedCount,
1125
+ totalCount: total
1126
+ };
1127
+ }
1128
+ function aggregateTokenCountSum(values) {
1129
+ const initial = {
1130
+ input: 0,
1131
+ output: 0,
1132
+ inputCached: 0,
1133
+ outputCached: 0
1134
+ };
1135
+ return values.reduce(
1136
+ (acc, v) => ({
1137
+ input: acc.input + (v.input ?? 0),
1138
+ output: acc.output + (v.output ?? 0),
1139
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1140
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1141
+ }),
1142
+ initial
1143
+ );
1144
+ }
1145
+ function aggregateLatencyAverage(values) {
1146
+ if (values.length === 0) {
1147
+ return { ms: 0 };
1148
+ }
1149
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1150
+ return { ms: sum / values.length };
1151
+ }
1152
+
1153
+ // src/evals/metrics/standard.ts
1154
+ Metric.of({
1155
+ id: "token-count",
1156
+ name: "Tokens",
1157
+ aggregate: aggregateTokenCountSum,
1158
+ format: (data, options) => {
1159
+ const input = data.input ?? 0;
1160
+ const output = data.output ?? 0;
1161
+ const inputCached = data.inputCached ?? 0;
1162
+ const outputCached = data.outputCached ?? 0;
1163
+ const cached = inputCached + outputCached;
1164
+ const base = `in:${input} out:${output} cached:${cached}`;
1165
+ return options?.isAggregated ? `Total: ${base}` : base;
1166
+ }
1167
+ });
1168
+ Metric.of({
1169
+ id: "latency",
1170
+ name: "Latency",
1171
+ aggregate: aggregateLatencyAverage,
1172
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1173
+ });
1174
+
1175
+ // src/evals/scores/standard.ts
1176
+ Score.of({
1177
+ id: "percent",
1178
+ name: "Score",
1179
+ displayStrategy: "bar",
1180
+ format: (data, options) => {
1181
+ if (options?.isAggregated) {
1182
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1183
+ }
1184
+ return data.value.toFixed(2);
1185
+ },
1186
+ aggregate: aggregateAverageWithVariance
1187
+ });
1188
+ Score.of({
1189
+ id: "binary",
1190
+ name: "Result",
1191
+ displayStrategy: "passFail",
1192
+ format: (data, options) => {
1193
+ if (options?.isAggregated) {
1194
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1195
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1196
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1197
+ }
1198
+ return base;
1199
+ }
1200
+ return data.passed ? "PASSED" : "NOT PASSED";
1201
+ },
1202
+ aggregate: aggregateAll
1203
+ });
1204
+
1205
+ // src/runner/score-utils.ts
1206
+ function toNumericScoreFromScores(scores) {
1207
+ for (const item of scores) {
1208
+ const def = getScoreById(item.id);
1209
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1210
+ const value = item.data.value;
1211
+ if (typeof value === "number" && Number.isFinite(value)) {
1212
+ return value;
1213
+ }
1214
+ }
1215
+ const numeric = toNumericScore(item.data);
1216
+ if (numeric !== void 0) {
1217
+ return numeric;
1218
+ }
1219
+ }
1220
+ return void 0;
1221
+ }
1222
+ function toNumericScore(value) {
1223
+ if (typeof value === "number" && Number.isFinite(value)) {
1224
+ return value;
1225
+ }
1226
+ if (typeof value !== "object" || value === null) {
1227
+ return void 0;
1228
+ }
1229
+ const obj = value;
1230
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1231
+ return obj.score;
1232
+ }
1233
+ const numberValues = Object.values(value).filter(
1234
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1235
1235
  );
1236
- return found.flat();
1236
+ if (numberValues.length === 0) {
1237
+ return void 0;
1238
+ }
1239
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1237
1240
  }
1241
+
1242
+ // src/runner/execution.ts
1238
1243
  function computeEvaluatorPassed(evaluator, result, scores) {
1239
1244
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1240
1245
  if (scoresWithPassed.length > 0) {
@@ -1276,6 +1281,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1276
1281
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1277
1282
  );
1278
1283
  }
1284
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1285
+ return effect.Effect.gen(function* () {
1286
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1287
+ const rerunPassed = [];
1288
+ for (let r = 0; r < reruns; r++) {
1289
+ const started = Date.now();
1290
+ const evaluatorScores = [];
1291
+ let testCaseError;
1292
+ const output = readOutput(testCaseItem.testCase);
1293
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1294
+ const evaluateFn = evaluator.getEvaluateFn();
1295
+ if (!evaluateFn) {
1296
+ continue;
1297
+ }
1298
+ try {
1299
+ const logs = [];
1300
+ const logDiff = (expected, actual, options) => {
1301
+ logs.push(createDiffLogEntry(expected, actual, options));
1302
+ };
1303
+ const ctx = yield* effect.Effect.promise(
1304
+ () => Promise.resolve(evaluator.resolveContext())
1305
+ );
1306
+ const result = yield* effect.Effect.promise(
1307
+ () => Promise.resolve(
1308
+ evaluateFn({
1309
+ input: testCaseItem.testCase.getInput(),
1310
+ ctx,
1311
+ output,
1312
+ logDiff
1313
+ })
1314
+ )
1315
+ );
1316
+ const { scores, metrics } = normalizeResult(result);
1317
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1318
+ evaluatorScores.push({
1319
+ evaluatorId,
1320
+ scores,
1321
+ passed: passed2,
1322
+ metrics,
1323
+ logs: logs.length > 0 ? logs : void 0
1324
+ });
1325
+ } catch (error) {
1326
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1327
+ evaluatorScores.push({
1328
+ evaluatorId,
1329
+ scores: [],
1330
+ passed: false
1331
+ });
1332
+ }
1333
+ }
1334
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1335
+ rerunPassed.push(rerunPassedThis);
1336
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
1337
+ n + 1,
1338
+ n + 1
1339
+ ]);
1340
+ const progressEvent = {
1341
+ type: "TestCaseProgress",
1342
+ runId: task.runId,
1343
+ testCaseId: testCaseItem.id,
1344
+ testCaseName: testCaseItem.testCase.getName(),
1345
+ completedTestCases: completedEvaluations,
1346
+ totalTestCases: totalEvaluations,
1347
+ rerunIndex: r + 1,
1348
+ rerunTotal: reruns,
1349
+ passed: rerunPassedThis,
1350
+ durationMs: Date.now() - started,
1351
+ evaluatorScores,
1352
+ output,
1353
+ errorMessage: testCaseError
1354
+ };
1355
+ updateSnapshot(task.runId, (snapshot) => ({
1356
+ ...snapshot,
1357
+ completedTestCases: completedEvaluations
1358
+ }));
1359
+ yield* publishEvent(progressEvent);
1360
+ yield* effect.Queue.offer(persistenceQueue, {
1361
+ runId: task.runId,
1362
+ artifactPath: task.snapshot.artifactPath,
1363
+ payload: progressEvent
1364
+ });
1365
+ }
1366
+ const testCasePassed = rerunPassed.every(Boolean);
1367
+ if (testCasePassed) {
1368
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
1369
+ } else {
1370
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
1371
+ }
1372
+ const [passed, failed] = yield* effect.Effect.all([
1373
+ effect.Ref.get(passedRef),
1374
+ effect.Ref.get(failedRef)
1375
+ ]);
1376
+ updateSnapshot(task.runId, (snapshot) => ({
1377
+ ...snapshot,
1378
+ passedTestCases: passed,
1379
+ failedTestCases: failed
1380
+ }));
1381
+ });
1382
+ }
1279
1383
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1280
1384
  const startedAt = Date.now();
1281
1385
  updateSnapshot(task.runId, (snapshot) => ({
@@ -1288,104 +1392,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
1288
1392
  runId: task.runId,
1289
1393
  startedAt
1290
1394
  });
1291
- let completedTestCases = 0;
1292
- let passedTestCases = 0;
1293
- let failedTestCases = 0;
1294
- for (const testCaseItem of task.testCases) {
1295
- const started = Date.now();
1296
- const evaluatorScores = [];
1297
- let testCaseError;
1298
- const output = readOutput(testCaseItem.testCase);
1299
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1300
- const evaluateFn = evaluator.getEvaluateFn();
1301
- if (!evaluateFn) {
1302
- continue;
1303
- }
1304
- try {
1305
- const logs = [];
1306
- const logDiff = (expected, actual, options) => {
1307
- logs.push(createDiffLogEntry(expected, actual, options));
1308
- };
1309
- const ctx = yield* effect.Effect.promise(
1310
- () => Promise.resolve(evaluator.resolveContext())
1311
- );
1312
- const result = yield* effect.Effect.promise(
1313
- () => Promise.resolve(
1314
- evaluateFn({
1315
- input: testCaseItem.testCase.getInput(),
1316
- ctx,
1317
- output,
1318
- logDiff
1319
- })
1320
- )
1321
- );
1322
- const { scores, metrics } = normalizeResult(result);
1323
- const passed = computeEvaluatorPassed(evaluator, result, scores);
1324
- evaluatorScores.push({
1325
- evaluatorId,
1326
- scores,
1327
- passed,
1328
- metrics,
1329
- logs: logs.length > 0 ? logs : void 0
1330
- });
1331
- } catch (error) {
1332
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1333
- evaluatorScores.push({
1334
- evaluatorId,
1335
- scores: [],
1336
- passed: false
1337
- });
1338
- }
1339
- }
1340
- const testCasePassed = evaluatorScores.every((s) => s.passed);
1341
- completedTestCases += 1;
1342
- if (testCasePassed) {
1343
- passedTestCases += 1;
1344
- } else {
1345
- failedTestCases += 1;
1346
- }
1347
- const progressEvent = {
1348
- type: "TestCaseProgress",
1349
- runId: task.runId,
1350
- testCaseId: testCaseItem.id,
1351
- testCaseName: testCaseItem.testCase.getName(),
1352
- completedTestCases,
1353
- totalTestCases: task.testCases.length,
1354
- passed: testCasePassed,
1355
- durationMs: Date.now() - started,
1356
- evaluatorScores,
1357
- output,
1358
- errorMessage: testCaseError
1359
- };
1360
- updateSnapshot(task.runId, (snapshot) => ({
1361
- ...snapshot,
1362
- completedTestCases,
1363
- passedTestCases,
1364
- failedTestCases
1365
- }));
1366
- yield* publishEvent(progressEvent);
1367
- yield* effect.Queue.offer(persistenceQueue, {
1368
- runId: task.runId,
1369
- artifactPath: task.snapshot.artifactPath,
1370
- payload: progressEvent
1371
- });
1372
- }
1395
+ const totalEvaluations = task.testCases.reduce(
1396
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1397
+ 0
1398
+ );
1399
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1400
+ const completedRef = yield* effect.Ref.make(0);
1401
+ const passedRef = yield* effect.Ref.make(0);
1402
+ const failedRef = yield* effect.Ref.make(0);
1403
+ const processTestCase = (testCaseItem) => processOneTestCase(
1404
+ task,
1405
+ testCaseItem,
1406
+ totalEvaluations,
1407
+ publishEvent,
1408
+ persistenceQueue,
1409
+ updateSnapshot,
1410
+ completedRef,
1411
+ passedRef,
1412
+ failedRef
1413
+ );
1414
+ yield* effect.Effect.forEach(
1415
+ task.testCases,
1416
+ processTestCase,
1417
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1418
+ );
1419
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
1420
+ effect.Ref.get(completedRef),
1421
+ effect.Ref.get(passedRef),
1422
+ effect.Ref.get(failedRef)
1423
+ ]);
1373
1424
  const finishedAt = Date.now();
1374
1425
  const completedEvent = {
1375
1426
  type: "RunCompleted",
1376
1427
  runId: task.runId,
1377
1428
  finishedAt,
1378
- passedTestCases,
1379
- failedTestCases,
1429
+ passedTestCases: passedUniqueTestCases,
1430
+ failedTestCases: failedUniqueTestCases,
1380
1431
  totalTestCases: task.testCases.length,
1381
1432
  artifactPath: task.snapshot.artifactPath
1382
1433
  };
1383
1434
  updateSnapshot(task.runId, (snapshot) => ({
1384
1435
  ...snapshot,
1385
1436
  status: "completed",
1386
- completedTestCases,
1387
- passedTestCases,
1388
- failedTestCases,
1437
+ completedTestCases: completedEvaluations,
1438
+ passedTestCases: passedUniqueTestCases,
1439
+ failedTestCases: failedUniqueTestCases,
1389
1440
  finishedAt
1390
1441
  }));
1391
1442
  yield* publishEvent(completedEvent);
@@ -1473,7 +1524,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1473
1524
  const artifactPath = filePath;
1474
1525
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1475
1526
  const progress = aggregateTestCaseProgress(lines);
1476
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1527
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1477
1528
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1478
1529
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1479
1530
  return {
@@ -1495,23 +1546,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1495
1546
  }
1496
1547
  function aggregateTestCaseProgress(lines) {
1497
1548
  let completedTestCases = 0;
1498
- let passedTestCases = 0;
1499
- let failedTestCases = 0;
1549
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1500
1550
  for (const line of lines) {
1501
1551
  try {
1502
1552
  const event = JSON.parse(line);
1503
1553
  if (event.type === "TestCaseProgress") {
1504
1554
  const ev = event;
1505
1555
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1506
- if (ev.passed) {
1507
- passedTestCases += 1;
1508
- } else {
1509
- failedTestCases += 1;
1510
- }
1556
+ const id = ev.testCaseId;
1557
+ const current = testCasePassedBy.get(id);
1558
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1511
1559
  }
1512
1560
  } catch {
1513
1561
  }
1514
1562
  }
1563
+ let passedTestCases = 0;
1564
+ let failedTestCases = 0;
1565
+ for (const passed of testCasePassedBy.values()) {
1566
+ if (passed) {
1567
+ passedTestCases += 1;
1568
+ } else {
1569
+ failedTestCases += 1;
1570
+ }
1571
+ }
1515
1572
  return { completedTestCases, passedTestCases, failedTestCases };
1516
1573
  }
1517
1574
  async function parseArtifactFile(artifactPath) {
@@ -1529,6 +1586,8 @@ async function parseArtifactFile(artifactPath) {
1529
1586
  testCaseName: ev.testCaseName,
1530
1587
  completedTestCases: ev.completedTestCases,
1531
1588
  totalTestCases: ev.totalTestCases,
1589
+ rerunIndex: ev.rerunIndex,
1590
+ rerunTotal: ev.rerunTotal,
1532
1591
  passed: ev.passed,
1533
1592
  durationMs: ev.durationMs,
1534
1593
  evaluatorScores: ev.evaluatorScores ?? []
@@ -1734,6 +1793,10 @@ var EffectRunner = class {
1734
1793
  throw new Error("No evaluators selected for run");
1735
1794
  }
1736
1795
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1796
+ const totalEvaluations = selectedTestCases.reduce(
1797
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1798
+ 0
1799
+ );
1737
1800
  const runId = `run-${crypto.randomUUID()}`;
1738
1801
  const artifactPath = createArtifactPath(
1739
1802
  this.config.artifactDirectory,
@@ -1746,7 +1809,7 @@ var EffectRunner = class {
1746
1809
  datasetName: dataset.dataset.getName(),
1747
1810
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1748
1811
  queuedAt: Date.now(),
1749
- totalTestCases: selectedTestCases.length,
1812
+ totalTestCases: totalEvaluations,
1750
1813
  completedTestCases: 0,
1751
1814
  passedTestCases: 0,
1752
1815
  failedTestCases: 0,
@@ -1760,7 +1823,7 @@ var EffectRunner = class {
1760
1823
  datasetId: request.datasetId,
1761
1824
  datasetName: dataset.dataset.getName(),
1762
1825
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1763
- totalTestCases: selectedTestCases.length,
1826
+ totalTestCases: totalEvaluations,
1764
1827
  artifactPath
1765
1828
  };
1766
1829
  await effect.Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1771,6 +1834,7 @@ var EffectRunner = class {
1771
1834
  payload: queuedEvent
1772
1835
  })
1773
1836
  );
1837
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1774
1838
  await effect.Effect.runPromise(
1775
1839
  effect.Queue.offer(this.runQueue, {
1776
1840
  runId,
@@ -1778,7 +1842,8 @@ var EffectRunner = class {
1778
1842
  dataset: dataset.dataset,
1779
1843
  evaluators: selectedEvaluators,
1780
1844
  testCases: selectedTestCases,
1781
- snapshot
1845
+ snapshot,
1846
+ maxConcurrency
1782
1847
  })
1783
1848
  );
1784
1849
  return snapshot;
@@ -1845,6 +1910,240 @@ var EffectRunner = class {
1845
1910
  );
1846
1911
  }
1847
1912
  };
1913
+ var LEFT_PANE_WIDTH2 = 44;
1914
+ var MAX_RUNS_FOR_CHART = 12;
1915
+ var MAX_RUNS_FOR_TREND = 20;
1916
+ var TREND_BATCH_SIZE = 4;
1917
+ function extractRunAverageScore(testCases) {
1918
+ const scores = [];
1919
+ for (const tc of testCases) {
1920
+ for (const es of tc.evaluatorScores) {
1921
+ const n = toNumericScoreFromScores(es.scores);
1922
+ if (n !== void 0) {
1923
+ scores.push(n);
1924
+ }
1925
+ }
1926
+ }
1927
+ if (scores.length === 0)
1928
+ return void 0;
1929
+ return scores.reduce((a, b) => a + b, 0) / scores.length;
1930
+ }
1931
+ async function loadRunScores(runs) {
1932
+ const results = [];
1933
+ for (const run of runs) {
1934
+ const artifact = run.meta?.artifact;
1935
+ if (!artifact)
1936
+ continue;
1937
+ try {
1938
+ const path$1 = path.resolve(artifact);
1939
+ const testCases = await parseArtifactFile(path$1);
1940
+ const avg = extractRunAverageScore(testCases);
1941
+ if (avg !== void 0) {
1942
+ results.push({
1943
+ runId: run.id,
1944
+ label: run.label,
1945
+ value: avg
1946
+ });
1947
+ }
1948
+ } catch {
1949
+ }
1950
+ }
1951
+ return results;
1952
+ }
1953
+ function batchAverage(values, batchSize) {
1954
+ const batches = [];
1955
+ for (let i = 0; i < values.length; i += batchSize) {
1956
+ const slice = values.slice(i, i + batchSize);
1957
+ if (slice.length > 0) {
1958
+ batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
1959
+ }
1960
+ }
1961
+ return batches;
1962
+ }
1963
+ var OVERVIEW_PAGE_SIZE = 15;
1964
+ function DatasetsView({
1965
+ state,
1966
+ filteredDatasets,
1967
+ selectedDataset,
1968
+ overviewRowCountRef
1969
+ }) {
1970
+ const leftFocused = state.focus === "left";
1971
+ const rightFocused = state.focus === "right";
1972
+ const [runScores, setRunScores] = React2.useState([]);
1973
+ const [loading, setLoading] = React2.useState(false);
1974
+ React2.useEffect(() => {
1975
+ if (!selectedDataset?.runs?.length) {
1976
+ setRunScores([]);
1977
+ return;
1978
+ }
1979
+ setLoading(true);
1980
+ const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
1981
+ loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
1982
+ }, [selectedDataset?.id, selectedDataset?.runs?.length]);
1983
+ const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
1984
+ const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
1985
+ const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
1986
+ const overviewRows = React2.useMemo(() => {
1987
+ const rows = [];
1988
+ rows.push(
1989
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
1990
+ );
1991
+ if (selectedDataset && selectedDataset.runs.length > 0) {
1992
+ if (loading) {
1993
+ rows.push(
1994
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
1995
+ );
1996
+ } else if (runScores.length > 0) {
1997
+ rows.push(
1998
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
1999
+ );
2000
+ for (const d of barData) {
2001
+ rows.push(
2002
+ /* @__PURE__ */ jsxRuntime.jsx(
2003
+ TextBar,
2004
+ {
2005
+ label: d.label,
2006
+ value: d.value,
2007
+ labelWidth: 14,
2008
+ barWidth: 24,
2009
+ max: 100,
2010
+ format: (v) => v.toFixed(1)
2011
+ },
2012
+ d.runId
2013
+ )
2014
+ );
2015
+ }
2016
+ if (trendBatched.length > 0) {
2017
+ rows.push(
2018
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
2019
+ );
2020
+ rows.push(
2021
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: /* @__PURE__ */ jsxRuntime.jsx(
2022
+ inkChart.LineGraph,
2023
+ {
2024
+ data: [{ values: trendBatched, color: "cyan" }],
2025
+ height: 5,
2026
+ width: 45,
2027
+ showYAxis: true,
2028
+ xLabels: ["older", "newer"]
2029
+ }
2030
+ ) }, "trend-graph")
2031
+ );
2032
+ }
2033
+ }
2034
+ }
2035
+ return rows;
2036
+ }, [
2037
+ selectedDataset?.overview,
2038
+ selectedDataset?.runs?.length,
2039
+ loading,
2040
+ runScores,
2041
+ barData,
2042
+ trendBatched
2043
+ ]);
2044
+ if (overviewRowCountRef) {
2045
+ overviewRowCountRef.current = overviewRows.length;
2046
+ }
2047
+ const offset = Math.max(0, state.overviewScrollOffset);
2048
+ const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
2049
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2050
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
2051
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
2052
+ /* @__PURE__ */ jsxRuntime.jsx(
2053
+ ListItem,
2054
+ {
2055
+ selected: state.datasetMenuIndex === 0,
2056
+ label: "New evaluation",
2057
+ itemKey: "datasets-new-eval"
2058
+ }
2059
+ ),
2060
+ filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
2061
+ ListItem,
2062
+ {
2063
+ selected: state.datasetMenuIndex === index + 1,
2064
+ label: dataset.name,
2065
+ itemKey: `dataset-${dataset.id}`
2066
+ },
2067
+ dataset.id
2068
+ ))
2069
+ ] }),
2070
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
2071
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
2072
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: row }, offset + i)) })
2073
+ ] })
2074
+ ] });
2075
+ }
2076
+ function RunsView({
2077
+ state,
2078
+ dataset,
2079
+ selectedRun
2080
+ }) {
2081
+ const runs = dataset?.runs ?? [];
2082
+ const rightFocused = state.focus === "right";
2083
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2084
+ /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2085
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
2086
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2087
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
2088
+ " ",
2089
+ selectedRun.label,
2090
+ " ",
2091
+ /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
2092
+ ] }),
2093
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2094
+ "Commit: ",
2095
+ selectedRun.meta.commit,
2096
+ " Branch: ",
2097
+ selectedRun.meta.branch,
2098
+ " ",
2099
+ "Seed: ",
2100
+ selectedRun.meta.seed
2101
+ ] }),
2102
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2103
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
2104
+ /* @__PURE__ */ jsxRuntime.jsx(
2105
+ TextBar,
2106
+ {
2107
+ label: "pass rate",
2108
+ value: selectedRun.performance.passRate,
2109
+ format: (v) => `${v}%`
2110
+ }
2111
+ ),
2112
+ /* @__PURE__ */ jsxRuntime.jsx(
2113
+ TextBar,
2114
+ {
2115
+ label: "avg score",
2116
+ value: Math.round(selectedRun.performance.avgScore * 100)
2117
+ }
2118
+ ),
2119
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2120
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
2121
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
2122
+ TextBar,
2123
+ {
2124
+ label: dimension.name,
2125
+ value: dimension.score
2126
+ },
2127
+ dimension.name
2128
+ )),
2129
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2130
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
2131
+ /* @__PURE__ */ jsxRuntime.jsx(
2132
+ Sparkline,
2133
+ {
2134
+ data: selectedRun.performance.latencyHistoryMs ?? [
2135
+ selectedRun.performance.latencyAvgMs - 40,
2136
+ selectedRun.performance.latencyAvgMs - 10,
2137
+ selectedRun.performance.latencyAvgMs + 20,
2138
+ selectedRun.performance.latencyP95Ms - 80,
2139
+ selectedRun.performance.latencyP95Ms
2140
+ ],
2141
+ width: 24
2142
+ }
2143
+ )
2144
+ ] }) })
2145
+ ] });
2146
+ }
1848
2147
  var DETAILS_PAGE_SIZE = 20;
1849
2148
  function scoreColor(score) {
1850
2149
  if (score >= 80)
@@ -1853,7 +2152,7 @@ function scoreColor(score) {
1853
2152
  return "yellow";
1854
2153
  return "red";
1855
2154
  }
1856
- function formatScorePart(item, scoreToColor) {
2155
+ function formatScorePart(item) {
1857
2156
  const def = getScoreById(item.id);
1858
2157
  if (!def) {
1859
2158
  const numeric = toNumericScore(item.data);
@@ -1883,7 +2182,7 @@ function CheckRow({
1883
2182
  " ",
1884
2183
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
1885
2184
  detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1886
- " (",
2185
+ " (",
1887
2186
  detail,
1888
2187
  ")"
1889
2188
  ] }) : null
@@ -1903,21 +2202,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1903
2202
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1904
2203
  "Model: ",
1905
2204
  meta.model,
1906
- " Provider: ",
2205
+ " Provider: ",
1907
2206
  meta.provider
1908
2207
  ] }, "meta-1"),
1909
2208
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1910
2209
  "Commit: ",
1911
2210
  meta.commit,
1912
- " Branch: ",
2211
+ " Branch: ",
1913
2212
  meta.branch,
1914
- " Seed: ",
2213
+ " Seed: ",
1915
2214
  meta.seed
1916
2215
  ] }, "meta-2"),
1917
2216
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1918
2217
  "Duration: ",
1919
2218
  meta.duration,
1920
- " Concurrency: ",
2219
+ " Concurrency: ",
1921
2220
  meta.concurrency
1922
2221
  ] }, "meta-3"),
1923
2222
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
@@ -1929,7 +2228,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1929
2228
  ...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
1930
2229
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
1931
2230
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
1932
- ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2231
+ ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
2232
+ CheckRow,
2233
+ {
2234
+ name: c.name,
2235
+ passed: c.passed,
2236
+ detail: c.detail
2237
+ },
2238
+ `chk-${c.name}`
2239
+ )),
1933
2240
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
1934
2241
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
1935
2242
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -1942,16 +2249,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1942
2249
  "perf-rate"
1943
2250
  ),
1944
2251
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1945
- "latency avg ",
2252
+ "latency avg ",
1946
2253
  performance.latencyAvgMs,
1947
- "ms p95 ",
2254
+ "ms p95 ",
1948
2255
  performance.latencyP95Ms,
1949
2256
  "ms"
1950
2257
  ] }, "perf-lat"),
1951
2258
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1952
- "tokens avg ",
2259
+ "tokens avg ",
1953
2260
  performance.tokensAvg,
1954
- " p95 ",
2261
+ " p95 ",
1955
2262
  performance.tokensP95
1956
2263
  ] }, "perf-tok"),
1957
2264
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
@@ -1975,6 +2282,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1975
2282
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
1976
2283
  rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
1977
2284
  for (const tc of testCases) {
2285
+ const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
1978
2286
  rows.push(
1979
2287
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1980
2288
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -1986,12 +2294,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1986
2294
  ] }),
1987
2295
  " ",
1988
2296
  tc.testCaseName,
2297
+ rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
1989
2298
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1990
2299
  " (",
1991
2300
  tc.durationMs,
1992
2301
  "ms)"
1993
2302
  ] })
1994
- ] }, `tc-${tc.testCaseId}`)
2303
+ ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
1995
2304
  );
1996
2305
  for (const item of tc.evaluatorScores) {
1997
2306
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2059,12 +2368,12 @@ function RunDetailsView({
2059
2368
  }) {
2060
2369
  const runs = dataset?.runs ?? [];
2061
2370
  const rightFocused = state.focus === "right";
2062
- const [testCases, setTestCases] = React.useState([]);
2063
- const evaluatorNameById = React__default.default.useMemo(
2371
+ const [testCases, setTestCases] = React2.useState([]);
2372
+ const evaluatorNameById = React2__default.default.useMemo(
2064
2373
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2065
2374
  [evaluators]
2066
2375
  );
2067
- React.useEffect(() => {
2376
+ React2.useEffect(() => {
2068
2377
  if (!selectedRun?.meta?.artifact) {
2069
2378
  setTestCases([]);
2070
2379
  return;
@@ -2083,7 +2392,7 @@ function RunDetailsView({
2083
2392
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2084
2393
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2085
2394
  /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2086
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
2395
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
2087
2396
  ] });
2088
2397
  }
2089
2398
  var LEFT_PANE_WIDTH3 = 44;
@@ -2163,16 +2472,17 @@ function EvalsCliApp({
2163
2472
  }) {
2164
2473
  const { exit } = ink.useApp();
2165
2474
  const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
2166
- const [liveData, setLiveData] = React.useState(data);
2167
- const [runtimeMessage, setRuntimeMessage] = React.useState();
2168
- const [state, dispatch] = React.useReducer(
2475
+ const [liveData, setLiveData] = React2.useState(data);
2476
+ const [runtimeMessage, setRuntimeMessage] = React2.useState();
2477
+ const overviewRowCountRef = React2.useRef(0);
2478
+ const [state, dispatch] = React2.useReducer(
2169
2479
  reduceCliState,
2170
2480
  createInitialState(data, args)
2171
2481
  );
2172
- React.useEffect(() => {
2482
+ React2.useEffect(() => {
2173
2483
  setLiveData(data);
2174
2484
  }, [data]);
2175
- React.useEffect(() => {
2485
+ React2.useEffect(() => {
2176
2486
  if (!runner) {
2177
2487
  return void 0;
2178
2488
  }
@@ -2191,7 +2501,7 @@ function EvalsCliApp({
2191
2501
  }
2192
2502
  });
2193
2503
  }, [runner]);
2194
- const filteredDatasets = React.useMemo(
2504
+ const filteredDatasets = React2.useMemo(
2195
2505
  () => getFilteredDatasets(liveData, state.searchQuery),
2196
2506
  [liveData, state.searchQuery]
2197
2507
  );
@@ -2244,7 +2554,16 @@ function EvalsCliApp({
2244
2554
  return;
2245
2555
  }
2246
2556
  if (key.downArrow) {
2247
- const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
2557
+ let max;
2558
+ if (clampedState.level === "datasets") {
2559
+ max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
2560
+ } else if (clampedState.level === "runs") {
2561
+ max = selectedDataset?.runs.length ?? 0;
2562
+ } else if (clampedState.level === "new-evaluation") {
2563
+ max = Math.max(0, visibleEvaluators.length - 1);
2564
+ } else {
2565
+ max = 100;
2566
+ }
2248
2567
  dispatch({ type: "MOVE_DOWN", max });
2249
2568
  return;
2250
2569
  }
@@ -2262,7 +2581,7 @@ function EvalsCliApp({
2262
2581
  }
2263
2582
  return;
2264
2583
  }
2265
- if (isBackKey(key)) {
2584
+ if (isBackKey(key) || input === "\x7F" || input === "\b") {
2266
2585
  dispatch({ type: "BACK" });
2267
2586
  return;
2268
2587
  }
@@ -2315,7 +2634,8 @@ function EvalsCliApp({
2315
2634
  {
2316
2635
  state: clampedState,
2317
2636
  filteredDatasets,
2318
- selectedDataset
2637
+ selectedDataset,
2638
+ overviewRowCountRef
2319
2639
  }
2320
2640
  );
2321
2641
  }