@m4trix/evals 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1,16 +1,17 @@
1
1
  #!/usr/bin/env node
2
2
  import { withFullScreen, useScreenSize } from 'fullscreen-ink';
3
- import React, { useState, useReducer, useEffect, useMemo } from 'react';
3
+ import React2, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
4
4
  import { useApp, useInput, Box, Text } from 'ink';
5
5
  import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
6
  import { resolve, relative, join, dirname } from 'path';
7
- import { diffString } from 'json-diff';
7
+ import { LineGraph } from '@pppp606/ink-chart';
8
8
  import { randomUUID } from 'crypto';
9
- import { Effect, PubSub, Queue, Fiber } from 'effect';
9
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
10
10
  import { existsSync } from 'fs';
11
11
  import * as jitiModule from 'jiti';
12
12
  import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
13
13
  import { pathToFileURL } from 'url';
14
+ import { diffLines } from 'diff';
14
15
 
15
16
  var SEP = " ";
16
17
  var ARROW = "\u203A";
@@ -78,7 +79,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
78
79
  // src/cli/components/Footer.tsx
79
80
  function getFooterText(state) {
80
81
  if (state.level === "datasets") {
81
- return "\u2191\u2193 move Enter open / search Tab focus q quit";
82
+ return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
82
83
  }
83
84
  if (state.level === "runs") {
84
85
  return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
@@ -610,6 +611,7 @@ function createInitialState(data, args) {
610
611
  datasetMenuIndex,
611
612
  runMenuIndex,
612
613
  detailsScrollOffset: 0,
614
+ overviewScrollOffset: 0,
613
615
  selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
614
616
  evaluatorMenuIndex: 0,
615
617
  searchQuery,
@@ -625,8 +627,11 @@ function reduceCliState(state, action) {
625
627
  if (state.level === "details" && state.focus === "right") {
626
628
  return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
627
629
  }
630
+ if (state.level === "datasets" && state.focus === "right") {
631
+ return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
632
+ }
628
633
  if (state.level === "datasets") {
629
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
634
+ return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
630
635
  }
631
636
  if (state.level === "runs") {
632
637
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -643,8 +648,11 @@ function reduceCliState(state, action) {
643
648
  if (state.level === "details" && state.focus === "right") {
644
649
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
645
650
  }
651
+ if (state.level === "datasets" && state.focus === "right") {
652
+ return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
653
+ }
646
654
  if (state.level === "datasets") {
647
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
655
+ return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
648
656
  }
649
657
  if (state.level === "runs") {
650
658
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -720,249 +728,6 @@ function reduceCliState(state, action) {
720
728
  }
721
729
  return state;
722
730
  }
723
- var LEFT_PANE_WIDTH2 = 44;
724
- function DatasetsView({
725
- state,
726
- filteredDatasets,
727
- selectedDataset
728
- }) {
729
- const leftFocused = state.focus === "left";
730
- const rightFocused = state.focus === "right";
731
- return /* @__PURE__ */ jsxs(Fragment, { children: [
732
- /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
733
- /* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
734
- /* @__PURE__ */ jsx(
735
- ListItem,
736
- {
737
- selected: state.datasetMenuIndex === 0,
738
- label: "New evaluation",
739
- itemKey: "datasets-new-eval"
740
- }
741
- ),
742
- filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
743
- ListItem,
744
- {
745
- selected: state.datasetMenuIndex === index + 1,
746
- label: dataset.name,
747
- itemKey: `dataset-${dataset.id}`
748
- },
749
- dataset.id
750
- ))
751
- ] }),
752
- /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
753
- /* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
754
- /* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
755
- ] })
756
- ] });
757
- }
758
- function RunsView({
759
- state,
760
- dataset,
761
- selectedRun
762
- }) {
763
- const runs = dataset?.runs ?? [];
764
- const rightFocused = state.focus === "right";
765
- return /* @__PURE__ */ jsxs(Fragment, { children: [
766
- /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
767
- /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
768
- /* @__PURE__ */ jsxs(Text, { children: [
769
- /* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
770
- " ",
771
- selectedRun.label,
772
- " ",
773
- /* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
774
- ] }),
775
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
776
- "Commit: ",
777
- selectedRun.meta.commit,
778
- " Branch: ",
779
- selectedRun.meta.branch,
780
- " ",
781
- "Seed: ",
782
- selectedRun.meta.seed
783
- ] }),
784
- /* @__PURE__ */ jsx(Text, { children: " " }),
785
- /* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
786
- /* @__PURE__ */ jsx(
787
- TextBar,
788
- {
789
- label: "pass rate",
790
- value: selectedRun.performance.passRate,
791
- format: (v) => `${v}%`
792
- }
793
- ),
794
- /* @__PURE__ */ jsx(
795
- TextBar,
796
- {
797
- label: "avg score",
798
- value: Math.round(selectedRun.performance.avgScore * 100)
799
- }
800
- ),
801
- /* @__PURE__ */ jsx(Text, { children: " " }),
802
- /* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
803
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
804
- TextBar,
805
- {
806
- label: dimension.name,
807
- value: dimension.score
808
- },
809
- dimension.name
810
- )),
811
- /* @__PURE__ */ jsx(Text, { children: " " }),
812
- /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
813
- /* @__PURE__ */ jsx(
814
- Sparkline,
815
- {
816
- data: selectedRun.performance.latencyHistoryMs ?? [
817
- selectedRun.performance.latencyAvgMs - 40,
818
- selectedRun.performance.latencyAvgMs - 10,
819
- selectedRun.performance.latencyAvgMs + 20,
820
- selectedRun.performance.latencyP95Ms - 80,
821
- selectedRun.performance.latencyP95Ms
822
- ],
823
- width: 24
824
- }
825
- )
826
- ] }) })
827
- ] });
828
- }
829
-
830
- // src/evals/metric.ts
831
- var registry = /* @__PURE__ */ new Map();
832
- var Metric = {
833
- of(config) {
834
- const def = {
835
- id: config.id,
836
- name: config.name,
837
- format: config.format,
838
- make: (data) => ({ id: config.id, data })
839
- };
840
- registry.set(config.id, def);
841
- return def;
842
- }
843
- };
844
- function getMetricById(id) {
845
- return registry.get(id);
846
- }
847
-
848
- // src/evals/score.ts
849
- var registry2 = /* @__PURE__ */ new Map();
850
- var Score = {
851
- of(config) {
852
- const def = {
853
- id: config.id,
854
- name: config.name,
855
- displayStrategy: config.displayStrategy,
856
- format: config.format,
857
- make: (data, options) => {
858
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
859
- return {
860
- id: config.id,
861
- data,
862
- ...passed !== void 0 && { passed }
863
- };
864
- }
865
- };
866
- registry2.set(config.id, def);
867
- return def;
868
- }
869
- };
870
- function getScoreById(id) {
871
- return registry2.get(id);
872
- }
873
-
874
- // src/evals/metrics/standard.ts
875
- Metric.of({
876
- id: "token-count",
877
- name: "Tokens",
878
- format: (data) => {
879
- const input = data.input ?? 0;
880
- const output = data.output ?? 0;
881
- const inputCached = data.inputCached ?? 0;
882
- const outputCached = data.outputCached ?? 0;
883
- const cached = inputCached + outputCached;
884
- return `in:${input} out:${output} cached:${cached}`;
885
- }
886
- });
887
- Metric.of({
888
- id: "latency",
889
- name: "Latency",
890
- format: (data) => `${data.ms}ms`
891
- });
892
-
893
- // src/evals/scores/standard.ts
894
- Score.of({
895
- id: "percent",
896
- name: "Score",
897
- displayStrategy: "bar",
898
- format: (data) => data.value.toFixed(2)
899
- });
900
- Score.of({
901
- id: "binary",
902
- name: "Result",
903
- displayStrategy: "passFail",
904
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
905
- });
906
- function createDiffLogEntry(expected, actual, options) {
907
- const diff = diffString(expected, actual, { color: false });
908
- return {
909
- type: "diff",
910
- label: options?.label,
911
- expected,
912
- actual,
913
- diff: diff || "(no differences)"
914
- };
915
- }
916
- function getDiffLines(entry) {
917
- const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
918
- return raw.split("\n").map((line) => {
919
- const trimmed = line.trimStart();
920
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
921
- return { type: "remove", line };
922
- }
923
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
924
- return { type: "add", line };
925
- }
926
- return { type: "context", line };
927
- });
928
- }
929
-
930
- // src/runner/score-utils.ts
931
- function toNumericScoreFromScores(scores) {
932
- for (const item of scores) {
933
- const def = getScoreById(item.id);
934
- if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
935
- const value = item.data.value;
936
- if (typeof value === "number" && Number.isFinite(value)) {
937
- return value;
938
- }
939
- }
940
- const numeric = toNumericScore(item.data);
941
- if (numeric !== void 0) {
942
- return numeric;
943
- }
944
- }
945
- return void 0;
946
- }
947
- function toNumericScore(value) {
948
- if (typeof value === "number" && Number.isFinite(value)) {
949
- return value;
950
- }
951
- if (typeof value !== "object" || value === null) {
952
- return void 0;
953
- }
954
- const obj = value;
955
- if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
956
- return obj.score;
957
- }
958
- const numberValues = Object.values(value).filter(
959
- (entry) => typeof entry === "number" && Number.isFinite(entry)
960
- );
961
- if (numberValues.length === 0) {
962
- return void 0;
963
- }
964
- return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
965
- }
966
731
 
967
732
  // src/runner/config.ts
968
733
  var defaultRunnerConfig = {
@@ -983,7 +748,8 @@ var defaultRunnerConfig = {
983
748
  ],
984
749
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
985
750
  },
986
- artifactDirectory: ".eval-results"
751
+ artifactDirectory: ".eval-results",
752
+ maxConcurrency: 1
987
753
  };
988
754
  function toRunnerConfigOverrides(config) {
989
755
  if (!config) {
@@ -1016,6 +782,9 @@ function toRunnerConfigOverrides(config) {
1016
782
  if (config.artifactDirectory !== void 0) {
1017
783
  overrides.artifactDirectory = config.artifactDirectory;
1018
784
  }
785
+ if (config.maxConcurrency !== void 0) {
786
+ overrides.maxConcurrency = config.maxConcurrency;
787
+ }
1019
788
  if (Object.keys(discovery).length > 0) {
1020
789
  overrides.discovery = discovery;
1021
790
  }
@@ -1190,25 +959,261 @@ async function collectEvaluatorsFromFiles(config) {
1190
959
  );
1191
960
  return found.flat();
1192
961
  }
1193
- async function collectTestCasesFromFiles(config) {
1194
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1195
- const matched = files.filter(
1196
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1197
- );
1198
- const found = await Promise.all(
1199
- matched.map(async (absolutePath) => {
1200
- const exports = await loadModuleExports(absolutePath);
1201
- const testCases = exports.filter(isTestCaseLike);
1202
- const relPath = relative(config.rootDir, absolutePath);
1203
- return testCases.map((testCase) => ({
1204
- id: toId("test-case", relPath, testCase.getName()),
1205
- filePath: relPath,
1206
- testCase
1207
- }));
1208
- })
962
+ async function collectTestCasesFromFiles(config) {
963
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
964
+ const matched = files.filter(
965
+ (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
966
+ );
967
+ const found = await Promise.all(
968
+ matched.map(async (absolutePath) => {
969
+ const exports = await loadModuleExports(absolutePath);
970
+ const testCases = exports.filter(isTestCaseLike);
971
+ const relPath = relative(config.rootDir, absolutePath);
972
+ return testCases.map((testCase) => ({
973
+ id: toId("test-case", relPath, testCase.getName()),
974
+ filePath: relPath,
975
+ testCase
976
+ }));
977
+ })
978
+ );
979
+ return found.flat();
980
+ }
981
+ function toJsonLines(value) {
982
+ try {
983
+ return JSON.stringify(value, null, 2);
984
+ } catch {
985
+ return String(value);
986
+ }
987
+ }
988
+ function formatDiffString(changes) {
989
+ const lines = [];
990
+ for (const part of changes) {
991
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
992
+ const partLines = part.value.split("\n");
993
+ if (partLines[partLines.length - 1] === "") {
994
+ partLines.pop();
995
+ }
996
+ for (const line of partLines) {
997
+ lines.push(`${prefix} ${line}`);
998
+ }
999
+ }
1000
+ return lines.join("\n");
1001
+ }
1002
+ function createDiffString(expected, actual) {
1003
+ const expectedStr = toJsonLines(expected);
1004
+ const actualStr = toJsonLines(actual);
1005
+ const changes = diffLines(expectedStr, actualStr);
1006
+ return formatDiffString(changes);
1007
+ }
1008
+ function createDiffLogEntry(expected, actual, options) {
1009
+ const diff = createDiffString(expected, actual);
1010
+ return {
1011
+ type: "diff",
1012
+ label: options?.label,
1013
+ expected,
1014
+ actual,
1015
+ diff: diff || "(no differences)"
1016
+ };
1017
+ }
1018
+ function getDiffLines(entry) {
1019
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1020
+ return raw.split("\n").map((line) => {
1021
+ const trimmed = line.trimStart();
1022
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
1023
+ return { type: "remove", line };
1024
+ }
1025
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
1026
+ return { type: "add", line };
1027
+ }
1028
+ return { type: "context", line };
1029
+ });
1030
+ }
1031
+
1032
+ // src/evals/metric.ts
1033
+ var registry = /* @__PURE__ */ new Map();
1034
+ var Metric = {
1035
+ of(config) {
1036
+ const def = {
1037
+ id: config.id,
1038
+ name: config.name,
1039
+ aggregate: config.aggregate,
1040
+ format: config.format,
1041
+ make: (data) => ({ id: config.id, data })
1042
+ };
1043
+ registry.set(config.id, def);
1044
+ return def;
1045
+ }
1046
+ };
1047
+ function getMetricById(id) {
1048
+ return registry.get(id);
1049
+ }
1050
+
1051
+ // src/evals/score.ts
1052
+ var registry2 = /* @__PURE__ */ new Map();
1053
+ var Score = {
1054
+ of(config) {
1055
+ const def = {
1056
+ id: config.id,
1057
+ name: config.name,
1058
+ displayStrategy: config.displayStrategy,
1059
+ aggregate: config.aggregate,
1060
+ format: config.format,
1061
+ make: (data, options) => {
1062
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1063
+ return {
1064
+ id: config.id,
1065
+ data,
1066
+ ...passed !== void 0 && { passed }
1067
+ };
1068
+ }
1069
+ };
1070
+ registry2.set(config.id, def);
1071
+ return def;
1072
+ }
1073
+ };
1074
+ function getScoreById(id) {
1075
+ return registry2.get(id);
1076
+ }
1077
+
1078
+ // src/evals/aggregators.ts
1079
+ function aggregateAverageWithVariance(values) {
1080
+ if (values.length === 0) {
1081
+ return { value: 0, count: 0 };
1082
+ }
1083
+ const sum = values.reduce((s, v) => s + v.value, 0);
1084
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1085
+ const mean = sum / values.length;
1086
+ let stdDev;
1087
+ if (values.length >= 2) {
1088
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1089
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1090
+ }
1091
+ return { value: mean, stdDev, count: values.length };
1092
+ }
1093
+ function aggregateAll(values) {
1094
+ const total = values.length;
1095
+ const passedCount = values.filter((v) => v.passed).length;
1096
+ return {
1097
+ passed: total > 0 && values.every((v) => v.passed),
1098
+ passedCount,
1099
+ totalCount: total
1100
+ };
1101
+ }
1102
+ function aggregateTokenCountSum(values) {
1103
+ const initial = {
1104
+ input: 0,
1105
+ output: 0,
1106
+ inputCached: 0,
1107
+ outputCached: 0
1108
+ };
1109
+ return values.reduce(
1110
+ (acc, v) => ({
1111
+ input: acc.input + (v.input ?? 0),
1112
+ output: acc.output + (v.output ?? 0),
1113
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1114
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1115
+ }),
1116
+ initial
1117
+ );
1118
+ }
1119
+ function aggregateLatencyAverage(values) {
1120
+ if (values.length === 0) {
1121
+ return { ms: 0 };
1122
+ }
1123
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1124
+ return { ms: sum / values.length };
1125
+ }
1126
+
1127
+ // src/evals/metrics/standard.ts
1128
+ Metric.of({
1129
+ id: "token-count",
1130
+ name: "Tokens",
1131
+ aggregate: aggregateTokenCountSum,
1132
+ format: (data, options) => {
1133
+ const input = data.input ?? 0;
1134
+ const output = data.output ?? 0;
1135
+ const inputCached = data.inputCached ?? 0;
1136
+ const outputCached = data.outputCached ?? 0;
1137
+ const cached = inputCached + outputCached;
1138
+ const base = `in:${input} out:${output} cached:${cached}`;
1139
+ return options?.isAggregated ? `Total: ${base}` : base;
1140
+ }
1141
+ });
1142
+ Metric.of({
1143
+ id: "latency",
1144
+ name: "Latency",
1145
+ aggregate: aggregateLatencyAverage,
1146
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1147
+ });
1148
+
1149
+ // src/evals/scores/standard.ts
1150
+ Score.of({
1151
+ id: "percent",
1152
+ name: "Score",
1153
+ displayStrategy: "bar",
1154
+ format: (data, options) => {
1155
+ if (options?.isAggregated) {
1156
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1157
+ }
1158
+ return data.value.toFixed(2);
1159
+ },
1160
+ aggregate: aggregateAverageWithVariance
1161
+ });
1162
+ Score.of({
1163
+ id: "binary",
1164
+ name: "Result",
1165
+ displayStrategy: "passFail",
1166
+ format: (data, options) => {
1167
+ if (options?.isAggregated) {
1168
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1169
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1170
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1171
+ }
1172
+ return base;
1173
+ }
1174
+ return data.passed ? "PASSED" : "NOT PASSED";
1175
+ },
1176
+ aggregate: aggregateAll
1177
+ });
1178
+
1179
+ // src/runner/score-utils.ts
1180
+ function toNumericScoreFromScores(scores) {
1181
+ for (const item of scores) {
1182
+ const def = getScoreById(item.id);
1183
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1184
+ const value = item.data.value;
1185
+ if (typeof value === "number" && Number.isFinite(value)) {
1186
+ return value;
1187
+ }
1188
+ }
1189
+ const numeric = toNumericScore(item.data);
1190
+ if (numeric !== void 0) {
1191
+ return numeric;
1192
+ }
1193
+ }
1194
+ return void 0;
1195
+ }
1196
+ function toNumericScore(value) {
1197
+ if (typeof value === "number" && Number.isFinite(value)) {
1198
+ return value;
1199
+ }
1200
+ if (typeof value !== "object" || value === null) {
1201
+ return void 0;
1202
+ }
1203
+ const obj = value;
1204
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1205
+ return obj.score;
1206
+ }
1207
+ const numberValues = Object.values(value).filter(
1208
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1209
1209
  );
1210
- return found.flat();
1210
+ if (numberValues.length === 0) {
1211
+ return void 0;
1212
+ }
1213
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1211
1214
  }
1215
+
1216
+ // src/runner/execution.ts
1212
1217
  function computeEvaluatorPassed(evaluator, result, scores) {
1213
1218
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1214
1219
  if (scoresWithPassed.length > 0) {
@@ -1250,6 +1255,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1250
1255
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1251
1256
  );
1252
1257
  }
1258
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1259
+ return Effect.gen(function* () {
1260
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1261
+ const rerunPassed = [];
1262
+ for (let r = 0; r < reruns; r++) {
1263
+ const started = Date.now();
1264
+ const evaluatorScores = [];
1265
+ let testCaseError;
1266
+ const output = readOutput(testCaseItem.testCase);
1267
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1268
+ const evaluateFn = evaluator.getEvaluateFn();
1269
+ if (!evaluateFn) {
1270
+ continue;
1271
+ }
1272
+ try {
1273
+ const logs = [];
1274
+ const logDiff = (expected, actual, options) => {
1275
+ logs.push(createDiffLogEntry(expected, actual, options));
1276
+ };
1277
+ const ctx = yield* Effect.promise(
1278
+ () => Promise.resolve(evaluator.resolveContext())
1279
+ );
1280
+ const result = yield* Effect.promise(
1281
+ () => Promise.resolve(
1282
+ evaluateFn({
1283
+ input: testCaseItem.testCase.getInput(),
1284
+ ctx,
1285
+ output,
1286
+ logDiff
1287
+ })
1288
+ )
1289
+ );
1290
+ const { scores, metrics } = normalizeResult(result);
1291
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1292
+ evaluatorScores.push({
1293
+ evaluatorId,
1294
+ scores,
1295
+ passed: passed2,
1296
+ metrics,
1297
+ logs: logs.length > 0 ? logs : void 0
1298
+ });
1299
+ } catch (error) {
1300
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1301
+ evaluatorScores.push({
1302
+ evaluatorId,
1303
+ scores: [],
1304
+ passed: false
1305
+ });
1306
+ }
1307
+ }
1308
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1309
+ rerunPassed.push(rerunPassedThis);
1310
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1311
+ n + 1,
1312
+ n + 1
1313
+ ]);
1314
+ const progressEvent = {
1315
+ type: "TestCaseProgress",
1316
+ runId: task.runId,
1317
+ testCaseId: testCaseItem.id,
1318
+ testCaseName: testCaseItem.testCase.getName(),
1319
+ completedTestCases: completedEvaluations,
1320
+ totalTestCases: totalEvaluations,
1321
+ rerunIndex: r + 1,
1322
+ rerunTotal: reruns,
1323
+ passed: rerunPassedThis,
1324
+ durationMs: Date.now() - started,
1325
+ evaluatorScores,
1326
+ output,
1327
+ errorMessage: testCaseError
1328
+ };
1329
+ updateSnapshot(task.runId, (snapshot) => ({
1330
+ ...snapshot,
1331
+ completedTestCases: completedEvaluations
1332
+ }));
1333
+ yield* publishEvent(progressEvent);
1334
+ yield* Queue.offer(persistenceQueue, {
1335
+ runId: task.runId,
1336
+ artifactPath: task.snapshot.artifactPath,
1337
+ payload: progressEvent
1338
+ });
1339
+ }
1340
+ const testCasePassed = rerunPassed.every(Boolean);
1341
+ if (testCasePassed) {
1342
+ yield* Ref.update(passedRef, (n) => n + 1);
1343
+ } else {
1344
+ yield* Ref.update(failedRef, (n) => n + 1);
1345
+ }
1346
+ const [passed, failed] = yield* Effect.all([
1347
+ Ref.get(passedRef),
1348
+ Ref.get(failedRef)
1349
+ ]);
1350
+ updateSnapshot(task.runId, (snapshot) => ({
1351
+ ...snapshot,
1352
+ passedTestCases: passed,
1353
+ failedTestCases: failed
1354
+ }));
1355
+ });
1356
+ }
1253
1357
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1254
1358
  const startedAt = Date.now();
1255
1359
  updateSnapshot(task.runId, (snapshot) => ({
@@ -1262,104 +1366,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1262
1366
  runId: task.runId,
1263
1367
  startedAt
1264
1368
  });
1265
- let completedTestCases = 0;
1266
- let passedTestCases = 0;
1267
- let failedTestCases = 0;
1268
- for (const testCaseItem of task.testCases) {
1269
- const started = Date.now();
1270
- const evaluatorScores = [];
1271
- let testCaseError;
1272
- const output = readOutput(testCaseItem.testCase);
1273
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1274
- const evaluateFn = evaluator.getEvaluateFn();
1275
- if (!evaluateFn) {
1276
- continue;
1277
- }
1278
- try {
1279
- const logs = [];
1280
- const logDiff = (expected, actual, options) => {
1281
- logs.push(createDiffLogEntry(expected, actual, options));
1282
- };
1283
- const ctx = yield* Effect.promise(
1284
- () => Promise.resolve(evaluator.resolveContext())
1285
- );
1286
- const result = yield* Effect.promise(
1287
- () => Promise.resolve(
1288
- evaluateFn({
1289
- input: testCaseItem.testCase.getInput(),
1290
- ctx,
1291
- output,
1292
- logDiff
1293
- })
1294
- )
1295
- );
1296
- const { scores, metrics } = normalizeResult(result);
1297
- const passed = computeEvaluatorPassed(evaluator, result, scores);
1298
- evaluatorScores.push({
1299
- evaluatorId,
1300
- scores,
1301
- passed,
1302
- metrics,
1303
- logs: logs.length > 0 ? logs : void 0
1304
- });
1305
- } catch (error) {
1306
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1307
- evaluatorScores.push({
1308
- evaluatorId,
1309
- scores: [],
1310
- passed: false
1311
- });
1312
- }
1313
- }
1314
- const testCasePassed = evaluatorScores.every((s) => s.passed);
1315
- completedTestCases += 1;
1316
- if (testCasePassed) {
1317
- passedTestCases += 1;
1318
- } else {
1319
- failedTestCases += 1;
1320
- }
1321
- const progressEvent = {
1322
- type: "TestCaseProgress",
1323
- runId: task.runId,
1324
- testCaseId: testCaseItem.id,
1325
- testCaseName: testCaseItem.testCase.getName(),
1326
- completedTestCases,
1327
- totalTestCases: task.testCases.length,
1328
- passed: testCasePassed,
1329
- durationMs: Date.now() - started,
1330
- evaluatorScores,
1331
- output,
1332
- errorMessage: testCaseError
1333
- };
1334
- updateSnapshot(task.runId, (snapshot) => ({
1335
- ...snapshot,
1336
- completedTestCases,
1337
- passedTestCases,
1338
- failedTestCases
1339
- }));
1340
- yield* publishEvent(progressEvent);
1341
- yield* Queue.offer(persistenceQueue, {
1342
- runId: task.runId,
1343
- artifactPath: task.snapshot.artifactPath,
1344
- payload: progressEvent
1345
- });
1346
- }
1369
+ const totalEvaluations = task.testCases.reduce(
1370
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1371
+ 0
1372
+ );
1373
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1374
+ const completedRef = yield* Ref.make(0);
1375
+ const passedRef = yield* Ref.make(0);
1376
+ const failedRef = yield* Ref.make(0);
1377
+ const processTestCase = (testCaseItem) => processOneTestCase(
1378
+ task,
1379
+ testCaseItem,
1380
+ totalEvaluations,
1381
+ publishEvent,
1382
+ persistenceQueue,
1383
+ updateSnapshot,
1384
+ completedRef,
1385
+ passedRef,
1386
+ failedRef
1387
+ );
1388
+ yield* Effect.forEach(
1389
+ task.testCases,
1390
+ processTestCase,
1391
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1392
+ );
1393
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1394
+ Ref.get(completedRef),
1395
+ Ref.get(passedRef),
1396
+ Ref.get(failedRef)
1397
+ ]);
1347
1398
  const finishedAt = Date.now();
1348
1399
  const completedEvent = {
1349
1400
  type: "RunCompleted",
1350
1401
  runId: task.runId,
1351
1402
  finishedAt,
1352
- passedTestCases,
1353
- failedTestCases,
1403
+ passedTestCases: passedUniqueTestCases,
1404
+ failedTestCases: failedUniqueTestCases,
1354
1405
  totalTestCases: task.testCases.length,
1355
1406
  artifactPath: task.snapshot.artifactPath
1356
1407
  };
1357
1408
  updateSnapshot(task.runId, (snapshot) => ({
1358
1409
  ...snapshot,
1359
1410
  status: "completed",
1360
- completedTestCases,
1361
- passedTestCases,
1362
- failedTestCases,
1411
+ completedTestCases: completedEvaluations,
1412
+ passedTestCases: passedUniqueTestCases,
1413
+ failedTestCases: failedUniqueTestCases,
1363
1414
  finishedAt
1364
1415
  }));
1365
1416
  yield* publishEvent(completedEvent);
@@ -1447,7 +1498,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
1447
1498
  const artifactPath = filePath;
1448
1499
  const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1449
1500
  const progress = aggregateTestCaseProgress(lines);
1450
- const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
1501
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1451
1502
  const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1452
1503
  const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1453
1504
  return {
@@ -1469,23 +1520,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
1469
1520
  }
1470
1521
  function aggregateTestCaseProgress(lines) {
1471
1522
  let completedTestCases = 0;
1472
- let passedTestCases = 0;
1473
- let failedTestCases = 0;
1523
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1474
1524
  for (const line of lines) {
1475
1525
  try {
1476
1526
  const event = JSON.parse(line);
1477
1527
  if (event.type === "TestCaseProgress") {
1478
1528
  const ev = event;
1479
1529
  completedTestCases = ev.completedTestCases ?? completedTestCases;
1480
- if (ev.passed) {
1481
- passedTestCases += 1;
1482
- } else {
1483
- failedTestCases += 1;
1484
- }
1530
+ const id = ev.testCaseId;
1531
+ const current = testCasePassedBy.get(id);
1532
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1485
1533
  }
1486
1534
  } catch {
1487
1535
  }
1488
1536
  }
1537
+ let passedTestCases = 0;
1538
+ let failedTestCases = 0;
1539
+ for (const passed of testCasePassedBy.values()) {
1540
+ if (passed) {
1541
+ passedTestCases += 1;
1542
+ } else {
1543
+ failedTestCases += 1;
1544
+ }
1545
+ }
1489
1546
  return { completedTestCases, passedTestCases, failedTestCases };
1490
1547
  }
1491
1548
  async function parseArtifactFile(artifactPath) {
@@ -1503,6 +1560,8 @@ async function parseArtifactFile(artifactPath) {
1503
1560
  testCaseName: ev.testCaseName,
1504
1561
  completedTestCases: ev.completedTestCases,
1505
1562
  totalTestCases: ev.totalTestCases,
1563
+ rerunIndex: ev.rerunIndex,
1564
+ rerunTotal: ev.rerunTotal,
1506
1565
  passed: ev.passed,
1507
1566
  durationMs: ev.durationMs,
1508
1567
  evaluatorScores: ev.evaluatorScores ?? []
@@ -1708,6 +1767,10 @@ var EffectRunner = class {
1708
1767
  throw new Error("No evaluators selected for run");
1709
1768
  }
1710
1769
  const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1770
+ const totalEvaluations = selectedTestCases.reduce(
1771
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1772
+ 0
1773
+ );
1711
1774
  const runId = `run-${randomUUID()}`;
1712
1775
  const artifactPath = createArtifactPath(
1713
1776
  this.config.artifactDirectory,
@@ -1720,7 +1783,7 @@ var EffectRunner = class {
1720
1783
  datasetName: dataset.dataset.getName(),
1721
1784
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1722
1785
  queuedAt: Date.now(),
1723
- totalTestCases: selectedTestCases.length,
1786
+ totalTestCases: totalEvaluations,
1724
1787
  completedTestCases: 0,
1725
1788
  passedTestCases: 0,
1726
1789
  failedTestCases: 0,
@@ -1734,7 +1797,7 @@ var EffectRunner = class {
1734
1797
  datasetId: request.datasetId,
1735
1798
  datasetName: dataset.dataset.getName(),
1736
1799
  evaluatorIds: selectedEvaluators.map((item) => item.id),
1737
- totalTestCases: selectedTestCases.length,
1800
+ totalTestCases: totalEvaluations,
1738
1801
  artifactPath
1739
1802
  };
1740
1803
  await Effect.runPromise(this.publishEvent(queuedEvent));
@@ -1745,6 +1808,7 @@ var EffectRunner = class {
1745
1808
  payload: queuedEvent
1746
1809
  })
1747
1810
  );
1811
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1748
1812
  await Effect.runPromise(
1749
1813
  Queue.offer(this.runQueue, {
1750
1814
  runId,
@@ -1752,7 +1816,8 @@ var EffectRunner = class {
1752
1816
  dataset: dataset.dataset,
1753
1817
  evaluators: selectedEvaluators,
1754
1818
  testCases: selectedTestCases,
1755
- snapshot
1819
+ snapshot,
1820
+ maxConcurrency
1756
1821
  })
1757
1822
  );
1758
1823
  return snapshot;
@@ -1819,6 +1884,240 @@ var EffectRunner = class {
1819
1884
  );
1820
1885
  }
1821
1886
  };
1887
+ var LEFT_PANE_WIDTH2 = 44;
1888
+ var MAX_RUNS_FOR_CHART = 12;
1889
+ var MAX_RUNS_FOR_TREND = 20;
1890
+ var TREND_BATCH_SIZE = 4;
1891
+ function extractRunAverageScore(testCases) {
1892
+ const scores = [];
1893
+ for (const tc of testCases) {
1894
+ for (const es of tc.evaluatorScores) {
1895
+ const n = toNumericScoreFromScores(es.scores);
1896
+ if (n !== void 0) {
1897
+ scores.push(n);
1898
+ }
1899
+ }
1900
+ }
1901
+ if (scores.length === 0)
1902
+ return void 0;
1903
+ return scores.reduce((a, b) => a + b, 0) / scores.length;
1904
+ }
1905
+ async function loadRunScores(runs) {
1906
+ const results = [];
1907
+ for (const run of runs) {
1908
+ const artifact = run.meta?.artifact;
1909
+ if (!artifact)
1910
+ continue;
1911
+ try {
1912
+ const path = resolve(artifact);
1913
+ const testCases = await parseArtifactFile(path);
1914
+ const avg = extractRunAverageScore(testCases);
1915
+ if (avg !== void 0) {
1916
+ results.push({
1917
+ runId: run.id,
1918
+ label: run.label,
1919
+ value: avg
1920
+ });
1921
+ }
1922
+ } catch {
1923
+ }
1924
+ }
1925
+ return results;
1926
+ }
1927
+ function batchAverage(values, batchSize) {
1928
+ const batches = [];
1929
+ for (let i = 0; i < values.length; i += batchSize) {
1930
+ const slice = values.slice(i, i + batchSize);
1931
+ if (slice.length > 0) {
1932
+ batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
1933
+ }
1934
+ }
1935
+ return batches;
1936
+ }
1937
+ var OVERVIEW_PAGE_SIZE = 15;
1938
+ function DatasetsView({
1939
+ state,
1940
+ filteredDatasets,
1941
+ selectedDataset,
1942
+ overviewRowCountRef
1943
+ }) {
1944
+ const leftFocused = state.focus === "left";
1945
+ const rightFocused = state.focus === "right";
1946
+ const [runScores, setRunScores] = useState([]);
1947
+ const [loading, setLoading] = useState(false);
1948
+ useEffect(() => {
1949
+ if (!selectedDataset?.runs?.length) {
1950
+ setRunScores([]);
1951
+ return;
1952
+ }
1953
+ setLoading(true);
1954
+ const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
1955
+ loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
1956
+ }, [selectedDataset?.id, selectedDataset?.runs?.length]);
1957
+ const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
1958
+ const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
1959
+ const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
1960
+ const overviewRows = useMemo(() => {
1961
+ const rows = [];
1962
+ rows.push(
1963
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
1964
+ );
1965
+ if (selectedDataset && selectedDataset.runs.length > 0) {
1966
+ if (loading) {
1967
+ rows.push(
1968
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
1969
+ );
1970
+ } else if (runScores.length > 0) {
1971
+ rows.push(
1972
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
1973
+ );
1974
+ for (const d of barData) {
1975
+ rows.push(
1976
+ /* @__PURE__ */ jsx(
1977
+ TextBar,
1978
+ {
1979
+ label: d.label,
1980
+ value: d.value,
1981
+ labelWidth: 14,
1982
+ barWidth: 24,
1983
+ max: 100,
1984
+ format: (v) => v.toFixed(1)
1985
+ },
1986
+ d.runId
1987
+ )
1988
+ );
1989
+ }
1990
+ if (trendBatched.length > 0) {
1991
+ rows.push(
1992
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
1993
+ );
1994
+ rows.push(
1995
+ /* @__PURE__ */ jsx(Box, { children: /* @__PURE__ */ jsx(
1996
+ LineGraph,
1997
+ {
1998
+ data: [{ values: trendBatched, color: "cyan" }],
1999
+ height: 5,
2000
+ width: 45,
2001
+ showYAxis: true,
2002
+ xLabels: ["older", "newer"]
2003
+ }
2004
+ ) }, "trend-graph")
2005
+ );
2006
+ }
2007
+ }
2008
+ }
2009
+ return rows;
2010
+ }, [
2011
+ selectedDataset?.overview,
2012
+ selectedDataset?.runs?.length,
2013
+ loading,
2014
+ runScores,
2015
+ barData,
2016
+ trendBatched
2017
+ ]);
2018
+ if (overviewRowCountRef) {
2019
+ overviewRowCountRef.current = overviewRows.length;
2020
+ }
2021
+ const offset = Math.max(0, state.overviewScrollOffset);
2022
+ const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
2023
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
2024
+ /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
2025
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
2026
+ /* @__PURE__ */ jsx(
2027
+ ListItem,
2028
+ {
2029
+ selected: state.datasetMenuIndex === 0,
2030
+ label: "New evaluation",
2031
+ itemKey: "datasets-new-eval"
2032
+ }
2033
+ ),
2034
+ filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
2035
+ ListItem,
2036
+ {
2037
+ selected: state.datasetMenuIndex === index + 1,
2038
+ label: dataset.name,
2039
+ itemKey: `dataset-${dataset.id}`
2040
+ },
2041
+ dataset.id
2042
+ ))
2043
+ ] }),
2044
+ /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
2045
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
2046
+ /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsx(Box, { children: row }, offset + i)) })
2047
+ ] })
2048
+ ] });
2049
+ }
2050
+ function RunsView({
2051
+ state,
2052
+ dataset,
2053
+ selectedRun
2054
+ }) {
2055
+ const runs = dataset?.runs ?? [];
2056
+ const rightFocused = state.focus === "right";
2057
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
2058
+ /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
2059
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
2060
+ /* @__PURE__ */ jsxs(Text, { children: [
2061
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
2062
+ " ",
2063
+ selectedRun.label,
2064
+ " ",
2065
+ /* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
2066
+ ] }),
2067
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2068
+ "Commit: ",
2069
+ selectedRun.meta.commit,
2070
+ " Branch: ",
2071
+ selectedRun.meta.branch,
2072
+ " ",
2073
+ "Seed: ",
2074
+ selectedRun.meta.seed
2075
+ ] }),
2076
+ /* @__PURE__ */ jsx(Text, { children: " " }),
2077
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
2078
+ /* @__PURE__ */ jsx(
2079
+ TextBar,
2080
+ {
2081
+ label: "pass rate",
2082
+ value: selectedRun.performance.passRate,
2083
+ format: (v) => `${v}%`
2084
+ }
2085
+ ),
2086
+ /* @__PURE__ */ jsx(
2087
+ TextBar,
2088
+ {
2089
+ label: "avg score",
2090
+ value: Math.round(selectedRun.performance.avgScore * 100)
2091
+ }
2092
+ ),
2093
+ /* @__PURE__ */ jsx(Text, { children: " " }),
2094
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
2095
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
2096
+ TextBar,
2097
+ {
2098
+ label: dimension.name,
2099
+ value: dimension.score
2100
+ },
2101
+ dimension.name
2102
+ )),
2103
+ /* @__PURE__ */ jsx(Text, { children: " " }),
2104
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
2105
+ /* @__PURE__ */ jsx(
2106
+ Sparkline,
2107
+ {
2108
+ data: selectedRun.performance.latencyHistoryMs ?? [
2109
+ selectedRun.performance.latencyAvgMs - 40,
2110
+ selectedRun.performance.latencyAvgMs - 10,
2111
+ selectedRun.performance.latencyAvgMs + 20,
2112
+ selectedRun.performance.latencyP95Ms - 80,
2113
+ selectedRun.performance.latencyP95Ms
2114
+ ],
2115
+ width: 24
2116
+ }
2117
+ )
2118
+ ] }) })
2119
+ ] });
2120
+ }
1822
2121
  var DETAILS_PAGE_SIZE = 20;
1823
2122
  function scoreColor(score) {
1824
2123
  if (score >= 80)
@@ -1827,7 +2126,7 @@ function scoreColor(score) {
1827
2126
  return "yellow";
1828
2127
  return "red";
1829
2128
  }
1830
- function formatScorePart(item, scoreToColor) {
2129
+ function formatScorePart(item) {
1831
2130
  const def = getScoreById(item.id);
1832
2131
  if (!def) {
1833
2132
  const numeric = toNumericScore(item.data);
@@ -1857,7 +2156,7 @@ function CheckRow({
1857
2156
  " ",
1858
2157
  /* @__PURE__ */ jsx(Text, { color, bold: true, children: status }),
1859
2158
  detail ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1860
- " (",
2159
+ " (",
1861
2160
  detail,
1862
2161
  ")"
1863
2162
  ] }) : null
@@ -1877,21 +2176,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1877
2176
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1878
2177
  "Model: ",
1879
2178
  meta.model,
1880
- " Provider: ",
2179
+ " Provider: ",
1881
2180
  meta.provider
1882
2181
  ] }, "meta-1"),
1883
2182
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1884
2183
  "Commit: ",
1885
2184
  meta.commit,
1886
- " Branch: ",
2185
+ " Branch: ",
1887
2186
  meta.branch,
1888
- " Seed: ",
2187
+ " Seed: ",
1889
2188
  meta.seed
1890
2189
  ] }, "meta-2"),
1891
2190
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1892
2191
  "Duration: ",
1893
2192
  meta.duration,
1894
- " Concurrency: ",
2193
+ " Concurrency: ",
1895
2194
  meta.concurrency
1896
2195
  ] }, "meta-3"),
1897
2196
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
@@ -1903,7 +2202,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1903
2202
  ...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
1904
2203
  /* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
1905
2204
  /* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
1906
- ...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2205
+ ...checks.map((c) => /* @__PURE__ */ jsx(
2206
+ CheckRow,
2207
+ {
2208
+ name: c.name,
2209
+ passed: c.passed,
2210
+ detail: c.detail
2211
+ },
2212
+ `chk-${c.name}`
2213
+ )),
1907
2214
  /* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
1908
2215
  /* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
1909
2216
  /* @__PURE__ */ jsx(
@@ -1916,16 +2223,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1916
2223
  "perf-rate"
1917
2224
  ),
1918
2225
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1919
- "latency avg ",
2226
+ "latency avg ",
1920
2227
  performance.latencyAvgMs,
1921
- "ms p95 ",
2228
+ "ms p95 ",
1922
2229
  performance.latencyP95Ms,
1923
2230
  "ms"
1924
2231
  ] }, "perf-lat"),
1925
2232
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1926
- "tokens avg ",
2233
+ "tokens avg ",
1927
2234
  performance.tokensAvg,
1928
- " p95 ",
2235
+ " p95 ",
1929
2236
  performance.tokensP95
1930
2237
  ] }, "perf-tok"),
1931
2238
  /* @__PURE__ */ jsx(Text, { children: " " }, "sp4"),
@@ -1949,6 +2256,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1949
2256
  rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
1950
2257
  rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
1951
2258
  for (const tc of testCases) {
2259
+ const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
1952
2260
  rows.push(
1953
2261
  /* @__PURE__ */ jsxs(Text, { children: [
1954
2262
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -1960,12 +2268,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
1960
2268
  ] }),
1961
2269
  " ",
1962
2270
  tc.testCaseName,
2271
+ rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
1963
2272
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1964
2273
  " (",
1965
2274
  tc.durationMs,
1966
2275
  "ms)"
1967
2276
  ] })
1968
- ] }, `tc-${tc.testCaseId}`)
2277
+ ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
1969
2278
  );
1970
2279
  for (const item of tc.evaluatorScores) {
1971
2280
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -2034,7 +2343,7 @@ function RunDetailsView({
2034
2343
  const runs = dataset?.runs ?? [];
2035
2344
  const rightFocused = state.focus === "right";
2036
2345
  const [testCases, setTestCases] = useState([]);
2037
- const evaluatorNameById = React.useMemo(
2346
+ const evaluatorNameById = React2.useMemo(
2038
2347
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2039
2348
  [evaluators]
2040
2349
  );
@@ -2057,7 +2366,7 @@ function RunDetailsView({
2057
2366
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2058
2367
  return /* @__PURE__ */ jsxs(Fragment, { children: [
2059
2368
  /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
2060
- /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
2369
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React2.Fragment, { children: row }, i)) }) })
2061
2370
  ] });
2062
2371
  }
2063
2372
  var LEFT_PANE_WIDTH3 = 44;
@@ -2139,6 +2448,7 @@ function EvalsCliApp({
2139
2448
  const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
2140
2449
  const [liveData, setLiveData] = useState(data);
2141
2450
  const [runtimeMessage, setRuntimeMessage] = useState();
2451
+ const overviewRowCountRef = useRef(0);
2142
2452
  const [state, dispatch] = useReducer(
2143
2453
  reduceCliState,
2144
2454
  createInitialState(data, args)
@@ -2218,7 +2528,16 @@ function EvalsCliApp({
2218
2528
  return;
2219
2529
  }
2220
2530
  if (key.downArrow) {
2221
- const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
2531
+ let max;
2532
+ if (clampedState.level === "datasets") {
2533
+ max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
2534
+ } else if (clampedState.level === "runs") {
2535
+ max = selectedDataset?.runs.length ?? 0;
2536
+ } else if (clampedState.level === "new-evaluation") {
2537
+ max = Math.max(0, visibleEvaluators.length - 1);
2538
+ } else {
2539
+ max = 100;
2540
+ }
2222
2541
  dispatch({ type: "MOVE_DOWN", max });
2223
2542
  return;
2224
2543
  }
@@ -2236,7 +2555,7 @@ function EvalsCliApp({
2236
2555
  }
2237
2556
  return;
2238
2557
  }
2239
- if (isBackKey(key)) {
2558
+ if (isBackKey(key) || input === "\x7F" || input === "\b") {
2240
2559
  dispatch({ type: "BACK" });
2241
2560
  return;
2242
2561
  }
@@ -2289,7 +2608,8 @@ function EvalsCliApp({
2289
2608
  {
2290
2609
  state: clampedState,
2291
2610
  filteredDatasets,
2292
- selectedDataset
2611
+ selectedDataset,
2612
+ overviewRowCountRef
2293
2613
  }
2294
2614
  );
2295
2615
  }