@m4trix/evals 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -2,17 +2,18 @@
2
2
  'use strict';
3
3
 
4
4
  var fullscreenInk = require('fullscreen-ink');
5
- var React = require('react');
5
+ var React2 = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
8
  var path = require('path');
9
- var jsonDiff = require('json-diff');
9
+ var inkChart = require('@pppp606/ink-chart');
10
10
  var crypto = require('crypto');
11
11
  var effect = require('effect');
12
12
  var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
14
  var promises = require('fs/promises');
15
15
  var url = require('url');
16
+ var diff = require('diff');
16
17
 
17
18
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
18
19
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -35,7 +36,7 @@ function _interopNamespace(e) {
35
36
  return Object.freeze(n);
36
37
  }
37
38
 
38
- var React__default = /*#__PURE__*/_interopDefault(React);
39
+ var React2__default = /*#__PURE__*/_interopDefault(React2);
39
40
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
40
41
 
41
42
  var SEP = " ";
@@ -104,7 +105,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
104
105
  // src/cli/components/Footer.tsx
105
106
  function getFooterText(state) {
106
107
  if (state.level === "datasets") {
107
- return "\u2191\u2193 move Enter open / search Tab focus q quit";
108
+ return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
108
109
  }
109
110
  if (state.level === "runs") {
110
111
  return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
@@ -636,6 +637,7 @@ function createInitialState(data, args) {
636
637
  datasetMenuIndex,
637
638
  runMenuIndex,
638
639
  detailsScrollOffset: 0,
640
+ overviewScrollOffset: 0,
639
641
  selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
640
642
  evaluatorMenuIndex: 0,
641
643
  searchQuery,
@@ -651,8 +653,11 @@ function reduceCliState(state, action) {
651
653
  if (state.level === "details" && state.focus === "right") {
652
654
  return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
653
655
  }
656
+ if (state.level === "datasets" && state.focus === "right") {
657
+ return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
658
+ }
654
659
  if (state.level === "datasets") {
655
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
660
+ return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
656
661
  }
657
662
  if (state.level === "runs") {
658
663
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -669,8 +674,11 @@ function reduceCliState(state, action) {
669
674
  if (state.level === "details" && state.focus === "right") {
670
675
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
671
676
  }
677
+ if (state.level === "datasets" && state.focus === "right") {
678
+ return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
679
+ }
672
680
  if (state.level === "datasets") {
673
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
681
+ return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
674
682
  }
675
683
  if (state.level === "runs") {
676
684
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -746,292 +754,6 @@ function reduceCliState(state, action) {
746
754
  }
747
755
  return state;
748
756
  }
749
- var LEFT_PANE_WIDTH2 = 44;
750
- function DatasetsView({
751
- state,
752
- filteredDatasets,
753
- selectedDataset
754
- }) {
755
- const leftFocused = state.focus === "left";
756
- const rightFocused = state.focus === "right";
757
- return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
758
- /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
759
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
760
- /* @__PURE__ */ jsxRuntime.jsx(
761
- ListItem,
762
- {
763
- selected: state.datasetMenuIndex === 0,
764
- label: "New evaluation",
765
- itemKey: "datasets-new-eval"
766
- }
767
- ),
768
- filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
769
- ListItem,
770
- {
771
- selected: state.datasetMenuIndex === index + 1,
772
- label: dataset.name,
773
- itemKey: `dataset-${dataset.id}`
774
- },
775
- dataset.id
776
- ))
777
- ] }),
778
- /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
779
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
780
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
781
- ] })
782
- ] });
783
- }
784
- function RunsView({
785
- state,
786
- dataset,
787
- selectedRun
788
- }) {
789
- const runs = dataset?.runs ?? [];
790
- const rightFocused = state.focus === "right";
791
- return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
792
- /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
793
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
794
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
795
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
796
- " ",
797
- selectedRun.label,
798
- " ",
799
- /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
800
- ] }),
801
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
802
- "Commit: ",
803
- selectedRun.meta.commit,
804
- " Branch: ",
805
- selectedRun.meta.branch,
806
- " ",
807
- "Seed: ",
808
- selectedRun.meta.seed
809
- ] }),
810
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
811
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
812
- /* @__PURE__ */ jsxRuntime.jsx(
813
- TextBar,
814
- {
815
- label: "pass rate",
816
- value: selectedRun.performance.passRate,
817
- format: (v) => `${v}%`
818
- }
819
- ),
820
- /* @__PURE__ */ jsxRuntime.jsx(
821
- TextBar,
822
- {
823
- label: "avg score",
824
- value: Math.round(selectedRun.performance.avgScore * 100)
825
- }
826
- ),
827
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
828
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
829
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
830
- TextBar,
831
- {
832
- label: dimension.name,
833
- value: dimension.score
834
- },
835
- dimension.name
836
- )),
837
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
838
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
839
- /* @__PURE__ */ jsxRuntime.jsx(
840
- Sparkline,
841
- {
842
- data: selectedRun.performance.latencyHistoryMs ?? [
843
- selectedRun.performance.latencyAvgMs - 40,
844
- selectedRun.performance.latencyAvgMs - 10,
845
- selectedRun.performance.latencyAvgMs + 20,
846
- selectedRun.performance.latencyP95Ms - 80,
847
- selectedRun.performance.latencyP95Ms
848
- ],
849
- width: 24
850
- }
851
- )
852
- ] }) })
853
- ] });
854
- }
855
-
856
- // src/evals/metric.ts
857
- var registry = /* @__PURE__ */ new Map();
858
- var Metric = {
859
- of(config) {
860
- const def = {
861
- id: config.id,
862
- name: config.name,
863
- aggregate: config.aggregate,
864
- format: config.format,
865
- make: (data) => ({ id: config.id, data })
866
- };
867
- registry.set(config.id, def);
868
- return def;
869
- }
870
- };
871
- function getMetricById(id) {
872
- return registry.get(id);
873
- }
874
-
875
- // src/evals/score.ts
876
- var registry2 = /* @__PURE__ */ new Map();
877
- var Score = {
878
- of(config) {
879
- const def = {
880
- id: config.id,
881
- name: config.name,
882
- displayStrategy: config.displayStrategy,
883
- aggregate: config.aggregate,
884
- format: config.format,
885
- make: (data, options) => {
886
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
887
- return {
888
- id: config.id,
889
- data,
890
- ...passed !== void 0 && { passed }
891
- };
892
- }
893
- };
894
- registry2.set(config.id, def);
895
- return def;
896
- }
897
- };
898
- function getScoreById(id) {
899
- return registry2.get(id);
900
- }
901
-
902
- // src/evals/aggregators.ts
903
- function aggregateAverage(values) {
904
- if (values.length === 0) {
905
- return { value: 0 };
906
- }
907
- const sum = values.reduce((s, v) => s + v.value, 0);
908
- return { value: sum / values.length };
909
- }
910
- function aggregateAll(values) {
911
- return { passed: values.length > 0 && values.every((v) => v.passed) };
912
- }
913
- function aggregateTokenCountSum(values) {
914
- const initial = {
915
- input: 0,
916
- output: 0,
917
- inputCached: 0,
918
- outputCached: 0
919
- };
920
- return values.reduce(
921
- (acc, v) => ({
922
- input: acc.input + (v.input ?? 0),
923
- output: acc.output + (v.output ?? 0),
924
- inputCached: acc.inputCached + (v.inputCached ?? 0),
925
- outputCached: acc.outputCached + (v.outputCached ?? 0)
926
- }),
927
- initial
928
- );
929
- }
930
- function aggregateLatencyAverage(values) {
931
- if (values.length === 0) {
932
- return { ms: 0 };
933
- }
934
- const sum = values.reduce((s, v) => s + v.ms, 0);
935
- return { ms: sum / values.length };
936
- }
937
-
938
- // src/evals/metrics/standard.ts
939
- Metric.of({
940
- id: "token-count",
941
- name: "Tokens",
942
- aggregate: aggregateTokenCountSum,
943
- format: (data, options) => {
944
- const input = data.input ?? 0;
945
- const output = data.output ?? 0;
946
- const inputCached = data.inputCached ?? 0;
947
- const outputCached = data.outputCached ?? 0;
948
- const cached = inputCached + outputCached;
949
- const base = `in:${input} out:${output} cached:${cached}`;
950
- return options?.isAggregated ? `Total: ${base}` : base;
951
- }
952
- });
953
- Metric.of({
954
- id: "latency",
955
- name: "Latency",
956
- aggregate: aggregateLatencyAverage,
957
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
958
- });
959
-
960
- // src/evals/scores/standard.ts
961
- Score.of({
962
- id: "percent",
963
- name: "Score",
964
- displayStrategy: "bar",
965
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
966
- aggregate: aggregateAverage
967
- });
968
- Score.of({
969
- id: "binary",
970
- name: "Result",
971
- displayStrategy: "passFail",
972
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
973
- aggregate: aggregateAll
974
- });
975
- function createDiffLogEntry(expected, actual, options) {
976
- const diff = jsonDiff.diffString(expected, actual, { color: false });
977
- return {
978
- type: "diff",
979
- label: options?.label,
980
- expected,
981
- actual,
982
- diff: diff || "(no differences)"
983
- };
984
- }
985
- function getDiffLines(entry) {
986
- const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
987
- return raw.split("\n").map((line) => {
988
- const trimmed = line.trimStart();
989
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
990
- return { type: "remove", line };
991
- }
992
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
993
- return { type: "add", line };
994
- }
995
- return { type: "context", line };
996
- });
997
- }
998
-
999
- // src/runner/score-utils.ts
1000
- function toNumericScoreFromScores(scores) {
1001
- for (const item of scores) {
1002
- const def = getScoreById(item.id);
1003
- if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1004
- const value = item.data.value;
1005
- if (typeof value === "number" && Number.isFinite(value)) {
1006
- return value;
1007
- }
1008
- }
1009
- const numeric = toNumericScore(item.data);
1010
- if (numeric !== void 0) {
1011
- return numeric;
1012
- }
1013
- }
1014
- return void 0;
1015
- }
1016
- function toNumericScore(value) {
1017
- if (typeof value === "number" && Number.isFinite(value)) {
1018
- return value;
1019
- }
1020
- if (typeof value !== "object" || value === null) {
1021
- return void 0;
1022
- }
1023
- const obj = value;
1024
- if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1025
- return obj.score;
1026
- }
1027
- const numberValues = Object.values(value).filter(
1028
- (entry) => typeof entry === "number" && Number.isFinite(entry)
1029
- );
1030
- if (numberValues.length === 0) {
1031
- return void 0;
1032
- }
1033
- return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1034
- }
1035
757
 
1036
758
  // src/runner/config.ts
1037
759
  var defaultRunnerConfig = {
@@ -1213,75 +935,311 @@ async function loadModuleExports(filePath) {
1213
935
  if (!createJiti2) {
1214
936
  throw new Error("Failed to initialize jiti TypeScript loader");
1215
937
  }
1216
- jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
1217
- interopDefault: true,
1218
- moduleCache: true
1219
- });
938
+ jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
939
+ interopDefault: true,
940
+ moduleCache: true
941
+ });
942
+ }
943
+ const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
944
+ return Object.values(loaded2);
945
+ }
946
+ const moduleUrl = url.pathToFileURL(filePath).href;
947
+ const loaded = await import(moduleUrl);
948
+ return Object.values(loaded);
949
+ }
950
+ async function collectDatasetsFromFiles(config) {
951
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
952
+ const matched = files.filter(
953
+ (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
954
+ );
955
+ const found = await Promise.all(
956
+ matched.map(async (absolutePath) => {
957
+ const exports = await loadModuleExports(absolutePath);
958
+ const datasets = exports.filter(isDatasetLike);
959
+ const relPath = path.relative(config.rootDir, absolutePath);
960
+ return datasets.map((dataset) => ({
961
+ id: toId("dataset", relPath, dataset.getName()),
962
+ filePath: relPath,
963
+ dataset
964
+ }));
965
+ })
966
+ );
967
+ return found.flat();
968
+ }
969
+ async function collectEvaluatorsFromFiles(config) {
970
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
971
+ const matched = files.filter(
972
+ (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
973
+ );
974
+ const found = await Promise.all(
975
+ matched.map(async (absolutePath) => {
976
+ const exports = await loadModuleExports(absolutePath);
977
+ const evaluators = exports.filter(isEvaluatorLike);
978
+ const relPath = path.relative(config.rootDir, absolutePath);
979
+ return evaluators.map((evaluator) => ({
980
+ id: toId("evaluator", relPath, evaluator.getName()),
981
+ filePath: relPath,
982
+ evaluator
983
+ }));
984
+ })
985
+ );
986
+ return found.flat();
987
+ }
988
+ async function collectTestCasesFromFiles(config) {
989
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
990
+ const matched = files.filter(
991
+ (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
992
+ );
993
+ const found = await Promise.all(
994
+ matched.map(async (absolutePath) => {
995
+ const exports = await loadModuleExports(absolutePath);
996
+ const testCases = exports.filter(isTestCaseLike);
997
+ const relPath = path.relative(config.rootDir, absolutePath);
998
+ return testCases.map((testCase) => ({
999
+ id: toId("test-case", relPath, testCase.getName()),
1000
+ filePath: relPath,
1001
+ testCase
1002
+ }));
1003
+ })
1004
+ );
1005
+ return found.flat();
1006
+ }
1007
+ function toJsonLines(value) {
1008
+ try {
1009
+ return JSON.stringify(value, null, 2);
1010
+ } catch {
1011
+ return String(value);
1012
+ }
1013
+ }
1014
+ function formatDiffString(changes) {
1015
+ const lines = [];
1016
+ for (const part of changes) {
1017
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
1018
+ const partLines = part.value.split("\n");
1019
+ if (partLines[partLines.length - 1] === "") {
1020
+ partLines.pop();
1021
+ }
1022
+ for (const line of partLines) {
1023
+ lines.push(`${prefix} ${line}`);
1024
+ }
1025
+ }
1026
+ return lines.join("\n");
1027
+ }
1028
+ function createDiffString(expected, actual) {
1029
+ const expectedStr = toJsonLines(expected);
1030
+ const actualStr = toJsonLines(actual);
1031
+ const changes = diff.diffLines(expectedStr, actualStr);
1032
+ return formatDiffString(changes);
1033
+ }
1034
+ function createDiffLogEntry(expected, actual, options) {
1035
+ const diff = createDiffString(expected, actual);
1036
+ return {
1037
+ type: "diff",
1038
+ label: options?.label,
1039
+ expected,
1040
+ actual,
1041
+ diff: diff || "(no differences)"
1042
+ };
1043
+ }
1044
+ function getDiffLines(entry) {
1045
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1046
+ return raw.split("\n").map((line) => {
1047
+ const trimmed = line.trimStart();
1048
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
1049
+ return { type: "remove", line };
1050
+ }
1051
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
1052
+ return { type: "add", line };
1053
+ }
1054
+ return { type: "context", line };
1055
+ });
1056
+ }
1057
+
1058
+ // src/evals/metric.ts
1059
+ var registry = /* @__PURE__ */ new Map();
1060
+ var Metric = {
1061
+ of(config) {
1062
+ const def = {
1063
+ id: config.id,
1064
+ name: config.name,
1065
+ aggregate: config.aggregate,
1066
+ format: config.format,
1067
+ make: (data) => ({ id: config.id, data })
1068
+ };
1069
+ registry.set(config.id, def);
1070
+ return def;
1071
+ }
1072
+ };
1073
+ function getMetricById(id) {
1074
+ return registry.get(id);
1075
+ }
1076
+
1077
+ // src/evals/score.ts
1078
+ var registry2 = /* @__PURE__ */ new Map();
1079
+ var Score = {
1080
+ of(config) {
1081
+ const def = {
1082
+ id: config.id,
1083
+ name: config.name,
1084
+ displayStrategy: config.displayStrategy,
1085
+ aggregate: config.aggregate,
1086
+ format: config.format,
1087
+ make: (data, options) => {
1088
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1089
+ return {
1090
+ id: config.id,
1091
+ data,
1092
+ ...passed !== void 0 && { passed }
1093
+ };
1094
+ }
1095
+ };
1096
+ registry2.set(config.id, def);
1097
+ return def;
1098
+ }
1099
+ };
1100
+ function getScoreById(id) {
1101
+ return registry2.get(id);
1102
+ }
1103
+
1104
+ // src/evals/aggregators.ts
1105
+ function aggregateAverageWithVariance(values) {
1106
+ if (values.length === 0) {
1107
+ return { value: 0, count: 0 };
1108
+ }
1109
+ const sum = values.reduce((s, v) => s + v.value, 0);
1110
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1111
+ const mean = sum / values.length;
1112
+ let stdDev;
1113
+ if (values.length >= 2) {
1114
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1115
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1116
+ }
1117
+ return { value: mean, stdDev, count: values.length };
1118
+ }
1119
+ function aggregateAll(values) {
1120
+ const total = values.length;
1121
+ const passedCount = values.filter((v) => v.passed).length;
1122
+ return {
1123
+ passed: total > 0 && values.every((v) => v.passed),
1124
+ passedCount,
1125
+ totalCount: total
1126
+ };
1127
+ }
1128
+ function aggregateTokenCountSum(values) {
1129
+ const initial = {
1130
+ input: 0,
1131
+ output: 0,
1132
+ inputCached: 0,
1133
+ outputCached: 0
1134
+ };
1135
+ return values.reduce(
1136
+ (acc, v) => ({
1137
+ input: acc.input + (v.input ?? 0),
1138
+ output: acc.output + (v.output ?? 0),
1139
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1140
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1141
+ }),
1142
+ initial
1143
+ );
1144
+ }
1145
+ function aggregateLatencyAverage(values) {
1146
+ if (values.length === 0) {
1147
+ return { ms: 0 };
1148
+ }
1149
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1150
+ return { ms: sum / values.length };
1151
+ }
1152
+
1153
+ // src/evals/metrics/standard.ts
1154
+ Metric.of({
1155
+ id: "token-count",
1156
+ name: "Tokens",
1157
+ aggregate: aggregateTokenCountSum,
1158
+ format: (data, options) => {
1159
+ const input = data.input ?? 0;
1160
+ const output = data.output ?? 0;
1161
+ const inputCached = data.inputCached ?? 0;
1162
+ const outputCached = data.outputCached ?? 0;
1163
+ const cached = inputCached + outputCached;
1164
+ const base = `in:${input} out:${output} cached:${cached}`;
1165
+ return options?.isAggregated ? `Total: ${base}` : base;
1166
+ }
1167
+ });
1168
+ Metric.of({
1169
+ id: "latency",
1170
+ name: "Latency",
1171
+ aggregate: aggregateLatencyAverage,
1172
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1173
+ });
1174
+
1175
+ // src/evals/scores/standard.ts
1176
+ Score.of({
1177
+ id: "percent",
1178
+ name: "Score",
1179
+ displayStrategy: "bar",
1180
+ format: (data, options) => {
1181
+ if (options?.isAggregated) {
1182
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1183
+ }
1184
+ return data.value.toFixed(2);
1185
+ },
1186
+ aggregate: aggregateAverageWithVariance
1187
+ });
1188
+ Score.of({
1189
+ id: "binary",
1190
+ name: "Result",
1191
+ displayStrategy: "passFail",
1192
+ format: (data, options) => {
1193
+ if (options?.isAggregated) {
1194
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1195
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1196
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1197
+ }
1198
+ return base;
1199
+ }
1200
+ return data.passed ? "PASSED" : "NOT PASSED";
1201
+ },
1202
+ aggregate: aggregateAll
1203
+ });
1204
+
1205
+ // src/runner/score-utils.ts
1206
+ function toNumericScoreFromScores(scores) {
1207
+ for (const item of scores) {
1208
+ const def = getScoreById(item.id);
1209
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1210
+ const value = item.data.value;
1211
+ if (typeof value === "number" && Number.isFinite(value)) {
1212
+ return value;
1213
+ }
1214
+ }
1215
+ const numeric = toNumericScore(item.data);
1216
+ if (numeric !== void 0) {
1217
+ return numeric;
1220
1218
  }
1221
- const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
1222
- return Object.values(loaded2);
1223
1219
  }
1224
- const moduleUrl = url.pathToFileURL(filePath).href;
1225
- const loaded = await import(moduleUrl);
1226
- return Object.values(loaded);
1227
- }
1228
- async function collectDatasetsFromFiles(config) {
1229
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1230
- const matched = files.filter(
1231
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1232
- );
1233
- const found = await Promise.all(
1234
- matched.map(async (absolutePath) => {
1235
- const exports = await loadModuleExports(absolutePath);
1236
- const datasets = exports.filter(isDatasetLike);
1237
- const relPath = path.relative(config.rootDir, absolutePath);
1238
- return datasets.map((dataset) => ({
1239
- id: toId("dataset", relPath, dataset.getName()),
1240
- filePath: relPath,
1241
- dataset
1242
- }));
1243
- })
1244
- );
1245
- return found.flat();
1246
- }
1247
- async function collectEvaluatorsFromFiles(config) {
1248
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1249
- const matched = files.filter(
1250
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1251
- );
1252
- const found = await Promise.all(
1253
- matched.map(async (absolutePath) => {
1254
- const exports = await loadModuleExports(absolutePath);
1255
- const evaluators = exports.filter(isEvaluatorLike);
1256
- const relPath = path.relative(config.rootDir, absolutePath);
1257
- return evaluators.map((evaluator) => ({
1258
- id: toId("evaluator", relPath, evaluator.getName()),
1259
- filePath: relPath,
1260
- evaluator
1261
- }));
1262
- })
1263
- );
1264
- return found.flat();
1220
+ return void 0;
1265
1221
  }
1266
- async function collectTestCasesFromFiles(config) {
1267
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1268
- const matched = files.filter(
1269
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1270
- );
1271
- const found = await Promise.all(
1272
- matched.map(async (absolutePath) => {
1273
- const exports = await loadModuleExports(absolutePath);
1274
- const testCases = exports.filter(isTestCaseLike);
1275
- const relPath = path.relative(config.rootDir, absolutePath);
1276
- return testCases.map((testCase) => ({
1277
- id: toId("test-case", relPath, testCase.getName()),
1278
- filePath: relPath,
1279
- testCase
1280
- }));
1281
- })
1222
+ function toNumericScore(value) {
1223
+ if (typeof value === "number" && Number.isFinite(value)) {
1224
+ return value;
1225
+ }
1226
+ if (typeof value !== "object" || value === null) {
1227
+ return void 0;
1228
+ }
1229
+ const obj = value;
1230
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1231
+ return obj.score;
1232
+ }
1233
+ const numberValues = Object.values(value).filter(
1234
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1282
1235
  );
1283
- return found.flat();
1236
+ if (numberValues.length === 0) {
1237
+ return void 0;
1238
+ }
1239
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1284
1240
  }
1241
+
1242
+ // src/runner/execution.ts
1285
1243
  function computeEvaluatorPassed(evaluator, result, scores) {
1286
1244
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1287
1245
  if (scoresWithPassed.length > 0) {
@@ -1952,6 +1910,240 @@ var EffectRunner = class {
1952
1910
  );
1953
1911
  }
1954
1912
  };
1913
+ var LEFT_PANE_WIDTH2 = 44;
1914
+ var MAX_RUNS_FOR_CHART = 12;
1915
+ var MAX_RUNS_FOR_TREND = 20;
1916
+ var TREND_BATCH_SIZE = 4;
1917
+ function extractRunAverageScore(testCases) {
1918
+ const scores = [];
1919
+ for (const tc of testCases) {
1920
+ for (const es of tc.evaluatorScores) {
1921
+ const n = toNumericScoreFromScores(es.scores);
1922
+ if (n !== void 0) {
1923
+ scores.push(n);
1924
+ }
1925
+ }
1926
+ }
1927
+ if (scores.length === 0)
1928
+ return void 0;
1929
+ return scores.reduce((a, b) => a + b, 0) / scores.length;
1930
+ }
1931
+ async function loadRunScores(runs) {
1932
+ const results = [];
1933
+ for (const run of runs) {
1934
+ const artifact = run.meta?.artifact;
1935
+ if (!artifact)
1936
+ continue;
1937
+ try {
1938
+ const path$1 = path.resolve(artifact);
1939
+ const testCases = await parseArtifactFile(path$1);
1940
+ const avg = extractRunAverageScore(testCases);
1941
+ if (avg !== void 0) {
1942
+ results.push({
1943
+ runId: run.id,
1944
+ label: run.label,
1945
+ value: avg
1946
+ });
1947
+ }
1948
+ } catch {
1949
+ }
1950
+ }
1951
+ return results;
1952
+ }
1953
+ function batchAverage(values, batchSize) {
1954
+ const batches = [];
1955
+ for (let i = 0; i < values.length; i += batchSize) {
1956
+ const slice = values.slice(i, i + batchSize);
1957
+ if (slice.length > 0) {
1958
+ batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
1959
+ }
1960
+ }
1961
+ return batches;
1962
+ }
1963
+ var OVERVIEW_PAGE_SIZE = 15;
1964
+ function DatasetsView({
1965
+ state,
1966
+ filteredDatasets,
1967
+ selectedDataset,
1968
+ overviewRowCountRef
1969
+ }) {
1970
+ const leftFocused = state.focus === "left";
1971
+ const rightFocused = state.focus === "right";
1972
+ const [runScores, setRunScores] = React2.useState([]);
1973
+ const [loading, setLoading] = React2.useState(false);
1974
+ React2.useEffect(() => {
1975
+ if (!selectedDataset?.runs?.length) {
1976
+ setRunScores([]);
1977
+ return;
1978
+ }
1979
+ setLoading(true);
1980
+ const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
1981
+ loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
1982
+ }, [selectedDataset?.id, selectedDataset?.runs?.length]);
1983
+ const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
1984
+ const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
1985
+ const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
1986
+ const overviewRows = React2.useMemo(() => {
1987
+ const rows = [];
1988
+ rows.push(
1989
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
1990
+ );
1991
+ if (selectedDataset && selectedDataset.runs.length > 0) {
1992
+ if (loading) {
1993
+ rows.push(
1994
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
1995
+ );
1996
+ } else if (runScores.length > 0) {
1997
+ rows.push(
1998
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
1999
+ );
2000
+ for (const d of barData) {
2001
+ rows.push(
2002
+ /* @__PURE__ */ jsxRuntime.jsx(
2003
+ TextBar,
2004
+ {
2005
+ label: d.label,
2006
+ value: d.value,
2007
+ labelWidth: 14,
2008
+ barWidth: 24,
2009
+ max: 100,
2010
+ format: (v) => v.toFixed(1)
2011
+ },
2012
+ d.runId
2013
+ )
2014
+ );
2015
+ }
2016
+ if (trendBatched.length > 0) {
2017
+ rows.push(
2018
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
2019
+ );
2020
+ rows.push(
2021
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: /* @__PURE__ */ jsxRuntime.jsx(
2022
+ inkChart.LineGraph,
2023
+ {
2024
+ data: [{ values: trendBatched, color: "cyan" }],
2025
+ height: 5,
2026
+ width: 45,
2027
+ showYAxis: true,
2028
+ xLabels: ["older", "newer"]
2029
+ }
2030
+ ) }, "trend-graph")
2031
+ );
2032
+ }
2033
+ }
2034
+ }
2035
+ return rows;
2036
+ }, [
2037
+ selectedDataset?.overview,
2038
+ selectedDataset?.runs?.length,
2039
+ loading,
2040
+ runScores,
2041
+ barData,
2042
+ trendBatched
2043
+ ]);
2044
+ if (overviewRowCountRef) {
2045
+ overviewRowCountRef.current = overviewRows.length;
2046
+ }
2047
+ const offset = Math.max(0, state.overviewScrollOffset);
2048
+ const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
2049
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2050
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
2051
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
2052
+ /* @__PURE__ */ jsxRuntime.jsx(
2053
+ ListItem,
2054
+ {
2055
+ selected: state.datasetMenuIndex === 0,
2056
+ label: "New evaluation",
2057
+ itemKey: "datasets-new-eval"
2058
+ }
2059
+ ),
2060
+ filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
2061
+ ListItem,
2062
+ {
2063
+ selected: state.datasetMenuIndex === index + 1,
2064
+ label: dataset.name,
2065
+ itemKey: `dataset-${dataset.id}`
2066
+ },
2067
+ dataset.id
2068
+ ))
2069
+ ] }),
2070
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
2071
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
2072
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: row }, offset + i)) })
2073
+ ] })
2074
+ ] });
2075
+ }
2076
+ function RunsView({
2077
+ state,
2078
+ dataset,
2079
+ selectedRun
2080
+ }) {
2081
+ const runs = dataset?.runs ?? [];
2082
+ const rightFocused = state.focus === "right";
2083
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2084
+ /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2085
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
2086
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2087
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
2088
+ " ",
2089
+ selectedRun.label,
2090
+ " ",
2091
+ /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
2092
+ ] }),
2093
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2094
+ "Commit: ",
2095
+ selectedRun.meta.commit,
2096
+ " Branch: ",
2097
+ selectedRun.meta.branch,
2098
+ " ",
2099
+ "Seed: ",
2100
+ selectedRun.meta.seed
2101
+ ] }),
2102
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2103
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
2104
+ /* @__PURE__ */ jsxRuntime.jsx(
2105
+ TextBar,
2106
+ {
2107
+ label: "pass rate",
2108
+ value: selectedRun.performance.passRate,
2109
+ format: (v) => `${v}%`
2110
+ }
2111
+ ),
2112
+ /* @__PURE__ */ jsxRuntime.jsx(
2113
+ TextBar,
2114
+ {
2115
+ label: "avg score",
2116
+ value: Math.round(selectedRun.performance.avgScore * 100)
2117
+ }
2118
+ ),
2119
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2120
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
2121
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
2122
+ TextBar,
2123
+ {
2124
+ label: dimension.name,
2125
+ value: dimension.score
2126
+ },
2127
+ dimension.name
2128
+ )),
2129
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2130
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
2131
+ /* @__PURE__ */ jsxRuntime.jsx(
2132
+ Sparkline,
2133
+ {
2134
+ data: selectedRun.performance.latencyHistoryMs ?? [
2135
+ selectedRun.performance.latencyAvgMs - 40,
2136
+ selectedRun.performance.latencyAvgMs - 10,
2137
+ selectedRun.performance.latencyAvgMs + 20,
2138
+ selectedRun.performance.latencyP95Ms - 80,
2139
+ selectedRun.performance.latencyP95Ms
2140
+ ],
2141
+ width: 24
2142
+ }
2143
+ )
2144
+ ] }) })
2145
+ ] });
2146
+ }
1955
2147
  var DETAILS_PAGE_SIZE = 20;
1956
2148
  function scoreColor(score) {
1957
2149
  if (score >= 80)
@@ -1960,7 +2152,7 @@ function scoreColor(score) {
1960
2152
  return "yellow";
1961
2153
  return "red";
1962
2154
  }
1963
- function formatScorePart(item, scoreToColor) {
2155
+ function formatScorePart(item) {
1964
2156
  const def = getScoreById(item.id);
1965
2157
  if (!def) {
1966
2158
  const numeric = toNumericScore(item.data);
@@ -1990,7 +2182,7 @@ function CheckRow({
1990
2182
  " ",
1991
2183
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
1992
2184
  detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1993
- " (",
2185
+ " (",
1994
2186
  detail,
1995
2187
  ")"
1996
2188
  ] }) : null
@@ -2010,21 +2202,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2010
2202
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2011
2203
  "Model: ",
2012
2204
  meta.model,
2013
- " Provider: ",
2205
+ " Provider: ",
2014
2206
  meta.provider
2015
2207
  ] }, "meta-1"),
2016
2208
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2017
2209
  "Commit: ",
2018
2210
  meta.commit,
2019
- " Branch: ",
2211
+ " Branch: ",
2020
2212
  meta.branch,
2021
- " Seed: ",
2213
+ " Seed: ",
2022
2214
  meta.seed
2023
2215
  ] }, "meta-2"),
2024
2216
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2025
2217
  "Duration: ",
2026
2218
  meta.duration,
2027
- " Concurrency: ",
2219
+ " Concurrency: ",
2028
2220
  meta.concurrency
2029
2221
  ] }, "meta-3"),
2030
2222
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
@@ -2036,7 +2228,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2036
2228
  ...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
2037
2229
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
2038
2230
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
2039
- ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2231
+ ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
2232
+ CheckRow,
2233
+ {
2234
+ name: c.name,
2235
+ passed: c.passed,
2236
+ detail: c.detail
2237
+ },
2238
+ `chk-${c.name}`
2239
+ )),
2040
2240
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
2041
2241
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
2042
2242
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -2049,16 +2249,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2049
2249
  "perf-rate"
2050
2250
  ),
2051
2251
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2052
- "latency avg ",
2252
+ "latency avg ",
2053
2253
  performance.latencyAvgMs,
2054
- "ms p95 ",
2254
+ "ms p95 ",
2055
2255
  performance.latencyP95Ms,
2056
2256
  "ms"
2057
2257
  ] }, "perf-lat"),
2058
2258
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2059
- "tokens avg ",
2259
+ "tokens avg ",
2060
2260
  performance.tokensAvg,
2061
- " p95 ",
2261
+ " p95 ",
2062
2262
  performance.tokensP95
2063
2263
  ] }, "perf-tok"),
2064
2264
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
@@ -2168,12 +2368,12 @@ function RunDetailsView({
2168
2368
  }) {
2169
2369
  const runs = dataset?.runs ?? [];
2170
2370
  const rightFocused = state.focus === "right";
2171
- const [testCases, setTestCases] = React.useState([]);
2172
- const evaluatorNameById = React__default.default.useMemo(
2371
+ const [testCases, setTestCases] = React2.useState([]);
2372
+ const evaluatorNameById = React2__default.default.useMemo(
2173
2373
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2174
2374
  [evaluators]
2175
2375
  );
2176
- React.useEffect(() => {
2376
+ React2.useEffect(() => {
2177
2377
  if (!selectedRun?.meta?.artifact) {
2178
2378
  setTestCases([]);
2179
2379
  return;
@@ -2192,7 +2392,7 @@ function RunDetailsView({
2192
2392
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2193
2393
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2194
2394
  /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2195
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
2395
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
2196
2396
  ] });
2197
2397
  }
2198
2398
  var LEFT_PANE_WIDTH3 = 44;
@@ -2272,16 +2472,17 @@ function EvalsCliApp({
2272
2472
  }) {
2273
2473
  const { exit } = ink.useApp();
2274
2474
  const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
2275
- const [liveData, setLiveData] = React.useState(data);
2276
- const [runtimeMessage, setRuntimeMessage] = React.useState();
2277
- const [state, dispatch] = React.useReducer(
2475
+ const [liveData, setLiveData] = React2.useState(data);
2476
+ const [runtimeMessage, setRuntimeMessage] = React2.useState();
2477
+ const overviewRowCountRef = React2.useRef(0);
2478
+ const [state, dispatch] = React2.useReducer(
2278
2479
  reduceCliState,
2279
2480
  createInitialState(data, args)
2280
2481
  );
2281
- React.useEffect(() => {
2482
+ React2.useEffect(() => {
2282
2483
  setLiveData(data);
2283
2484
  }, [data]);
2284
- React.useEffect(() => {
2485
+ React2.useEffect(() => {
2285
2486
  if (!runner) {
2286
2487
  return void 0;
2287
2488
  }
@@ -2300,7 +2501,7 @@ function EvalsCliApp({
2300
2501
  }
2301
2502
  });
2302
2503
  }, [runner]);
2303
- const filteredDatasets = React.useMemo(
2504
+ const filteredDatasets = React2.useMemo(
2304
2505
  () => getFilteredDatasets(liveData, state.searchQuery),
2305
2506
  [liveData, state.searchQuery]
2306
2507
  );
@@ -2353,7 +2554,16 @@ function EvalsCliApp({
2353
2554
  return;
2354
2555
  }
2355
2556
  if (key.downArrow) {
2356
- const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
2557
+ let max;
2558
+ if (clampedState.level === "datasets") {
2559
+ max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
2560
+ } else if (clampedState.level === "runs") {
2561
+ max = selectedDataset?.runs.length ?? 0;
2562
+ } else if (clampedState.level === "new-evaluation") {
2563
+ max = Math.max(0, visibleEvaluators.length - 1);
2564
+ } else {
2565
+ max = 100;
2566
+ }
2357
2567
  dispatch({ type: "MOVE_DOWN", max });
2358
2568
  return;
2359
2569
  }
@@ -2371,7 +2581,7 @@ function EvalsCliApp({
2371
2581
  }
2372
2582
  return;
2373
2583
  }
2374
- if (isBackKey(key)) {
2584
+ if (isBackKey(key) || input === "\x7F" || input === "\b") {
2375
2585
  dispatch({ type: "BACK" });
2376
2586
  return;
2377
2587
  }
@@ -2424,7 +2634,8 @@ function EvalsCliApp({
2424
2634
  {
2425
2635
  state: clampedState,
2426
2636
  filteredDatasets,
2427
- selectedDataset
2637
+ selectedDataset,
2638
+ overviewRowCountRef
2428
2639
  }
2429
2640
  );
2430
2641
  }