@m4trix/evals 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -2,17 +2,18 @@
2
2
  'use strict';
3
3
 
4
4
  var fullscreenInk = require('fullscreen-ink');
5
- var React = require('react');
5
+ var React2 = require('react');
6
6
  var ink = require('ink');
7
7
  var jsxRuntime = require('react/jsx-runtime');
8
8
  var path = require('path');
9
- var jsonDiff = require('json-diff');
9
+ var inkChart = require('@pppp606/ink-chart');
10
10
  var crypto = require('crypto');
11
11
  var effect = require('effect');
12
12
  var fs = require('fs');
13
13
  var jitiModule = require('jiti');
14
14
  var promises = require('fs/promises');
15
15
  var url = require('url');
16
+ var diff = require('diff');
16
17
 
17
18
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
18
19
  function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
@@ -35,7 +36,7 @@ function _interopNamespace(e) {
35
36
  return Object.freeze(n);
36
37
  }
37
38
 
38
- var React__default = /*#__PURE__*/_interopDefault(React);
39
+ var React2__default = /*#__PURE__*/_interopDefault(React2);
39
40
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
40
41
 
41
42
  var SEP = " ";
@@ -104,7 +105,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
104
105
  // src/cli/components/Footer.tsx
105
106
  function getFooterText(state) {
106
107
  if (state.level === "datasets") {
107
- return "\u2191\u2193 move Enter open / search Tab focus q quit";
108
+ return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
108
109
  }
109
110
  if (state.level === "runs") {
110
111
  return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
@@ -636,6 +637,7 @@ function createInitialState(data, args) {
636
637
  datasetMenuIndex,
637
638
  runMenuIndex,
638
639
  detailsScrollOffset: 0,
640
+ overviewScrollOffset: 0,
639
641
  selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
640
642
  evaluatorMenuIndex: 0,
641
643
  searchQuery,
@@ -651,8 +653,11 @@ function reduceCliState(state, action) {
651
653
  if (state.level === "details" && state.focus === "right") {
652
654
  return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
653
655
  }
656
+ if (state.level === "datasets" && state.focus === "right") {
657
+ return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
658
+ }
654
659
  if (state.level === "datasets") {
655
- return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
660
+ return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
656
661
  }
657
662
  if (state.level === "runs") {
658
663
  return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
@@ -669,8 +674,11 @@ function reduceCliState(state, action) {
669
674
  if (state.level === "details" && state.focus === "right") {
670
675
  return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
671
676
  }
677
+ if (state.level === "datasets" && state.focus === "right") {
678
+ return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
679
+ }
672
680
  if (state.level === "datasets") {
673
- return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
681
+ return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
674
682
  }
675
683
  if (state.level === "runs") {
676
684
  return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
@@ -746,292 +754,6 @@ function reduceCliState(state, action) {
746
754
  }
747
755
  return state;
748
756
  }
749
- var LEFT_PANE_WIDTH2 = 44;
750
- function DatasetsView({
751
- state,
752
- filteredDatasets,
753
- selectedDataset
754
- }) {
755
- const leftFocused = state.focus === "left";
756
- const rightFocused = state.focus === "right";
757
- return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
758
- /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
759
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
760
- /* @__PURE__ */ jsxRuntime.jsx(
761
- ListItem,
762
- {
763
- selected: state.datasetMenuIndex === 0,
764
- label: "New evaluation",
765
- itemKey: "datasets-new-eval"
766
- }
767
- ),
768
- filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
769
- ListItem,
770
- {
771
- selected: state.datasetMenuIndex === index + 1,
772
- label: dataset.name,
773
- itemKey: `dataset-${dataset.id}`
774
- },
775
- dataset.id
776
- ))
777
- ] }),
778
- /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
779
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
780
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
781
- ] })
782
- ] });
783
- }
784
- function RunsView({
785
- state,
786
- dataset,
787
- selectedRun
788
- }) {
789
- const runs = dataset?.runs ?? [];
790
- const rightFocused = state.focus === "right";
791
- return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
792
- /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
793
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
794
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
795
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
796
- " ",
797
- selectedRun.label,
798
- " ",
799
- /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
800
- ] }),
801
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
802
- "Commit: ",
803
- selectedRun.meta.commit,
804
- " Branch: ",
805
- selectedRun.meta.branch,
806
- " ",
807
- "Seed: ",
808
- selectedRun.meta.seed
809
- ] }),
810
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
811
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
812
- /* @__PURE__ */ jsxRuntime.jsx(
813
- TextBar,
814
- {
815
- label: "pass rate",
816
- value: selectedRun.performance.passRate,
817
- format: (v) => `${v}%`
818
- }
819
- ),
820
- /* @__PURE__ */ jsxRuntime.jsx(
821
- TextBar,
822
- {
823
- label: "avg score",
824
- value: Math.round(selectedRun.performance.avgScore * 100)
825
- }
826
- ),
827
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
828
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
829
- selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
830
- TextBar,
831
- {
832
- label: dimension.name,
833
- value: dimension.score
834
- },
835
- dimension.name
836
- )),
837
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
838
- /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
839
- /* @__PURE__ */ jsxRuntime.jsx(
840
- Sparkline,
841
- {
842
- data: selectedRun.performance.latencyHistoryMs ?? [
843
- selectedRun.performance.latencyAvgMs - 40,
844
- selectedRun.performance.latencyAvgMs - 10,
845
- selectedRun.performance.latencyAvgMs + 20,
846
- selectedRun.performance.latencyP95Ms - 80,
847
- selectedRun.performance.latencyP95Ms
848
- ],
849
- width: 24
850
- }
851
- )
852
- ] }) })
853
- ] });
854
- }
855
-
856
- // src/evals/metric.ts
857
- var registry = /* @__PURE__ */ new Map();
858
- var Metric = {
859
- of(config) {
860
- const def = {
861
- id: config.id,
862
- name: config.name,
863
- aggregate: config.aggregate,
864
- format: config.format,
865
- make: (data) => ({ id: config.id, data })
866
- };
867
- registry.set(config.id, def);
868
- return def;
869
- }
870
- };
871
- function getMetricById(id) {
872
- return registry.get(id);
873
- }
874
-
875
- // src/evals/score.ts
876
- var registry2 = /* @__PURE__ */ new Map();
877
- var Score = {
878
- of(config) {
879
- const def = {
880
- id: config.id,
881
- name: config.name,
882
- displayStrategy: config.displayStrategy,
883
- aggregate: config.aggregate,
884
- format: config.format,
885
- make: (data, options) => {
886
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
887
- return {
888
- id: config.id,
889
- data,
890
- ...passed !== void 0 && { passed }
891
- };
892
- }
893
- };
894
- registry2.set(config.id, def);
895
- return def;
896
- }
897
- };
898
- function getScoreById(id) {
899
- return registry2.get(id);
900
- }
901
-
902
- // src/evals/aggregators.ts
903
- function aggregateAverage(values) {
904
- if (values.length === 0) {
905
- return { value: 0 };
906
- }
907
- const sum = values.reduce((s, v) => s + v.value, 0);
908
- return { value: sum / values.length };
909
- }
910
- function aggregateAll(values) {
911
- return { passed: values.length > 0 && values.every((v) => v.passed) };
912
- }
913
- function aggregateTokenCountSum(values) {
914
- const initial = {
915
- input: 0,
916
- output: 0,
917
- inputCached: 0,
918
- outputCached: 0
919
- };
920
- return values.reduce(
921
- (acc, v) => ({
922
- input: acc.input + (v.input ?? 0),
923
- output: acc.output + (v.output ?? 0),
924
- inputCached: acc.inputCached + (v.inputCached ?? 0),
925
- outputCached: acc.outputCached + (v.outputCached ?? 0)
926
- }),
927
- initial
928
- );
929
- }
930
- function aggregateLatencyAverage(values) {
931
- if (values.length === 0) {
932
- return { ms: 0 };
933
- }
934
- const sum = values.reduce((s, v) => s + v.ms, 0);
935
- return { ms: sum / values.length };
936
- }
937
-
938
- // src/evals/metrics/standard.ts
939
- Metric.of({
940
- id: "token-count",
941
- name: "Tokens",
942
- aggregate: aggregateTokenCountSum,
943
- format: (data, options) => {
944
- const input = data.input ?? 0;
945
- const output = data.output ?? 0;
946
- const inputCached = data.inputCached ?? 0;
947
- const outputCached = data.outputCached ?? 0;
948
- const cached = inputCached + outputCached;
949
- const base = `in:${input} out:${output} cached:${cached}`;
950
- return options?.isAggregated ? `Total: ${base}` : base;
951
- }
952
- });
953
- Metric.of({
954
- id: "latency",
955
- name: "Latency",
956
- aggregate: aggregateLatencyAverage,
957
- format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
958
- });
959
-
960
- // src/evals/scores/standard.ts
961
- Score.of({
962
- id: "percent",
963
- name: "Score",
964
- displayStrategy: "bar",
965
- format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
966
- aggregate: aggregateAverage
967
- });
968
- Score.of({
969
- id: "binary",
970
- name: "Result",
971
- displayStrategy: "passFail",
972
- format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
973
- aggregate: aggregateAll
974
- });
975
- function createDiffLogEntry(expected, actual, options) {
976
- const diff = jsonDiff.diffString(expected, actual, { color: false });
977
- return {
978
- type: "diff",
979
- label: options?.label,
980
- expected,
981
- actual,
982
- diff: diff || "(no differences)"
983
- };
984
- }
985
- function getDiffLines(entry) {
986
- const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
987
- return raw.split("\n").map((line) => {
988
- const trimmed = line.trimStart();
989
- if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
990
- return { type: "remove", line };
991
- }
992
- if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
993
- return { type: "add", line };
994
- }
995
- return { type: "context", line };
996
- });
997
- }
998
-
999
- // src/runner/score-utils.ts
1000
- function toNumericScoreFromScores(scores) {
1001
- for (const item of scores) {
1002
- const def = getScoreById(item.id);
1003
- if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1004
- const value = item.data.value;
1005
- if (typeof value === "number" && Number.isFinite(value)) {
1006
- return value;
1007
- }
1008
- }
1009
- const numeric = toNumericScore(item.data);
1010
- if (numeric !== void 0) {
1011
- return numeric;
1012
- }
1013
- }
1014
- return void 0;
1015
- }
1016
- function toNumericScore(value) {
1017
- if (typeof value === "number" && Number.isFinite(value)) {
1018
- return value;
1019
- }
1020
- if (typeof value !== "object" || value === null) {
1021
- return void 0;
1022
- }
1023
- const obj = value;
1024
- if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1025
- return obj.score;
1026
- }
1027
- const numberValues = Object.values(value).filter(
1028
- (entry) => typeof entry === "number" && Number.isFinite(entry)
1029
- );
1030
- if (numberValues.length === 0) {
1031
- return void 0;
1032
- }
1033
- return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1034
- }
1035
757
 
1036
758
  // src/runner/config.ts
1037
759
  var defaultRunnerConfig = {
@@ -1213,75 +935,311 @@ async function loadModuleExports(filePath) {
1213
935
  if (!createJiti2) {
1214
936
  throw new Error("Failed to initialize jiti TypeScript loader");
1215
937
  }
1216
- jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
1217
- interopDefault: true,
1218
- moduleCache: true
1219
- });
938
+ jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
939
+ interopDefault: true,
940
+ moduleCache: true
941
+ });
942
+ }
943
+ const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
944
+ return Object.values(loaded2);
945
+ }
946
+ const moduleUrl = url.pathToFileURL(filePath).href;
947
+ const loaded = await import(moduleUrl);
948
+ return Object.values(loaded);
949
+ }
950
+ async function collectDatasetsFromFiles(config) {
951
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
952
+ const matched = files.filter(
953
+ (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
954
+ );
955
+ const found = await Promise.all(
956
+ matched.map(async (absolutePath) => {
957
+ const exports = await loadModuleExports(absolutePath);
958
+ const datasets = exports.filter(isDatasetLike);
959
+ const relPath = path.relative(config.rootDir, absolutePath);
960
+ return datasets.map((dataset) => ({
961
+ id: toId("dataset", relPath, dataset.getName()),
962
+ filePath: relPath,
963
+ dataset
964
+ }));
965
+ })
966
+ );
967
+ return found.flat();
968
+ }
969
+ async function collectEvaluatorsFromFiles(config) {
970
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
971
+ const matched = files.filter(
972
+ (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
973
+ );
974
+ const found = await Promise.all(
975
+ matched.map(async (absolutePath) => {
976
+ const exports = await loadModuleExports(absolutePath);
977
+ const evaluators = exports.filter(isEvaluatorLike);
978
+ const relPath = path.relative(config.rootDir, absolutePath);
979
+ return evaluators.map((evaluator) => ({
980
+ id: toId("evaluator", relPath, evaluator.getName()),
981
+ filePath: relPath,
982
+ evaluator
983
+ }));
984
+ })
985
+ );
986
+ return found.flat();
987
+ }
988
+ async function collectTestCasesFromFiles(config) {
989
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
990
+ const matched = files.filter(
991
+ (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
992
+ );
993
+ const found = await Promise.all(
994
+ matched.map(async (absolutePath) => {
995
+ const exports = await loadModuleExports(absolutePath);
996
+ const testCases = exports.filter(isTestCaseLike);
997
+ const relPath = path.relative(config.rootDir, absolutePath);
998
+ return testCases.map((testCase) => ({
999
+ id: toId("test-case", relPath, testCase.getName()),
1000
+ filePath: relPath,
1001
+ testCase
1002
+ }));
1003
+ })
1004
+ );
1005
+ return found.flat();
1006
+ }
1007
+ function toJsonLines(value) {
1008
+ try {
1009
+ return JSON.stringify(value, null, 2);
1010
+ } catch {
1011
+ return String(value);
1012
+ }
1013
+ }
1014
+ function formatDiffString(changes) {
1015
+ const lines = [];
1016
+ for (const part of changes) {
1017
+ const prefix = part.added ? "+" : part.removed ? "-" : " ";
1018
+ const partLines = part.value.split("\n");
1019
+ if (partLines[partLines.length - 1] === "") {
1020
+ partLines.pop();
1021
+ }
1022
+ for (const line of partLines) {
1023
+ lines.push(`${prefix} ${line}`);
1024
+ }
1025
+ }
1026
+ return lines.join("\n");
1027
+ }
1028
+ function createDiffString(expected, actual) {
1029
+ const expectedStr = toJsonLines(expected);
1030
+ const actualStr = toJsonLines(actual);
1031
+ const changes = diff.diffLines(expectedStr, actualStr);
1032
+ return formatDiffString(changes);
1033
+ }
1034
+ function createDiffLogEntry(expected, actual, options) {
1035
+ const diff = createDiffString(expected, actual);
1036
+ return {
1037
+ type: "diff",
1038
+ label: options?.label,
1039
+ expected,
1040
+ actual,
1041
+ diff: diff || "(no differences)"
1042
+ };
1043
+ }
1044
+ function getDiffLines(entry) {
1045
+ const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
1046
+ return raw.split("\n").map((line) => {
1047
+ const trimmed = line.trimStart();
1048
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
1049
+ return { type: "remove", line };
1050
+ }
1051
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
1052
+ return { type: "add", line };
1053
+ }
1054
+ return { type: "context", line };
1055
+ });
1056
+ }
1057
+
1058
+ // src/evals/metric.ts
1059
+ var registry = /* @__PURE__ */ new Map();
1060
+ var Metric = {
1061
+ of(config) {
1062
+ const def = {
1063
+ id: config.id,
1064
+ name: config.name,
1065
+ aggregate: config.aggregate,
1066
+ format: config.format,
1067
+ make: (data) => ({ id: config.id, data })
1068
+ };
1069
+ registry.set(config.id, def);
1070
+ return def;
1071
+ }
1072
+ };
1073
+ function getMetricById(id) {
1074
+ return registry.get(id);
1075
+ }
1076
+
1077
+ // src/evals/score.ts
1078
+ var registry2 = /* @__PURE__ */ new Map();
1079
+ var Score = {
1080
+ of(config) {
1081
+ const def = {
1082
+ id: config.id,
1083
+ name: config.name,
1084
+ displayStrategy: config.displayStrategy,
1085
+ aggregate: config.aggregate,
1086
+ format: config.format,
1087
+ make: (data, options) => {
1088
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1089
+ return {
1090
+ id: config.id,
1091
+ data,
1092
+ ...passed !== void 0 && { passed }
1093
+ };
1094
+ }
1095
+ };
1096
+ registry2.set(config.id, def);
1097
+ return def;
1098
+ }
1099
+ };
1100
+ function getScoreById(id) {
1101
+ return registry2.get(id);
1102
+ }
1103
+
1104
+ // src/evals/aggregators.ts
1105
+ function aggregateAverageWithVariance(values) {
1106
+ if (values.length === 0) {
1107
+ return { value: 0, count: 0 };
1108
+ }
1109
+ const sum = values.reduce((s, v) => s + v.value, 0);
1110
+ const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
1111
+ const mean = sum / values.length;
1112
+ let stdDev;
1113
+ if (values.length >= 2) {
1114
+ const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
1115
+ stdDev = variance > 0 ? Math.sqrt(variance) : 0;
1116
+ }
1117
+ return { value: mean, stdDev, count: values.length };
1118
+ }
1119
+ function aggregateAll(values) {
1120
+ const total = values.length;
1121
+ const passedCount = values.filter((v) => v.passed).length;
1122
+ return {
1123
+ passed: total > 0 && values.every((v) => v.passed),
1124
+ passedCount,
1125
+ totalCount: total
1126
+ };
1127
+ }
1128
+ function aggregateTokenCountSum(values) {
1129
+ const initial = {
1130
+ input: 0,
1131
+ output: 0,
1132
+ inputCached: 0,
1133
+ outputCached: 0
1134
+ };
1135
+ return values.reduce(
1136
+ (acc, v) => ({
1137
+ input: acc.input + (v.input ?? 0),
1138
+ output: acc.output + (v.output ?? 0),
1139
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
1140
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
1141
+ }),
1142
+ initial
1143
+ );
1144
+ }
1145
+ function aggregateLatencyAverage(values) {
1146
+ if (values.length === 0) {
1147
+ return { ms: 0 };
1148
+ }
1149
+ const sum = values.reduce((s, v) => s + v.ms, 0);
1150
+ return { ms: sum / values.length };
1151
+ }
1152
+
1153
+ // src/evals/metrics/standard.ts
1154
+ Metric.of({
1155
+ id: "token-count",
1156
+ name: "Tokens",
1157
+ aggregate: aggregateTokenCountSum,
1158
+ format: (data, options) => {
1159
+ const input = data.input ?? 0;
1160
+ const output = data.output ?? 0;
1161
+ const inputCached = data.inputCached ?? 0;
1162
+ const outputCached = data.outputCached ?? 0;
1163
+ const cached = inputCached + outputCached;
1164
+ const base = `in:${input} out:${output} cached:${cached}`;
1165
+ return options?.isAggregated ? `Total: ${base}` : base;
1166
+ }
1167
+ });
1168
+ Metric.of({
1169
+ id: "latency",
1170
+ name: "Latency",
1171
+ aggregate: aggregateLatencyAverage,
1172
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
1173
+ });
1174
+
1175
+ // src/evals/scores/standard.ts
1176
+ Score.of({
1177
+ id: "percent",
1178
+ name: "Score",
1179
+ displayStrategy: "bar",
1180
+ format: (data, options) => {
1181
+ if (options?.isAggregated) {
1182
+ return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
1183
+ }
1184
+ return data.value.toFixed(2);
1185
+ },
1186
+ aggregate: aggregateAverageWithVariance
1187
+ });
1188
+ Score.of({
1189
+ id: "binary",
1190
+ name: "Result",
1191
+ displayStrategy: "passFail",
1192
+ format: (data, options) => {
1193
+ if (options?.isAggregated) {
1194
+ const base = data.passed ? "All: PASSED" : "Some: FAILED";
1195
+ if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
1196
+ return `${base} (${data.passedCount}/${data.totalCount})`;
1197
+ }
1198
+ return base;
1199
+ }
1200
+ return data.passed ? "PASSED" : "NOT PASSED";
1201
+ },
1202
+ aggregate: aggregateAll
1203
+ });
1204
+
1205
+ // src/runner/score-utils.ts
1206
+ function toNumericScoreFromScores(scores) {
1207
+ for (const item of scores) {
1208
+ const def = getScoreById(item.id);
1209
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1210
+ const value = item.data.value;
1211
+ if (typeof value === "number" && Number.isFinite(value)) {
1212
+ return value;
1213
+ }
1214
+ }
1215
+ const numeric = toNumericScore(item.data);
1216
+ if (numeric !== void 0) {
1217
+ return numeric;
1220
1218
  }
1221
- const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
1222
- return Object.values(loaded2);
1223
1219
  }
1224
- const moduleUrl = url.pathToFileURL(filePath).href;
1225
- const loaded = await import(moduleUrl);
1226
- return Object.values(loaded);
1227
- }
1228
- async function collectDatasetsFromFiles(config) {
1229
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1230
- const matched = files.filter(
1231
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1232
- );
1233
- const found = await Promise.all(
1234
- matched.map(async (absolutePath) => {
1235
- const exports = await loadModuleExports(absolutePath);
1236
- const datasets = exports.filter(isDatasetLike);
1237
- const relPath = path.relative(config.rootDir, absolutePath);
1238
- return datasets.map((dataset) => ({
1239
- id: toId("dataset", relPath, dataset.getName()),
1240
- filePath: relPath,
1241
- dataset
1242
- }));
1243
- })
1244
- );
1245
- return found.flat();
1246
- }
1247
- async function collectEvaluatorsFromFiles(config) {
1248
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1249
- const matched = files.filter(
1250
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1251
- );
1252
- const found = await Promise.all(
1253
- matched.map(async (absolutePath) => {
1254
- const exports = await loadModuleExports(absolutePath);
1255
- const evaluators = exports.filter(isEvaluatorLike);
1256
- const relPath = path.relative(config.rootDir, absolutePath);
1257
- return evaluators.map((evaluator) => ({
1258
- id: toId("evaluator", relPath, evaluator.getName()),
1259
- filePath: relPath,
1260
- evaluator
1261
- }));
1262
- })
1263
- );
1264
- return found.flat();
1220
+ return void 0;
1265
1221
  }
1266
- async function collectTestCasesFromFiles(config) {
1267
- const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1268
- const matched = files.filter(
1269
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1270
- );
1271
- const found = await Promise.all(
1272
- matched.map(async (absolutePath) => {
1273
- const exports = await loadModuleExports(absolutePath);
1274
- const testCases = exports.filter(isTestCaseLike);
1275
- const relPath = path.relative(config.rootDir, absolutePath);
1276
- return testCases.map((testCase) => ({
1277
- id: toId("test-case", relPath, testCase.getName()),
1278
- filePath: relPath,
1279
- testCase
1280
- }));
1281
- })
1222
+ function toNumericScore(value) {
1223
+ if (typeof value === "number" && Number.isFinite(value)) {
1224
+ return value;
1225
+ }
1226
+ if (typeof value !== "object" || value === null) {
1227
+ return void 0;
1228
+ }
1229
+ const obj = value;
1230
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1231
+ return obj.score;
1232
+ }
1233
+ const numberValues = Object.values(value).filter(
1234
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1282
1235
  );
1283
- return found.flat();
1236
+ if (numberValues.length === 0) {
1237
+ return void 0;
1238
+ }
1239
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1284
1240
  }
1241
+
1242
+ // src/runner/execution.ts
1285
1243
  function computeEvaluatorPassed(evaluator, result, scores) {
1286
1244
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1287
1245
  if (scoresWithPassed.length > 0) {
@@ -1952,6 +1910,240 @@ var EffectRunner = class {
1952
1910
  );
1953
1911
  }
1954
1912
  };
1913
+ var LEFT_PANE_WIDTH2 = 44;
1914
+ var MAX_RUNS_FOR_CHART = 12;
1915
+ var MAX_RUNS_FOR_TREND = 20;
1916
+ var TREND_BATCH_SIZE = 4;
1917
+ function extractRunAverageScore(testCases) {
1918
+ const scores = [];
1919
+ for (const tc of testCases) {
1920
+ for (const es of tc.evaluatorScores) {
1921
+ const n = toNumericScoreFromScores(es.scores);
1922
+ if (n !== void 0) {
1923
+ scores.push(n);
1924
+ }
1925
+ }
1926
+ }
1927
+ if (scores.length === 0)
1928
+ return void 0;
1929
+ return scores.reduce((a, b) => a + b, 0) / scores.length;
1930
+ }
1931
+ async function loadRunScores(runs) {
1932
+ const results = [];
1933
+ for (const run of runs) {
1934
+ const artifact = run.meta?.artifact;
1935
+ if (!artifact)
1936
+ continue;
1937
+ try {
1938
+ const path$1 = path.resolve(artifact);
1939
+ const testCases = await parseArtifactFile(path$1);
1940
+ const avg = extractRunAverageScore(testCases);
1941
+ if (avg !== void 0) {
1942
+ results.push({
1943
+ runId: run.id,
1944
+ label: run.label,
1945
+ value: avg
1946
+ });
1947
+ }
1948
+ } catch {
1949
+ }
1950
+ }
1951
+ return results;
1952
+ }
1953
+ function batchAverage(values, batchSize) {
1954
+ const batches = [];
1955
+ for (let i = 0; i < values.length; i += batchSize) {
1956
+ const slice = values.slice(i, i + batchSize);
1957
+ if (slice.length > 0) {
1958
+ batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
1959
+ }
1960
+ }
1961
+ return batches;
1962
+ }
1963
+ var OVERVIEW_PAGE_SIZE = 15;
1964
+ function DatasetsView({
1965
+ state,
1966
+ filteredDatasets,
1967
+ selectedDataset,
1968
+ overviewRowCountRef
1969
+ }) {
1970
+ const leftFocused = state.focus === "left";
1971
+ const rightFocused = state.focus === "right";
1972
+ const [runScores, setRunScores] = React2.useState([]);
1973
+ const [loading, setLoading] = React2.useState(false);
1974
+ React2.useEffect(() => {
1975
+ if (!selectedDataset?.runs?.length) {
1976
+ setRunScores([]);
1977
+ return;
1978
+ }
1979
+ setLoading(true);
1980
+ const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
1981
+ loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
1982
+ }, [selectedDataset?.id, selectedDataset?.runs?.length]);
1983
+ const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
1984
+ const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
1985
+ const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
1986
+ const overviewRows = React2.useMemo(() => {
1987
+ const rows = [];
1988
+ rows.push(
1989
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
1990
+ );
1991
+ if (selectedDataset && selectedDataset.runs.length > 0) {
1992
+ if (loading) {
1993
+ rows.push(
1994
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
1995
+ );
1996
+ } else if (runScores.length > 0) {
1997
+ rows.push(
1998
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
1999
+ );
2000
+ for (const d of barData) {
2001
+ rows.push(
2002
+ /* @__PURE__ */ jsxRuntime.jsx(
2003
+ TextBar,
2004
+ {
2005
+ label: d.label,
2006
+ value: d.value,
2007
+ labelWidth: 14,
2008
+ barWidth: 24,
2009
+ max: 100,
2010
+ format: (v) => v.toFixed(1)
2011
+ },
2012
+ d.runId
2013
+ )
2014
+ );
2015
+ }
2016
+ if (trendBatched.length > 0) {
2017
+ rows.push(
2018
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
2019
+ );
2020
+ rows.push(
2021
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: /* @__PURE__ */ jsxRuntime.jsx(
2022
+ inkChart.LineGraph,
2023
+ {
2024
+ data: [{ values: trendBatched, color: "cyan" }],
2025
+ height: 5,
2026
+ width: 45,
2027
+ showYAxis: true,
2028
+ xLabels: ["older", "newer"]
2029
+ }
2030
+ ) }, "trend-graph")
2031
+ );
2032
+ }
2033
+ }
2034
+ }
2035
+ return rows;
2036
+ }, [
2037
+ selectedDataset?.overview,
2038
+ selectedDataset?.runs?.length,
2039
+ loading,
2040
+ runScores,
2041
+ barData,
2042
+ trendBatched
2043
+ ]);
2044
+ if (overviewRowCountRef) {
2045
+ overviewRowCountRef.current = overviewRows.length;
2046
+ }
2047
+ const offset = Math.max(0, state.overviewScrollOffset);
2048
+ const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
2049
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2050
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
2051
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
2052
+ /* @__PURE__ */ jsxRuntime.jsx(
2053
+ ListItem,
2054
+ {
2055
+ selected: state.datasetMenuIndex === 0,
2056
+ label: "New evaluation",
2057
+ itemKey: "datasets-new-eval"
2058
+ }
2059
+ ),
2060
+ filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
2061
+ ListItem,
2062
+ {
2063
+ selected: state.datasetMenuIndex === index + 1,
2064
+ label: dataset.name,
2065
+ itemKey: `dataset-${dataset.id}`
2066
+ },
2067
+ dataset.id
2068
+ ))
2069
+ ] }),
2070
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
2071
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
2072
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: row }, offset + i)) })
2073
+ ] })
2074
+ ] });
2075
+ }
2076
+ function RunsView({
2077
+ state,
2078
+ dataset,
2079
+ selectedRun
2080
+ }) {
2081
+ const runs = dataset?.runs ?? [];
2082
+ const rightFocused = state.focus === "right";
2083
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2084
+ /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2085
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
2086
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2087
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
2088
+ " ",
2089
+ selectedRun.label,
2090
+ " ",
2091
+ /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
2092
+ ] }),
2093
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2094
+ "Commit: ",
2095
+ selectedRun.meta.commit,
2096
+ " Branch: ",
2097
+ selectedRun.meta.branch,
2098
+ " ",
2099
+ "Seed: ",
2100
+ selectedRun.meta.seed
2101
+ ] }),
2102
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2103
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
2104
+ /* @__PURE__ */ jsxRuntime.jsx(
2105
+ TextBar,
2106
+ {
2107
+ label: "pass rate",
2108
+ value: selectedRun.performance.passRate,
2109
+ format: (v) => `${v}%`
2110
+ }
2111
+ ),
2112
+ /* @__PURE__ */ jsxRuntime.jsx(
2113
+ TextBar,
2114
+ {
2115
+ label: "avg score",
2116
+ value: Math.round(selectedRun.performance.avgScore * 100)
2117
+ }
2118
+ ),
2119
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2120
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
2121
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
2122
+ TextBar,
2123
+ {
2124
+ label: dimension.name,
2125
+ value: dimension.score
2126
+ },
2127
+ dimension.name
2128
+ )),
2129
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
2130
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
2131
+ /* @__PURE__ */ jsxRuntime.jsx(
2132
+ Sparkline,
2133
+ {
2134
+ data: selectedRun.performance.latencyHistoryMs ?? [
2135
+ selectedRun.performance.latencyAvgMs - 40,
2136
+ selectedRun.performance.latencyAvgMs - 10,
2137
+ selectedRun.performance.latencyAvgMs + 20,
2138
+ selectedRun.performance.latencyP95Ms - 80,
2139
+ selectedRun.performance.latencyP95Ms
2140
+ ],
2141
+ width: 24
2142
+ }
2143
+ )
2144
+ ] }) })
2145
+ ] });
2146
+ }
1955
2147
  var DETAILS_PAGE_SIZE = 20;
1956
2148
  function scoreColor(score) {
1957
2149
  if (score >= 80)
@@ -1960,7 +2152,7 @@ function scoreColor(score) {
1960
2152
  return "yellow";
1961
2153
  return "red";
1962
2154
  }
1963
- function formatScorePart(item, scoreToColor) {
2155
+ function formatScorePart(item) {
1964
2156
  const def = getScoreById(item.id);
1965
2157
  if (!def) {
1966
2158
  const numeric = toNumericScore(item.data);
@@ -1990,7 +2182,7 @@ function CheckRow({
1990
2182
  " ",
1991
2183
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
1992
2184
  detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1993
- " (",
2185
+ " (",
1994
2186
  detail,
1995
2187
  ")"
1996
2188
  ] }) : null
@@ -2010,21 +2202,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2010
2202
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2011
2203
  "Model: ",
2012
2204
  meta.model,
2013
- " Provider: ",
2205
+ " Provider: ",
2014
2206
  meta.provider
2015
2207
  ] }, "meta-1"),
2016
2208
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2017
2209
  "Commit: ",
2018
2210
  meta.commit,
2019
- " Branch: ",
2211
+ " Branch: ",
2020
2212
  meta.branch,
2021
- " Seed: ",
2213
+ " Seed: ",
2022
2214
  meta.seed
2023
2215
  ] }, "meta-2"),
2024
2216
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2025
2217
  "Duration: ",
2026
2218
  meta.duration,
2027
- " Concurrency: ",
2219
+ " Concurrency: ",
2028
2220
  meta.concurrency
2029
2221
  ] }, "meta-3"),
2030
2222
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
@@ -2036,7 +2228,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2036
2228
  ...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
2037
2229
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
2038
2230
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
2039
- ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2231
+ ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
2232
+ CheckRow,
2233
+ {
2234
+ name: c.name,
2235
+ passed: c.passed,
2236
+ detail: c.detail
2237
+ },
2238
+ `chk-${c.name}`
2239
+ )),
2040
2240
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
2041
2241
  /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
2042
2242
  /* @__PURE__ */ jsxRuntime.jsx(
@@ -2049,16 +2249,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2049
2249
  "perf-rate"
2050
2250
  ),
2051
2251
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2052
- "latency avg ",
2252
+ "latency avg ",
2053
2253
  performance.latencyAvgMs,
2054
- "ms p95 ",
2254
+ "ms p95 ",
2055
2255
  performance.latencyP95Ms,
2056
2256
  "ms"
2057
2257
  ] }, "perf-lat"),
2058
2258
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2059
- "tokens avg ",
2259
+ "tokens avg ",
2060
2260
  performance.tokensAvg,
2061
- " p95 ",
2261
+ " p95 ",
2062
2262
  performance.tokensP95
2063
2263
  ] }, "perf-tok"),
2064
2264
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
@@ -2111,26 +2311,60 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
2111
2311
  ":",
2112
2312
  " ",
2113
2313
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2114
- " ",
2115
- item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
2116
- formatScorePart(s),
2117
- " "
2118
- ] }, s.id)),
2119
- item.metrics?.map((m) => {
2120
- const def = getMetricById(m.id);
2121
- if (!def)
2122
- return null;
2123
- const formatted = def.format(m.data);
2124
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2125
- "[",
2126
- def.name ? `${def.name}: ` : "",
2127
- formatted,
2128
- "]",
2129
- " "
2130
- ] }, m.id);
2131
- })
2314
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2315
+ " ",
2316
+ item.metrics.map((m) => {
2317
+ const def = getMetricById(m.id);
2318
+ if (!def)
2319
+ return null;
2320
+ const formatted = def.format(m.data);
2321
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2322
+ "[",
2323
+ def.name ? `${def.name}: ` : "",
2324
+ formatted,
2325
+ "]",
2326
+ " "
2327
+ ] }, m.id);
2328
+ })
2329
+ ] }) : null
2132
2330
  ] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
2133
2331
  );
2332
+ if (item.scores.length > 0) {
2333
+ for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
2334
+ const s = item.scores[sIdx];
2335
+ const def = getScoreById(s.id);
2336
+ const scoreLabel = def ? def.name ?? def.id : s.id;
2337
+ rows.push(
2338
+ /* @__PURE__ */ jsxRuntime.jsxs(
2339
+ ink.Text,
2340
+ {
2341
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2342
+ children: [
2343
+ " ",
2344
+ scoreLabel,
2345
+ ": ",
2346
+ formatScorePart(s)
2347
+ ]
2348
+ },
2349
+ `tc-${tc.testCaseId}-${item.evaluatorId}-score-${sIdx}`
2350
+ )
2351
+ );
2352
+ }
2353
+ } else {
2354
+ rows.push(
2355
+ /* @__PURE__ */ jsxRuntime.jsxs(
2356
+ ink.Text,
2357
+ {
2358
+ color: "gray",
2359
+ children: [
2360
+ " ",
2361
+ "n/a"
2362
+ ]
2363
+ },
2364
+ `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
2365
+ )
2366
+ );
2367
+ }
2134
2368
  if (!item.passed && item.logs && item.logs.length > 0) {
2135
2369
  for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
2136
2370
  const log = item.logs[logIdx];
@@ -2168,12 +2402,12 @@ function RunDetailsView({
2168
2402
  }) {
2169
2403
  const runs = dataset?.runs ?? [];
2170
2404
  const rightFocused = state.focus === "right";
2171
- const [testCases, setTestCases] = React.useState([]);
2172
- const evaluatorNameById = React__default.default.useMemo(
2405
+ const [testCases, setTestCases] = React2.useState([]);
2406
+ const evaluatorNameById = React2__default.default.useMemo(
2173
2407
  () => new Map(evaluators.map((e) => [e.id, e.name])),
2174
2408
  [evaluators]
2175
2409
  );
2176
- React.useEffect(() => {
2410
+ React2.useEffect(() => {
2177
2411
  if (!selectedRun?.meta?.artifact) {
2178
2412
  setTestCases([]);
2179
2413
  return;
@@ -2192,7 +2426,7 @@ function RunDetailsView({
2192
2426
  const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2193
2427
  return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2194
2428
  /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
2195
- /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
2429
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
2196
2430
  ] });
2197
2431
  }
2198
2432
  var LEFT_PANE_WIDTH3 = 44;
@@ -2272,16 +2506,17 @@ function EvalsCliApp({
2272
2506
  }) {
2273
2507
  const { exit } = ink.useApp();
2274
2508
  const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
2275
- const [liveData, setLiveData] = React.useState(data);
2276
- const [runtimeMessage, setRuntimeMessage] = React.useState();
2277
- const [state, dispatch] = React.useReducer(
2509
+ const [liveData, setLiveData] = React2.useState(data);
2510
+ const [runtimeMessage, setRuntimeMessage] = React2.useState();
2511
+ const overviewRowCountRef = React2.useRef(0);
2512
+ const [state, dispatch] = React2.useReducer(
2278
2513
  reduceCliState,
2279
2514
  createInitialState(data, args)
2280
2515
  );
2281
- React.useEffect(() => {
2516
+ React2.useEffect(() => {
2282
2517
  setLiveData(data);
2283
2518
  }, [data]);
2284
- React.useEffect(() => {
2519
+ React2.useEffect(() => {
2285
2520
  if (!runner) {
2286
2521
  return void 0;
2287
2522
  }
@@ -2300,7 +2535,7 @@ function EvalsCliApp({
2300
2535
  }
2301
2536
  });
2302
2537
  }, [runner]);
2303
- const filteredDatasets = React.useMemo(
2538
+ const filteredDatasets = React2.useMemo(
2304
2539
  () => getFilteredDatasets(liveData, state.searchQuery),
2305
2540
  [liveData, state.searchQuery]
2306
2541
  );
@@ -2353,7 +2588,16 @@ function EvalsCliApp({
2353
2588
  return;
2354
2589
  }
2355
2590
  if (key.downArrow) {
2356
- const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
2591
+ let max;
2592
+ if (clampedState.level === "datasets") {
2593
+ max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
2594
+ } else if (clampedState.level === "runs") {
2595
+ max = selectedDataset?.runs.length ?? 0;
2596
+ } else if (clampedState.level === "new-evaluation") {
2597
+ max = Math.max(0, visibleEvaluators.length - 1);
2598
+ } else {
2599
+ max = 100;
2600
+ }
2357
2601
  dispatch({ type: "MOVE_DOWN", max });
2358
2602
  return;
2359
2603
  }
@@ -2371,7 +2615,7 @@ function EvalsCliApp({
2371
2615
  }
2372
2616
  return;
2373
2617
  }
2374
- if (isBackKey(key)) {
2618
+ if (isBackKey(key) || input === "\x7F" || input === "\b") {
2375
2619
  dispatch({ type: "BACK" });
2376
2620
  return;
2377
2621
  }
@@ -2424,7 +2668,8 @@ function EvalsCliApp({
2424
2668
  {
2425
2669
  state: clampedState,
2426
2670
  filteredDatasets,
2427
- selectedDataset
2671
+ selectedDataset,
2672
+ overviewRowCountRef
2428
2673
  }
2429
2674
  );
2430
2675
  }