@m4trix/evals 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -3,14 +3,14 @@ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
3
3
  import React, { useState, useReducer, useEffect, useMemo } from 'react';
4
4
  import { useApp, useInput, Box, Text } from 'ink';
5
5
  import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
+ import { resolve, relative, join, dirname } from 'path';
7
+ import { diffString } from 'json-diff';
6
8
  import { randomUUID } from 'crypto';
7
- import { Effect, PubSub, Queue, Fiber } from 'effect';
9
+ import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
8
10
  import { existsSync } from 'fs';
9
- import { resolve, relative, join, dirname } from 'path';
10
11
  import * as jitiModule from 'jiti';
11
- import { mkdir, appendFile, readdir } from 'fs/promises';
12
+ import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
12
13
  import { pathToFileURL } from 'url';
13
- import { diffString } from 'json-diff';
14
14
 
15
15
  var SEP = " ";
16
16
  var ARROW = "\u203A";
@@ -498,11 +498,17 @@ function toEvaluatorOption(item) {
498
498
  };
499
499
  }
500
500
  async function loadRunnerData(runner) {
501
- const [datasets, evaluators] = await Promise.all([
501
+ const [datasets, evaluators, diskSnapshots] = await Promise.all([
502
502
  runner.collectDatasets(),
503
- runner.collectEvaluators()
503
+ runner.collectEvaluators(),
504
+ runner.loadRunSnapshotsFromArtifacts()
504
505
  ]);
505
- const snapshots = runner.getAllRunSnapshots();
506
+ const memSnapshots = runner.getAllRunSnapshots();
507
+ const seen = new Set(memSnapshots.map((s) => s.runId));
508
+ const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
509
+ const snapshots = [...memSnapshots, ...fromDisk].sort(
510
+ (a, b) => b.queuedAt - a.queuedAt
511
+ );
506
512
  if (datasets.length === 0 && evaluators.length === 0) {
507
513
  return loadMockData();
508
514
  }
@@ -820,458 +826,185 @@ function RunsView({
820
826
  ] }) })
821
827
  ] });
822
828
  }
823
- var DETAILS_PAGE_SIZE = 20;
824
- function CheckRow({
825
- name,
826
- passed,
827
- detail
828
- }) {
829
- const status = passed ? "PASSED" : "FAILED";
830
- const color = passed ? "green" : "red";
831
- return /* @__PURE__ */ jsxs(Text, { children: [
832
- /* @__PURE__ */ jsx(Text, { color: "gray", children: name.padEnd(14) }),
833
- " ",
834
- /* @__PURE__ */ jsx(Text, { color, bold: true, children: status }),
835
- detail ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
836
- " (",
837
- detail,
838
- ")"
839
- ] }) : null
840
- ] });
841
- }
842
- function buildDetailRows(run) {
843
- const { performance, dimensions, checks, failures, meta } = run;
844
- const latencyHistory = performance.latencyHistoryMs ?? [
845
- performance.latencyAvgMs - 40,
846
- performance.latencyAvgMs - 10,
847
- performance.latencyAvgMs + 20,
848
- performance.latencyP95Ms - 80,
849
- performance.latencyP95Ms
850
- ];
851
- const rows = [
852
- /* @__PURE__ */ jsx(SectionHeader, { children: "Meta" }, "meta-h"),
853
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
854
- "Model: ",
855
- meta.model,
856
- " Provider: ",
857
- meta.provider
858
- ] }, "meta-1"),
859
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
860
- "Commit: ",
861
- meta.commit,
862
- " Branch: ",
863
- meta.branch,
864
- " Seed: ",
865
- meta.seed
866
- ] }, "meta-2"),
867
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
868
- "Duration: ",
869
- meta.duration,
870
- " Concurrency: ",
871
- meta.concurrency
872
- ] }, "meta-3"),
873
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
874
- "Artifact: ",
875
- meta.artifact
876
- ] }, "meta-4"),
877
- /* @__PURE__ */ jsx(Text, { children: " " }, "sp1"),
878
- /* @__PURE__ */ jsx(SectionHeader, { children: "Scores (0\u2013100)" }, "scores-h"),
879
- ...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
880
- /* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
881
- /* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
882
- ...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
883
- /* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
884
- /* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
885
- /* @__PURE__ */ jsx(
886
- TextBar,
887
- {
888
- label: "pass rate",
889
- value: performance.passRate,
890
- format: (v) => `${v}%`
891
- },
892
- "perf-rate"
893
- ),
894
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
895
- "latency avg ",
896
- performance.latencyAvgMs,
897
- "ms p95 ",
898
- performance.latencyP95Ms,
899
- "ms"
900
- ] }, "perf-lat"),
901
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
902
- "tokens avg ",
903
- performance.tokensAvg,
904
- " p95 ",
905
- performance.tokensP95
906
- ] }, "perf-tok"),
907
- /* @__PURE__ */ jsx(Text, { children: " " }, "sp4"),
908
- /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }, "spark-h"),
909
- /* @__PURE__ */ jsx(Sparkline, { data: latencyHistory, width: 20 }, "spark")
910
- ];
911
- if (failures.length > 0) {
912
- rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp5"));
913
- rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Failures (top)" }, "fail-h"));
914
- failures.forEach((f, i) => {
915
- rows.push(
916
- /* @__PURE__ */ jsxs(Text, { color: "red", children: [
917
- i + 1,
918
- ") ",
919
- f.title
920
- ] }, `fail-${i}`)
921
- );
922
- });
829
+
830
+ // src/evals/metric.ts
831
+ var registry = /* @__PURE__ */ new Map();
832
+ var Metric = {
833
+ of(config) {
834
+ const def = {
835
+ id: config.id,
836
+ name: config.name,
837
+ aggregate: config.aggregate,
838
+ format: config.format,
839
+ make: (data) => ({ id: config.id, data })
840
+ };
841
+ registry.set(config.id, def);
842
+ return def;
923
843
  }
924
- return rows;
844
+ };
845
+ function getMetricById(id) {
846
+ return registry.get(id);
925
847
  }
926
- function RunDetailsView({
927
- state,
928
- dataset,
929
- selectedRun
930
- }) {
931
- const runs = dataset?.runs ?? [];
932
- const rightFocused = state.focus === "right";
933
- if (!selectedRun) {
934
- return /* @__PURE__ */ jsxs(Fragment, { children: [
935
- /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
936
- /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to inspect details." }) })
937
- ] });
848
+
849
+ // src/evals/score.ts
850
+ var registry2 = /* @__PURE__ */ new Map();
851
+ var Score = {
852
+ of(config) {
853
+ const def = {
854
+ id: config.id,
855
+ name: config.name,
856
+ displayStrategy: config.displayStrategy,
857
+ aggregate: config.aggregate,
858
+ format: config.format,
859
+ make: (data, options) => {
860
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
861
+ return {
862
+ id: config.id,
863
+ data,
864
+ ...passed !== void 0 && { passed }
865
+ };
866
+ }
867
+ };
868
+ registry2.set(config.id, def);
869
+ return def;
938
870
  }
939
- const rows = buildDetailRows(selectedRun);
940
- const offset = Math.max(0, state.detailsScrollOffset);
941
- const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
942
- return /* @__PURE__ */ jsxs(Fragment, { children: [
943
- /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
944
- /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
945
- ] });
946
- }
947
- var LEFT_PANE_WIDTH3 = 44;
948
- function NewEvaluationView({
949
- state,
950
- data,
951
- visibleEvaluators
952
- }) {
953
- const selectedCount = state.selectedEvaluatorIds.length;
954
- const focusedEvaluator = visibleEvaluators[state.evaluatorMenuIndex];
955
- const leftFocused = state.focus === "left";
956
- const rightFocused = state.focus === "right";
957
- return /* @__PURE__ */ jsxs(Fragment, { children: [
958
- /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH3, focused: leftFocused, children: [
959
- /* @__PURE__ */ jsx(SectionHeader, { children: "Available Evaluators" }),
960
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
961
- "Search: ",
962
- state.searchQuery || "(none)"
963
- ] }),
964
- visibleEvaluators.map((evaluator, index) => {
965
- const selected = index === state.evaluatorMenuIndex;
966
- const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
967
- return /* @__PURE__ */ jsxs(
968
- Text,
969
- {
970
- color: selected ? "cyan" : "gray",
971
- bold: selected,
972
- children: [
973
- selected ? "\u25B8 " : " ",
974
- inSelection ? "[x] " : "[ ] ",
975
- evaluator.name
976
- ]
977
- },
978
- evaluator.id
979
- );
980
- })
981
- ] }),
982
- /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
983
- /* @__PURE__ */ jsxs(SectionHeader, { children: [
984
- "Selected (",
985
- selectedCount,
986
- ")"
987
- ] }),
988
- state.selectedEvaluatorIds.map((id, index) => {
989
- const evaluator = data.evaluators.find((item) => item.id === id);
990
- if (!evaluator)
991
- return null;
992
- return /* @__PURE__ */ jsxs(Text, { children: [
993
- index + 1,
994
- ") ",
995
- evaluator.name
996
- ] }, id);
997
- }),
998
- /* @__PURE__ */ jsx(SectionHeader, { children: "Config preview" }),
999
- /* @__PURE__ */ jsx(Text, { color: "gray", children: focusedEvaluator?.configPreview ?? "Select an evaluator to inspect config." })
1000
- ] })
1001
- ] });
871
+ };
872
+ function getScoreById(id) {
873
+ return registry2.get(id);
1002
874
  }
1003
- function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
1004
- const datasetMax = filteredDatasetsLength;
1005
- const runMax = selectedRunCount;
1006
- const evaluatorMax = 3;
1007
- return {
1008
- ...state,
1009
- datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
1010
- runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
1011
- evaluatorMenuIndex: Math.max(
1012
- 0,
1013
- Math.min(state.evaluatorMenuIndex, evaluatorMax)
1014
- )
875
+
876
+ // src/evals/aggregators.ts
877
+ function aggregateAverage(values) {
878
+ if (values.length === 0) {
879
+ return { value: 0 };
880
+ }
881
+ const sum = values.reduce((s, v) => s + v.value, 0);
882
+ return { value: sum / values.length };
883
+ }
884
+ function aggregateAll(values) {
885
+ return { passed: values.length > 0 && values.every((v) => v.passed) };
886
+ }
887
+ function aggregateTokenCountSum(values) {
888
+ const initial = {
889
+ input: 0,
890
+ output: 0,
891
+ inputCached: 0,
892
+ outputCached: 0
1015
893
  };
1016
- }
1017
- function EvalsCliApp({
1018
- data,
1019
- args,
1020
- runner
1021
- }) {
1022
- const { exit } = useApp();
1023
- const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
1024
- const [liveData, setLiveData] = useState(data);
1025
- const [runtimeMessage, setRuntimeMessage] = useState();
1026
- const [state, dispatch] = useReducer(
1027
- reduceCliState,
1028
- createInitialState(data, args)
894
+ return values.reduce(
895
+ (acc, v) => ({
896
+ input: acc.input + (v.input ?? 0),
897
+ output: acc.output + (v.output ?? 0),
898
+ inputCached: acc.inputCached + (v.inputCached ?? 0),
899
+ outputCached: acc.outputCached + (v.outputCached ?? 0)
900
+ }),
901
+ initial
1029
902
  );
1030
- useEffect(() => {
1031
- setLiveData(data);
1032
- }, [data]);
1033
- useEffect(() => {
1034
- if (!runner) {
1035
- return void 0;
1036
- }
1037
- return runner.subscribeRunEvents((event) => {
1038
- setLiveData((current) => applyRunnerEvent(current, event, runner));
1039
- if (event.type === "RunQueued") {
1040
- setRuntimeMessage(`Queued ${event.runId} with ${event.totalTestCases} test cases.`);
1041
- }
1042
- if (event.type === "RunCompleted") {
1043
- setRuntimeMessage(
1044
- `Completed ${event.runId}: ${event.passedTestCases}/${event.totalTestCases} passed.`
1045
- );
1046
- }
1047
- if (event.type === "RunFailed") {
1048
- setRuntimeMessage(`Run failed: ${event.errorMessage}`);
1049
- }
1050
- });
1051
- }, [runner]);
1052
- const filteredDatasets = useMemo(
1053
- () => getFilteredDatasets(liveData, state.searchQuery),
1054
- [liveData, state.searchQuery]
1055
- );
1056
- const clampedState = clampCursor(
1057
- state,
1058
- filteredDatasets.length,
1059
- getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
1060
- );
1061
- const selectedDataset = getDatasetByMenuIndex(
1062
- filteredDatasets,
1063
- clampedState.datasetMenuIndex
1064
- );
1065
- const selectedRun = getRunByMenuIndex(
1066
- selectedDataset,
1067
- clampedState.runMenuIndex
1068
- );
1069
- const visibleEvaluators = liveData.evaluators.filter(
1070
- (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
1071
- );
1072
- useInput((input, key) => {
1073
- if (isQuitInput(input) || key.escape) {
1074
- exit();
1075
- return;
1076
- }
1077
- if (key.tab) {
1078
- dispatch({ type: "TOGGLE_FOCUS" });
1079
- return;
1080
- }
1081
- if (isSearchInput(input)) {
1082
- dispatch({ type: "START_SEARCH" });
1083
- return;
1084
- }
1085
- if (clampedState.searchMode) {
1086
- if (key.return) {
1087
- dispatch({ type: "END_SEARCH" });
1088
- return;
1089
- }
1090
- if (isBackKey(key)) {
1091
- dispatch({ type: "REMOVE_SEARCH_CHAR" });
1092
- return;
1093
- }
1094
- if (isPrintableCharacter(input)) {
1095
- dispatch({ type: "APPEND_SEARCH", value: input });
1096
- }
1097
- return;
1098
- }
1099
- if (key.upArrow) {
1100
- const max = clampedState.level === "details" ? 100 : clampedState.level === "new-evaluation" ? visibleEvaluators.length - 1 : 100;
1101
- dispatch({ type: "MOVE_UP", max });
1102
- return;
1103
- }
1104
- if (key.downArrow) {
1105
- const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
1106
- dispatch({ type: "MOVE_DOWN", max });
1107
- return;
1108
- }
1109
- if (key.return) {
1110
- dispatch({
1111
- type: "ENTER",
1112
- hasDataset: Boolean(selectedDataset),
1113
- hasRun: Boolean(selectedRun)
1114
- });
1115
- if (clampedState.level === "new-evaluation") {
1116
- const evaluator = visibleEvaluators[clampedState.evaluatorMenuIndex];
1117
- if (evaluator) {
1118
- dispatch({ type: "TOGGLE_EVALUATOR", evaluatorId: evaluator.id });
1119
- }
1120
- }
1121
- return;
1122
- }
1123
- if (isBackKey(key)) {
1124
- dispatch({ type: "BACK" });
1125
- return;
1126
- }
1127
- if (input.toLowerCase() === "c") {
1128
- dispatch({ type: "CLEAR_WARNINGS" });
1129
- setRuntimeMessage(void 0);
1130
- return;
903
+ }
904
+ function aggregateLatencyAverage(values) {
905
+ if (values.length === 0) {
906
+ return { ms: 0 };
907
+ }
908
+ const sum = values.reduce((s, v) => s + v.ms, 0);
909
+ return { ms: sum / values.length };
910
+ }
911
+
912
+ // src/evals/metrics/standard.ts
913
+ Metric.of({
914
+ id: "token-count",
915
+ name: "Tokens",
916
+ aggregate: aggregateTokenCountSum,
917
+ format: (data, options) => {
918
+ const input = data.input ?? 0;
919
+ const output = data.output ?? 0;
920
+ const inputCached = data.inputCached ?? 0;
921
+ const outputCached = data.outputCached ?? 0;
922
+ const cached = inputCached + outputCached;
923
+ const base = `in:${input} out:${output} cached:${cached}`;
924
+ return options?.isAggregated ? `Total: ${base}` : base;
925
+ }
926
+ });
927
+ Metric.of({
928
+ id: "latency",
929
+ name: "Latency",
930
+ aggregate: aggregateLatencyAverage,
931
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
932
+ });
933
+
934
+ // src/evals/scores/standard.ts
935
+ Score.of({
936
+ id: "percent",
937
+ name: "Score",
938
+ displayStrategy: "bar",
939
+ format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
940
+ aggregate: aggregateAverage
941
+ });
942
+ Score.of({
943
+ id: "binary",
944
+ name: "Result",
945
+ displayStrategy: "passFail",
946
+ format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
947
+ aggregate: aggregateAll
948
+ });
949
+ function createDiffLogEntry(expected, actual, options) {
950
+ const diff = diffString(expected, actual, { color: false });
951
+ return {
952
+ type: "diff",
953
+ label: options?.label,
954
+ expected,
955
+ actual,
956
+ diff: diff || "(no differences)"
957
+ };
958
+ }
959
+ function getDiffLines(entry) {
960
+ const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
961
+ return raw.split("\n").map((line) => {
962
+ const trimmed = line.trimStart();
963
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
964
+ return { type: "remove", line };
1131
965
  }
1132
- if (input.toLowerCase() === "s" && clampedState.level === "new-evaluation") {
1133
- if (!runner) {
1134
- setRuntimeMessage("Runner unavailable: cannot start evaluation.");
1135
- return;
1136
- }
1137
- if (!selectedDataset) {
1138
- setRuntimeMessage("Select a dataset before starting a new evaluation.");
1139
- return;
1140
- }
1141
- if (clampedState.selectedEvaluatorIds.length === 0) {
1142
- setRuntimeMessage("Select at least one evaluator before starting.");
1143
- return;
1144
- }
1145
- void runner.runDatasetWith({
1146
- datasetId: selectedDataset.id,
1147
- evaluatorIds: clampedState.selectedEvaluatorIds
1148
- }).then((snapshot) => {
1149
- setRuntimeMessage(
1150
- `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
1151
- );
1152
- }).catch((error) => {
1153
- setRuntimeMessage(
1154
- error instanceof Error ? error.message : "Failed to start evaluation."
1155
- );
1156
- });
966
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
967
+ return { type: "add", line };
1157
968
  }
969
+ return { type: "context", line };
1158
970
  });
1159
- const renderContent = () => {
1160
- if (clampedState.level === "new-evaluation") {
1161
- return /* @__PURE__ */ jsx(
1162
- NewEvaluationView,
1163
- {
1164
- state: clampedState,
1165
- data: liveData,
1166
- visibleEvaluators
1167
- }
1168
- );
1169
- }
1170
- if (clampedState.level === "datasets") {
1171
- return /* @__PURE__ */ jsx(
1172
- DatasetsView,
1173
- {
1174
- state: clampedState,
1175
- filteredDatasets,
1176
- selectedDataset
1177
- }
1178
- );
1179
- }
1180
- if (clampedState.level === "runs") {
1181
- return /* @__PURE__ */ jsx(
1182
- RunsView,
1183
- {
1184
- state: clampedState,
1185
- dataset: selectedDataset,
1186
- selectedRun
1187
- }
1188
- );
1189
- }
1190
- return /* @__PURE__ */ jsx(
1191
- RunDetailsView,
1192
- {
1193
- state: clampedState,
1194
- dataset: selectedDataset,
1195
- selectedRun
971
+ }
972
+
973
+ // src/runner/score-utils.ts
974
+ function toNumericScoreFromScores(scores) {
975
+ for (const item of scores) {
976
+ const def = getScoreById(item.id);
977
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
978
+ const value = item.data.value;
979
+ if (typeof value === "number" && Number.isFinite(value)) {
980
+ return value;
1196
981
  }
1197
- );
1198
- };
1199
- return /* @__PURE__ */ jsxs(
1200
- Box,
1201
- {
1202
- flexDirection: "column",
1203
- flexGrow: 1,
1204
- width: stdoutWidth,
1205
- height: stdoutHeight,
1206
- children: [
1207
- /* @__PURE__ */ jsx(
1208
- Box,
1209
- {
1210
- borderStyle: "round",
1211
- borderColor: "cyan",
1212
- paddingX: 1,
1213
- width: stdoutWidth,
1214
- children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(
1215
- clampedState,
1216
- selectedDataset?.name,
1217
- selectedRun?.label
1218
- ) })
1219
- }
1220
- ),
1221
- clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
1222
- Box,
1223
- {
1224
- marginTop: 1,
1225
- borderStyle: "round",
1226
- borderColor: "yellow",
1227
- paddingX: 1,
1228
- flexDirection: "column",
1229
- width: stdoutWidth,
1230
- children: [
1231
- /* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
1232
- clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
1233
- ]
1234
- }
1235
- ),
1236
- clampedState.searchMode && /* @__PURE__ */ jsxs(
1237
- Box,
1238
- {
1239
- marginTop: 1,
1240
- borderStyle: "round",
1241
- borderColor: "magenta",
1242
- paddingX: 1,
1243
- width: stdoutWidth,
1244
- children: [
1245
- /* @__PURE__ */ jsx(Text, { color: "magenta", bold: true, children: "Search: " }),
1246
- /* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
1247
- ]
1248
- }
1249
- ),
1250
- runtimeMessage && /* @__PURE__ */ jsx(
1251
- Box,
1252
- {
1253
- marginTop: 1,
1254
- borderStyle: "round",
1255
- borderColor: "blue",
1256
- paddingX: 1,
1257
- width: stdoutWidth,
1258
- children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage })
1259
- }
1260
- ),
1261
- /* @__PURE__ */ jsx(
1262
- Box,
1263
- {
1264
- marginTop: 1,
1265
- flexGrow: 1,
1266
- width: stdoutWidth,
1267
- flexDirection: "row",
1268
- children: renderContent()
1269
- }
1270
- ),
1271
- /* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
1272
- ]
1273
982
  }
983
+ const numeric = toNumericScore(item.data);
984
+ if (numeric !== void 0) {
985
+ return numeric;
986
+ }
987
+ }
988
+ return void 0;
989
+ }
990
+ function toNumericScore(value) {
991
+ if (typeof value === "number" && Number.isFinite(value)) {
992
+ return value;
993
+ }
994
+ if (typeof value !== "object" || value === null) {
995
+ return void 0;
996
+ }
997
+ const obj = value;
998
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
999
+ return obj.score;
1000
+ }
1001
+ const numberValues = Object.values(value).filter(
1002
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1274
1003
  );
1004
+ if (numberValues.length === 0) {
1005
+ return void 0;
1006
+ }
1007
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1275
1008
  }
1276
1009
 
1277
1010
  // src/runner/config.ts
@@ -1293,7 +1026,8 @@ var defaultRunnerConfig = {
1293
1026
  ],
1294
1027
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
1295
1028
  },
1296
- artifactDirectory: ".eval-results"
1029
+ artifactDirectory: ".eval-results",
1030
+ maxConcurrency: 1
1297
1031
  };
1298
1032
  function toRunnerConfigOverrides(config) {
1299
1033
  if (!config) {
@@ -1326,6 +1060,9 @@ function toRunnerConfigOverrides(config) {
1326
1060
  if (config.artifactDirectory !== void 0) {
1327
1061
  overrides.artifactDirectory = config.artifactDirectory;
1328
1062
  }
1063
+ if (config.maxConcurrency !== void 0) {
1064
+ overrides.maxConcurrency = config.maxConcurrency;
1065
+ }
1329
1066
  if (Object.keys(discovery).length > 0) {
1330
1067
  overrides.discovery = discovery;
1331
1068
  }
@@ -1519,129 +1256,6 @@ async function collectTestCasesFromFiles(config) {
1519
1256
  );
1520
1257
  return found.flat();
1521
1258
  }
1522
- function createDiffLogEntry(expected, actual, options) {
1523
- const diff = diffString(expected, actual, { color: false });
1524
- return {
1525
- type: "diff",
1526
- label: options?.label,
1527
- expected,
1528
- actual,
1529
- diff: diff || "(no differences)"
1530
- };
1531
- }
1532
-
1533
- // src/evals/metric.ts
1534
- var registry = /* @__PURE__ */ new Map();
1535
- var Metric = {
1536
- of(config) {
1537
- const def = {
1538
- id: config.id,
1539
- name: config.name,
1540
- format: config.format,
1541
- make: (data) => ({ id: config.id, data })
1542
- };
1543
- registry.set(config.id, def);
1544
- return def;
1545
- }
1546
- };
1547
-
1548
- // src/evals/score.ts
1549
- var registry2 = /* @__PURE__ */ new Map();
1550
- var Score = {
1551
- of(config) {
1552
- const def = {
1553
- id: config.id,
1554
- name: config.name,
1555
- displayStrategy: config.displayStrategy,
1556
- format: config.format,
1557
- make: (data, options) => {
1558
- const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1559
- return {
1560
- id: config.id,
1561
- data,
1562
- ...passed !== void 0 && { passed }
1563
- };
1564
- }
1565
- };
1566
- registry2.set(config.id, def);
1567
- return def;
1568
- }
1569
- };
1570
- function getScoreById(id) {
1571
- return registry2.get(id);
1572
- }
1573
-
1574
- // src/evals/metrics/standard.ts
1575
- Metric.of({
1576
- id: "token-count",
1577
- name: "Tokens",
1578
- format: (data) => {
1579
- const input = data.input ?? 0;
1580
- const output = data.output ?? 0;
1581
- const inputCached = data.inputCached ?? 0;
1582
- const outputCached = data.outputCached ?? 0;
1583
- const cached = inputCached + outputCached;
1584
- return `in:${input} out:${output} cached:${cached}`;
1585
- }
1586
- });
1587
- Metric.of({
1588
- id: "latency",
1589
- name: "Latency",
1590
- format: (data) => `${data.ms}ms`
1591
- });
1592
-
1593
- // src/evals/scores/standard.ts
1594
- Score.of({
1595
- id: "percent",
1596
- name: "Score",
1597
- displayStrategy: "bar",
1598
- format: (data) => data.value.toFixed(2)
1599
- });
1600
- Score.of({
1601
- id: "binary",
1602
- name: "Result",
1603
- displayStrategy: "passFail",
1604
- format: (data) => data.passed ? "PASSED" : "NOT PASSED"
1605
- });
1606
-
1607
- // src/runner/score-utils.ts
1608
- function toNumericScoreFromScores(scores) {
1609
- for (const item of scores) {
1610
- const def = getScoreById(item.id);
1611
- if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1612
- const value = item.data.value;
1613
- if (typeof value === "number" && Number.isFinite(value)) {
1614
- return value;
1615
- }
1616
- }
1617
- const numeric = toNumericScore(item.data);
1618
- if (numeric !== void 0) {
1619
- return numeric;
1620
- }
1621
- }
1622
- return void 0;
1623
- }
1624
- function toNumericScore(value) {
1625
- if (typeof value === "number" && Number.isFinite(value)) {
1626
- return value;
1627
- }
1628
- if (typeof value !== "object" || value === null) {
1629
- return void 0;
1630
- }
1631
- const obj = value;
1632
- if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1633
- return obj.score;
1634
- }
1635
- const numberValues = Object.values(value).filter(
1636
- (entry) => typeof entry === "number" && Number.isFinite(entry)
1637
- );
1638
- if (numberValues.length === 0) {
1639
- return void 0;
1640
- }
1641
- return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1642
- }
1643
-
1644
- // src/runner/execution.ts
1645
1259
  function computeEvaluatorPassed(evaluator, result, scores) {
1646
1260
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1647
1261
  if (scoresWithPassed.length > 0) {
@@ -1683,6 +1297,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1683
1297
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1684
1298
  );
1685
1299
  }
1300
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1301
+ return Effect.gen(function* () {
1302
+ const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1303
+ const rerunPassed = [];
1304
+ for (let r = 0; r < reruns; r++) {
1305
+ const started = Date.now();
1306
+ const evaluatorScores = [];
1307
+ let testCaseError;
1308
+ const output = readOutput(testCaseItem.testCase);
1309
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1310
+ const evaluateFn = evaluator.getEvaluateFn();
1311
+ if (!evaluateFn) {
1312
+ continue;
1313
+ }
1314
+ try {
1315
+ const logs = [];
1316
+ const logDiff = (expected, actual, options) => {
1317
+ logs.push(createDiffLogEntry(expected, actual, options));
1318
+ };
1319
+ const ctx = yield* Effect.promise(
1320
+ () => Promise.resolve(evaluator.resolveContext())
1321
+ );
1322
+ const result = yield* Effect.promise(
1323
+ () => Promise.resolve(
1324
+ evaluateFn({
1325
+ input: testCaseItem.testCase.getInput(),
1326
+ ctx,
1327
+ output,
1328
+ logDiff
1329
+ })
1330
+ )
1331
+ );
1332
+ const { scores, metrics } = normalizeResult(result);
1333
+ const passed2 = computeEvaluatorPassed(evaluator, result, scores);
1334
+ evaluatorScores.push({
1335
+ evaluatorId,
1336
+ scores,
1337
+ passed: passed2,
1338
+ metrics,
1339
+ logs: logs.length > 0 ? logs : void 0
1340
+ });
1341
+ } catch (error) {
1342
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1343
+ evaluatorScores.push({
1344
+ evaluatorId,
1345
+ scores: [],
1346
+ passed: false
1347
+ });
1348
+ }
1349
+ }
1350
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
1351
+ rerunPassed.push(rerunPassedThis);
1352
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
1353
+ n + 1,
1354
+ n + 1
1355
+ ]);
1356
+ const progressEvent = {
1357
+ type: "TestCaseProgress",
1358
+ runId: task.runId,
1359
+ testCaseId: testCaseItem.id,
1360
+ testCaseName: testCaseItem.testCase.getName(),
1361
+ completedTestCases: completedEvaluations,
1362
+ totalTestCases: totalEvaluations,
1363
+ rerunIndex: r + 1,
1364
+ rerunTotal: reruns,
1365
+ passed: rerunPassedThis,
1366
+ durationMs: Date.now() - started,
1367
+ evaluatorScores,
1368
+ output,
1369
+ errorMessage: testCaseError
1370
+ };
1371
+ updateSnapshot(task.runId, (snapshot) => ({
1372
+ ...snapshot,
1373
+ completedTestCases: completedEvaluations
1374
+ }));
1375
+ yield* publishEvent(progressEvent);
1376
+ yield* Queue.offer(persistenceQueue, {
1377
+ runId: task.runId,
1378
+ artifactPath: task.snapshot.artifactPath,
1379
+ payload: progressEvent
1380
+ });
1381
+ }
1382
+ const testCasePassed = rerunPassed.every(Boolean);
1383
+ if (testCasePassed) {
1384
+ yield* Ref.update(passedRef, (n) => n + 1);
1385
+ } else {
1386
+ yield* Ref.update(failedRef, (n) => n + 1);
1387
+ }
1388
+ const [passed, failed] = yield* Effect.all([
1389
+ Ref.get(passedRef),
1390
+ Ref.get(failedRef)
1391
+ ]);
1392
+ updateSnapshot(task.runId, (snapshot) => ({
1393
+ ...snapshot,
1394
+ passedTestCases: passed,
1395
+ failedTestCases: failed
1396
+ }));
1397
+ });
1398
+ }
1686
1399
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1687
1400
  const startedAt = Date.now();
1688
1401
  updateSnapshot(task.runId, (snapshot) => ({
@@ -1695,118 +1408,215 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1695
1408
  runId: task.runId,
1696
1409
  startedAt
1697
1410
  });
1698
- let completedTestCases = 0;
1699
- let passedTestCases = 0;
1700
- let failedTestCases = 0;
1701
- for (const testCaseItem of task.testCases) {
1702
- const started = Date.now();
1703
- const evaluatorScores = [];
1704
- let testCaseError;
1705
- const output = readOutput(testCaseItem.testCase);
1706
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
1707
- const evaluateFn = evaluator.getEvaluateFn();
1708
- if (!evaluateFn) {
1709
- continue;
1411
+ const totalEvaluations = task.testCases.reduce(
1412
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1413
+ 0
1414
+ );
1415
+ const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1416
+ const completedRef = yield* Ref.make(0);
1417
+ const passedRef = yield* Ref.make(0);
1418
+ const failedRef = yield* Ref.make(0);
1419
+ const processTestCase = (testCaseItem) => processOneTestCase(
1420
+ task,
1421
+ testCaseItem,
1422
+ totalEvaluations,
1423
+ publishEvent,
1424
+ persistenceQueue,
1425
+ updateSnapshot,
1426
+ completedRef,
1427
+ passedRef,
1428
+ failedRef
1429
+ );
1430
+ yield* Effect.forEach(
1431
+ task.testCases,
1432
+ processTestCase,
1433
+ maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
1434
+ );
1435
+ const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
1436
+ Ref.get(completedRef),
1437
+ Ref.get(passedRef),
1438
+ Ref.get(failedRef)
1439
+ ]);
1440
+ const finishedAt = Date.now();
1441
+ const completedEvent = {
1442
+ type: "RunCompleted",
1443
+ runId: task.runId,
1444
+ finishedAt,
1445
+ passedTestCases: passedUniqueTestCases,
1446
+ failedTestCases: failedUniqueTestCases,
1447
+ totalTestCases: task.testCases.length,
1448
+ artifactPath: task.snapshot.artifactPath
1449
+ };
1450
+ updateSnapshot(task.runId, (snapshot) => ({
1451
+ ...snapshot,
1452
+ status: "completed",
1453
+ completedTestCases: completedEvaluations,
1454
+ passedTestCases: passedUniqueTestCases,
1455
+ failedTestCases: failedUniqueTestCases,
1456
+ finishedAt
1457
+ }));
1458
+ yield* publishEvent(completedEvent);
1459
+ yield* Queue.offer(persistenceQueue, {
1460
+ runId: task.runId,
1461
+ artifactPath: task.snapshot.artifactPath,
1462
+ payload: completedEvent
1463
+ });
1464
+ yield* publishEvent({
1465
+ type: "ArtifactFlushed",
1466
+ runId: task.runId,
1467
+ artifactPath: task.snapshot.artifactPath
1468
+ });
1469
+ });
1470
+ async function loadRunSnapshotsFromArtifacts(config) {
1471
+ const baseDir = resolve(config.artifactDirectory);
1472
+ let entries;
1473
+ try {
1474
+ entries = await readdir(baseDir);
1475
+ } catch {
1476
+ return [];
1477
+ }
1478
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
1479
+ const snapshots = [];
1480
+ for (const fileName of jsonlFiles) {
1481
+ const filePath = join(baseDir, fileName);
1482
+ try {
1483
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
1484
+ if (snapshot) {
1485
+ snapshots.push(snapshot);
1710
1486
  }
1711
- try {
1712
- const logs = [];
1713
- const logDiff = (expected, actual, options) => {
1714
- logs.push(createDiffLogEntry(expected, actual, options));
1487
+ } catch {
1488
+ }
1489
+ }
1490
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
1491
+ }
1492
+ async function parseArtifactToSnapshot(filePath, _config) {
1493
+ const content = await readFile(filePath, "utf8");
1494
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1495
+ if (lines.length === 0) {
1496
+ return null;
1497
+ }
1498
+ let runQueued = null;
1499
+ let runCompleted = null;
1500
+ let runFailed = null;
1501
+ let runStarted = null;
1502
+ for (const line of lines) {
1503
+ try {
1504
+ const event = JSON.parse(line);
1505
+ const type = event.type;
1506
+ if (type === "RunQueued") {
1507
+ runQueued = {
1508
+ runId: event.runId,
1509
+ datasetId: event.datasetId,
1510
+ datasetName: event.datasetName,
1511
+ evaluatorIds: event.evaluatorIds,
1512
+ totalTestCases: event.totalTestCases ?? 0,
1513
+ artifactPath: event.artifactPath ?? filePath,
1514
+ ts: event.ts
1515
+ };
1516
+ }
1517
+ if (type === "RunStarted") {
1518
+ runStarted = { startedAt: event.startedAt };
1519
+ }
1520
+ if (type === "RunCompleted") {
1521
+ runCompleted = {
1522
+ passedTestCases: event.passedTestCases,
1523
+ failedTestCases: event.failedTestCases,
1524
+ totalTestCases: event.totalTestCases,
1525
+ finishedAt: event.finishedAt
1715
1526
  };
1716
- const ctx = yield* Effect.promise(
1717
- () => Promise.resolve(evaluator.resolveContext())
1718
- );
1719
- const result = yield* Effect.promise(
1720
- () => Promise.resolve(
1721
- evaluateFn({
1722
- input: testCaseItem.testCase.getInput(),
1723
- ctx,
1724
- output,
1725
- logDiff
1726
- })
1727
- )
1728
- );
1729
- const { scores, metrics } = normalizeResult(result);
1730
- const passed = computeEvaluatorPassed(evaluator, result, scores);
1731
- evaluatorScores.push({
1732
- evaluatorId,
1733
- scores,
1734
- passed,
1735
- metrics,
1736
- logs: logs.length > 0 ? logs : void 0
1737
- });
1738
- } catch (error) {
1739
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1740
- evaluatorScores.push({
1741
- evaluatorId,
1742
- scores: [],
1743
- passed: false
1744
- });
1745
1527
  }
1528
+ if (type === "RunFailed") {
1529
+ runFailed = {
1530
+ finishedAt: event.finishedAt,
1531
+ errorMessage: event.errorMessage
1532
+ };
1533
+ }
1534
+ } catch {
1746
1535
  }
1747
- const testCasePassed = evaluatorScores.every((s) => s.passed);
1748
- completedTestCases += 1;
1749
- if (testCasePassed) {
1536
+ }
1537
+ if (!runQueued) {
1538
+ return null;
1539
+ }
1540
+ const artifactPath = filePath;
1541
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
1542
+ const progress = aggregateTestCaseProgress(lines);
1543
+ const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
1544
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
1545
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
1546
+ return {
1547
+ runId: runQueued.runId,
1548
+ datasetId: runQueued.datasetId,
1549
+ datasetName: runQueued.datasetName,
1550
+ evaluatorIds: runQueued.evaluatorIds,
1551
+ queuedAt: runQueued.ts ?? 0,
1552
+ startedAt: runStarted?.startedAt,
1553
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
1554
+ totalTestCases: runQueued.totalTestCases,
1555
+ completedTestCases,
1556
+ passedTestCases,
1557
+ failedTestCases,
1558
+ status,
1559
+ artifactPath,
1560
+ errorMessage: runFailed?.errorMessage
1561
+ };
1562
+ }
1563
+ function aggregateTestCaseProgress(lines) {
1564
+ let completedTestCases = 0;
1565
+ const testCasePassedBy = /* @__PURE__ */ new Map();
1566
+ for (const line of lines) {
1567
+ try {
1568
+ const event = JSON.parse(line);
1569
+ if (event.type === "TestCaseProgress") {
1570
+ const ev = event;
1571
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
1572
+ const id = ev.testCaseId;
1573
+ const current = testCasePassedBy.get(id);
1574
+ testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
1575
+ }
1576
+ } catch {
1577
+ }
1578
+ }
1579
+ let passedTestCases = 0;
1580
+ let failedTestCases = 0;
1581
+ for (const passed of testCasePassedBy.values()) {
1582
+ if (passed) {
1750
1583
  passedTestCases += 1;
1751
1584
  } else {
1752
1585
  failedTestCases += 1;
1753
1586
  }
1754
- const progressEvent = {
1755
- type: "TestCaseProgress",
1756
- runId: task.runId,
1757
- testCaseId: testCaseItem.id,
1758
- testCaseName: testCaseItem.testCase.getName(),
1759
- completedTestCases,
1760
- totalTestCases: task.testCases.length,
1761
- passed: testCasePassed,
1762
- durationMs: Date.now() - started,
1763
- evaluatorScores,
1764
- output,
1765
- errorMessage: testCaseError
1766
- };
1767
- updateSnapshot(task.runId, (snapshot) => ({
1768
- ...snapshot,
1769
- completedTestCases,
1770
- passedTestCases,
1771
- failedTestCases
1772
- }));
1773
- yield* publishEvent(progressEvent);
1774
- yield* Queue.offer(persistenceQueue, {
1775
- runId: task.runId,
1776
- artifactPath: task.snapshot.artifactPath,
1777
- payload: progressEvent
1778
- });
1779
1587
  }
1780
- const finishedAt = Date.now();
1781
- const completedEvent = {
1782
- type: "RunCompleted",
1783
- runId: task.runId,
1784
- finishedAt,
1785
- passedTestCases,
1786
- failedTestCases,
1787
- totalTestCases: task.testCases.length,
1788
- artifactPath: task.snapshot.artifactPath
1789
- };
1790
- updateSnapshot(task.runId, (snapshot) => ({
1791
- ...snapshot,
1792
- status: "completed",
1793
- completedTestCases,
1794
- passedTestCases,
1795
- failedTestCases,
1796
- finishedAt
1797
- }));
1798
- yield* publishEvent(completedEvent);
1799
- yield* Queue.offer(persistenceQueue, {
1800
- runId: task.runId,
1801
- artifactPath: task.snapshot.artifactPath,
1802
- payload: completedEvent
1803
- });
1804
- yield* publishEvent({
1805
- type: "ArtifactFlushed",
1806
- runId: task.runId,
1807
- artifactPath: task.snapshot.artifactPath
1808
- });
1809
- });
1588
+ return { completedTestCases, passedTestCases, failedTestCases };
1589
+ }
1590
+ async function parseArtifactFile(artifactPath) {
1591
+ try {
1592
+ const content = await readFile(artifactPath, "utf8");
1593
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
1594
+ const results = [];
1595
+ for (const line of lines) {
1596
+ try {
1597
+ const event = JSON.parse(line);
1598
+ if (event.type === "TestCaseProgress") {
1599
+ const ev = event;
1600
+ results.push({
1601
+ testCaseId: ev.testCaseId,
1602
+ testCaseName: ev.testCaseName,
1603
+ completedTestCases: ev.completedTestCases,
1604
+ totalTestCases: ev.totalTestCases,
1605
+ rerunIndex: ev.rerunIndex,
1606
+ rerunTotal: ev.rerunTotal,
1607
+ passed: ev.passed,
1608
+ durationMs: ev.durationMs,
1609
+ evaluatorScores: ev.evaluatorScores ?? []
1610
+ });
1611
+ }
1612
+ } catch {
1613
+ }
1614
+ }
1615
+ return results;
1616
+ } catch {
1617
+ return [];
1618
+ }
1619
+ }
1810
1620
  async function appendJsonLine(artifactPath, payload) {
1811
1621
  await mkdir(dirname(artifactPath), { recursive: true });
1812
1622
  await appendFile(artifactPath, `${JSON.stringify(payload)}
@@ -1822,291 +1632,873 @@ var createPersistenceWorker = (queue) => Effect.forever(
1822
1632
  ...message.payload
1823
1633
  })
1824
1634
  );
1825
- })
1826
- );
1827
-
1828
- // src/runner/search.ts
1829
- function matchesAny(value, matchers) {
1830
- if (!matchers || matchers.length === 0) {
1831
- return true;
1635
+ })
1636
+ );
1637
+
1638
+ // src/runner/search.ts
1639
+ function matchesAny(value, matchers) {
1640
+ if (!matchers || matchers.length === 0) {
1641
+ return true;
1642
+ }
1643
+ return matchers.some(
1644
+ (matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
1645
+ );
1646
+ }
1647
+ function matchesPath(value, matchers) {
1648
+ if (!matchers || matchers.length === 0) {
1649
+ return true;
1650
+ }
1651
+ return matchers.some((matcher) => {
1652
+ if (typeof matcher === "string") {
1653
+ return value.includes(matcher);
1654
+ }
1655
+ return matcher.test(value);
1656
+ });
1657
+ }
1658
+ function searchCollectedTestCases(all, query) {
1659
+ if (!query) {
1660
+ return all;
1661
+ }
1662
+ return all.filter((item) => {
1663
+ const tags = item.testCase.getTags();
1664
+ if (query.excludedTags && tags.some((tag) => matchesAny(tag, query.excludedTags))) {
1665
+ return false;
1666
+ }
1667
+ if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
1668
+ return false;
1669
+ }
1670
+ const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, query.includedTags));
1671
+ const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
1672
+ return includedTagsMatch && includedPathsMatch;
1673
+ });
1674
+ }
1675
+
1676
+ // src/runner/api.ts
1677
+ function parseRegexLiteral(pattern) {
1678
+ if (!pattern.startsWith("/")) {
1679
+ return void 0;
1680
+ }
1681
+ const lastSlash = pattern.lastIndexOf("/");
1682
+ if (lastSlash <= 0) {
1683
+ return void 0;
1684
+ }
1685
+ return {
1686
+ source: pattern.slice(1, lastSlash),
1687
+ flags: pattern.slice(lastSlash + 1)
1688
+ };
1689
+ }
1690
+ function createNameMatcher(pattern) {
1691
+ const normalizedPattern = pattern.trim();
1692
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1693
+ if (regexLiteral) {
1694
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1695
+ return (value) => regex.test(value);
1696
+ }
1697
+ if (normalizedPattern.includes("*")) {
1698
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1699
+ const regex = new RegExp(`^${escaped}$`, "i");
1700
+ return (value) => regex.test(value);
1701
+ }
1702
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1703
+ }
1704
+ function mergeRunnerOverrides(base, next) {
1705
+ if (!base) {
1706
+ return next;
1707
+ }
1708
+ if (!next) {
1709
+ return base;
1710
+ }
1711
+ const discovery = base.discovery || next.discovery ? {
1712
+ ...base.discovery ?? {},
1713
+ ...next.discovery ?? {}
1714
+ } : void 0;
1715
+ return {
1716
+ ...base,
1717
+ ...next,
1718
+ discovery
1719
+ };
1720
+ }
1721
+ function createRunner(overrides) {
1722
+ const fileOverrides = loadRunnerConfigFile();
1723
+ const merged = mergeRunnerOverrides(fileOverrides, overrides);
1724
+ return new EffectRunner(withRunnerConfig(merged));
1725
+ }
1726
+ var EffectRunner = class {
1727
+ constructor(config) {
1728
+ this.eventBus = Effect.runSync(PubSub.unbounded());
1729
+ this.runQueue = Effect.runSync(Queue.unbounded());
1730
+ this.persistenceQueue = Effect.runSync(
1731
+ Queue.unbounded()
1732
+ );
1733
+ this.snapshots = /* @__PURE__ */ new Map();
1734
+ this.listeners = /* @__PURE__ */ new Set();
1735
+ this.datasetsById = /* @__PURE__ */ new Map();
1736
+ this.evaluatorsById = /* @__PURE__ */ new Map();
1737
+ this.schedulerFiber = Effect.runFork(
1738
+ this.createSchedulerEffect()
1739
+ );
1740
+ this.persistenceFiber = Effect.runFork(
1741
+ createPersistenceWorker(this.persistenceQueue)
1742
+ );
1743
+ this.config = config;
1744
+ }
1745
+ async collectDatasets() {
1746
+ const datasets = await collectDatasetsFromFiles(this.config.discovery);
1747
+ this.datasetsById.clear();
1748
+ for (const dataset of datasets) {
1749
+ this.datasetsById.set(dataset.id, dataset);
1750
+ }
1751
+ return datasets;
1752
+ }
1753
+ async collectEvaluators() {
1754
+ const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
1755
+ this.evaluatorsById.clear();
1756
+ for (const evaluator of evaluators) {
1757
+ this.evaluatorsById.set(evaluator.id, evaluator);
1758
+ }
1759
+ return evaluators;
1760
+ }
1761
+ async resolveDatasetByName(name) {
1762
+ if (this.datasetsById.size === 0) {
1763
+ await this.collectDatasets();
1764
+ }
1765
+ const normalized = name.trim().toLowerCase();
1766
+ return Array.from(this.datasetsById.values()).find(
1767
+ (item) => item.dataset.getName().toLowerCase() === normalized
1768
+ );
1769
+ }
1770
+ async resolveEvaluatorsByNamePattern(pattern) {
1771
+ if (this.evaluatorsById.size === 0) {
1772
+ await this.collectEvaluators();
1773
+ }
1774
+ const matcher = createNameMatcher(pattern);
1775
+ return Array.from(this.evaluatorsById.values()).filter(
1776
+ (item) => matcher(item.evaluator.getName() ?? "")
1777
+ );
1778
+ }
1779
+ async searchTestCases(query) {
1780
+ const testCases = await collectTestCasesFromFiles(this.config.discovery);
1781
+ return searchCollectedTestCases(testCases, query);
1782
+ }
1783
+ async collectDatasetTestCases(datasetId) {
1784
+ if (this.datasetsById.size === 0) {
1785
+ await this.collectDatasets();
1786
+ }
1787
+ const dataset = this.datasetsById.get(datasetId);
1788
+ if (!dataset) {
1789
+ throw new Error(`Unknown dataset: ${datasetId}`);
1790
+ }
1791
+ const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
1792
+ return allTestCases.filter(
1793
+ (testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
1794
+ );
1795
+ }
1796
+ async runDatasetWith(request) {
1797
+ if (this.datasetsById.size === 0) {
1798
+ await this.collectDatasets();
1799
+ }
1800
+ if (this.evaluatorsById.size === 0) {
1801
+ await this.collectEvaluators();
1802
+ }
1803
+ const dataset = this.datasetsById.get(request.datasetId);
1804
+ if (!dataset) {
1805
+ throw new Error(`Unknown dataset: ${request.datasetId}`);
1806
+ }
1807
+ const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1808
+ if (selectedEvaluators.length === 0) {
1809
+ throw new Error("No evaluators selected for run");
1810
+ }
1811
+ const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1812
+ const totalEvaluations = selectedTestCases.reduce(
1813
+ (sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
1814
+ 0
1815
+ );
1816
+ const runId = `run-${randomUUID()}`;
1817
+ const artifactPath = createArtifactPath(
1818
+ this.config.artifactDirectory,
1819
+ request.datasetId,
1820
+ runId
1821
+ );
1822
+ const snapshot = {
1823
+ runId,
1824
+ datasetId: request.datasetId,
1825
+ datasetName: dataset.dataset.getName(),
1826
+ evaluatorIds: selectedEvaluators.map((item) => item.id),
1827
+ queuedAt: Date.now(),
1828
+ totalTestCases: totalEvaluations,
1829
+ completedTestCases: 0,
1830
+ passedTestCases: 0,
1831
+ failedTestCases: 0,
1832
+ status: "queued",
1833
+ artifactPath
1834
+ };
1835
+ this.snapshots.set(runId, snapshot);
1836
+ const queuedEvent = {
1837
+ type: "RunQueued",
1838
+ runId,
1839
+ datasetId: request.datasetId,
1840
+ datasetName: dataset.dataset.getName(),
1841
+ evaluatorIds: selectedEvaluators.map((item) => item.id),
1842
+ totalTestCases: totalEvaluations,
1843
+ artifactPath
1844
+ };
1845
+ await Effect.runPromise(this.publishEvent(queuedEvent));
1846
+ await Effect.runPromise(
1847
+ Queue.offer(this.persistenceQueue, {
1848
+ runId,
1849
+ artifactPath,
1850
+ payload: queuedEvent
1851
+ })
1852
+ );
1853
+ const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
1854
+ await Effect.runPromise(
1855
+ Queue.offer(this.runQueue, {
1856
+ runId,
1857
+ datasetId: request.datasetId,
1858
+ dataset: dataset.dataset,
1859
+ evaluators: selectedEvaluators,
1860
+ testCases: selectedTestCases,
1861
+ snapshot,
1862
+ maxConcurrency
1863
+ })
1864
+ );
1865
+ return snapshot;
1832
1866
  }
1833
- return matchers.some(
1834
- (matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
1835
- );
1836
- }
1837
- function matchesPath(value, matchers) {
1838
- if (!matchers || matchers.length === 0) {
1839
- return true;
1867
+ subscribeRunEvents(listener, options) {
1868
+ const entry = { runId: options?.runId, listener };
1869
+ this.listeners.add(entry);
1870
+ return () => {
1871
+ this.listeners.delete(entry);
1872
+ };
1840
1873
  }
1841
- return matchers.some((matcher) => {
1842
- if (typeof matcher === "string") {
1843
- return value.includes(matcher);
1844
- }
1845
- return matcher.test(value);
1846
- });
1847
- }
1848
- function searchCollectedTestCases(all, query) {
1849
- if (!query) {
1850
- return all;
1874
+ getRunSnapshot(runId) {
1875
+ return this.snapshots.get(runId);
1851
1876
  }
1852
- return all.filter((item) => {
1853
- const tags = item.testCase.getTags();
1854
- if (query.excludedTags && tags.some((tag) => matchesAny(tag, query.excludedTags))) {
1855
- return false;
1877
+ getAllRunSnapshots() {
1878
+ return Array.from(this.snapshots.values()).sort(
1879
+ (a, b) => b.queuedAt - a.queuedAt
1880
+ );
1881
+ }
1882
+ async loadRunSnapshotsFromArtifacts() {
1883
+ return loadRunSnapshotsFromArtifacts(this.config);
1884
+ }
1885
+ async shutdown() {
1886
+ await Effect.runPromise(Fiber.interrupt(this.schedulerFiber));
1887
+ await Effect.runPromise(Fiber.interrupt(this.persistenceFiber));
1888
+ await Effect.runPromise(Queue.shutdown(this.runQueue));
1889
+ await Effect.runPromise(Queue.shutdown(this.persistenceQueue));
1890
+ await Effect.runPromise(PubSub.shutdown(this.eventBus));
1891
+ }
1892
+ createSchedulerEffect() {
1893
+ const self = this;
1894
+ return Effect.forever(
1895
+ Effect.gen(function* () {
1896
+ const task = yield* Queue.take(self.runQueue);
1897
+ yield* Effect.fork(
1898
+ executeRunTask(
1899
+ task,
1900
+ self.publishEvent.bind(self),
1901
+ self.persistenceQueue,
1902
+ self.updateSnapshot.bind(self)
1903
+ )
1904
+ );
1905
+ })
1906
+ );
1907
+ }
1908
+ updateSnapshot(runId, updater) {
1909
+ const existing = this.snapshots.get(runId);
1910
+ if (!existing) {
1911
+ return;
1856
1912
  }
1857
- if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
1858
- return false;
1913
+ this.snapshots.set(runId, updater(existing));
1914
+ }
1915
+ publishEvent(event) {
1916
+ return Effect.sync(() => {
1917
+ for (const entry of this.listeners) {
1918
+ if (entry.runId && entry.runId !== event.runId) {
1919
+ continue;
1920
+ }
1921
+ entry.listener(event);
1922
+ }
1923
+ }).pipe(
1924
+ Effect.flatMap(() => PubSub.publish(this.eventBus, event)),
1925
+ Effect.asVoid
1926
+ );
1927
+ }
1928
+ };
1929
+ var DETAILS_PAGE_SIZE = 20;
1930
+ function scoreColor(score) {
1931
+ if (score >= 80)
1932
+ return "green";
1933
+ if (score >= 50)
1934
+ return "yellow";
1935
+ return "red";
1936
+ }
1937
+ function formatScorePart(item, scoreToColor) {
1938
+ const def = getScoreById(item.id);
1939
+ if (!def) {
1940
+ const numeric = toNumericScore(item.data);
1941
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1942
+ }
1943
+ const formatted = def.format(item.data);
1944
+ if (def.displayStrategy === "bar") {
1945
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1946
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1947
+ const barWidth = 14;
1948
+ const filled = Math.round(numeric / 100 * barWidth);
1949
+ const bar = "\u2588".repeat(filled) + "\u2591".repeat(barWidth - filled);
1950
+ return `${formatted} ${bar}`;
1859
1951
  }
1860
- const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, query.includedTags));
1861
- const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
1862
- return includedTagsMatch && includedPathsMatch;
1863
- });
1952
+ }
1953
+ return formatted;
1864
1954
  }
1865
-
1866
- // src/runner/api.ts
1867
- function parseRegexLiteral(pattern) {
1868
- if (!pattern.startsWith("/")) {
1869
- return void 0;
1955
+ function CheckRow({
1956
+ name,
1957
+ passed,
1958
+ detail
1959
+ }) {
1960
+ const status = passed ? "PASSED" : "FAILED";
1961
+ const color = passed ? "green" : "red";
1962
+ return /* @__PURE__ */ jsxs(Text, { children: [
1963
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: name.padEnd(14) }),
1964
+ " ",
1965
+ /* @__PURE__ */ jsx(Text, { color, bold: true, children: status }),
1966
+ detail ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1967
+ " (",
1968
+ detail,
1969
+ ")"
1970
+ ] }) : null
1971
+ ] });
1972
+ }
1973
+ function buildDetailRows(run, testCases, evaluatorNameById) {
1974
+ const { performance, dimensions, checks, failures, meta } = run;
1975
+ const latencyHistory = performance.latencyHistoryMs ?? [
1976
+ performance.latencyAvgMs - 40,
1977
+ performance.latencyAvgMs - 10,
1978
+ performance.latencyAvgMs + 20,
1979
+ performance.latencyP95Ms - 80,
1980
+ performance.latencyP95Ms
1981
+ ];
1982
+ const rows = [
1983
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Meta" }, "meta-h"),
1984
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1985
+ "Model: ",
1986
+ meta.model,
1987
+ " Provider: ",
1988
+ meta.provider
1989
+ ] }, "meta-1"),
1990
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1991
+ "Commit: ",
1992
+ meta.commit,
1993
+ " Branch: ",
1994
+ meta.branch,
1995
+ " Seed: ",
1996
+ meta.seed
1997
+ ] }, "meta-2"),
1998
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1999
+ "Duration: ",
2000
+ meta.duration,
2001
+ " Concurrency: ",
2002
+ meta.concurrency
2003
+ ] }, "meta-3"),
2004
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2005
+ "Artifact: ",
2006
+ meta.artifact
2007
+ ] }, "meta-4"),
2008
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp1"),
2009
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Scores (0\u2013100)" }, "scores-h"),
2010
+ ...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
2011
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
2012
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
2013
+ ...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
2014
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
2015
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
2016
+ /* @__PURE__ */ jsx(
2017
+ TextBar,
2018
+ {
2019
+ label: "pass rate",
2020
+ value: performance.passRate,
2021
+ format: (v) => `${v}%`
2022
+ },
2023
+ "perf-rate"
2024
+ ),
2025
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2026
+ "latency avg ",
2027
+ performance.latencyAvgMs,
2028
+ "ms p95 ",
2029
+ performance.latencyP95Ms,
2030
+ "ms"
2031
+ ] }, "perf-lat"),
2032
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2033
+ "tokens avg ",
2034
+ performance.tokensAvg,
2035
+ " p95 ",
2036
+ performance.tokensP95
2037
+ ] }, "perf-tok"),
2038
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp4"),
2039
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }, "spark-h"),
2040
+ /* @__PURE__ */ jsx(Sparkline, { data: latencyHistory, width: 20 }, "spark")
2041
+ ];
2042
+ if (failures.length > 0) {
2043
+ rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp5"));
2044
+ rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Failures (top)" }, "fail-h"));
2045
+ failures.forEach((f, i) => {
2046
+ rows.push(
2047
+ /* @__PURE__ */ jsxs(Text, { color: "red", children: [
2048
+ i + 1,
2049
+ ") ",
2050
+ f.title
2051
+ ] }, `fail-${i}`)
2052
+ );
2053
+ });
1870
2054
  }
1871
- const lastSlash = pattern.lastIndexOf("/");
1872
- if (lastSlash <= 0) {
1873
- return void 0;
2055
+ if (testCases.length > 0) {
2056
+ rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
2057
+ rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
2058
+ for (const tc of testCases) {
2059
+ const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
2060
+ rows.push(
2061
+ /* @__PURE__ */ jsxs(Text, { children: [
2062
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
2063
+ "[",
2064
+ tc.completedTestCases,
2065
+ "/",
2066
+ tc.totalTestCases,
2067
+ "]"
2068
+ ] }),
2069
+ " ",
2070
+ tc.testCaseName,
2071
+ rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
2072
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2073
+ " (",
2074
+ tc.durationMs,
2075
+ "ms)"
2076
+ ] })
2077
+ ] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
2078
+ );
2079
+ for (const item of tc.evaluatorScores) {
2080
+ const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2081
+ rows.push(
2082
+ /* @__PURE__ */ jsxs(Text, { children: [
2083
+ " ",
2084
+ name,
2085
+ ":",
2086
+ " ",
2087
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2088
+ " ",
2089
+ item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
2090
+ formatScorePart(s),
2091
+ " "
2092
+ ] }, s.id)),
2093
+ item.metrics?.map((m) => {
2094
+ const def = getMetricById(m.id);
2095
+ if (!def)
2096
+ return null;
2097
+ const formatted = def.format(m.data);
2098
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2099
+ "[",
2100
+ def.name ? `${def.name}: ` : "",
2101
+ formatted,
2102
+ "]",
2103
+ " "
2104
+ ] }, m.id);
2105
+ })
2106
+ ] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
2107
+ );
2108
+ if (!item.passed && item.logs && item.logs.length > 0) {
2109
+ for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
2110
+ const log = item.logs[logIdx];
2111
+ if (log.type === "diff") {
2112
+ const lines = getDiffLines(log);
2113
+ for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
2114
+ const { type, line } = lines[lineIdx];
2115
+ rows.push(
2116
+ /* @__PURE__ */ jsxs(
2117
+ Text,
2118
+ {
2119
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2120
+ children: [
2121
+ " ",
2122
+ line
2123
+ ]
2124
+ },
2125
+ `tc-${tc.testCaseId}-${item.evaluatorId}-${logIdx}-${lineIdx}`
2126
+ )
2127
+ );
2128
+ }
2129
+ }
2130
+ }
2131
+ }
2132
+ }
2133
+ }
1874
2134
  }
1875
- return {
1876
- source: pattern.slice(1, lastSlash),
1877
- flags: pattern.slice(lastSlash + 1)
1878
- };
2135
+ return rows;
1879
2136
  }
1880
- function createNameMatcher(pattern) {
1881
- const normalizedPattern = pattern.trim();
1882
- const regexLiteral = parseRegexLiteral(normalizedPattern);
1883
- if (regexLiteral) {
1884
- const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1885
- return (value) => regex.test(value);
1886
- }
1887
- if (normalizedPattern.includes("*")) {
1888
- const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1889
- const regex = new RegExp(`^${escaped}$`, "i");
1890
- return (value) => regex.test(value);
2137
+ function RunDetailsView({
2138
+ state,
2139
+ dataset,
2140
+ selectedRun,
2141
+ evaluators
2142
+ }) {
2143
+ const runs = dataset?.runs ?? [];
2144
+ const rightFocused = state.focus === "right";
2145
+ const [testCases, setTestCases] = useState([]);
2146
+ const evaluatorNameById = React.useMemo(
2147
+ () => new Map(evaluators.map((e) => [e.id, e.name])),
2148
+ [evaluators]
2149
+ );
2150
+ useEffect(() => {
2151
+ if (!selectedRun?.meta?.artifact) {
2152
+ setTestCases([]);
2153
+ return;
2154
+ }
2155
+ const artifactPath = resolve(selectedRun.meta.artifact);
2156
+ parseArtifactFile(artifactPath).then(setTestCases);
2157
+ }, [selectedRun?.meta?.artifact]);
2158
+ if (!selectedRun) {
2159
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
2160
+ /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
2161
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to inspect details." }) })
2162
+ ] });
1891
2163
  }
1892
- return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
2164
+ const rows = buildDetailRows(selectedRun, testCases, evaluatorNameById);
2165
+ const offset = Math.max(0, state.detailsScrollOffset);
2166
+ const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
2167
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
2168
+ /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
2169
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
2170
+ ] });
1893
2171
  }
1894
- function mergeRunnerOverrides(base, next) {
1895
- if (!base) {
1896
- return next;
1897
- }
1898
- if (!next) {
1899
- return base;
1900
- }
1901
- const discovery = base.discovery || next.discovery ? {
1902
- ...base.discovery ?? {},
1903
- ...next.discovery ?? {}
1904
- } : void 0;
2172
+ var LEFT_PANE_WIDTH3 = 44;
2173
+ function NewEvaluationView({
2174
+ state,
2175
+ data,
2176
+ visibleEvaluators
2177
+ }) {
2178
+ const selectedCount = state.selectedEvaluatorIds.length;
2179
+ const focusedEvaluator = visibleEvaluators[state.evaluatorMenuIndex];
2180
+ const leftFocused = state.focus === "left";
2181
+ const rightFocused = state.focus === "right";
2182
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
2183
+ /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH3, focused: leftFocused, children: [
2184
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Available Evaluators" }),
2185
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2186
+ "Search: ",
2187
+ state.searchQuery || "(none)"
2188
+ ] }),
2189
+ visibleEvaluators.map((evaluator, index) => {
2190
+ const selected = index === state.evaluatorMenuIndex;
2191
+ const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
2192
+ return /* @__PURE__ */ jsxs(
2193
+ Text,
2194
+ {
2195
+ color: selected ? "cyan" : "gray",
2196
+ bold: selected,
2197
+ children: [
2198
+ selected ? "\u25B8 " : " ",
2199
+ inSelection ? "[x] " : "[ ] ",
2200
+ evaluator.name
2201
+ ]
2202
+ },
2203
+ evaluator.id
2204
+ );
2205
+ })
2206
+ ] }),
2207
+ /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
2208
+ /* @__PURE__ */ jsxs(SectionHeader, { children: [
2209
+ "Selected (",
2210
+ selectedCount,
2211
+ ")"
2212
+ ] }),
2213
+ state.selectedEvaluatorIds.map((id, index) => {
2214
+ const evaluator = data.evaluators.find((item) => item.id === id);
2215
+ if (!evaluator)
2216
+ return null;
2217
+ return /* @__PURE__ */ jsxs(Text, { children: [
2218
+ index + 1,
2219
+ ") ",
2220
+ evaluator.name
2221
+ ] }, id);
2222
+ }),
2223
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Config preview" }),
2224
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: focusedEvaluator?.configPreview ?? "Select an evaluator to inspect config." })
2225
+ ] })
2226
+ ] });
2227
+ }
2228
+ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
2229
+ const datasetMax = filteredDatasetsLength;
2230
+ const runMax = selectedRunCount;
2231
+ const evaluatorMax = 3;
1905
2232
  return {
1906
- ...base,
1907
- ...next,
1908
- discovery
2233
+ ...state,
2234
+ datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
2235
+ runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
2236
+ evaluatorMenuIndex: Math.max(
2237
+ 0,
2238
+ Math.min(state.evaluatorMenuIndex, evaluatorMax)
2239
+ )
1909
2240
  };
1910
2241
  }
1911
- function createRunner(overrides) {
1912
- const fileOverrides = loadRunnerConfigFile();
1913
- const merged = mergeRunnerOverrides(fileOverrides, overrides);
1914
- return new EffectRunner(withRunnerConfig(merged));
1915
- }
1916
- var EffectRunner = class {
1917
- constructor(config) {
1918
- this.eventBus = Effect.runSync(PubSub.unbounded());
1919
- this.runQueue = Effect.runSync(Queue.unbounded());
1920
- this.persistenceQueue = Effect.runSync(
1921
- Queue.unbounded()
1922
- );
1923
- this.snapshots = /* @__PURE__ */ new Map();
1924
- this.listeners = /* @__PURE__ */ new Set();
1925
- this.datasetsById = /* @__PURE__ */ new Map();
1926
- this.evaluatorsById = /* @__PURE__ */ new Map();
1927
- this.schedulerFiber = Effect.runFork(
1928
- this.createSchedulerEffect()
1929
- );
1930
- this.persistenceFiber = Effect.runFork(
1931
- createPersistenceWorker(this.persistenceQueue)
1932
- );
1933
- this.config = config;
1934
- }
1935
- async collectDatasets() {
1936
- const datasets = await collectDatasetsFromFiles(this.config.discovery);
1937
- this.datasetsById.clear();
1938
- for (const dataset of datasets) {
1939
- this.datasetsById.set(dataset.id, dataset);
2242
+ function EvalsCliApp({
2243
+ data,
2244
+ args,
2245
+ runner
2246
+ }) {
2247
+ const { exit } = useApp();
2248
+ const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
2249
+ const [liveData, setLiveData] = useState(data);
2250
+ const [runtimeMessage, setRuntimeMessage] = useState();
2251
+ const [state, dispatch] = useReducer(
2252
+ reduceCliState,
2253
+ createInitialState(data, args)
2254
+ );
2255
+ useEffect(() => {
2256
+ setLiveData(data);
2257
+ }, [data]);
2258
+ useEffect(() => {
2259
+ if (!runner) {
2260
+ return void 0;
1940
2261
  }
1941
- return datasets;
1942
- }
1943
- async collectEvaluators() {
1944
- const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
1945
- this.evaluatorsById.clear();
1946
- for (const evaluator of evaluators) {
1947
- this.evaluatorsById.set(evaluator.id, evaluator);
2262
+ return runner.subscribeRunEvents((event) => {
2263
+ setLiveData((current) => applyRunnerEvent(current, event, runner));
2264
+ if (event.type === "RunQueued") {
2265
+ setRuntimeMessage(`Queued ${event.runId} with ${event.totalTestCases} test cases.`);
2266
+ }
2267
+ if (event.type === "RunCompleted") {
2268
+ setRuntimeMessage(
2269
+ `Completed ${event.runId}: ${event.passedTestCases}/${event.totalTestCases} passed.`
2270
+ );
2271
+ }
2272
+ if (event.type === "RunFailed") {
2273
+ setRuntimeMessage(`Run failed: ${event.errorMessage}`);
2274
+ }
2275
+ });
2276
+ }, [runner]);
2277
+ const filteredDatasets = useMemo(
2278
+ () => getFilteredDatasets(liveData, state.searchQuery),
2279
+ [liveData, state.searchQuery]
2280
+ );
2281
+ const clampedState = clampCursor(
2282
+ state,
2283
+ filteredDatasets.length,
2284
+ getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
2285
+ );
2286
+ const selectedDataset = getDatasetByMenuIndex(
2287
+ filteredDatasets,
2288
+ clampedState.datasetMenuIndex
2289
+ );
2290
+ const selectedRun = getRunByMenuIndex(
2291
+ selectedDataset,
2292
+ clampedState.runMenuIndex
2293
+ );
2294
+ const visibleEvaluators = liveData.evaluators.filter(
2295
+ (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
2296
+ );
2297
+ useInput((input, key) => {
2298
+ if (isQuitInput(input) || key.escape) {
2299
+ exit();
2300
+ return;
1948
2301
  }
1949
- return evaluators;
1950
- }
1951
- async resolveDatasetByName(name) {
1952
- if (this.datasetsById.size === 0) {
1953
- await this.collectDatasets();
2302
+ if (key.tab) {
2303
+ dispatch({ type: "TOGGLE_FOCUS" });
2304
+ return;
1954
2305
  }
1955
- const normalized = name.trim().toLowerCase();
1956
- return Array.from(this.datasetsById.values()).find(
1957
- (item) => item.dataset.getName().toLowerCase() === normalized
1958
- );
1959
- }
1960
- async resolveEvaluatorsByNamePattern(pattern) {
1961
- if (this.evaluatorsById.size === 0) {
1962
- await this.collectEvaluators();
2306
+ if (isSearchInput(input)) {
2307
+ dispatch({ type: "START_SEARCH" });
2308
+ return;
1963
2309
  }
1964
- const matcher = createNameMatcher(pattern);
1965
- return Array.from(this.evaluatorsById.values()).filter(
1966
- (item) => matcher(item.evaluator.getName() ?? "")
1967
- );
1968
- }
1969
- async searchTestCases(query) {
1970
- const testCases = await collectTestCasesFromFiles(this.config.discovery);
1971
- return searchCollectedTestCases(testCases, query);
1972
- }
1973
- async collectDatasetTestCases(datasetId) {
1974
- if (this.datasetsById.size === 0) {
1975
- await this.collectDatasets();
2310
+ if (clampedState.searchMode) {
2311
+ if (key.return) {
2312
+ dispatch({ type: "END_SEARCH" });
2313
+ return;
2314
+ }
2315
+ if (isBackKey(key)) {
2316
+ dispatch({ type: "REMOVE_SEARCH_CHAR" });
2317
+ return;
2318
+ }
2319
+ if (isPrintableCharacter(input)) {
2320
+ dispatch({ type: "APPEND_SEARCH", value: input });
2321
+ }
2322
+ return;
1976
2323
  }
1977
- const dataset = this.datasetsById.get(datasetId);
1978
- if (!dataset) {
1979
- throw new Error(`Unknown dataset: ${datasetId}`);
2324
+ if (key.upArrow) {
2325
+ const max = clampedState.level === "details" ? 100 : clampedState.level === "new-evaluation" ? visibleEvaluators.length - 1 : 100;
2326
+ dispatch({ type: "MOVE_UP", max });
2327
+ return;
1980
2328
  }
1981
- const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
1982
- return allTestCases.filter(
1983
- (testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
1984
- );
1985
- }
1986
- async runDatasetWith(request) {
1987
- if (this.datasetsById.size === 0) {
1988
- await this.collectDatasets();
2329
+ if (key.downArrow) {
2330
+ const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
2331
+ dispatch({ type: "MOVE_DOWN", max });
2332
+ return;
1989
2333
  }
1990
- if (this.evaluatorsById.size === 0) {
1991
- await this.collectEvaluators();
2334
+ if (key.return) {
2335
+ dispatch({
2336
+ type: "ENTER",
2337
+ hasDataset: Boolean(selectedDataset),
2338
+ hasRun: Boolean(selectedRun)
2339
+ });
2340
+ if (clampedState.level === "new-evaluation") {
2341
+ const evaluator = visibleEvaluators[clampedState.evaluatorMenuIndex];
2342
+ if (evaluator) {
2343
+ dispatch({ type: "TOGGLE_EVALUATOR", evaluatorId: evaluator.id });
2344
+ }
2345
+ }
2346
+ return;
1992
2347
  }
1993
- const dataset = this.datasetsById.get(request.datasetId);
1994
- if (!dataset) {
1995
- throw new Error(`Unknown dataset: ${request.datasetId}`);
2348
+ if (isBackKey(key)) {
2349
+ dispatch({ type: "BACK" });
2350
+ return;
1996
2351
  }
1997
- const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1998
- if (selectedEvaluators.length === 0) {
1999
- throw new Error("No evaluators selected for run");
2352
+ if (input.toLowerCase() === "c") {
2353
+ dispatch({ type: "CLEAR_WARNINGS" });
2354
+ setRuntimeMessage(void 0);
2355
+ return;
2000
2356
  }
2001
- const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
2002
- const runId = `run-${randomUUID()}`;
2003
- const artifactPath = createArtifactPath(
2004
- this.config.artifactDirectory,
2005
- request.datasetId,
2006
- runId
2007
- );
2008
- const snapshot = {
2009
- runId,
2010
- datasetId: request.datasetId,
2011
- datasetName: dataset.dataset.getName(),
2012
- evaluatorIds: selectedEvaluators.map((item) => item.id),
2013
- queuedAt: Date.now(),
2014
- totalTestCases: selectedTestCases.length,
2015
- completedTestCases: 0,
2016
- passedTestCases: 0,
2017
- failedTestCases: 0,
2018
- status: "queued",
2019
- artifactPath
2020
- };
2021
- this.snapshots.set(runId, snapshot);
2022
- const queuedEvent = {
2023
- type: "RunQueued",
2024
- runId,
2025
- datasetId: request.datasetId,
2026
- datasetName: dataset.dataset.getName(),
2027
- evaluatorIds: selectedEvaluators.map((item) => item.id),
2028
- totalTestCases: selectedTestCases.length,
2029
- artifactPath
2030
- };
2031
- await Effect.runPromise(this.publishEvent(queuedEvent));
2032
- await Effect.runPromise(
2033
- Queue.offer(this.persistenceQueue, {
2034
- runId,
2035
- artifactPath,
2036
- payload: queuedEvent
2037
- })
2038
- );
2039
- await Effect.runPromise(
2040
- Queue.offer(this.runQueue, {
2041
- runId,
2042
- datasetId: request.datasetId,
2043
- dataset: dataset.dataset,
2044
- evaluators: selectedEvaluators,
2045
- testCases: selectedTestCases,
2046
- snapshot
2047
- })
2048
- );
2049
- return snapshot;
2050
- }
2051
- subscribeRunEvents(listener, options) {
2052
- const entry = { runId: options?.runId, listener };
2053
- this.listeners.add(entry);
2054
- return () => {
2055
- this.listeners.delete(entry);
2056
- };
2057
- }
2058
- getRunSnapshot(runId) {
2059
- return this.snapshots.get(runId);
2060
- }
2061
- getAllRunSnapshots() {
2062
- return Array.from(this.snapshots.values()).sort(
2063
- (a, b) => b.queuedAt - a.queuedAt
2064
- );
2065
- }
2066
- async shutdown() {
2067
- await Effect.runPromise(Fiber.interrupt(this.schedulerFiber));
2068
- await Effect.runPromise(Fiber.interrupt(this.persistenceFiber));
2069
- await Effect.runPromise(Queue.shutdown(this.runQueue));
2070
- await Effect.runPromise(Queue.shutdown(this.persistenceQueue));
2071
- await Effect.runPromise(PubSub.shutdown(this.eventBus));
2072
- }
2073
- createSchedulerEffect() {
2074
- const self = this;
2075
- return Effect.forever(
2076
- Effect.gen(function* () {
2077
- const task = yield* Queue.take(self.runQueue);
2078
- yield* Effect.fork(
2079
- executeRunTask(
2080
- task,
2081
- self.publishEvent.bind(self),
2082
- self.persistenceQueue,
2083
- self.updateSnapshot.bind(self)
2084
- )
2357
+ if (input.toLowerCase() === "s" && clampedState.level === "new-evaluation") {
2358
+ if (!runner) {
2359
+ setRuntimeMessage("Runner unavailable: cannot start evaluation.");
2360
+ return;
2361
+ }
2362
+ if (!selectedDataset) {
2363
+ setRuntimeMessage("Select a dataset before starting a new evaluation.");
2364
+ return;
2365
+ }
2366
+ if (clampedState.selectedEvaluatorIds.length === 0) {
2367
+ setRuntimeMessage("Select at least one evaluator before starting.");
2368
+ return;
2369
+ }
2370
+ void runner.runDatasetWith({
2371
+ datasetId: selectedDataset.id,
2372
+ evaluatorIds: clampedState.selectedEvaluatorIds
2373
+ }).then((snapshot) => {
2374
+ setRuntimeMessage(
2375
+ `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
2085
2376
  );
2086
- })
2087
- );
2088
- }
2089
- updateSnapshot(runId, updater) {
2090
- const existing = this.snapshots.get(runId);
2091
- if (!existing) {
2092
- return;
2377
+ }).catch((error) => {
2378
+ setRuntimeMessage(
2379
+ error instanceof Error ? error.message : "Failed to start evaluation."
2380
+ );
2381
+ });
2093
2382
  }
2094
- this.snapshots.set(runId, updater(existing));
2095
- }
2096
- publishEvent(event) {
2097
- return Effect.sync(() => {
2098
- for (const entry of this.listeners) {
2099
- if (entry.runId && entry.runId !== event.runId) {
2100
- continue;
2383
+ });
2384
+ const renderContent = () => {
2385
+ if (clampedState.level === "new-evaluation") {
2386
+ return /* @__PURE__ */ jsx(
2387
+ NewEvaluationView,
2388
+ {
2389
+ state: clampedState,
2390
+ data: liveData,
2391
+ visibleEvaluators
2101
2392
  }
2102
- entry.listener(event);
2393
+ );
2394
+ }
2395
+ if (clampedState.level === "datasets") {
2396
+ return /* @__PURE__ */ jsx(
2397
+ DatasetsView,
2398
+ {
2399
+ state: clampedState,
2400
+ filteredDatasets,
2401
+ selectedDataset
2402
+ }
2403
+ );
2404
+ }
2405
+ if (clampedState.level === "runs") {
2406
+ return /* @__PURE__ */ jsx(
2407
+ RunsView,
2408
+ {
2409
+ state: clampedState,
2410
+ dataset: selectedDataset,
2411
+ selectedRun
2412
+ }
2413
+ );
2414
+ }
2415
+ return /* @__PURE__ */ jsx(
2416
+ RunDetailsView,
2417
+ {
2418
+ state: clampedState,
2419
+ dataset: selectedDataset,
2420
+ selectedRun,
2421
+ evaluators: liveData.evaluators
2103
2422
  }
2104
- }).pipe(
2105
- Effect.flatMap(() => PubSub.publish(this.eventBus, event)),
2106
- Effect.asVoid
2107
2423
  );
2108
- }
2109
- };
2424
+ };
2425
+ return /* @__PURE__ */ jsxs(
2426
+ Box,
2427
+ {
2428
+ flexDirection: "column",
2429
+ flexGrow: 1,
2430
+ width: stdoutWidth,
2431
+ height: stdoutHeight,
2432
+ children: [
2433
+ /* @__PURE__ */ jsx(
2434
+ Box,
2435
+ {
2436
+ borderStyle: "round",
2437
+ borderColor: "cyan",
2438
+ paddingX: 1,
2439
+ width: stdoutWidth,
2440
+ children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(
2441
+ clampedState,
2442
+ selectedDataset?.name,
2443
+ selectedRun?.label
2444
+ ) })
2445
+ }
2446
+ ),
2447
+ clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
2448
+ Box,
2449
+ {
2450
+ marginTop: 1,
2451
+ borderStyle: "round",
2452
+ borderColor: "yellow",
2453
+ paddingX: 1,
2454
+ flexDirection: "column",
2455
+ width: stdoutWidth,
2456
+ children: [
2457
+ /* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
2458
+ clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
2459
+ ]
2460
+ }
2461
+ ),
2462
+ clampedState.searchMode && /* @__PURE__ */ jsxs(
2463
+ Box,
2464
+ {
2465
+ marginTop: 1,
2466
+ borderStyle: "round",
2467
+ borderColor: "magenta",
2468
+ paddingX: 1,
2469
+ width: stdoutWidth,
2470
+ children: [
2471
+ /* @__PURE__ */ jsx(Text, { color: "magenta", bold: true, children: "Search: " }),
2472
+ /* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
2473
+ ]
2474
+ }
2475
+ ),
2476
+ runtimeMessage && /* @__PURE__ */ jsx(
2477
+ Box,
2478
+ {
2479
+ marginTop: 1,
2480
+ borderStyle: "round",
2481
+ borderColor: "blue",
2482
+ paddingX: 1,
2483
+ width: stdoutWidth,
2484
+ children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage })
2485
+ }
2486
+ ),
2487
+ /* @__PURE__ */ jsx(
2488
+ Box,
2489
+ {
2490
+ marginTop: 1,
2491
+ flexGrow: 1,
2492
+ width: stdoutWidth,
2493
+ flexDirection: "row",
2494
+ children: renderContent()
2495
+ }
2496
+ ),
2497
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
2498
+ ]
2499
+ }
2500
+ );
2501
+ }
2110
2502
  async function main() {
2111
2503
  const args = parseStartupArgs(process.argv.slice(2));
2112
2504
  const runner = createRunner();