@m4trix/evals 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,8 +9,13 @@ var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
11
  require('json-diff');
12
+ var React2 = require('react');
13
+ var ink = require('ink');
14
+ var jsxRuntime = require('react/jsx-runtime');
12
15
 
13
16
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
17
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
+
14
19
  function _interopNamespace(e) {
15
20
  if (e && e.__esModule) return e;
16
21
  var n = Object.create(null);
@@ -30,6 +35,7 @@ function _interopNamespace(e) {
30
35
  }
31
36
 
32
37
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
38
+ var React2__default = /*#__PURE__*/_interopDefault(React2);
33
39
 
34
40
  // src/runner/config.ts
35
41
  var defaultRunnerConfig = {
@@ -890,6 +896,107 @@ function getSimpleCliUsage() {
890
896
  ' "/score/i" regex literal'
891
897
  ].join("\n");
892
898
  }
899
+
900
+ // src/cli-simple/banner.ts
901
+ var ansi = {
902
+ reset: "\x1B[0m",
903
+ dim: "\x1B[2m",
904
+ cyan: "\x1B[36m"
905
+ };
906
+ function printBanner() {
907
+ const c = (s) => `${ansi.cyan}${s}${ansi.reset}`;
908
+ const d = (s) => `${ansi.dim}${s}${ansi.reset}`;
909
+ const lines = [
910
+ "",
911
+ ` ${c("\u256D\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256E")}`,
912
+ ` ${c("\u2502")} ${d("@m4trix/evals")} ${c("\xB7")} ${d("eval-agents-simple")} ${c("\u2502")}`,
913
+ ` ${c("\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256F")}`,
914
+ ""
915
+ ];
916
+ console.log(lines.join("\n"));
917
+ }
918
+ function Banner() {
919
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
920
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "@m4trix/evals" }),
921
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: " \xB7 " }),
922
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "eval-agents-simple" })
923
+ ] });
924
+ }
925
+ function GenerateView({
926
+ runner,
927
+ datasetName,
928
+ onComplete
929
+ }) {
930
+ const [result, setResult] = React2.useState(null);
931
+ const [error, setError] = React2.useState(null);
932
+ React2.useEffect(() => {
933
+ let cancelled = false;
934
+ async function run() {
935
+ const dataset = await runner.resolveDatasetByName(datasetName);
936
+ if (!dataset) {
937
+ setError(new Error(`Dataset "${datasetName}" not found.`));
938
+ onComplete(new Error(`Dataset "${datasetName}" not found.`));
939
+ return;
940
+ }
941
+ const { writeFile: writeFile2 } = await import('fs/promises');
942
+ const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
943
+ const testCases = await runner.collectDatasetTestCases(dataset.id);
944
+ const payload = testCases.map((item) => {
945
+ const tc = item.testCase;
946
+ return {
947
+ name: item.testCase.getName(),
948
+ input: item.testCase.getInput(),
949
+ output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
950
+ };
951
+ });
952
+ const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
953
+ const parsed = parse2(absoluteDatasetPath);
954
+ const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
955
+ await writeFile2(
956
+ outputPath,
957
+ `${JSON.stringify(payload, null, 2)}
958
+ `,
959
+ "utf8"
960
+ );
961
+ if (!cancelled) {
962
+ setResult({
963
+ count: payload.length,
964
+ datasetName: dataset.dataset.getName(),
965
+ outputPath
966
+ });
967
+ setTimeout(() => onComplete(), 200);
968
+ }
969
+ }
970
+ void run();
971
+ return () => {
972
+ cancelled = true;
973
+ };
974
+ }, [runner, datasetName, onComplete]);
975
+ if (error) {
976
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
977
+ /* @__PURE__ */ jsxRuntime.jsx(Banner, {}),
978
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: error.message })
979
+ ] });
980
+ }
981
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
982
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
983
+ result && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
984
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "green", children: [
985
+ "Generated ",
986
+ result.count,
987
+ ' test cases for dataset "',
988
+ result.datasetName,
989
+ '".'
990
+ ] }),
991
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
992
+ "Wrote ",
993
+ result.outputPath
994
+ ] })
995
+ ] })
996
+ ] });
997
+ }
998
+
999
+ // src/cli-simple/generate.ts
893
1000
  function readOutput2(testCase) {
894
1001
  if (typeof testCase.getOutput !== "function") {
895
1002
  return void 0;
@@ -900,7 +1007,7 @@ function createOutputPath(datasetFilePath) {
900
1007
  const parsed = path.parse(datasetFilePath);
901
1008
  return path.join(parsed.dir, `${parsed.name}.cases.json`);
902
1009
  }
903
- async function generateDatasetJsonCommand(runner, datasetName) {
1010
+ async function generateDatasetJsonCommandPlain(runner, datasetName) {
904
1011
  const dataset = await runner.resolveDatasetByName(datasetName);
905
1012
  if (!dataset) {
906
1013
  throw new Error(`Dataset "${datasetName}" not found.`);
@@ -918,9 +1025,383 @@ async function generateDatasetJsonCommand(runner, datasetName) {
918
1025
  console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
919
1026
  console.log(`Wrote ${outputPath}`);
920
1027
  }
1028
+ async function generateDatasetJsonCommandInk(runner, datasetName) {
1029
+ return new Promise((resolve4, reject) => {
1030
+ const app = ink.render(
1031
+ React2__default.default.createElement(GenerateView, {
1032
+ runner,
1033
+ datasetName,
1034
+ onComplete: (err) => {
1035
+ app.unmount();
1036
+ if (err) {
1037
+ reject(err);
1038
+ } else {
1039
+ resolve4();
1040
+ }
1041
+ }
1042
+ })
1043
+ );
1044
+ });
1045
+ }
1046
+ function barColor(pct) {
1047
+ if (pct >= 70)
1048
+ return "green";
1049
+ if (pct >= 40)
1050
+ return "yellow";
1051
+ return "red";
1052
+ }
1053
+ function TextBar({
1054
+ label,
1055
+ value,
1056
+ max = 100,
1057
+ labelWidth = 14,
1058
+ barWidth = 20,
1059
+ format = (v) => String(v),
1060
+ colorByValue = true
1061
+ }) {
1062
+ const clamped = Math.max(0, Math.min(max, value));
1063
+ const pct = max > 0 ? clamped / max * 100 : 0;
1064
+ const filled = Math.round(clamped / max * barWidth);
1065
+ const filledBar = "\u2588".repeat(filled);
1066
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
1067
+ const color = colorByValue ? barColor(pct) : void 0;
1068
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1069
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
1070
+ " [",
1071
+ color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1072
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
1073
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
1074
+ ] }) : filledBar + emptyBar,
1075
+ "] ",
1076
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
1077
+ ] });
1078
+ }
1079
+ var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1080
+ function Spinner({ label = "Running" }) {
1081
+ const [frame, setFrame] = React2.useState(0);
1082
+ React2.useEffect(() => {
1083
+ const timer = setInterval(() => {
1084
+ setFrame((f) => (f + 1) % FRAMES.length);
1085
+ }, 100);
1086
+ return () => clearInterval(timer);
1087
+ }, []);
1088
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1089
+ FRAMES[frame],
1090
+ " ",
1091
+ label
1092
+ ] });
1093
+ }
1094
+ function scoreColor(score) {
1095
+ if (score >= 80)
1096
+ return "green";
1097
+ if (score >= 50)
1098
+ return "yellow";
1099
+ return "red";
1100
+ }
1101
+ function createBar(value, max = 100, width = 20) {
1102
+ const safe = Math.max(0, Math.min(max, value));
1103
+ const filled = Math.round(safe / max * width);
1104
+ return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1105
+ }
1106
+ function formatScorePart(item, scoreToColor2) {
1107
+ const def = getScoreById(item.id);
1108
+ if (!def) {
1109
+ const numeric = toNumericScore(item.data);
1110
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1111
+ }
1112
+ const formatted = def.format(item.data);
1113
+ if (def.displayStrategy === "bar") {
1114
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1115
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1116
+ return `${formatted} ${createBar(numeric)}`;
1117
+ }
1118
+ }
1119
+ return formatted;
1120
+ }
1121
+ function RunView({
1122
+ runner,
1123
+ datasetName,
1124
+ evaluatorPattern,
1125
+ onComplete
1126
+ }) {
1127
+ const [phase, setPhase] = React2.useState(
1128
+ "loading"
1129
+ );
1130
+ const [runInfo, setRunInfo] = React2.useState(null);
1131
+ const [testCases, setTestCases] = React2.useState([]);
1132
+ const [summary, setSummary] = React2.useState(null);
1133
+ const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1134
+ const runEval = React2.useCallback(async () => {
1135
+ const dataset = await runner.resolveDatasetByName(datasetName);
1136
+ if (!dataset) {
1137
+ const known = await runner.collectDatasets();
1138
+ const available = known.map((item) => item.dataset.getName()).sort();
1139
+ onComplete(
1140
+ new Error(
1141
+ available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1142
+ )
1143
+ );
1144
+ return;
1145
+ }
1146
+ const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1147
+ if (evaluators.length === 0) {
1148
+ const known = await runner.collectEvaluators();
1149
+ const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1150
+ onComplete(
1151
+ new Error(
1152
+ available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1153
+ )
1154
+ );
1155
+ return;
1156
+ }
1157
+ const nameById = new Map(
1158
+ evaluators.map((item) => [
1159
+ item.id,
1160
+ item.evaluator.getName() ?? item.id
1161
+ ])
1162
+ );
1163
+ setEvaluatorNameById(nameById);
1164
+ const aggregates = /* @__PURE__ */ new Map();
1165
+ let overallScoreTotal = 0;
1166
+ let overallScoreCount = 0;
1167
+ const done = new Promise((resolve4) => {
1168
+ const unsubscribe = runner.subscribeRunEvents((event) => {
1169
+ if (event.type === "TestCaseProgress") {
1170
+ const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1171
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1172
+ for (const item of event.evaluatorScores) {
1173
+ const numeric = toNumericScoreFromScores(item.scores);
1174
+ if (numeric !== void 0) {
1175
+ const current = aggregates.get(item.evaluatorId) ?? {
1176
+ total: 0,
1177
+ count: 0,
1178
+ passed: 0,
1179
+ failed: 0
1180
+ };
1181
+ aggregates.set(item.evaluatorId, {
1182
+ total: current.total + numeric,
1183
+ count: current.count + 1,
1184
+ passed: current.passed + (item.passed ? 1 : 0),
1185
+ failed: current.failed + (item.passed ? 0 : 1)
1186
+ });
1187
+ overallScoreTotal += numeric;
1188
+ overallScoreCount += 1;
1189
+ }
1190
+ }
1191
+ setTestCases((prev) => [
1192
+ ...prev,
1193
+ {
1194
+ name: event.testCaseName,
1195
+ completedTestCases: event.completedTestCases,
1196
+ totalTestCases: event.totalTestCases,
1197
+ durationMs: event.durationMs,
1198
+ passed: event.passed,
1199
+ averageScore,
1200
+ evaluatorScores: event.evaluatorScores.map((item) => ({
1201
+ evaluatorId: item.evaluatorId,
1202
+ evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1203
+ scores: item.scores,
1204
+ passed: item.passed,
1205
+ metrics: item.metrics
1206
+ }))
1207
+ }
1208
+ ]);
1209
+ }
1210
+ if (event.type === "RunCompleted" || event.type === "RunFailed") {
1211
+ unsubscribe();
1212
+ resolve4(event);
1213
+ }
1214
+ });
1215
+ });
1216
+ const snapshot = await runner.runDatasetWith({
1217
+ datasetId: dataset.id,
1218
+ evaluatorIds: evaluators.map((item) => item.id)
1219
+ });
1220
+ setRunInfo({
1221
+ runId: snapshot.runId,
1222
+ datasetName: snapshot.datasetName,
1223
+ evaluatorNames: evaluators.map(
1224
+ (e) => e.evaluator.getName() ?? e.id
1225
+ ),
1226
+ totalTestCases: snapshot.totalTestCases
1227
+ });
1228
+ setPhase("running");
1229
+ const finalEvent = await done;
1230
+ if (finalEvent.type === "RunFailed") {
1231
+ onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1232
+ return;
1233
+ }
1234
+ setSummary({
1235
+ passedTestCases: finalEvent.passedTestCases,
1236
+ failedTestCases: finalEvent.failedTestCases,
1237
+ totalTestCases: finalEvent.totalTestCases,
1238
+ overallScoreTotal,
1239
+ overallScoreCount,
1240
+ aggregates: new Map(aggregates),
1241
+ artifactPath: finalEvent.artifactPath
1242
+ });
1243
+ setPhase("completed");
1244
+ setTimeout(() => onComplete(), 200);
1245
+ }, [runner, datasetName, evaluatorPattern, onComplete]);
1246
+ React2.useEffect(() => {
1247
+ void runEval();
1248
+ }, [runEval]);
1249
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1250
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1251
+ runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1252
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1253
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
1254
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
1255
+ ] }),
1256
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1257
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
1258
+ runInfo.datasetName
1259
+ ] }),
1260
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1261
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
1262
+ runInfo.evaluatorNames.join(", ")
1263
+ ] }),
1264
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1265
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
1266
+ runInfo.totalTestCases
1267
+ ] })
1268
+ ] }),
1269
+ phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1270
+ Spinner,
1271
+ {
1272
+ label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1273
+ }
1274
+ ) }),
1275
+ testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1276
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1277
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1278
+ "[",
1279
+ tc.completedTestCases,
1280
+ "/",
1281
+ tc.totalTestCases,
1282
+ "]"
1283
+ ] }),
1284
+ " ",
1285
+ tc.name,
1286
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1287
+ " (",
1288
+ tc.durationMs,
1289
+ "ms)"
1290
+ ] })
1291
+ ] }),
1292
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1293
+ item.evaluatorName,
1294
+ ":",
1295
+ " ",
1296
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1297
+ " ",
1298
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1299
+ formatScorePart(s),
1300
+ " "
1301
+ ] }, s.id)),
1302
+ item.metrics?.map((m) => {
1303
+ const def = getMetricById(m.id);
1304
+ if (!def)
1305
+ return null;
1306
+ const formatted = def.format(m.data);
1307
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1308
+ "[",
1309
+ def.name ? `${def.name}: ` : "",
1310
+ formatted,
1311
+ "]",
1312
+ " "
1313
+ ] }, m.id);
1314
+ })
1315
+ ] }) }, item.evaluatorId))
1316
+ ] }, i)) }),
1317
+ phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1318
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
1319
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
1320
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "green", children: "passed" }),
1321
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1322
+ " ",
1323
+ summary.passedTestCases,
1324
+ "/",
1325
+ summary.totalTestCases
1326
+ ] })
1327
+ ] }),
1328
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1329
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
1330
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1331
+ " ",
1332
+ summary.failedTestCases,
1333
+ "/",
1334
+ summary.totalTestCases
1335
+ ] })
1336
+ ] }),
1337
+ summary.overallScoreCount > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1338
+ TextBar,
1339
+ {
1340
+ label: "overall avg",
1341
+ value: summary.overallScoreTotal / summary.overallScoreCount,
1342
+ barWidth: 20,
1343
+ format: (v) => v.toFixed(2)
1344
+ }
1345
+ ) }),
1346
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1347
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1348
+ Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1349
+ const agg = summary.aggregates.get(id);
1350
+ if (!agg || agg.count === 0) {
1351
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1352
+ "- ",
1353
+ name.padEnd(28),
1354
+ " no numeric scores"
1355
+ ] }, id);
1356
+ }
1357
+ const mean = agg.total / agg.count;
1358
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1359
+ "- ",
1360
+ name.padEnd(28),
1361
+ " avg=",
1362
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1363
+ " passed=",
1364
+ agg.passed,
1365
+ " failed=",
1366
+ agg.failed
1367
+ ] }, id);
1368
+ })
1369
+ ] }),
1370
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1371
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1372
+ testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1373
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1374
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1375
+ " ",
1376
+ tc.name.padEnd(24)
1377
+ ] }),
1378
+ tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1379
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
1380
+ "score=",
1381
+ tc.averageScore.toFixed(2)
1382
+ ] }),
1383
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1384
+ " ",
1385
+ createBar(tc.averageScore, 100, 14)
1386
+ ] })
1387
+ ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1388
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1389
+ " (",
1390
+ tc.durationMs,
1391
+ "ms)"
1392
+ ] })
1393
+ ] }, i))
1394
+ ] }),
1395
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1396
+ "artifact: ",
1397
+ summary.artifactPath
1398
+ ] }) })
1399
+ ] })
1400
+ ] });
1401
+ }
921
1402
 
922
1403
  // src/cli-simple/run.ts
923
- var ansi = {
1404
+ var ansi2 = {
924
1405
  reset: "\x1B[0m",
925
1406
  bold: "\x1B[1m",
926
1407
  dim: "\x1B[2m",
@@ -931,16 +1412,16 @@ var ansi = {
931
1412
  magenta: "\x1B[35m"
932
1413
  };
933
1414
  function colorize(text, color) {
934
- return `${color}${text}${ansi.reset}`;
1415
+ return `${color}${text}${ansi2.reset}`;
935
1416
  }
936
1417
  function scoreToColor(score) {
937
1418
  if (score >= 80) {
938
- return ansi.green;
1419
+ return ansi2.green;
939
1420
  }
940
1421
  if (score >= 50) {
941
- return ansi.yellow;
1422
+ return ansi2.yellow;
942
1423
  }
943
- return ansi.red;
1424
+ return ansi2.red;
944
1425
  }
945
1426
  function getEvaluatorSummaryLine(evaluatorName, aggregate) {
946
1427
  if (!aggregate || aggregate.count === 0) {
@@ -949,13 +1430,13 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
949
1430
  const mean = aggregate.total / aggregate.count;
950
1431
  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
951
1432
  }
952
- function createBar(value, max = 100, width = 20) {
1433
+ function createBar2(value, max = 100, width = 20) {
953
1434
  const safe = Math.max(0, Math.min(max, value));
954
1435
  const filled = Math.round(safe / max * width);
955
1436
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
956
1437
  }
957
1438
  function formatEvaluatorScoreLine(name, scores, passed, metrics) {
958
- const passLabel = passed ? colorize("PASS", `${ansi.bold}${ansi.green}`) : colorize("FAIL", `${ansi.bold}${ansi.red}`);
1439
+ const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
959
1440
  const scoreParts = [];
960
1441
  for (const item of scores) {
961
1442
  const def = getScoreById(item.id);
@@ -972,7 +1453,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
972
1453
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
973
1454
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
974
1455
  scoreParts.push(
975
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi.dim)}`
1456
+ `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
976
1457
  );
977
1458
  } else {
978
1459
  scoreParts.push(formatted);
@@ -986,7 +1467,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
986
1467
  scoreParts.push(
987
1468
  colorize(
988
1469
  formatted,
989
- item.passed === true ? `${ansi.bold}${ansi.green}` : item.passed === false ? `${ansi.bold}${ansi.red}` : ansi.dim
1470
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
990
1471
  )
991
1472
  );
992
1473
  break;
@@ -1011,7 +1492,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1011
1492
  }
1012
1493
  return line;
1013
1494
  }
1014
- async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1495
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1015
1496
  const dataset = await runner.resolveDatasetByName(datasetName);
1016
1497
  if (!dataset) {
1017
1498
  const known = await runner.collectDatasets();
@@ -1053,10 +1534,10 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1053
1534
  const frame = spinnerFrames[spinnerIndex % spinnerFrames.length];
1054
1535
  spinnerIndex += 1;
1055
1536
  process.stdout.write(
1056
- `\r${colorize(frame, ansi.cyan)} Running evaluations ${colorize(
1537
+ `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
1057
1538
  `${completedCount}/${totalCount}`,
1058
- ansi.bold
1059
- )} ${colorize("(live)", ansi.dim)}`
1539
+ ansi2.bold
1540
+ )} ${colorize("(live)", ansi2.dim)}`
1060
1541
  );
1061
1542
  }
1062
1543
  let spinnerTimer;
@@ -1068,7 +1549,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1068
1549
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1069
1550
  clearLine();
1070
1551
  console.log(
1071
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi.dim)}`
1552
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1072
1553
  );
1073
1554
  for (const item of event.evaluatorScores) {
1074
1555
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -1119,14 +1600,14 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1119
1600
  evaluatorIds: evaluators.map((item) => item.id)
1120
1601
  });
1121
1602
  totalCount = snapshot.totalTestCases;
1122
- console.log(colorize("=== Eval Run Started ===", `${ansi.bold}${ansi.cyan}`));
1123
- console.log(`Run: ${colorize(snapshot.runId, ansi.cyan)}`);
1124
- console.log(`Dataset: ${colorize(snapshot.datasetName, ansi.bold)}`);
1603
+ console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
1604
+ console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
1605
+ console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
1125
1606
  console.log(
1126
1607
  `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
1127
1608
  );
1128
1609
  console.log(
1129
- `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi.bold)}`
1610
+ `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
1130
1611
  );
1131
1612
  console.log("");
1132
1613
  drawSpinner();
@@ -1139,17 +1620,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1139
1620
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
1140
1621
  }
1141
1622
  console.log("");
1142
- console.log(colorize("=== Run Summary ===", `${ansi.bold}${ansi.cyan}`));
1623
+ console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
1143
1624
  console.log(
1144
1625
  `- passed: ${colorize(
1145
1626
  `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
1146
- ansi.green
1627
+ ansi2.green
1147
1628
  )}`
1148
1629
  );
1149
1630
  console.log(
1150
1631
  `- failed: ${colorize(
1151
1632
  `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
1152
- finalEvent.failedTestCases > 0 ? ansi.red : ansi.dim
1633
+ finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
1153
1634
  )}`
1154
1635
  );
1155
1636
  if (overallScoreCount > 0) {
@@ -1158,22 +1639,22 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1158
1639
  `- overall avg score: ${colorize(
1159
1640
  overallAverage.toFixed(2),
1160
1641
  scoreToColor(overallAverage)
1161
- )} ${colorize(createBar(overallAverage), ansi.dim)}`
1642
+ )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1162
1643
  );
1163
1644
  }
1164
- console.log(colorize("- evaluator averages:", ansi.magenta));
1645
+ console.log(colorize("- evaluator averages:", ansi2.magenta));
1165
1646
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
1166
1647
  console.log(
1167
1648
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1168
1649
  );
1169
1650
  }
1170
1651
  if (testCaseSummaries.length > 0) {
1171
- console.log(colorize("- test case scores:", ansi.magenta));
1652
+ console.log(colorize("- test case scores:", ansi2.magenta));
1172
1653
  for (const summary of testCaseSummaries) {
1173
- const status = summary.passed ? colorize("PASS", ansi.green) : colorize("FAIL", ansi.red);
1654
+ const status = summary.passed ? colorize("PASS", ansi2.green) : colorize("FAIL", ansi2.red);
1174
1655
  if (summary.averageScore === void 0) {
1175
1656
  console.log(
1176
- ` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
1657
+ ` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1177
1658
  );
1178
1659
  continue;
1179
1660
  }
@@ -1181,11 +1662,30 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1181
1662
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1182
1663
  summary.averageScore.toFixed(2),
1183
1664
  scoreToColor(summary.averageScore)
1184
- )} ${colorize(createBar(summary.averageScore, 100, 14), ansi.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
1665
+ )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1185
1666
  );
1186
1667
  }
1187
1668
  }
1188
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi.dim)}`);
1669
+ console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1670
+ }
1671
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1672
+ return new Promise((resolve4, reject) => {
1673
+ const app = ink.render(
1674
+ React2__default.default.createElement(RunView, {
1675
+ runner,
1676
+ datasetName,
1677
+ evaluatorPattern,
1678
+ onComplete: (err) => {
1679
+ app.unmount();
1680
+ if (err) {
1681
+ reject(err);
1682
+ } else {
1683
+ resolve4();
1684
+ }
1685
+ }
1686
+ })
1687
+ );
1688
+ });
1189
1689
  }
1190
1690
 
1191
1691
  // src/cli-simple/index.ts
@@ -1210,17 +1710,28 @@ async function main() {
1210
1710
  console.error("Missing required --dataset <datasetName> argument.");
1211
1711
  printUsageAndExit(1);
1212
1712
  }
1713
+ if (args.command === "run" && !args.evaluatorPattern) {
1714
+ console.error("Missing required --evaluator <name-or-pattern> argument.");
1715
+ printUsageAndExit(1);
1716
+ }
1717
+ const useInk = process.stdout.isTTY === true;
1718
+ if (!useInk) {
1719
+ printBanner();
1720
+ }
1213
1721
  const runner = createRunner();
1214
1722
  try {
1215
1723
  if (args.command === "run") {
1216
- if (!args.evaluatorPattern) {
1217
- console.error("Missing required --evaluator <name-or-pattern> argument.");
1218
- printUsageAndExit(1);
1219
- }
1220
- await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
1724
+ await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
1725
+ runner,
1726
+ args.datasetName,
1727
+ args.evaluatorPattern
1728
+ );
1221
1729
  return;
1222
1730
  }
1223
- await generateDatasetJsonCommand(runner, args.datasetName);
1731
+ await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
1732
+ runner,
1733
+ args.datasetName
1734
+ );
1224
1735
  } finally {
1225
1736
  await runner.shutdown();
1226
1737
  }