@m4trix/evals 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,8 +9,13 @@ var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
11
  require('json-diff');
12
+ var React2 = require('react');
13
+ var ink = require('ink');
14
+ var jsxRuntime = require('react/jsx-runtime');
12
15
 
13
16
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
17
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
+
14
19
  function _interopNamespace(e) {
15
20
  if (e && e.__esModule) return e;
16
21
  var n = Object.create(null);
@@ -30,6 +35,7 @@ function _interopNamespace(e) {
30
35
  }
31
36
 
32
37
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
38
+ var React2__default = /*#__PURE__*/_interopDefault(React2);
33
39
 
34
40
  // src/runner/config.ts
35
41
  var defaultRunnerConfig = {
@@ -909,6 +915,88 @@ function printBanner() {
909
915
  ];
910
916
  console.log(lines.join("\n"));
911
917
  }
918
+ function Banner() {
919
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
920
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "@m4trix/evals" }),
921
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: " \xB7 " }),
922
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "eval-agents-simple" })
923
+ ] });
924
+ }
925
+ function GenerateView({
926
+ runner,
927
+ datasetName,
928
+ onComplete
929
+ }) {
930
+ const [result, setResult] = React2.useState(null);
931
+ const [error, setError] = React2.useState(null);
932
+ React2.useEffect(() => {
933
+ let cancelled = false;
934
+ async function run() {
935
+ const dataset = await runner.resolveDatasetByName(datasetName);
936
+ if (!dataset) {
937
+ setError(new Error(`Dataset "${datasetName}" not found.`));
938
+ onComplete(new Error(`Dataset "${datasetName}" not found.`));
939
+ return;
940
+ }
941
+ const { writeFile: writeFile2 } = await import('fs/promises');
942
+ const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
943
+ const testCases = await runner.collectDatasetTestCases(dataset.id);
944
+ const payload = testCases.map((item) => {
945
+ const tc = item.testCase;
946
+ return {
947
+ name: item.testCase.getName(),
948
+ input: item.testCase.getInput(),
949
+ output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
950
+ };
951
+ });
952
+ const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
953
+ const parsed = parse2(absoluteDatasetPath);
954
+ const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
955
+ await writeFile2(
956
+ outputPath,
957
+ `${JSON.stringify(payload, null, 2)}
958
+ `,
959
+ "utf8"
960
+ );
961
+ if (!cancelled) {
962
+ setResult({
963
+ count: payload.length,
964
+ datasetName: dataset.dataset.getName(),
965
+ outputPath
966
+ });
967
+ setTimeout(() => onComplete(), 200);
968
+ }
969
+ }
970
+ void run();
971
+ return () => {
972
+ cancelled = true;
973
+ };
974
+ }, [runner, datasetName, onComplete]);
975
+ if (error) {
976
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
977
+ /* @__PURE__ */ jsxRuntime.jsx(Banner, {}),
978
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: error.message })
979
+ ] });
980
+ }
981
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
982
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
983
+ result && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
984
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "green", children: [
985
+ "Generated ",
986
+ result.count,
987
+ ' test cases for dataset "',
988
+ result.datasetName,
989
+ '".'
990
+ ] }),
991
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
992
+ "Wrote ",
993
+ result.outputPath
994
+ ] })
995
+ ] })
996
+ ] });
997
+ }
998
+
999
+ // src/cli-simple/generate.ts
912
1000
  function readOutput2(testCase) {
913
1001
  if (typeof testCase.getOutput !== "function") {
914
1002
  return void 0;
@@ -919,7 +1007,7 @@ function createOutputPath(datasetFilePath) {
919
1007
  const parsed = path.parse(datasetFilePath);
920
1008
  return path.join(parsed.dir, `${parsed.name}.cases.json`);
921
1009
  }
922
- async function generateDatasetJsonCommand(runner, datasetName) {
1010
+ async function generateDatasetJsonCommandPlain(runner, datasetName) {
923
1011
  const dataset = await runner.resolveDatasetByName(datasetName);
924
1012
  if (!dataset) {
925
1013
  throw new Error(`Dataset "${datasetName}" not found.`);
@@ -937,6 +1025,380 @@ async function generateDatasetJsonCommand(runner, datasetName) {
937
1025
  console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
938
1026
  console.log(`Wrote ${outputPath}`);
939
1027
  }
1028
+ async function generateDatasetJsonCommandInk(runner, datasetName) {
1029
+ return new Promise((resolve4, reject) => {
1030
+ const app = ink.render(
1031
+ React2__default.default.createElement(GenerateView, {
1032
+ runner,
1033
+ datasetName,
1034
+ onComplete: (err) => {
1035
+ app.unmount();
1036
+ if (err) {
1037
+ reject(err);
1038
+ } else {
1039
+ resolve4();
1040
+ }
1041
+ }
1042
+ })
1043
+ );
1044
+ });
1045
+ }
1046
+ function barColor(pct) {
1047
+ if (pct >= 70)
1048
+ return "green";
1049
+ if (pct >= 40)
1050
+ return "yellow";
1051
+ return "red";
1052
+ }
1053
+ function TextBar({
1054
+ label,
1055
+ value,
1056
+ max = 100,
1057
+ labelWidth = 14,
1058
+ barWidth = 20,
1059
+ format = (v) => String(v),
1060
+ colorByValue = true
1061
+ }) {
1062
+ const clamped = Math.max(0, Math.min(max, value));
1063
+ const pct = max > 0 ? clamped / max * 100 : 0;
1064
+ const filled = Math.round(clamped / max * barWidth);
1065
+ const filledBar = "\u2588".repeat(filled);
1066
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
1067
+ const color = colorByValue ? barColor(pct) : void 0;
1068
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1069
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
1070
+ " [",
1071
+ color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1072
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
1073
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
1074
+ ] }) : filledBar + emptyBar,
1075
+ "] ",
1076
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
1077
+ ] });
1078
+ }
1079
+ var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1080
+ function Spinner({ label = "Running" }) {
1081
+ const [frame, setFrame] = React2.useState(0);
1082
+ React2.useEffect(() => {
1083
+ const timer = setInterval(() => {
1084
+ setFrame((f) => (f + 1) % FRAMES.length);
1085
+ }, 100);
1086
+ return () => clearInterval(timer);
1087
+ }, []);
1088
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1089
+ FRAMES[frame],
1090
+ " ",
1091
+ label
1092
+ ] });
1093
+ }
1094
+ function scoreColor(score) {
1095
+ if (score >= 80)
1096
+ return "green";
1097
+ if (score >= 50)
1098
+ return "yellow";
1099
+ return "red";
1100
+ }
1101
+ function createBar(value, max = 100, width = 20) {
1102
+ const safe = Math.max(0, Math.min(max, value));
1103
+ const filled = Math.round(safe / max * width);
1104
+ return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1105
+ }
1106
+ function formatScorePart(item, scoreToColor2) {
1107
+ const def = getScoreById(item.id);
1108
+ if (!def) {
1109
+ const numeric = toNumericScore(item.data);
1110
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1111
+ }
1112
+ const formatted = def.format(item.data);
1113
+ if (def.displayStrategy === "bar") {
1114
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1115
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1116
+ return `${formatted} ${createBar(numeric)}`;
1117
+ }
1118
+ }
1119
+ return formatted;
1120
+ }
1121
+ function RunView({
1122
+ runner,
1123
+ datasetName,
1124
+ evaluatorPattern,
1125
+ onComplete
1126
+ }) {
1127
+ const [phase, setPhase] = React2.useState(
1128
+ "loading"
1129
+ );
1130
+ const [runInfo, setRunInfo] = React2.useState(null);
1131
+ const [testCases, setTestCases] = React2.useState([]);
1132
+ const [summary, setSummary] = React2.useState(null);
1133
+ const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1134
+ const runEval = React2.useCallback(async () => {
1135
+ const dataset = await runner.resolveDatasetByName(datasetName);
1136
+ if (!dataset) {
1137
+ const known = await runner.collectDatasets();
1138
+ const available = known.map((item) => item.dataset.getName()).sort();
1139
+ onComplete(
1140
+ new Error(
1141
+ available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1142
+ )
1143
+ );
1144
+ return;
1145
+ }
1146
+ const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1147
+ if (evaluators.length === 0) {
1148
+ const known = await runner.collectEvaluators();
1149
+ const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1150
+ onComplete(
1151
+ new Error(
1152
+ available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1153
+ )
1154
+ );
1155
+ return;
1156
+ }
1157
+ const nameById = new Map(
1158
+ evaluators.map((item) => [
1159
+ item.id,
1160
+ item.evaluator.getName() ?? item.id
1161
+ ])
1162
+ );
1163
+ setEvaluatorNameById(nameById);
1164
+ const aggregates = /* @__PURE__ */ new Map();
1165
+ let overallScoreTotal = 0;
1166
+ let overallScoreCount = 0;
1167
+ const done = new Promise((resolve4) => {
1168
+ const unsubscribe = runner.subscribeRunEvents((event) => {
1169
+ if (event.type === "TestCaseProgress") {
1170
+ const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1171
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1172
+ for (const item of event.evaluatorScores) {
1173
+ const numeric = toNumericScoreFromScores(item.scores);
1174
+ if (numeric !== void 0) {
1175
+ const current = aggregates.get(item.evaluatorId) ?? {
1176
+ total: 0,
1177
+ count: 0,
1178
+ passed: 0,
1179
+ failed: 0
1180
+ };
1181
+ aggregates.set(item.evaluatorId, {
1182
+ total: current.total + numeric,
1183
+ count: current.count + 1,
1184
+ passed: current.passed + (item.passed ? 1 : 0),
1185
+ failed: current.failed + (item.passed ? 0 : 1)
1186
+ });
1187
+ overallScoreTotal += numeric;
1188
+ overallScoreCount += 1;
1189
+ }
1190
+ }
1191
+ setTestCases((prev) => [
1192
+ ...prev,
1193
+ {
1194
+ name: event.testCaseName,
1195
+ completedTestCases: event.completedTestCases,
1196
+ totalTestCases: event.totalTestCases,
1197
+ durationMs: event.durationMs,
1198
+ passed: event.passed,
1199
+ averageScore,
1200
+ evaluatorScores: event.evaluatorScores.map((item) => ({
1201
+ evaluatorId: item.evaluatorId,
1202
+ evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1203
+ scores: item.scores,
1204
+ passed: item.passed,
1205
+ metrics: item.metrics
1206
+ }))
1207
+ }
1208
+ ]);
1209
+ }
1210
+ if (event.type === "RunCompleted" || event.type === "RunFailed") {
1211
+ unsubscribe();
1212
+ resolve4(event);
1213
+ }
1214
+ });
1215
+ });
1216
+ const snapshot = await runner.runDatasetWith({
1217
+ datasetId: dataset.id,
1218
+ evaluatorIds: evaluators.map((item) => item.id)
1219
+ });
1220
+ setRunInfo({
1221
+ runId: snapshot.runId,
1222
+ datasetName: snapshot.datasetName,
1223
+ evaluatorNames: evaluators.map(
1224
+ (e) => e.evaluator.getName() ?? e.id
1225
+ ),
1226
+ totalTestCases: snapshot.totalTestCases
1227
+ });
1228
+ setPhase("running");
1229
+ const finalEvent = await done;
1230
+ if (finalEvent.type === "RunFailed") {
1231
+ onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1232
+ return;
1233
+ }
1234
+ setSummary({
1235
+ passedTestCases: finalEvent.passedTestCases,
1236
+ failedTestCases: finalEvent.failedTestCases,
1237
+ totalTestCases: finalEvent.totalTestCases,
1238
+ overallScoreTotal,
1239
+ overallScoreCount,
1240
+ aggregates: new Map(aggregates),
1241
+ artifactPath: finalEvent.artifactPath
1242
+ });
1243
+ setPhase("completed");
1244
+ setTimeout(() => onComplete(), 200);
1245
+ }, [runner, datasetName, evaluatorPattern, onComplete]);
1246
+ React2.useEffect(() => {
1247
+ void runEval();
1248
+ }, [runEval]);
1249
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1250
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1251
+ runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1252
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1253
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
1254
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
1255
+ ] }),
1256
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1257
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
1258
+ runInfo.datasetName
1259
+ ] }),
1260
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1261
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
1262
+ runInfo.evaluatorNames.join(", ")
1263
+ ] }),
1264
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1265
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
1266
+ runInfo.totalTestCases
1267
+ ] })
1268
+ ] }),
1269
+ phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1270
+ Spinner,
1271
+ {
1272
+ label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1273
+ }
1274
+ ) }),
1275
+ testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1276
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1277
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1278
+ "[",
1279
+ tc.completedTestCases,
1280
+ "/",
1281
+ tc.totalTestCases,
1282
+ "]"
1283
+ ] }),
1284
+ " ",
1285
+ tc.name,
1286
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1287
+ " (",
1288
+ tc.durationMs,
1289
+ "ms)"
1290
+ ] })
1291
+ ] }),
1292
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1293
+ item.evaluatorName,
1294
+ ":",
1295
+ " ",
1296
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1297
+ " ",
1298
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1299
+ formatScorePart(s),
1300
+ " "
1301
+ ] }, s.id)),
1302
+ item.metrics?.map((m) => {
1303
+ const def = getMetricById(m.id);
1304
+ if (!def)
1305
+ return null;
1306
+ const formatted = def.format(m.data);
1307
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1308
+ "[",
1309
+ def.name ? `${def.name}: ` : "",
1310
+ formatted,
1311
+ "]",
1312
+ " "
1313
+ ] }, m.id);
1314
+ })
1315
+ ] }) }, item.evaluatorId))
1316
+ ] }, i)) }),
1317
+ phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1318
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
1319
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
1320
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "green", children: "passed" }),
1321
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1322
+ " ",
1323
+ summary.passedTestCases,
1324
+ "/",
1325
+ summary.totalTestCases
1326
+ ] })
1327
+ ] }),
1328
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1329
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
1330
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1331
+ " ",
1332
+ summary.failedTestCases,
1333
+ "/",
1334
+ summary.totalTestCases
1335
+ ] })
1336
+ ] }),
1337
+ summary.overallScoreCount > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1338
+ TextBar,
1339
+ {
1340
+ label: "overall avg",
1341
+ value: summary.overallScoreTotal / summary.overallScoreCount,
1342
+ barWidth: 20,
1343
+ format: (v) => v.toFixed(2)
1344
+ }
1345
+ ) }),
1346
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1347
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1348
+ Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1349
+ const agg = summary.aggregates.get(id);
1350
+ if (!agg || agg.count === 0) {
1351
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1352
+ "- ",
1353
+ name.padEnd(28),
1354
+ " no numeric scores"
1355
+ ] }, id);
1356
+ }
1357
+ const mean = agg.total / agg.count;
1358
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1359
+ "- ",
1360
+ name.padEnd(28),
1361
+ " avg=",
1362
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1363
+ " passed=",
1364
+ agg.passed,
1365
+ " failed=",
1366
+ agg.failed
1367
+ ] }, id);
1368
+ })
1369
+ ] }),
1370
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1371
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1372
+ testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1373
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1374
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1375
+ " ",
1376
+ tc.name.padEnd(24)
1377
+ ] }),
1378
+ tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1379
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
1380
+ "score=",
1381
+ tc.averageScore.toFixed(2)
1382
+ ] }),
1383
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1384
+ " ",
1385
+ createBar(tc.averageScore, 100, 14)
1386
+ ] })
1387
+ ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1388
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1389
+ " (",
1390
+ tc.durationMs,
1391
+ "ms)"
1392
+ ] })
1393
+ ] }, i))
1394
+ ] }),
1395
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1396
+ "artifact: ",
1397
+ summary.artifactPath
1398
+ ] }) })
1399
+ ] })
1400
+ ] });
1401
+ }
940
1402
 
941
1403
  // src/cli-simple/run.ts
942
1404
  var ansi2 = {
@@ -968,7 +1430,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
968
1430
  const mean = aggregate.total / aggregate.count;
969
1431
  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
970
1432
  }
971
- function createBar(value, max = 100, width = 20) {
1433
+ function createBar2(value, max = 100, width = 20) {
972
1434
  const safe = Math.max(0, Math.min(max, value));
973
1435
  const filled = Math.round(safe / max * width);
974
1436
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
@@ -991,7 +1453,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
991
1453
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
992
1454
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
993
1455
  scoreParts.push(
994
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi2.dim)}`
1456
+ `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
995
1457
  );
996
1458
  } else {
997
1459
  scoreParts.push(formatted);
@@ -1030,7 +1492,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1030
1492
  }
1031
1493
  return line;
1032
1494
  }
1033
- async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1495
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1034
1496
  const dataset = await runner.resolveDatasetByName(datasetName);
1035
1497
  if (!dataset) {
1036
1498
  const known = await runner.collectDatasets();
@@ -1177,7 +1639,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1177
1639
  `- overall avg score: ${colorize(
1178
1640
  overallAverage.toFixed(2),
1179
1641
  scoreToColor(overallAverage)
1180
- )} ${colorize(createBar(overallAverage), ansi2.dim)}`
1642
+ )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1181
1643
  );
1182
1644
  }
1183
1645
  console.log(colorize("- evaluator averages:", ansi2.magenta));
@@ -1200,12 +1662,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1200
1662
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1201
1663
  summary.averageScore.toFixed(2),
1202
1664
  scoreToColor(summary.averageScore)
1203
- )} ${colorize(createBar(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1665
+ )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1204
1666
  );
1205
1667
  }
1206
1668
  }
1207
1669
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1208
1670
  }
1671
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1672
+ return new Promise((resolve4, reject) => {
1673
+ const app = ink.render(
1674
+ React2__default.default.createElement(RunView, {
1675
+ runner,
1676
+ datasetName,
1677
+ evaluatorPattern,
1678
+ onComplete: (err) => {
1679
+ app.unmount();
1680
+ if (err) {
1681
+ reject(err);
1682
+ } else {
1683
+ resolve4();
1684
+ }
1685
+ }
1686
+ })
1687
+ );
1688
+ });
1689
+ }
1209
1690
 
1210
1691
  // src/cli-simple/index.ts
1211
1692
  function printUsageAndExit(exitCode) {
@@ -1233,14 +1714,24 @@ async function main() {
1233
1714
  console.error("Missing required --evaluator <name-or-pattern> argument.");
1234
1715
  printUsageAndExit(1);
1235
1716
  }
1236
- printBanner();
1717
+ const useInk = process.stdout.isTTY === true;
1718
+ if (!useInk) {
1719
+ printBanner();
1720
+ }
1237
1721
  const runner = createRunner();
1238
1722
  try {
1239
1723
  if (args.command === "run") {
1240
- await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
1724
+ await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
1725
+ runner,
1726
+ args.datasetName,
1727
+ args.evaluatorPattern
1728
+ );
1241
1729
  return;
1242
1730
  }
1243
- await generateDatasetJsonCommand(runner, args.datasetName);
1731
+ await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
1732
+ runner,
1733
+ args.datasetName
1734
+ );
1244
1735
  } finally {
1245
1736
  await runner.shutdown();
1246
1737
  }