@m4trix/evals 0.9.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,9 @@ import * as jitiModule from 'jiti';
7
7
  import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import 'json-diff';
10
+ import React2, { useState, useEffect, useCallback } from 'react';
11
+ import { render, Box, Text } from 'ink';
12
+ import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
10
13
 
11
14
  // src/runner/config.ts
12
15
  var defaultRunnerConfig = {
@@ -886,6 +889,88 @@ function printBanner() {
886
889
  ];
887
890
  console.log(lines.join("\n"));
888
891
  }
892
+ function Banner() {
893
+ return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
894
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
895
+ /* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
896
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
897
+ ] });
898
+ }
899
+ function GenerateView({
900
+ runner,
901
+ datasetName,
902
+ onComplete
903
+ }) {
904
+ const [result, setResult] = useState(null);
905
+ const [error, setError] = useState(null);
906
+ useEffect(() => {
907
+ let cancelled = false;
908
+ async function run() {
909
+ const dataset = await runner.resolveDatasetByName(datasetName);
910
+ if (!dataset) {
911
+ setError(new Error(`Dataset "${datasetName}" not found.`));
912
+ onComplete(new Error(`Dataset "${datasetName}" not found.`));
913
+ return;
914
+ }
915
+ const { writeFile: writeFile2 } = await import('fs/promises');
916
+ const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
917
+ const testCases = await runner.collectDatasetTestCases(dataset.id);
918
+ const payload = testCases.map((item) => {
919
+ const tc = item.testCase;
920
+ return {
921
+ name: item.testCase.getName(),
922
+ input: item.testCase.getInput(),
923
+ output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
924
+ };
925
+ });
926
+ const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
927
+ const parsed = parse2(absoluteDatasetPath);
928
+ const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
929
+ await writeFile2(
930
+ outputPath,
931
+ `${JSON.stringify(payload, null, 2)}
932
+ `,
933
+ "utf8"
934
+ );
935
+ if (!cancelled) {
936
+ setResult({
937
+ count: payload.length,
938
+ datasetName: dataset.dataset.getName(),
939
+ outputPath
940
+ });
941
+ setTimeout(() => onComplete(), 200);
942
+ }
943
+ }
944
+ void run();
945
+ return () => {
946
+ cancelled = true;
947
+ };
948
+ }, [runner, datasetName, onComplete]);
949
+ if (error) {
950
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
951
+ /* @__PURE__ */ jsx(Banner, {}),
952
+ /* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
953
+ ] });
954
+ }
955
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
956
+ /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
957
+ result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
958
+ /* @__PURE__ */ jsxs(Text, { color: "green", children: [
959
+ "Generated ",
960
+ result.count,
961
+ ' test cases for dataset "',
962
+ result.datasetName,
963
+ '".'
964
+ ] }),
965
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
966
+ "Wrote ",
967
+ result.outputPath
968
+ ] })
969
+ ] })
970
+ ] });
971
+ }
972
+
973
+ // src/cli-simple/generate.ts
889
974
  function readOutput2(testCase) {
890
975
  if (typeof testCase.getOutput !== "function") {
891
976
  return void 0;
@@ -896,7 +981,7 @@ function createOutputPath(datasetFilePath) {
896
981
  const parsed = parse(datasetFilePath);
897
982
  return join(parsed.dir, `${parsed.name}.cases.json`);
898
983
  }
899
- async function generateDatasetJsonCommand(runner, datasetName) {
984
+ async function generateDatasetJsonCommandPlain(runner, datasetName) {
900
985
  const dataset = await runner.resolveDatasetByName(datasetName);
901
986
  if (!dataset) {
902
987
  throw new Error(`Dataset "${datasetName}" not found.`);
@@ -914,6 +999,380 @@ async function generateDatasetJsonCommand(runner, datasetName) {
914
999
  console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
915
1000
  console.log(`Wrote ${outputPath}`);
916
1001
  }
1002
+ async function generateDatasetJsonCommandInk(runner, datasetName) {
1003
+ return new Promise((resolve4, reject) => {
1004
+ const app = render(
1005
+ React2.createElement(GenerateView, {
1006
+ runner,
1007
+ datasetName,
1008
+ onComplete: (err) => {
1009
+ app.unmount();
1010
+ if (err) {
1011
+ reject(err);
1012
+ } else {
1013
+ resolve4();
1014
+ }
1015
+ }
1016
+ })
1017
+ );
1018
+ });
1019
+ }
1020
+ function barColor(pct) {
1021
+ if (pct >= 70)
1022
+ return "green";
1023
+ if (pct >= 40)
1024
+ return "yellow";
1025
+ return "red";
1026
+ }
1027
+ function TextBar({
1028
+ label,
1029
+ value,
1030
+ max = 100,
1031
+ labelWidth = 14,
1032
+ barWidth = 20,
1033
+ format = (v) => String(v),
1034
+ colorByValue = true
1035
+ }) {
1036
+ const clamped = Math.max(0, Math.min(max, value));
1037
+ const pct = max > 0 ? clamped / max * 100 : 0;
1038
+ const filled = Math.round(clamped / max * barWidth);
1039
+ const filledBar = "\u2588".repeat(filled);
1040
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
1041
+ const color = colorByValue ? barColor(pct) : void 0;
1042
+ return /* @__PURE__ */ jsxs(Text, { children: [
1043
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
1044
+ " [",
1045
+ color ? /* @__PURE__ */ jsxs(Fragment, { children: [
1046
+ /* @__PURE__ */ jsx(Text, { color, children: filledBar }),
1047
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
1048
+ ] }) : filledBar + emptyBar,
1049
+ "] ",
1050
+ /* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
1051
+ ] });
1052
+ }
1053
+ var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1054
+ function Spinner({ label = "Running" }) {
1055
+ const [frame, setFrame] = useState(0);
1056
+ useEffect(() => {
1057
+ const timer = setInterval(() => {
1058
+ setFrame((f) => (f + 1) % FRAMES.length);
1059
+ }, 100);
1060
+ return () => clearInterval(timer);
1061
+ }, []);
1062
+ return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1063
+ FRAMES[frame],
1064
+ " ",
1065
+ label
1066
+ ] });
1067
+ }
1068
+ function scoreColor(score) {
1069
+ if (score >= 80)
1070
+ return "green";
1071
+ if (score >= 50)
1072
+ return "yellow";
1073
+ return "red";
1074
+ }
1075
+ function createBar(value, max = 100, width = 20) {
1076
+ const safe = Math.max(0, Math.min(max, value));
1077
+ const filled = Math.round(safe / max * width);
1078
+ return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1079
+ }
1080
+ function formatScorePart(item, scoreToColor2) {
1081
+ const def = getScoreById(item.id);
1082
+ if (!def) {
1083
+ const numeric = toNumericScore(item.data);
1084
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1085
+ }
1086
+ const formatted = def.format(item.data);
1087
+ if (def.displayStrategy === "bar") {
1088
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1089
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1090
+ return `${formatted} ${createBar(numeric)}`;
1091
+ }
1092
+ }
1093
+ return formatted;
1094
+ }
1095
+ function RunView({
1096
+ runner,
1097
+ datasetName,
1098
+ evaluatorPattern,
1099
+ onComplete
1100
+ }) {
1101
+ const [phase, setPhase] = useState(
1102
+ "loading"
1103
+ );
1104
+ const [runInfo, setRunInfo] = useState(null);
1105
+ const [testCases, setTestCases] = useState([]);
1106
+ const [summary, setSummary] = useState(null);
1107
+ const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1108
+ const runEval = useCallback(async () => {
1109
+ const dataset = await runner.resolveDatasetByName(datasetName);
1110
+ if (!dataset) {
1111
+ const known = await runner.collectDatasets();
1112
+ const available = known.map((item) => item.dataset.getName()).sort();
1113
+ onComplete(
1114
+ new Error(
1115
+ available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1116
+ )
1117
+ );
1118
+ return;
1119
+ }
1120
+ const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1121
+ if (evaluators.length === 0) {
1122
+ const known = await runner.collectEvaluators();
1123
+ const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1124
+ onComplete(
1125
+ new Error(
1126
+ available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1127
+ )
1128
+ );
1129
+ return;
1130
+ }
1131
+ const nameById = new Map(
1132
+ evaluators.map((item) => [
1133
+ item.id,
1134
+ item.evaluator.getName() ?? item.id
1135
+ ])
1136
+ );
1137
+ setEvaluatorNameById(nameById);
1138
+ const aggregates = /* @__PURE__ */ new Map();
1139
+ let overallScoreTotal = 0;
1140
+ let overallScoreCount = 0;
1141
+ const done = new Promise((resolve4) => {
1142
+ const unsubscribe = runner.subscribeRunEvents((event) => {
1143
+ if (event.type === "TestCaseProgress") {
1144
+ const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1145
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1146
+ for (const item of event.evaluatorScores) {
1147
+ const numeric = toNumericScoreFromScores(item.scores);
1148
+ if (numeric !== void 0) {
1149
+ const current = aggregates.get(item.evaluatorId) ?? {
1150
+ total: 0,
1151
+ count: 0,
1152
+ passed: 0,
1153
+ failed: 0
1154
+ };
1155
+ aggregates.set(item.evaluatorId, {
1156
+ total: current.total + numeric,
1157
+ count: current.count + 1,
1158
+ passed: current.passed + (item.passed ? 1 : 0),
1159
+ failed: current.failed + (item.passed ? 0 : 1)
1160
+ });
1161
+ overallScoreTotal += numeric;
1162
+ overallScoreCount += 1;
1163
+ }
1164
+ }
1165
+ setTestCases((prev) => [
1166
+ ...prev,
1167
+ {
1168
+ name: event.testCaseName,
1169
+ completedTestCases: event.completedTestCases,
1170
+ totalTestCases: event.totalTestCases,
1171
+ durationMs: event.durationMs,
1172
+ passed: event.passed,
1173
+ averageScore,
1174
+ evaluatorScores: event.evaluatorScores.map((item) => ({
1175
+ evaluatorId: item.evaluatorId,
1176
+ evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1177
+ scores: item.scores,
1178
+ passed: item.passed,
1179
+ metrics: item.metrics
1180
+ }))
1181
+ }
1182
+ ]);
1183
+ }
1184
+ if (event.type === "RunCompleted" || event.type === "RunFailed") {
1185
+ unsubscribe();
1186
+ resolve4(event);
1187
+ }
1188
+ });
1189
+ });
1190
+ const snapshot = await runner.runDatasetWith({
1191
+ datasetId: dataset.id,
1192
+ evaluatorIds: evaluators.map((item) => item.id)
1193
+ });
1194
+ setRunInfo({
1195
+ runId: snapshot.runId,
1196
+ datasetName: snapshot.datasetName,
1197
+ evaluatorNames: evaluators.map(
1198
+ (e) => e.evaluator.getName() ?? e.id
1199
+ ),
1200
+ totalTestCases: snapshot.totalTestCases
1201
+ });
1202
+ setPhase("running");
1203
+ const finalEvent = await done;
1204
+ if (finalEvent.type === "RunFailed") {
1205
+ onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1206
+ return;
1207
+ }
1208
+ setSummary({
1209
+ passedTestCases: finalEvent.passedTestCases,
1210
+ failedTestCases: finalEvent.failedTestCases,
1211
+ totalTestCases: finalEvent.totalTestCases,
1212
+ overallScoreTotal,
1213
+ overallScoreCount,
1214
+ aggregates: new Map(aggregates),
1215
+ artifactPath: finalEvent.artifactPath
1216
+ });
1217
+ setPhase("completed");
1218
+ setTimeout(() => onComplete(), 200);
1219
+ }, [runner, datasetName, evaluatorPattern, onComplete]);
1220
+ useEffect(() => {
1221
+ void runEval();
1222
+ }, [runEval]);
1223
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
1224
+ /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
1225
+ runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1226
+ /* @__PURE__ */ jsxs(Text, { children: [
1227
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
1228
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
1229
+ ] }),
1230
+ /* @__PURE__ */ jsxs(Text, { children: [
1231
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
1232
+ runInfo.datasetName
1233
+ ] }),
1234
+ /* @__PURE__ */ jsxs(Text, { children: [
1235
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
1236
+ runInfo.evaluatorNames.join(", ")
1237
+ ] }),
1238
+ /* @__PURE__ */ jsxs(Text, { children: [
1239
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
1240
+ runInfo.totalTestCases
1241
+ ] })
1242
+ ] }),
1243
+ phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1244
+ Spinner,
1245
+ {
1246
+ label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1247
+ }
1248
+ ) }),
1249
+ testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1250
+ /* @__PURE__ */ jsxs(Text, { children: [
1251
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1252
+ "[",
1253
+ tc.completedTestCases,
1254
+ "/",
1255
+ tc.totalTestCases,
1256
+ "]"
1257
+ ] }),
1258
+ " ",
1259
+ tc.name,
1260
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1261
+ " (",
1262
+ tc.durationMs,
1263
+ "ms)"
1264
+ ] })
1265
+ ] }),
1266
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
1267
+ item.evaluatorName,
1268
+ ":",
1269
+ " ",
1270
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1271
+ " ",
1272
+ item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1273
+ formatScorePart(s),
1274
+ " "
1275
+ ] }, s.id)),
1276
+ item.metrics?.map((m) => {
1277
+ const def = getMetricById(m.id);
1278
+ if (!def)
1279
+ return null;
1280
+ const formatted = def.format(m.data);
1281
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1282
+ "[",
1283
+ def.name ? `${def.name}: ` : "",
1284
+ formatted,
1285
+ "]",
1286
+ " "
1287
+ ] }, m.id);
1288
+ })
1289
+ ] }) }, item.evaluatorId))
1290
+ ] }, i)) }),
1291
+ phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1292
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
1293
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
1294
+ /* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
1295
+ /* @__PURE__ */ jsxs(Text, { children: [
1296
+ " ",
1297
+ summary.passedTestCases,
1298
+ "/",
1299
+ summary.totalTestCases
1300
+ ] })
1301
+ ] }),
1302
+ /* @__PURE__ */ jsxs(Box, { children: [
1303
+ /* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
1304
+ /* @__PURE__ */ jsxs(Text, { children: [
1305
+ " ",
1306
+ summary.failedTestCases,
1307
+ "/",
1308
+ summary.totalTestCases
1309
+ ] })
1310
+ ] }),
1311
+ summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
1312
+ TextBar,
1313
+ {
1314
+ label: "overall avg",
1315
+ value: summary.overallScoreTotal / summary.overallScoreCount,
1316
+ barWidth: 20,
1317
+ format: (v) => v.toFixed(2)
1318
+ }
1319
+ ) }),
1320
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1321
+ /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1322
+ Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1323
+ const agg = summary.aggregates.get(id);
1324
+ if (!agg || agg.count === 0) {
1325
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1326
+ "- ",
1327
+ name.padEnd(28),
1328
+ " no numeric scores"
1329
+ ] }, id);
1330
+ }
1331
+ const mean = agg.total / agg.count;
1332
+ return /* @__PURE__ */ jsxs(Text, { children: [
1333
+ "- ",
1334
+ name.padEnd(28),
1335
+ " avg=",
1336
+ /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1337
+ " passed=",
1338
+ agg.passed,
1339
+ " failed=",
1340
+ agg.failed
1341
+ ] }, id);
1342
+ })
1343
+ ] }),
1344
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1345
+ /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1346
+ testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
1347
+ /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1348
+ /* @__PURE__ */ jsxs(Text, { children: [
1349
+ " ",
1350
+ tc.name.padEnd(24)
1351
+ ] }),
1352
+ tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1353
+ /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
1354
+ "score=",
1355
+ tc.averageScore.toFixed(2)
1356
+ ] }),
1357
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1358
+ " ",
1359
+ createBar(tc.averageScore, 100, 14)
1360
+ ] })
1361
+ ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1362
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1363
+ " (",
1364
+ tc.durationMs,
1365
+ "ms)"
1366
+ ] })
1367
+ ] }, i))
1368
+ ] }),
1369
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1370
+ "artifact: ",
1371
+ summary.artifactPath
1372
+ ] }) })
1373
+ ] })
1374
+ ] });
1375
+ }
917
1376
 
918
1377
  // src/cli-simple/run.ts
919
1378
  var ansi2 = {
@@ -945,7 +1404,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
945
1404
  const mean = aggregate.total / aggregate.count;
946
1405
  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
947
1406
  }
948
- function createBar(value, max = 100, width = 20) {
1407
+ function createBar2(value, max = 100, width = 20) {
949
1408
  const safe = Math.max(0, Math.min(max, value));
950
1409
  const filled = Math.round(safe / max * width);
951
1410
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
@@ -968,7 +1427,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
968
1427
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
969
1428
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
970
1429
  scoreParts.push(
971
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi2.dim)}`
1430
+ `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
972
1431
  );
973
1432
  } else {
974
1433
  scoreParts.push(formatted);
@@ -1007,7 +1466,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1007
1466
  }
1008
1467
  return line;
1009
1468
  }
1010
- async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1469
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1011
1470
  const dataset = await runner.resolveDatasetByName(datasetName);
1012
1471
  if (!dataset) {
1013
1472
  const known = await runner.collectDatasets();
@@ -1154,7 +1613,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1154
1613
  `- overall avg score: ${colorize(
1155
1614
  overallAverage.toFixed(2),
1156
1615
  scoreToColor(overallAverage)
1157
- )} ${colorize(createBar(overallAverage), ansi2.dim)}`
1616
+ )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1158
1617
  );
1159
1618
  }
1160
1619
  console.log(colorize("- evaluator averages:", ansi2.magenta));
@@ -1177,12 +1636,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1177
1636
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1178
1637
  summary.averageScore.toFixed(2),
1179
1638
  scoreToColor(summary.averageScore)
1180
- )} ${colorize(createBar(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1639
+ )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1181
1640
  );
1182
1641
  }
1183
1642
  }
1184
1643
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1185
1644
  }
1645
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1646
+ return new Promise((resolve4, reject) => {
1647
+ const app = render(
1648
+ React2.createElement(RunView, {
1649
+ runner,
1650
+ datasetName,
1651
+ evaluatorPattern,
1652
+ onComplete: (err) => {
1653
+ app.unmount();
1654
+ if (err) {
1655
+ reject(err);
1656
+ } else {
1657
+ resolve4();
1658
+ }
1659
+ }
1660
+ })
1661
+ );
1662
+ });
1663
+ }
1186
1664
 
1187
1665
  // src/cli-simple/index.ts
1188
1666
  function printUsageAndExit(exitCode) {
@@ -1210,14 +1688,24 @@ async function main() {
1210
1688
  console.error("Missing required --evaluator <name-or-pattern> argument.");
1211
1689
  printUsageAndExit(1);
1212
1690
  }
1213
- printBanner();
1691
+ const useInk = process.stdout.isTTY === true;
1692
+ if (!useInk) {
1693
+ printBanner();
1694
+ }
1214
1695
  const runner = createRunner();
1215
1696
  try {
1216
1697
  if (args.command === "run") {
1217
- await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
1698
+ await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
1699
+ runner,
1700
+ args.datasetName,
1701
+ args.evaluatorPattern
1702
+ );
1218
1703
  return;
1219
1704
  }
1220
- await generateDatasetJsonCommand(runner, args.datasetName);
1705
+ await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
1706
+ runner,
1707
+ args.datasetName
1708
+ );
1221
1709
  } finally {
1222
1710
  await runner.shutdown();
1223
1711
  }