@m4trix/evals 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,9 @@ import * as jitiModule from 'jiti';
7
7
  import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import 'json-diff';
10
+ import React2, { useState, useEffect, useCallback } from 'react';
11
+ import { render, Box, Text } from 'ink';
12
+ import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
10
13
 
11
14
  // src/runner/config.ts
12
15
  var defaultRunnerConfig = {
@@ -867,6 +870,107 @@ function getSimpleCliUsage() {
867
870
  ' "/score/i" regex literal'
868
871
  ].join("\n");
869
872
  }
873
+
874
+ // src/cli-simple/banner.ts
875
+ var ansi = {
876
+ reset: "\x1B[0m",
877
+ dim: "\x1B[2m",
878
+ cyan: "\x1B[36m"
879
+ };
880
+ function printBanner() {
881
+ const c = (s) => `${ansi.cyan}${s}${ansi.reset}`;
882
+ const d = (s) => `${ansi.dim}${s}${ansi.reset}`;
883
+ const lines = [
884
+ "",
885
+ ` ${c("\u256D\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256E")}`,
886
+ ` ${c("\u2502")} ${d("@m4trix/evals")} ${c("\xB7")} ${d("eval-agents-simple")} ${c("\u2502")}`,
887
+ ` ${c("\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256F")}`,
888
+ ""
889
+ ];
890
+ console.log(lines.join("\n"));
891
+ }
892
+ function Banner() {
893
+ return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
894
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
895
+ /* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
896
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
897
+ ] });
898
+ }
899
+ function GenerateView({
900
+ runner,
901
+ datasetName,
902
+ onComplete
903
+ }) {
904
+ const [result, setResult] = useState(null);
905
+ const [error, setError] = useState(null);
906
+ useEffect(() => {
907
+ let cancelled = false;
908
+ async function run() {
909
+ const dataset = await runner.resolveDatasetByName(datasetName);
910
+ if (!dataset) {
911
+ setError(new Error(`Dataset "${datasetName}" not found.`));
912
+ onComplete(new Error(`Dataset "${datasetName}" not found.`));
913
+ return;
914
+ }
915
+ const { writeFile: writeFile2 } = await import('fs/promises');
916
+ const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
917
+ const testCases = await runner.collectDatasetTestCases(dataset.id);
918
+ const payload = testCases.map((item) => {
919
+ const tc = item.testCase;
920
+ return {
921
+ name: item.testCase.getName(),
922
+ input: item.testCase.getInput(),
923
+ output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
924
+ };
925
+ });
926
+ const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
927
+ const parsed = parse2(absoluteDatasetPath);
928
+ const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
929
+ await writeFile2(
930
+ outputPath,
931
+ `${JSON.stringify(payload, null, 2)}
932
+ `,
933
+ "utf8"
934
+ );
935
+ if (!cancelled) {
936
+ setResult({
937
+ count: payload.length,
938
+ datasetName: dataset.dataset.getName(),
939
+ outputPath
940
+ });
941
+ setTimeout(() => onComplete(), 200);
942
+ }
943
+ }
944
+ void run();
945
+ return () => {
946
+ cancelled = true;
947
+ };
948
+ }, [runner, datasetName, onComplete]);
949
+ if (error) {
950
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
951
+ /* @__PURE__ */ jsx(Banner, {}),
952
+ /* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
953
+ ] });
954
+ }
955
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
956
+ /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
957
+ result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
958
+ /* @__PURE__ */ jsxs(Text, { color: "green", children: [
959
+ "Generated ",
960
+ result.count,
961
+ ' test cases for dataset "',
962
+ result.datasetName,
963
+ '".'
964
+ ] }),
965
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
966
+ "Wrote ",
967
+ result.outputPath
968
+ ] })
969
+ ] })
970
+ ] });
971
+ }
972
+
973
+ // src/cli-simple/generate.ts
870
974
  function readOutput2(testCase) {
871
975
  if (typeof testCase.getOutput !== "function") {
872
976
  return void 0;
@@ -877,7 +981,7 @@ function createOutputPath(datasetFilePath) {
877
981
  const parsed = parse(datasetFilePath);
878
982
  return join(parsed.dir, `${parsed.name}.cases.json`);
879
983
  }
880
- async function generateDatasetJsonCommand(runner, datasetName) {
984
+ async function generateDatasetJsonCommandPlain(runner, datasetName) {
881
985
  const dataset = await runner.resolveDatasetByName(datasetName);
882
986
  if (!dataset) {
883
987
  throw new Error(`Dataset "${datasetName}" not found.`);
@@ -895,9 +999,383 @@ async function generateDatasetJsonCommand(runner, datasetName) {
895
999
  console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
896
1000
  console.log(`Wrote ${outputPath}`);
897
1001
  }
1002
+ async function generateDatasetJsonCommandInk(runner, datasetName) {
1003
+ return new Promise((resolve4, reject) => {
1004
+ const app = render(
1005
+ React2.createElement(GenerateView, {
1006
+ runner,
1007
+ datasetName,
1008
+ onComplete: (err) => {
1009
+ app.unmount();
1010
+ if (err) {
1011
+ reject(err);
1012
+ } else {
1013
+ resolve4();
1014
+ }
1015
+ }
1016
+ })
1017
+ );
1018
+ });
1019
+ }
1020
+ function barColor(pct) {
1021
+ if (pct >= 70)
1022
+ return "green";
1023
+ if (pct >= 40)
1024
+ return "yellow";
1025
+ return "red";
1026
+ }
1027
+ function TextBar({
1028
+ label,
1029
+ value,
1030
+ max = 100,
1031
+ labelWidth = 14,
1032
+ barWidth = 20,
1033
+ format = (v) => String(v),
1034
+ colorByValue = true
1035
+ }) {
1036
+ const clamped = Math.max(0, Math.min(max, value));
1037
+ const pct = max > 0 ? clamped / max * 100 : 0;
1038
+ const filled = Math.round(clamped / max * barWidth);
1039
+ const filledBar = "\u2588".repeat(filled);
1040
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
1041
+ const color = colorByValue ? barColor(pct) : void 0;
1042
+ return /* @__PURE__ */ jsxs(Text, { children: [
1043
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
1044
+ " [",
1045
+ color ? /* @__PURE__ */ jsxs(Fragment, { children: [
1046
+ /* @__PURE__ */ jsx(Text, { color, children: filledBar }),
1047
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
1048
+ ] }) : filledBar + emptyBar,
1049
+ "] ",
1050
+ /* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
1051
+ ] });
1052
+ }
1053
+ var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1054
+ function Spinner({ label = "Running" }) {
1055
+ const [frame, setFrame] = useState(0);
1056
+ useEffect(() => {
1057
+ const timer = setInterval(() => {
1058
+ setFrame((f) => (f + 1) % FRAMES.length);
1059
+ }, 100);
1060
+ return () => clearInterval(timer);
1061
+ }, []);
1062
+ return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1063
+ FRAMES[frame],
1064
+ " ",
1065
+ label
1066
+ ] });
1067
+ }
1068
+ function scoreColor(score) {
1069
+ if (score >= 80)
1070
+ return "green";
1071
+ if (score >= 50)
1072
+ return "yellow";
1073
+ return "red";
1074
+ }
1075
+ function createBar(value, max = 100, width = 20) {
1076
+ const safe = Math.max(0, Math.min(max, value));
1077
+ const filled = Math.round(safe / max * width);
1078
+ return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1079
+ }
1080
+ function formatScorePart(item, scoreToColor2) {
1081
+ const def = getScoreById(item.id);
1082
+ if (!def) {
1083
+ const numeric = toNumericScore(item.data);
1084
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1085
+ }
1086
+ const formatted = def.format(item.data);
1087
+ if (def.displayStrategy === "bar") {
1088
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1089
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1090
+ return `${formatted} ${createBar(numeric)}`;
1091
+ }
1092
+ }
1093
+ return formatted;
1094
+ }
1095
+ function RunView({
1096
+ runner,
1097
+ datasetName,
1098
+ evaluatorPattern,
1099
+ onComplete
1100
+ }) {
1101
+ const [phase, setPhase] = useState(
1102
+ "loading"
1103
+ );
1104
+ const [runInfo, setRunInfo] = useState(null);
1105
+ const [testCases, setTestCases] = useState([]);
1106
+ const [summary, setSummary] = useState(null);
1107
+ const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1108
+ const runEval = useCallback(async () => {
1109
+ const dataset = await runner.resolveDatasetByName(datasetName);
1110
+ if (!dataset) {
1111
+ const known = await runner.collectDatasets();
1112
+ const available = known.map((item) => item.dataset.getName()).sort();
1113
+ onComplete(
1114
+ new Error(
1115
+ available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1116
+ )
1117
+ );
1118
+ return;
1119
+ }
1120
+ const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1121
+ if (evaluators.length === 0) {
1122
+ const known = await runner.collectEvaluators();
1123
+ const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1124
+ onComplete(
1125
+ new Error(
1126
+ available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1127
+ )
1128
+ );
1129
+ return;
1130
+ }
1131
+ const nameById = new Map(
1132
+ evaluators.map((item) => [
1133
+ item.id,
1134
+ item.evaluator.getName() ?? item.id
1135
+ ])
1136
+ );
1137
+ setEvaluatorNameById(nameById);
1138
+ const aggregates = /* @__PURE__ */ new Map();
1139
+ let overallScoreTotal = 0;
1140
+ let overallScoreCount = 0;
1141
+ const done = new Promise((resolve4) => {
1142
+ const unsubscribe = runner.subscribeRunEvents((event) => {
1143
+ if (event.type === "TestCaseProgress") {
1144
+ const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1145
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1146
+ for (const item of event.evaluatorScores) {
1147
+ const numeric = toNumericScoreFromScores(item.scores);
1148
+ if (numeric !== void 0) {
1149
+ const current = aggregates.get(item.evaluatorId) ?? {
1150
+ total: 0,
1151
+ count: 0,
1152
+ passed: 0,
1153
+ failed: 0
1154
+ };
1155
+ aggregates.set(item.evaluatorId, {
1156
+ total: current.total + numeric,
1157
+ count: current.count + 1,
1158
+ passed: current.passed + (item.passed ? 1 : 0),
1159
+ failed: current.failed + (item.passed ? 0 : 1)
1160
+ });
1161
+ overallScoreTotal += numeric;
1162
+ overallScoreCount += 1;
1163
+ }
1164
+ }
1165
+ setTestCases((prev) => [
1166
+ ...prev,
1167
+ {
1168
+ name: event.testCaseName,
1169
+ completedTestCases: event.completedTestCases,
1170
+ totalTestCases: event.totalTestCases,
1171
+ durationMs: event.durationMs,
1172
+ passed: event.passed,
1173
+ averageScore,
1174
+ evaluatorScores: event.evaluatorScores.map((item) => ({
1175
+ evaluatorId: item.evaluatorId,
1176
+ evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1177
+ scores: item.scores,
1178
+ passed: item.passed,
1179
+ metrics: item.metrics
1180
+ }))
1181
+ }
1182
+ ]);
1183
+ }
1184
+ if (event.type === "RunCompleted" || event.type === "RunFailed") {
1185
+ unsubscribe();
1186
+ resolve4(event);
1187
+ }
1188
+ });
1189
+ });
1190
+ const snapshot = await runner.runDatasetWith({
1191
+ datasetId: dataset.id,
1192
+ evaluatorIds: evaluators.map((item) => item.id)
1193
+ });
1194
+ setRunInfo({
1195
+ runId: snapshot.runId,
1196
+ datasetName: snapshot.datasetName,
1197
+ evaluatorNames: evaluators.map(
1198
+ (e) => e.evaluator.getName() ?? e.id
1199
+ ),
1200
+ totalTestCases: snapshot.totalTestCases
1201
+ });
1202
+ setPhase("running");
1203
+ const finalEvent = await done;
1204
+ if (finalEvent.type === "RunFailed") {
1205
+ onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1206
+ return;
1207
+ }
1208
+ setSummary({
1209
+ passedTestCases: finalEvent.passedTestCases,
1210
+ failedTestCases: finalEvent.failedTestCases,
1211
+ totalTestCases: finalEvent.totalTestCases,
1212
+ overallScoreTotal,
1213
+ overallScoreCount,
1214
+ aggregates: new Map(aggregates),
1215
+ artifactPath: finalEvent.artifactPath
1216
+ });
1217
+ setPhase("completed");
1218
+ setTimeout(() => onComplete(), 200);
1219
+ }, [runner, datasetName, evaluatorPattern, onComplete]);
1220
+ useEffect(() => {
1221
+ void runEval();
1222
+ }, [runEval]);
1223
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
1224
+ /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
1225
+ runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1226
+ /* @__PURE__ */ jsxs(Text, { children: [
1227
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
1228
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
1229
+ ] }),
1230
+ /* @__PURE__ */ jsxs(Text, { children: [
1231
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
1232
+ runInfo.datasetName
1233
+ ] }),
1234
+ /* @__PURE__ */ jsxs(Text, { children: [
1235
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
1236
+ runInfo.evaluatorNames.join(", ")
1237
+ ] }),
1238
+ /* @__PURE__ */ jsxs(Text, { children: [
1239
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
1240
+ runInfo.totalTestCases
1241
+ ] })
1242
+ ] }),
1243
+ phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1244
+ Spinner,
1245
+ {
1246
+ label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1247
+ }
1248
+ ) }),
1249
+ testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1250
+ /* @__PURE__ */ jsxs(Text, { children: [
1251
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1252
+ "[",
1253
+ tc.completedTestCases,
1254
+ "/",
1255
+ tc.totalTestCases,
1256
+ "]"
1257
+ ] }),
1258
+ " ",
1259
+ tc.name,
1260
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1261
+ " (",
1262
+ tc.durationMs,
1263
+ "ms)"
1264
+ ] })
1265
+ ] }),
1266
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
1267
+ item.evaluatorName,
1268
+ ":",
1269
+ " ",
1270
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1271
+ " ",
1272
+ item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1273
+ formatScorePart(s),
1274
+ " "
1275
+ ] }, s.id)),
1276
+ item.metrics?.map((m) => {
1277
+ const def = getMetricById(m.id);
1278
+ if (!def)
1279
+ return null;
1280
+ const formatted = def.format(m.data);
1281
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1282
+ "[",
1283
+ def.name ? `${def.name}: ` : "",
1284
+ formatted,
1285
+ "]",
1286
+ " "
1287
+ ] }, m.id);
1288
+ })
1289
+ ] }) }, item.evaluatorId))
1290
+ ] }, i)) }),
1291
+ phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1292
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
1293
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
1294
+ /* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
1295
+ /* @__PURE__ */ jsxs(Text, { children: [
1296
+ " ",
1297
+ summary.passedTestCases,
1298
+ "/",
1299
+ summary.totalTestCases
1300
+ ] })
1301
+ ] }),
1302
+ /* @__PURE__ */ jsxs(Box, { children: [
1303
+ /* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
1304
+ /* @__PURE__ */ jsxs(Text, { children: [
1305
+ " ",
1306
+ summary.failedTestCases,
1307
+ "/",
1308
+ summary.totalTestCases
1309
+ ] })
1310
+ ] }),
1311
+ summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
1312
+ TextBar,
1313
+ {
1314
+ label: "overall avg",
1315
+ value: summary.overallScoreTotal / summary.overallScoreCount,
1316
+ barWidth: 20,
1317
+ format: (v) => v.toFixed(2)
1318
+ }
1319
+ ) }),
1320
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1321
+ /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1322
+ Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1323
+ const agg = summary.aggregates.get(id);
1324
+ if (!agg || agg.count === 0) {
1325
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1326
+ "- ",
1327
+ name.padEnd(28),
1328
+ " no numeric scores"
1329
+ ] }, id);
1330
+ }
1331
+ const mean = agg.total / agg.count;
1332
+ return /* @__PURE__ */ jsxs(Text, { children: [
1333
+ "- ",
1334
+ name.padEnd(28),
1335
+ " avg=",
1336
+ /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1337
+ " passed=",
1338
+ agg.passed,
1339
+ " failed=",
1340
+ agg.failed
1341
+ ] }, id);
1342
+ })
1343
+ ] }),
1344
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1345
+ /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1346
+ testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
1347
+ /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1348
+ /* @__PURE__ */ jsxs(Text, { children: [
1349
+ " ",
1350
+ tc.name.padEnd(24)
1351
+ ] }),
1352
+ tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1353
+ /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
1354
+ "score=",
1355
+ tc.averageScore.toFixed(2)
1356
+ ] }),
1357
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1358
+ " ",
1359
+ createBar(tc.averageScore, 100, 14)
1360
+ ] })
1361
+ ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1362
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1363
+ " (",
1364
+ tc.durationMs,
1365
+ "ms)"
1366
+ ] })
1367
+ ] }, i))
1368
+ ] }),
1369
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1370
+ "artifact: ",
1371
+ summary.artifactPath
1372
+ ] }) })
1373
+ ] })
1374
+ ] });
1375
+ }
898
1376
 
899
1377
  // src/cli-simple/run.ts
900
- var ansi = {
1378
+ var ansi2 = {
901
1379
  reset: "\x1B[0m",
902
1380
  bold: "\x1B[1m",
903
1381
  dim: "\x1B[2m",
@@ -908,16 +1386,16 @@ var ansi = {
908
1386
  magenta: "\x1B[35m"
909
1387
  };
910
1388
  function colorize(text, color) {
911
- return `${color}${text}${ansi.reset}`;
1389
+ return `${color}${text}${ansi2.reset}`;
912
1390
  }
913
1391
  function scoreToColor(score) {
914
1392
  if (score >= 80) {
915
- return ansi.green;
1393
+ return ansi2.green;
916
1394
  }
917
1395
  if (score >= 50) {
918
- return ansi.yellow;
1396
+ return ansi2.yellow;
919
1397
  }
920
- return ansi.red;
1398
+ return ansi2.red;
921
1399
  }
922
1400
  function getEvaluatorSummaryLine(evaluatorName, aggregate) {
923
1401
  if (!aggregate || aggregate.count === 0) {
@@ -926,13 +1404,13 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
926
1404
  const mean = aggregate.total / aggregate.count;
927
1405
  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
928
1406
  }
929
- function createBar(value, max = 100, width = 20) {
1407
+ function createBar2(value, max = 100, width = 20) {
930
1408
  const safe = Math.max(0, Math.min(max, value));
931
1409
  const filled = Math.round(safe / max * width);
932
1410
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
933
1411
  }
934
1412
  function formatEvaluatorScoreLine(name, scores, passed, metrics) {
935
- const passLabel = passed ? colorize("PASS", `${ansi.bold}${ansi.green}`) : colorize("FAIL", `${ansi.bold}${ansi.red}`);
1413
+ const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
936
1414
  const scoreParts = [];
937
1415
  for (const item of scores) {
938
1416
  const def = getScoreById(item.id);
@@ -949,7 +1427,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
949
1427
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
950
1428
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
951
1429
  scoreParts.push(
952
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi.dim)}`
1430
+ `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
953
1431
  );
954
1432
  } else {
955
1433
  scoreParts.push(formatted);
@@ -963,7 +1441,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
963
1441
  scoreParts.push(
964
1442
  colorize(
965
1443
  formatted,
966
- item.passed === true ? `${ansi.bold}${ansi.green}` : item.passed === false ? `${ansi.bold}${ansi.red}` : ansi.dim
1444
+ item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
967
1445
  )
968
1446
  );
969
1447
  break;
@@ -988,7 +1466,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
988
1466
  }
989
1467
  return line;
990
1468
  }
991
- async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1469
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
992
1470
  const dataset = await runner.resolveDatasetByName(datasetName);
993
1471
  if (!dataset) {
994
1472
  const known = await runner.collectDatasets();
@@ -1030,10 +1508,10 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1030
1508
  const frame = spinnerFrames[spinnerIndex % spinnerFrames.length];
1031
1509
  spinnerIndex += 1;
1032
1510
  process.stdout.write(
1033
- `\r${colorize(frame, ansi.cyan)} Running evaluations ${colorize(
1511
+ `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
1034
1512
  `${completedCount}/${totalCount}`,
1035
- ansi.bold
1036
- )} ${colorize("(live)", ansi.dim)}`
1513
+ ansi2.bold
1514
+ )} ${colorize("(live)", ansi2.dim)}`
1037
1515
  );
1038
1516
  }
1039
1517
  let spinnerTimer;
@@ -1045,7 +1523,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1045
1523
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
1046
1524
  clearLine();
1047
1525
  console.log(
1048
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi.dim)}`
1526
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
1049
1527
  );
1050
1528
  for (const item of event.evaluatorScores) {
1051
1529
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
@@ -1096,14 +1574,14 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1096
1574
  evaluatorIds: evaluators.map((item) => item.id)
1097
1575
  });
1098
1576
  totalCount = snapshot.totalTestCases;
1099
- console.log(colorize("=== Eval Run Started ===", `${ansi.bold}${ansi.cyan}`));
1100
- console.log(`Run: ${colorize(snapshot.runId, ansi.cyan)}`);
1101
- console.log(`Dataset: ${colorize(snapshot.datasetName, ansi.bold)}`);
1577
+ console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
1578
+ console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
1579
+ console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
1102
1580
  console.log(
1103
1581
  `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
1104
1582
  );
1105
1583
  console.log(
1106
- `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi.bold)}`
1584
+ `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
1107
1585
  );
1108
1586
  console.log("");
1109
1587
  drawSpinner();
@@ -1116,17 +1594,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1116
1594
  throw new Error(`Run failed: ${finalEvent.errorMessage}`);
1117
1595
  }
1118
1596
  console.log("");
1119
- console.log(colorize("=== Run Summary ===", `${ansi.bold}${ansi.cyan}`));
1597
+ console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
1120
1598
  console.log(
1121
1599
  `- passed: ${colorize(
1122
1600
  `${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
1123
- ansi.green
1601
+ ansi2.green
1124
1602
  )}`
1125
1603
  );
1126
1604
  console.log(
1127
1605
  `- failed: ${colorize(
1128
1606
  `${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
1129
- finalEvent.failedTestCases > 0 ? ansi.red : ansi.dim
1607
+ finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
1130
1608
  )}`
1131
1609
  );
1132
1610
  if (overallScoreCount > 0) {
@@ -1135,22 +1613,22 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1135
1613
  `- overall avg score: ${colorize(
1136
1614
  overallAverage.toFixed(2),
1137
1615
  scoreToColor(overallAverage)
1138
- )} ${colorize(createBar(overallAverage), ansi.dim)}`
1616
+ )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1139
1617
  );
1140
1618
  }
1141
- console.log(colorize("- evaluator averages:", ansi.magenta));
1619
+ console.log(colorize("- evaluator averages:", ansi2.magenta));
1142
1620
  for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
1143
1621
  console.log(
1144
1622
  getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
1145
1623
  );
1146
1624
  }
1147
1625
  if (testCaseSummaries.length > 0) {
1148
- console.log(colorize("- test case scores:", ansi.magenta));
1626
+ console.log(colorize("- test case scores:", ansi2.magenta));
1149
1627
  for (const summary of testCaseSummaries) {
1150
- const status = summary.passed ? colorize("PASS", ansi.green) : colorize("FAIL", ansi.red);
1628
+ const status = summary.passed ? colorize("PASS", ansi2.green) : colorize("FAIL", ansi2.red);
1151
1629
  if (summary.averageScore === void 0) {
1152
1630
  console.log(
1153
- ` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
1631
+ ` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1154
1632
  );
1155
1633
  continue;
1156
1634
  }
@@ -1158,11 +1636,30 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1158
1636
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1159
1637
  summary.averageScore.toFixed(2),
1160
1638
  scoreToColor(summary.averageScore)
1161
- )} ${colorize(createBar(summary.averageScore, 100, 14), ansi.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi.dim)}`
1639
+ )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1162
1640
  );
1163
1641
  }
1164
1642
  }
1165
- console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi.dim)}`);
1643
+ console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1644
+ }
1645
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1646
+ return new Promise((resolve4, reject) => {
1647
+ const app = render(
1648
+ React2.createElement(RunView, {
1649
+ runner,
1650
+ datasetName,
1651
+ evaluatorPattern,
1652
+ onComplete: (err) => {
1653
+ app.unmount();
1654
+ if (err) {
1655
+ reject(err);
1656
+ } else {
1657
+ resolve4();
1658
+ }
1659
+ }
1660
+ })
1661
+ );
1662
+ });
1166
1663
  }
1167
1664
 
1168
1665
  // src/cli-simple/index.ts
@@ -1187,17 +1684,28 @@ async function main() {
1187
1684
  console.error("Missing required --dataset <datasetName> argument.");
1188
1685
  printUsageAndExit(1);
1189
1686
  }
1687
+ if (args.command === "run" && !args.evaluatorPattern) {
1688
+ console.error("Missing required --evaluator <name-or-pattern> argument.");
1689
+ printUsageAndExit(1);
1690
+ }
1691
+ const useInk = process.stdout.isTTY === true;
1692
+ if (!useInk) {
1693
+ printBanner();
1694
+ }
1190
1695
  const runner = createRunner();
1191
1696
  try {
1192
1697
  if (args.command === "run") {
1193
- if (!args.evaluatorPattern) {
1194
- console.error("Missing required --evaluator <name-or-pattern> argument.");
1195
- printUsageAndExit(1);
1196
- }
1197
- await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
1698
+ await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
1699
+ runner,
1700
+ args.datasetName,
1701
+ args.evaluatorPattern
1702
+ );
1198
1703
  return;
1199
1704
  }
1200
- await generateDatasetJsonCommand(runner, args.datasetName);
1705
+ await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
1706
+ runner,
1707
+ args.datasetName
1708
+ );
1201
1709
  } finally {
1202
1710
  await runner.shutdown();
1203
1711
  }