@m4trix/evals 0.9.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,9 +8,14 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- require('json-diff');
11
+ var jsonDiff = require('json-diff');
12
+ var React2 = require('react');
13
+ var ink = require('ink');
14
+ var jsxRuntime = require('react/jsx-runtime');
12
15
 
13
16
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
17
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
+
14
19
  function _interopNamespace(e) {
15
20
  if (e && e.__esModule) return e;
16
21
  var n = Object.create(null);
@@ -30,6 +35,7 @@ function _interopNamespace(e) {
30
35
  }
31
36
 
32
37
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
38
+ var React2__default = /*#__PURE__*/_interopDefault(React2);
33
39
 
34
40
  // src/runner/config.ts
35
41
  var defaultRunnerConfig = {
@@ -276,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
276
282
  );
277
283
  return found.flat();
278
284
  }
285
+ function createDiffLogEntry(expected, actual, options) {
286
+ const diff = jsonDiff.diffString(expected, actual, { color: false });
287
+ return {
288
+ type: "diff",
289
+ label: options?.label,
290
+ expected,
291
+ actual,
292
+ diff: diff || "(no differences)"
293
+ };
294
+ }
295
+ function getDiffLines(entry) {
296
+ const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
297
+ return raw.split("\n").map((line) => {
298
+ const trimmed = line.trimStart();
299
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
300
+ return { type: "remove", line };
301
+ }
302
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
303
+ return { type: "add", line };
304
+ }
305
+ return { type: "context", line };
306
+ });
307
+ }
279
308
 
280
309
  // src/evals/metric.ts
281
310
  var registry = /* @__PURE__ */ new Map();
@@ -459,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
459
488
  continue;
460
489
  }
461
490
  try {
491
+ const logs = [];
492
+ const logDiff = (expected, actual, options) => {
493
+ logs.push(createDiffLogEntry(expected, actual, options));
494
+ };
462
495
  const ctx = yield* effect.Effect.promise(
463
496
  () => Promise.resolve(evaluator.resolveContext())
464
497
  );
@@ -467,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
467
500
  evaluateFn({
468
501
  input: testCaseItem.testCase.getInput(),
469
502
  ctx,
470
- output
503
+ output,
504
+ logDiff
471
505
  })
472
506
  )
473
507
  );
474
508
  const { scores, metrics } = normalizeResult(result);
475
509
  const passed = computeEvaluatorPassed(evaluator, result, scores);
476
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
510
+ evaluatorScores.push({
511
+ evaluatorId,
512
+ scores,
513
+ passed,
514
+ metrics,
515
+ logs: logs.length > 0 ? logs : void 0
516
+ });
477
517
  } catch (error) {
478
518
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
479
519
  evaluatorScores.push({
@@ -909,6 +949,88 @@ function printBanner() {
909
949
  ];
910
950
  console.log(lines.join("\n"));
911
951
  }
952
+ function Banner() {
953
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
954
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "@m4trix/evals" }),
955
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: " \xB7 " }),
956
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "eval-agents-simple" })
957
+ ] });
958
+ }
959
+ function GenerateView({
960
+ runner,
961
+ datasetName,
962
+ onComplete
963
+ }) {
964
+ const [result, setResult] = React2.useState(null);
965
+ const [error, setError] = React2.useState(null);
966
+ React2.useEffect(() => {
967
+ let cancelled = false;
968
+ async function run() {
969
+ const dataset = await runner.resolveDatasetByName(datasetName);
970
+ if (!dataset) {
971
+ setError(new Error(`Dataset "${datasetName}" not found.`));
972
+ onComplete(new Error(`Dataset "${datasetName}" not found.`));
973
+ return;
974
+ }
975
+ const { writeFile: writeFile2 } = await import('fs/promises');
976
+ const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
977
+ const testCases = await runner.collectDatasetTestCases(dataset.id);
978
+ const payload = testCases.map((item) => {
979
+ const tc = item.testCase;
980
+ return {
981
+ name: item.testCase.getName(),
982
+ input: item.testCase.getInput(),
983
+ output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
984
+ };
985
+ });
986
+ const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
987
+ const parsed = parse2(absoluteDatasetPath);
988
+ const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
989
+ await writeFile2(
990
+ outputPath,
991
+ `${JSON.stringify(payload, null, 2)}
992
+ `,
993
+ "utf8"
994
+ );
995
+ if (!cancelled) {
996
+ setResult({
997
+ count: payload.length,
998
+ datasetName: dataset.dataset.getName(),
999
+ outputPath
1000
+ });
1001
+ setTimeout(() => onComplete(), 200);
1002
+ }
1003
+ }
1004
+ void run();
1005
+ return () => {
1006
+ cancelled = true;
1007
+ };
1008
+ }, [runner, datasetName, onComplete]);
1009
+ if (error) {
1010
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1011
+ /* @__PURE__ */ jsxRuntime.jsx(Banner, {}),
1012
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: error.message })
1013
+ ] });
1014
+ }
1015
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1016
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1017
+ result && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1018
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "green", children: [
1019
+ "Generated ",
1020
+ result.count,
1021
+ ' test cases for dataset "',
1022
+ result.datasetName,
1023
+ '".'
1024
+ ] }),
1025
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1026
+ "Wrote ",
1027
+ result.outputPath
1028
+ ] })
1029
+ ] })
1030
+ ] });
1031
+ }
1032
+
1033
+ // src/cli-simple/generate.ts
912
1034
  function readOutput2(testCase) {
913
1035
  if (typeof testCase.getOutput !== "function") {
914
1036
  return void 0;
@@ -919,7 +1041,7 @@ function createOutputPath(datasetFilePath) {
919
1041
  const parsed = path.parse(datasetFilePath);
920
1042
  return path.join(parsed.dir, `${parsed.name}.cases.json`);
921
1043
  }
922
- async function generateDatasetJsonCommand(runner, datasetName) {
1044
+ async function generateDatasetJsonCommandPlain(runner, datasetName) {
923
1045
  const dataset = await runner.resolveDatasetByName(datasetName);
924
1046
  if (!dataset) {
925
1047
  throw new Error(`Dataset "${datasetName}" not found.`);
@@ -937,6 +1059,393 @@ async function generateDatasetJsonCommand(runner, datasetName) {
937
1059
  console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
938
1060
  console.log(`Wrote ${outputPath}`);
939
1061
  }
1062
+ async function generateDatasetJsonCommandInk(runner, datasetName) {
1063
+ return new Promise((resolve4, reject) => {
1064
+ const app = ink.render(
1065
+ React2__default.default.createElement(GenerateView, {
1066
+ runner,
1067
+ datasetName,
1068
+ onComplete: (err) => {
1069
+ app.unmount();
1070
+ if (err) {
1071
+ reject(err);
1072
+ } else {
1073
+ resolve4();
1074
+ }
1075
+ }
1076
+ })
1077
+ );
1078
+ });
1079
+ }
1080
+ function barColor(pct) {
1081
+ if (pct >= 70)
1082
+ return "green";
1083
+ if (pct >= 40)
1084
+ return "yellow";
1085
+ return "red";
1086
+ }
1087
+ function TextBar({
1088
+ label,
1089
+ value,
1090
+ max = 100,
1091
+ labelWidth = 14,
1092
+ barWidth = 20,
1093
+ format = (v) => String(v),
1094
+ colorByValue = true
1095
+ }) {
1096
+ const clamped = Math.max(0, Math.min(max, value));
1097
+ const pct = max > 0 ? clamped / max * 100 : 0;
1098
+ const filled = Math.round(clamped / max * barWidth);
1099
+ const filledBar = "\u2588".repeat(filled);
1100
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
1101
+ const color = colorByValue ? barColor(pct) : void 0;
1102
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1103
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
1104
+ " [",
1105
+ color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1106
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
1107
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
1108
+ ] }) : filledBar + emptyBar,
1109
+ "] ",
1110
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
1111
+ ] });
1112
+ }
1113
+ var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1114
+ function Spinner({ label = "Running" }) {
1115
+ const [frame, setFrame] = React2.useState(0);
1116
+ React2.useEffect(() => {
1117
+ const timer = setInterval(() => {
1118
+ setFrame((f) => (f + 1) % FRAMES.length);
1119
+ }, 100);
1120
+ return () => clearInterval(timer);
1121
+ }, []);
1122
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1123
+ FRAMES[frame],
1124
+ " ",
1125
+ label
1126
+ ] });
1127
+ }
1128
+ function scoreColor(score) {
1129
+ if (score >= 80)
1130
+ return "green";
1131
+ if (score >= 50)
1132
+ return "yellow";
1133
+ return "red";
1134
+ }
1135
+ function createBar(value, max = 100, width = 20) {
1136
+ const safe = Math.max(0, Math.min(max, value));
1137
+ const filled = Math.round(safe / max * width);
1138
+ return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1139
+ }
1140
+ function formatScorePart(item, scoreToColor2) {
1141
+ const def = getScoreById(item.id);
1142
+ if (!def) {
1143
+ const numeric = toNumericScore(item.data);
1144
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1145
+ }
1146
+ const formatted = def.format(item.data);
1147
+ if (def.displayStrategy === "bar") {
1148
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1149
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1150
+ return `${formatted} ${createBar(numeric)}`;
1151
+ }
1152
+ }
1153
+ return formatted;
1154
+ }
1155
+ function RunView({
1156
+ runner,
1157
+ datasetName,
1158
+ evaluatorPattern,
1159
+ onComplete
1160
+ }) {
1161
+ const [phase, setPhase] = React2.useState(
1162
+ "loading"
1163
+ );
1164
+ const [runInfo, setRunInfo] = React2.useState(null);
1165
+ const [testCases, setTestCases] = React2.useState([]);
1166
+ const [summary, setSummary] = React2.useState(null);
1167
+ const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1168
+ const runEval = React2.useCallback(async () => {
1169
+ const dataset = await runner.resolveDatasetByName(datasetName);
1170
+ if (!dataset) {
1171
+ const known = await runner.collectDatasets();
1172
+ const available = known.map((item) => item.dataset.getName()).sort();
1173
+ onComplete(
1174
+ new Error(
1175
+ available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1176
+ )
1177
+ );
1178
+ return;
1179
+ }
1180
+ const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1181
+ if (evaluators.length === 0) {
1182
+ const known = await runner.collectEvaluators();
1183
+ const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1184
+ onComplete(
1185
+ new Error(
1186
+ available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1187
+ )
1188
+ );
1189
+ return;
1190
+ }
1191
+ const nameById = new Map(
1192
+ evaluators.map((item) => [
1193
+ item.id,
1194
+ item.evaluator.getName() ?? item.id
1195
+ ])
1196
+ );
1197
+ setEvaluatorNameById(nameById);
1198
+ const aggregates = /* @__PURE__ */ new Map();
1199
+ let overallScoreTotal = 0;
1200
+ let overallScoreCount = 0;
1201
+ const done = new Promise((resolve4) => {
1202
+ const unsubscribe = runner.subscribeRunEvents((event) => {
1203
+ if (event.type === "TestCaseProgress") {
1204
+ const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1205
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1206
+ for (const item of event.evaluatorScores) {
1207
+ const numeric = toNumericScoreFromScores(item.scores);
1208
+ if (numeric !== void 0) {
1209
+ const current = aggregates.get(item.evaluatorId) ?? {
1210
+ total: 0,
1211
+ count: 0,
1212
+ passed: 0,
1213
+ failed: 0
1214
+ };
1215
+ aggregates.set(item.evaluatorId, {
1216
+ total: current.total + numeric,
1217
+ count: current.count + 1,
1218
+ passed: current.passed + (item.passed ? 1 : 0),
1219
+ failed: current.failed + (item.passed ? 0 : 1)
1220
+ });
1221
+ overallScoreTotal += numeric;
1222
+ overallScoreCount += 1;
1223
+ }
1224
+ }
1225
+ setTestCases((prev) => [
1226
+ ...prev,
1227
+ {
1228
+ name: event.testCaseName,
1229
+ completedTestCases: event.completedTestCases,
1230
+ totalTestCases: event.totalTestCases,
1231
+ durationMs: event.durationMs,
1232
+ passed: event.passed,
1233
+ averageScore,
1234
+ evaluatorScores: event.evaluatorScores.map((item) => ({
1235
+ evaluatorId: item.evaluatorId,
1236
+ evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1237
+ scores: item.scores,
1238
+ passed: item.passed,
1239
+ metrics: item.metrics,
1240
+ logs: item.logs
1241
+ }))
1242
+ }
1243
+ ]);
1244
+ }
1245
+ if (event.type === "RunCompleted" || event.type === "RunFailed") {
1246
+ unsubscribe();
1247
+ resolve4(event);
1248
+ }
1249
+ });
1250
+ });
1251
+ const snapshot = await runner.runDatasetWith({
1252
+ datasetId: dataset.id,
1253
+ evaluatorIds: evaluators.map((item) => item.id)
1254
+ });
1255
+ setRunInfo({
1256
+ runId: snapshot.runId,
1257
+ datasetName: snapshot.datasetName,
1258
+ evaluatorNames: evaluators.map(
1259
+ (e) => e.evaluator.getName() ?? e.id
1260
+ ),
1261
+ totalTestCases: snapshot.totalTestCases
1262
+ });
1263
+ setPhase("running");
1264
+ const finalEvent = await done;
1265
+ if (finalEvent.type === "RunFailed") {
1266
+ onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1267
+ return;
1268
+ }
1269
+ setSummary({
1270
+ passedTestCases: finalEvent.passedTestCases,
1271
+ failedTestCases: finalEvent.failedTestCases,
1272
+ totalTestCases: finalEvent.totalTestCases,
1273
+ overallScoreTotal,
1274
+ overallScoreCount,
1275
+ aggregates: new Map(aggregates),
1276
+ artifactPath: finalEvent.artifactPath
1277
+ });
1278
+ setPhase("completed");
1279
+ setTimeout(() => onComplete(), 200);
1280
+ }, [runner, datasetName, evaluatorPattern, onComplete]);
1281
+ React2.useEffect(() => {
1282
+ void runEval();
1283
+ }, [runEval]);
1284
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
1285
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
1286
+ runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1287
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1288
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
1289
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
1290
+ ] }),
1291
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1292
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
1293
+ runInfo.datasetName
1294
+ ] }),
1295
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1296
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
1297
+ runInfo.evaluatorNames.join(", ")
1298
+ ] }),
1299
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1300
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
1301
+ runInfo.totalTestCases
1302
+ ] })
1303
+ ] }),
1304
+ phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1305
+ Spinner,
1306
+ {
1307
+ label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1308
+ }
1309
+ ) }),
1310
+ testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1311
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1312
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
1313
+ "[",
1314
+ tc.completedTestCases,
1315
+ "/",
1316
+ tc.totalTestCases,
1317
+ "]"
1318
+ ] }),
1319
+ " ",
1320
+ tc.name,
1321
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1322
+ " (",
1323
+ tc.durationMs,
1324
+ "ms)"
1325
+ ] })
1326
+ ] }),
1327
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
1328
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1329
+ item.evaluatorName,
1330
+ ":",
1331
+ " ",
1332
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1333
+ " ",
1334
+ item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1335
+ formatScorePart(s),
1336
+ " "
1337
+ ] }, s.id)),
1338
+ item.metrics?.map((m) => {
1339
+ const def = getMetricById(m.id);
1340
+ if (!def)
1341
+ return null;
1342
+ const formatted = def.format(m.data);
1343
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1344
+ "[",
1345
+ def.name ? `${def.name}: ` : "",
1346
+ formatted,
1347
+ "]",
1348
+ " "
1349
+ ] }, m.id);
1350
+ })
1351
+ ] }),
1352
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1353
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
1354
+ ink.Text,
1355
+ {
1356
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1357
+ children: line
1358
+ },
1359
+ lineIdx
1360
+ )) }, logIdx) : null
1361
+ ) })
1362
+ ] }, item.evaluatorId))
1363
+ ] }, i)) }),
1364
+ phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
1365
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
1366
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
1367
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "green", children: "passed" }),
1368
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1369
+ " ",
1370
+ summary.passedTestCases,
1371
+ "/",
1372
+ summary.totalTestCases
1373
+ ] })
1374
+ ] }),
1375
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1376
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
1377
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1378
+ " ",
1379
+ summary.failedTestCases,
1380
+ "/",
1381
+ summary.totalTestCases
1382
+ ] })
1383
+ ] }),
1384
+ summary.overallScoreCount > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1385
+ TextBar,
1386
+ {
1387
+ label: "overall avg",
1388
+ value: summary.overallScoreTotal / summary.overallScoreCount,
1389
+ barWidth: 20,
1390
+ format: (v) => v.toFixed(2)
1391
+ }
1392
+ ) }),
1393
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1394
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1395
+ Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1396
+ const agg = summary.aggregates.get(id);
1397
+ if (!agg || agg.count === 0) {
1398
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1399
+ "- ",
1400
+ name.padEnd(28),
1401
+ " no numeric scores"
1402
+ ] }, id);
1403
+ }
1404
+ const mean = agg.total / agg.count;
1405
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1406
+ "- ",
1407
+ name.padEnd(28),
1408
+ " avg=",
1409
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1410
+ " passed=",
1411
+ agg.passed,
1412
+ " failed=",
1413
+ agg.failed
1414
+ ] }, id);
1415
+ })
1416
+ ] }),
1417
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
1418
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
1419
+ testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
1420
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1421
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1422
+ " ",
1423
+ tc.name.padEnd(24)
1424
+ ] }),
1425
+ tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
1426
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
1427
+ "score=",
1428
+ tc.averageScore.toFixed(2)
1429
+ ] }),
1430
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1431
+ " ",
1432
+ createBar(tc.averageScore, 100, 14)
1433
+ ] })
1434
+ ] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
1435
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1436
+ " (",
1437
+ tc.durationMs,
1438
+ "ms)"
1439
+ ] })
1440
+ ] }, i))
1441
+ ] }),
1442
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1443
+ "artifact: ",
1444
+ summary.artifactPath
1445
+ ] }) })
1446
+ ] })
1447
+ ] });
1448
+ }
940
1449
 
941
1450
  // src/cli-simple/run.ts
942
1451
  var ansi2 = {
@@ -968,7 +1477,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
968
1477
  const mean = aggregate.total / aggregate.count;
969
1478
  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
970
1479
  }
971
- function createBar(value, max = 100, width = 20) {
1480
+ function createBar2(value, max = 100, width = 20) {
972
1481
  const safe = Math.max(0, Math.min(max, value));
973
1482
  const filled = Math.round(safe / max * width);
974
1483
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
@@ -991,7 +1500,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
991
1500
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
992
1501
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
993
1502
  scoreParts.push(
994
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi2.dim)}`
1503
+ `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
995
1504
  );
996
1505
  } else {
997
1506
  scoreParts.push(formatted);
@@ -1030,7 +1539,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1030
1539
  }
1031
1540
  return line;
1032
1541
  }
1033
- async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1542
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1034
1543
  const dataset = await runner.resolveDatasetByName(datasetName);
1035
1544
  if (!dataset) {
1036
1545
  const known = await runner.collectDatasets();
@@ -1099,6 +1608,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1099
1608
  item.metrics
1100
1609
  )
1101
1610
  );
1611
+ if (!item.passed && item.logs && item.logs.length > 0) {
1612
+ for (const log of item.logs) {
1613
+ if (log.type === "diff") {
1614
+ const useColor = process.stdout.isTTY;
1615
+ for (const { type, line } of getDiffLines(log)) {
1616
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1617
+ console.log(colored);
1618
+ }
1619
+ }
1620
+ }
1621
+ }
1102
1622
  const numeric = toNumericScoreFromScores(item.scores);
1103
1623
  if (numeric !== void 0) {
1104
1624
  const current = aggregates.get(item.evaluatorId) ?? {
@@ -1177,7 +1697,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1177
1697
  `- overall avg score: ${colorize(
1178
1698
  overallAverage.toFixed(2),
1179
1699
  scoreToColor(overallAverage)
1180
- )} ${colorize(createBar(overallAverage), ansi2.dim)}`
1700
+ )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1181
1701
  );
1182
1702
  }
1183
1703
  console.log(colorize("- evaluator averages:", ansi2.magenta));
@@ -1200,12 +1720,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1200
1720
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1201
1721
  summary.averageScore.toFixed(2),
1202
1722
  scoreToColor(summary.averageScore)
1203
- )} ${colorize(createBar(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1723
+ )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1204
1724
  );
1205
1725
  }
1206
1726
  }
1207
1727
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1208
1728
  }
1729
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1730
+ return new Promise((resolve4, reject) => {
1731
+ const app = ink.render(
1732
+ React2__default.default.createElement(RunView, {
1733
+ runner,
1734
+ datasetName,
1735
+ evaluatorPattern,
1736
+ onComplete: (err) => {
1737
+ app.unmount();
1738
+ if (err) {
1739
+ reject(err);
1740
+ } else {
1741
+ resolve4();
1742
+ }
1743
+ }
1744
+ })
1745
+ );
1746
+ });
1747
+ }
1209
1748
 
1210
1749
  // src/cli-simple/index.ts
1211
1750
  function printUsageAndExit(exitCode) {
@@ -1233,14 +1772,24 @@ async function main() {
1233
1772
  console.error("Missing required --evaluator <name-or-pattern> argument.");
1234
1773
  printUsageAndExit(1);
1235
1774
  }
1236
- printBanner();
1775
+ const useInk = process.stdout.isTTY === true;
1776
+ if (!useInk) {
1777
+ printBanner();
1778
+ }
1237
1779
  const runner = createRunner();
1238
1780
  try {
1239
1781
  if (args.command === "run") {
1240
- await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
1782
+ await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
1783
+ runner,
1784
+ args.datasetName,
1785
+ args.evaluatorPattern
1786
+ );
1241
1787
  return;
1242
1788
  }
1243
- await generateDatasetJsonCommand(runner, args.datasetName);
1789
+ await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
1790
+ runner,
1791
+ args.datasetName
1792
+ );
1244
1793
  } finally {
1245
1794
  await runner.shutdown();
1246
1795
  }