@m4trix/evals 0.9.1 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,10 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import 'json-diff';
9
+ import { diffString } from 'json-diff';
10
+ import React2, { useState, useEffect, useCallback } from 'react';
11
+ import { render, Box, Text } from 'ink';
12
+ import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
10
13
 
11
14
  // src/runner/config.ts
12
15
  var defaultRunnerConfig = {
@@ -253,6 +256,29 @@ async function collectTestCasesFromFiles(config) {
253
256
  );
254
257
  return found.flat();
255
258
  }
259
+ function createDiffLogEntry(expected, actual, options) {
260
+ const diff = diffString(expected, actual, { color: false });
261
+ return {
262
+ type: "diff",
263
+ label: options?.label,
264
+ expected,
265
+ actual,
266
+ diff: diff || "(no differences)"
267
+ };
268
+ }
269
+ function getDiffLines(entry) {
270
+ const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
271
+ return raw.split("\n").map((line) => {
272
+ const trimmed = line.trimStart();
273
+ if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
274
+ return { type: "remove", line };
275
+ }
276
+ if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
277
+ return { type: "add", line };
278
+ }
279
+ return { type: "context", line };
280
+ });
281
+ }
256
282
 
257
283
  // src/evals/metric.ts
258
284
  var registry = /* @__PURE__ */ new Map();
@@ -436,6 +462,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
436
462
  continue;
437
463
  }
438
464
  try {
465
+ const logs = [];
466
+ const logDiff = (expected, actual, options) => {
467
+ logs.push(createDiffLogEntry(expected, actual, options));
468
+ };
439
469
  const ctx = yield* Effect.promise(
440
470
  () => Promise.resolve(evaluator.resolveContext())
441
471
  );
@@ -444,13 +474,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
444
474
  evaluateFn({
445
475
  input: testCaseItem.testCase.getInput(),
446
476
  ctx,
447
- output
477
+ output,
478
+ logDiff
448
479
  })
449
480
  )
450
481
  );
451
482
  const { scores, metrics } = normalizeResult(result);
452
483
  const passed = computeEvaluatorPassed(evaluator, result, scores);
453
- evaluatorScores.push({ evaluatorId, scores, passed, metrics });
484
+ evaluatorScores.push({
485
+ evaluatorId,
486
+ scores,
487
+ passed,
488
+ metrics,
489
+ logs: logs.length > 0 ? logs : void 0
490
+ });
454
491
  } catch (error) {
455
492
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
456
493
  evaluatorScores.push({
@@ -886,6 +923,88 @@ function printBanner() {
886
923
  ];
887
924
  console.log(lines.join("\n"));
888
925
  }
926
+ function Banner() {
927
+ return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
928
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
929
+ /* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
930
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
931
+ ] });
932
+ }
933
+ function GenerateView({
934
+ runner,
935
+ datasetName,
936
+ onComplete
937
+ }) {
938
+ const [result, setResult] = useState(null);
939
+ const [error, setError] = useState(null);
940
+ useEffect(() => {
941
+ let cancelled = false;
942
+ async function run() {
943
+ const dataset = await runner.resolveDatasetByName(datasetName);
944
+ if (!dataset) {
945
+ setError(new Error(`Dataset "${datasetName}" not found.`));
946
+ onComplete(new Error(`Dataset "${datasetName}" not found.`));
947
+ return;
948
+ }
949
+ const { writeFile: writeFile2 } = await import('fs/promises');
950
+ const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
951
+ const testCases = await runner.collectDatasetTestCases(dataset.id);
952
+ const payload = testCases.map((item) => {
953
+ const tc = item.testCase;
954
+ return {
955
+ name: item.testCase.getName(),
956
+ input: item.testCase.getInput(),
957
+ output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
958
+ };
959
+ });
960
+ const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
961
+ const parsed = parse2(absoluteDatasetPath);
962
+ const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
963
+ await writeFile2(
964
+ outputPath,
965
+ `${JSON.stringify(payload, null, 2)}
966
+ `,
967
+ "utf8"
968
+ );
969
+ if (!cancelled) {
970
+ setResult({
971
+ count: payload.length,
972
+ datasetName: dataset.dataset.getName(),
973
+ outputPath
974
+ });
975
+ setTimeout(() => onComplete(), 200);
976
+ }
977
+ }
978
+ void run();
979
+ return () => {
980
+ cancelled = true;
981
+ };
982
+ }, [runner, datasetName, onComplete]);
983
+ if (error) {
984
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
985
+ /* @__PURE__ */ jsx(Banner, {}),
986
+ /* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
987
+ ] });
988
+ }
989
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
990
+ /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
991
+ result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
992
+ /* @__PURE__ */ jsxs(Text, { color: "green", children: [
993
+ "Generated ",
994
+ result.count,
995
+ ' test cases for dataset "',
996
+ result.datasetName,
997
+ '".'
998
+ ] }),
999
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1000
+ "Wrote ",
1001
+ result.outputPath
1002
+ ] })
1003
+ ] })
1004
+ ] });
1005
+ }
1006
+
1007
+ // src/cli-simple/generate.ts
889
1008
  function readOutput2(testCase) {
890
1009
  if (typeof testCase.getOutput !== "function") {
891
1010
  return void 0;
@@ -896,7 +1015,7 @@ function createOutputPath(datasetFilePath) {
896
1015
  const parsed = parse(datasetFilePath);
897
1016
  return join(parsed.dir, `${parsed.name}.cases.json`);
898
1017
  }
899
- async function generateDatasetJsonCommand(runner, datasetName) {
1018
+ async function generateDatasetJsonCommandPlain(runner, datasetName) {
900
1019
  const dataset = await runner.resolveDatasetByName(datasetName);
901
1020
  if (!dataset) {
902
1021
  throw new Error(`Dataset "${datasetName}" not found.`);
@@ -914,6 +1033,393 @@ async function generateDatasetJsonCommand(runner, datasetName) {
914
1033
  console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
915
1034
  console.log(`Wrote ${outputPath}`);
916
1035
  }
1036
+ async function generateDatasetJsonCommandInk(runner, datasetName) {
1037
+ return new Promise((resolve4, reject) => {
1038
+ const app = render(
1039
+ React2.createElement(GenerateView, {
1040
+ runner,
1041
+ datasetName,
1042
+ onComplete: (err) => {
1043
+ app.unmount();
1044
+ if (err) {
1045
+ reject(err);
1046
+ } else {
1047
+ resolve4();
1048
+ }
1049
+ }
1050
+ })
1051
+ );
1052
+ });
1053
+ }
1054
+ function barColor(pct) {
1055
+ if (pct >= 70)
1056
+ return "green";
1057
+ if (pct >= 40)
1058
+ return "yellow";
1059
+ return "red";
1060
+ }
1061
+ function TextBar({
1062
+ label,
1063
+ value,
1064
+ max = 100,
1065
+ labelWidth = 14,
1066
+ barWidth = 20,
1067
+ format = (v) => String(v),
1068
+ colorByValue = true
1069
+ }) {
1070
+ const clamped = Math.max(0, Math.min(max, value));
1071
+ const pct = max > 0 ? clamped / max * 100 : 0;
1072
+ const filled = Math.round(clamped / max * barWidth);
1073
+ const filledBar = "\u2588".repeat(filled);
1074
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
1075
+ const color = colorByValue ? barColor(pct) : void 0;
1076
+ return /* @__PURE__ */ jsxs(Text, { children: [
1077
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
1078
+ " [",
1079
+ color ? /* @__PURE__ */ jsxs(Fragment, { children: [
1080
+ /* @__PURE__ */ jsx(Text, { color, children: filledBar }),
1081
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
1082
+ ] }) : filledBar + emptyBar,
1083
+ "] ",
1084
+ /* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
1085
+ ] });
1086
+ }
1087
+ var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1088
+ function Spinner({ label = "Running" }) {
1089
+ const [frame, setFrame] = useState(0);
1090
+ useEffect(() => {
1091
+ const timer = setInterval(() => {
1092
+ setFrame((f) => (f + 1) % FRAMES.length);
1093
+ }, 100);
1094
+ return () => clearInterval(timer);
1095
+ }, []);
1096
+ return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1097
+ FRAMES[frame],
1098
+ " ",
1099
+ label
1100
+ ] });
1101
+ }
1102
+ function scoreColor(score) {
1103
+ if (score >= 80)
1104
+ return "green";
1105
+ if (score >= 50)
1106
+ return "yellow";
1107
+ return "red";
1108
+ }
1109
+ function createBar(value, max = 100, width = 20) {
1110
+ const safe = Math.max(0, Math.min(max, value));
1111
+ const filled = Math.round(safe / max * width);
1112
+ return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
1113
+ }
1114
+ function formatScorePart(item, scoreToColor2) {
1115
+ const def = getScoreById(item.id);
1116
+ if (!def) {
1117
+ const numeric = toNumericScore(item.data);
1118
+ return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
1119
+ }
1120
+ const formatted = def.format(item.data);
1121
+ if (def.displayStrategy === "bar") {
1122
+ const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
1123
+ if (typeof numeric === "number" && Number.isFinite(numeric)) {
1124
+ return `${formatted} ${createBar(numeric)}`;
1125
+ }
1126
+ }
1127
+ return formatted;
1128
+ }
1129
+ function RunView({
1130
+ runner,
1131
+ datasetName,
1132
+ evaluatorPattern,
1133
+ onComplete
1134
+ }) {
1135
+ const [phase, setPhase] = useState(
1136
+ "loading"
1137
+ );
1138
+ const [runInfo, setRunInfo] = useState(null);
1139
+ const [testCases, setTestCases] = useState([]);
1140
+ const [summary, setSummary] = useState(null);
1141
+ const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1142
+ const runEval = useCallback(async () => {
1143
+ const dataset = await runner.resolveDatasetByName(datasetName);
1144
+ if (!dataset) {
1145
+ const known = await runner.collectDatasets();
1146
+ const available = known.map((item) => item.dataset.getName()).sort();
1147
+ onComplete(
1148
+ new Error(
1149
+ available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
1150
+ )
1151
+ );
1152
+ return;
1153
+ }
1154
+ const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
1155
+ if (evaluators.length === 0) {
1156
+ const known = await runner.collectEvaluators();
1157
+ const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
1158
+ onComplete(
1159
+ new Error(
1160
+ available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
1161
+ )
1162
+ );
1163
+ return;
1164
+ }
1165
+ const nameById = new Map(
1166
+ evaluators.map((item) => [
1167
+ item.id,
1168
+ item.evaluator.getName() ?? item.id
1169
+ ])
1170
+ );
1171
+ setEvaluatorNameById(nameById);
1172
+ const aggregates = /* @__PURE__ */ new Map();
1173
+ let overallScoreTotal = 0;
1174
+ let overallScoreCount = 0;
1175
+ const done = new Promise((resolve4) => {
1176
+ const unsubscribe = runner.subscribeRunEvents((event) => {
1177
+ if (event.type === "TestCaseProgress") {
1178
+ const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
1179
+ const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
1180
+ for (const item of event.evaluatorScores) {
1181
+ const numeric = toNumericScoreFromScores(item.scores);
1182
+ if (numeric !== void 0) {
1183
+ const current = aggregates.get(item.evaluatorId) ?? {
1184
+ total: 0,
1185
+ count: 0,
1186
+ passed: 0,
1187
+ failed: 0
1188
+ };
1189
+ aggregates.set(item.evaluatorId, {
1190
+ total: current.total + numeric,
1191
+ count: current.count + 1,
1192
+ passed: current.passed + (item.passed ? 1 : 0),
1193
+ failed: current.failed + (item.passed ? 0 : 1)
1194
+ });
1195
+ overallScoreTotal += numeric;
1196
+ overallScoreCount += 1;
1197
+ }
1198
+ }
1199
+ setTestCases((prev) => [
1200
+ ...prev,
1201
+ {
1202
+ name: event.testCaseName,
1203
+ completedTestCases: event.completedTestCases,
1204
+ totalTestCases: event.totalTestCases,
1205
+ durationMs: event.durationMs,
1206
+ passed: event.passed,
1207
+ averageScore,
1208
+ evaluatorScores: event.evaluatorScores.map((item) => ({
1209
+ evaluatorId: item.evaluatorId,
1210
+ evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
1211
+ scores: item.scores,
1212
+ passed: item.passed,
1213
+ metrics: item.metrics,
1214
+ logs: item.logs
1215
+ }))
1216
+ }
1217
+ ]);
1218
+ }
1219
+ if (event.type === "RunCompleted" || event.type === "RunFailed") {
1220
+ unsubscribe();
1221
+ resolve4(event);
1222
+ }
1223
+ });
1224
+ });
1225
+ const snapshot = await runner.runDatasetWith({
1226
+ datasetId: dataset.id,
1227
+ evaluatorIds: evaluators.map((item) => item.id)
1228
+ });
1229
+ setRunInfo({
1230
+ runId: snapshot.runId,
1231
+ datasetName: snapshot.datasetName,
1232
+ evaluatorNames: evaluators.map(
1233
+ (e) => e.evaluator.getName() ?? e.id
1234
+ ),
1235
+ totalTestCases: snapshot.totalTestCases
1236
+ });
1237
+ setPhase("running");
1238
+ const finalEvent = await done;
1239
+ if (finalEvent.type === "RunFailed") {
1240
+ onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
1241
+ return;
1242
+ }
1243
+ setSummary({
1244
+ passedTestCases: finalEvent.passedTestCases,
1245
+ failedTestCases: finalEvent.failedTestCases,
1246
+ totalTestCases: finalEvent.totalTestCases,
1247
+ overallScoreTotal,
1248
+ overallScoreCount,
1249
+ aggregates: new Map(aggregates),
1250
+ artifactPath: finalEvent.artifactPath
1251
+ });
1252
+ setPhase("completed");
1253
+ setTimeout(() => onComplete(), 200);
1254
+ }, [runner, datasetName, evaluatorPattern, onComplete]);
1255
+ useEffect(() => {
1256
+ void runEval();
1257
+ }, [runEval]);
1258
+ return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
1259
+ /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
1260
+ runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1261
+ /* @__PURE__ */ jsxs(Text, { children: [
1262
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
1263
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
1264
+ ] }),
1265
+ /* @__PURE__ */ jsxs(Text, { children: [
1266
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
1267
+ runInfo.datasetName
1268
+ ] }),
1269
+ /* @__PURE__ */ jsxs(Text, { children: [
1270
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
1271
+ runInfo.evaluatorNames.join(", ")
1272
+ ] }),
1273
+ /* @__PURE__ */ jsxs(Text, { children: [
1274
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
1275
+ runInfo.totalTestCases
1276
+ ] })
1277
+ ] }),
1278
+ phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1279
+ Spinner,
1280
+ {
1281
+ label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
1282
+ }
1283
+ ) }),
1284
+ testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1285
+ /* @__PURE__ */ jsxs(Text, { children: [
1286
+ /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
1287
+ "[",
1288
+ tc.completedTestCases,
1289
+ "/",
1290
+ tc.totalTestCases,
1291
+ "]"
1292
+ ] }),
1293
+ " ",
1294
+ tc.name,
1295
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1296
+ " (",
1297
+ tc.durationMs,
1298
+ "ms)"
1299
+ ] })
1300
+ ] }),
1301
+ tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
1302
+ /* @__PURE__ */ jsxs(Text, { children: [
1303
+ item.evaluatorName,
1304
+ ":",
1305
+ " ",
1306
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1307
+ " ",
1308
+ item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
1309
+ formatScorePart(s),
1310
+ " "
1311
+ ] }, s.id)),
1312
+ item.metrics?.map((m) => {
1313
+ const def = getMetricById(m.id);
1314
+ if (!def)
1315
+ return null;
1316
+ const formatted = def.format(m.data);
1317
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1318
+ "[",
1319
+ def.name ? `${def.name}: ` : "",
1320
+ formatted,
1321
+ "]",
1322
+ " "
1323
+ ] }, m.id);
1324
+ })
1325
+ ] }),
1326
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1327
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
1328
+ Text,
1329
+ {
1330
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
1331
+ children: line
1332
+ },
1333
+ lineIdx
1334
+ )) }, logIdx) : null
1335
+ ) })
1336
+ ] }, item.evaluatorId))
1337
+ ] }, i)) }),
1338
+ phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
1339
+ /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
1340
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
1341
+ /* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
1342
+ /* @__PURE__ */ jsxs(Text, { children: [
1343
+ " ",
1344
+ summary.passedTestCases,
1345
+ "/",
1346
+ summary.totalTestCases
1347
+ ] })
1348
+ ] }),
1349
+ /* @__PURE__ */ jsxs(Box, { children: [
1350
+ /* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
1351
+ /* @__PURE__ */ jsxs(Text, { children: [
1352
+ " ",
1353
+ summary.failedTestCases,
1354
+ "/",
1355
+ summary.totalTestCases
1356
+ ] })
1357
+ ] }),
1358
+ summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
1359
+ TextBar,
1360
+ {
1361
+ label: "overall avg",
1362
+ value: summary.overallScoreTotal / summary.overallScoreCount,
1363
+ barWidth: 20,
1364
+ format: (v) => v.toFixed(2)
1365
+ }
1366
+ ) }),
1367
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1368
+ /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1369
+ Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1370
+ const agg = summary.aggregates.get(id);
1371
+ if (!agg || agg.count === 0) {
1372
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1373
+ "- ",
1374
+ name.padEnd(28),
1375
+ " no numeric scores"
1376
+ ] }, id);
1377
+ }
1378
+ const mean = agg.total / agg.count;
1379
+ return /* @__PURE__ */ jsxs(Text, { children: [
1380
+ "- ",
1381
+ name.padEnd(28),
1382
+ " avg=",
1383
+ /* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
1384
+ " passed=",
1385
+ agg.passed,
1386
+ " failed=",
1387
+ agg.failed
1388
+ ] }, id);
1389
+ })
1390
+ ] }),
1391
+ /* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
1392
+ /* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
1393
+ testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
1394
+ /* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
1395
+ /* @__PURE__ */ jsxs(Text, { children: [
1396
+ " ",
1397
+ tc.name.padEnd(24)
1398
+ ] }),
1399
+ tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1400
+ /* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
1401
+ "score=",
1402
+ tc.averageScore.toFixed(2)
1403
+ ] }),
1404
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1405
+ " ",
1406
+ createBar(tc.averageScore, 100, 14)
1407
+ ] })
1408
+ ] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
1409
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1410
+ " (",
1411
+ tc.durationMs,
1412
+ "ms)"
1413
+ ] })
1414
+ ] }, i))
1415
+ ] }),
1416
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1417
+ "artifact: ",
1418
+ summary.artifactPath
1419
+ ] }) })
1420
+ ] })
1421
+ ] });
1422
+ }
917
1423
 
918
1424
  // src/cli-simple/run.ts
919
1425
  var ansi2 = {
@@ -945,7 +1451,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
945
1451
  const mean = aggregate.total / aggregate.count;
946
1452
  return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
947
1453
  }
948
- function createBar(value, max = 100, width = 20) {
1454
+ function createBar2(value, max = 100, width = 20) {
949
1455
  const safe = Math.max(0, Math.min(max, value));
950
1456
  const filled = Math.round(safe / max * width);
951
1457
  return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
@@ -968,7 +1474,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
968
1474
  const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
969
1475
  if (typeof numeric === "number" && Number.isFinite(numeric)) {
970
1476
  scoreParts.push(
971
- `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar(numeric), ansi2.dim)}`
1477
+ `${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
972
1478
  );
973
1479
  } else {
974
1480
  scoreParts.push(formatted);
@@ -1007,7 +1513,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
1007
1513
  }
1008
1514
  return line;
1009
1515
  }
1010
- async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1516
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
1011
1517
  const dataset = await runner.resolveDatasetByName(datasetName);
1012
1518
  if (!dataset) {
1013
1519
  const known = await runner.collectDatasets();
@@ -1076,6 +1582,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1076
1582
  item.metrics
1077
1583
  )
1078
1584
  );
1585
+ if (!item.passed && item.logs && item.logs.length > 0) {
1586
+ for (const log of item.logs) {
1587
+ if (log.type === "diff") {
1588
+ const useColor = process.stdout.isTTY;
1589
+ for (const { type, line } of getDiffLines(log)) {
1590
+ const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
1591
+ console.log(colored);
1592
+ }
1593
+ }
1594
+ }
1595
+ }
1079
1596
  const numeric = toNumericScoreFromScores(item.scores);
1080
1597
  if (numeric !== void 0) {
1081
1598
  const current = aggregates.get(item.evaluatorId) ?? {
@@ -1154,7 +1671,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1154
1671
  `- overall avg score: ${colorize(
1155
1672
  overallAverage.toFixed(2),
1156
1673
  scoreToColor(overallAverage)
1157
- )} ${colorize(createBar(overallAverage), ansi2.dim)}`
1674
+ )} ${colorize(createBar2(overallAverage), ansi2.dim)}`
1158
1675
  );
1159
1676
  }
1160
1677
  console.log(colorize("- evaluator averages:", ansi2.magenta));
@@ -1177,12 +1694,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
1177
1694
  ` ${status} ${summary.name.padEnd(24)} score=${colorize(
1178
1695
  summary.averageScore.toFixed(2),
1179
1696
  scoreToColor(summary.averageScore)
1180
- )} ${colorize(createBar(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1697
+ )} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
1181
1698
  );
1182
1699
  }
1183
1700
  }
1184
1701
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1185
1702
  }
1703
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1704
+ return new Promise((resolve4, reject) => {
1705
+ const app = render(
1706
+ React2.createElement(RunView, {
1707
+ runner,
1708
+ datasetName,
1709
+ evaluatorPattern,
1710
+ onComplete: (err) => {
1711
+ app.unmount();
1712
+ if (err) {
1713
+ reject(err);
1714
+ } else {
1715
+ resolve4();
1716
+ }
1717
+ }
1718
+ })
1719
+ );
1720
+ });
1721
+ }
1186
1722
 
1187
1723
  // src/cli-simple/index.ts
1188
1724
  function printUsageAndExit(exitCode) {
@@ -1210,14 +1746,24 @@ async function main() {
1210
1746
  console.error("Missing required --evaluator <name-or-pattern> argument.");
1211
1747
  printUsageAndExit(1);
1212
1748
  }
1213
- printBanner();
1749
+ const useInk = process.stdout.isTTY === true;
1750
+ if (!useInk) {
1751
+ printBanner();
1752
+ }
1214
1753
  const runner = createRunner();
1215
1754
  try {
1216
1755
  if (args.command === "run") {
1217
- await runSimpleEvalCommand(runner, args.datasetName, args.evaluatorPattern);
1756
+ await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
1757
+ runner,
1758
+ args.datasetName,
1759
+ args.evaluatorPattern
1760
+ );
1218
1761
  return;
1219
1762
  }
1220
- await generateDatasetJsonCommand(runner, args.datasetName);
1763
+ await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
1764
+ runner,
1765
+ args.datasetName
1766
+ );
1221
1767
  } finally {
1222
1768
  await runner.shutdown();
1223
1769
  }