@m4trix/evals 0.9.1 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +561 -12
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +558 -12
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +24 -3
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +24 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +23 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +25 -13
- package/dist/index.js +23 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,9 +8,14 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
require('json-diff');
|
|
11
|
+
var jsonDiff = require('json-diff');
|
|
12
|
+
var React2 = require('react');
|
|
13
|
+
var ink = require('ink');
|
|
14
|
+
var jsxRuntime = require('react/jsx-runtime');
|
|
12
15
|
|
|
13
16
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
17
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
18
|
+
|
|
14
19
|
function _interopNamespace(e) {
|
|
15
20
|
if (e && e.__esModule) return e;
|
|
16
21
|
var n = Object.create(null);
|
|
@@ -30,6 +35,7 @@ function _interopNamespace(e) {
|
|
|
30
35
|
}
|
|
31
36
|
|
|
32
37
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
38
|
+
var React2__default = /*#__PURE__*/_interopDefault(React2);
|
|
33
39
|
|
|
34
40
|
// src/runner/config.ts
|
|
35
41
|
var defaultRunnerConfig = {
|
|
@@ -276,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
|
|
|
276
282
|
);
|
|
277
283
|
return found.flat();
|
|
278
284
|
}
|
|
285
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
286
|
+
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
287
|
+
return {
|
|
288
|
+
type: "diff",
|
|
289
|
+
label: options?.label,
|
|
290
|
+
expected,
|
|
291
|
+
actual,
|
|
292
|
+
diff: diff || "(no differences)"
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
function getDiffLines(entry) {
|
|
296
|
+
const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
297
|
+
return raw.split("\n").map((line) => {
|
|
298
|
+
const trimmed = line.trimStart();
|
|
299
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
300
|
+
return { type: "remove", line };
|
|
301
|
+
}
|
|
302
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
303
|
+
return { type: "add", line };
|
|
304
|
+
}
|
|
305
|
+
return { type: "context", line };
|
|
306
|
+
});
|
|
307
|
+
}
|
|
279
308
|
|
|
280
309
|
// src/evals/metric.ts
|
|
281
310
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -459,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
459
488
|
continue;
|
|
460
489
|
}
|
|
461
490
|
try {
|
|
491
|
+
const logs = [];
|
|
492
|
+
const logDiff = (expected, actual, options) => {
|
|
493
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
494
|
+
};
|
|
462
495
|
const ctx = yield* effect.Effect.promise(
|
|
463
496
|
() => Promise.resolve(evaluator.resolveContext())
|
|
464
497
|
);
|
|
@@ -467,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
467
500
|
evaluateFn({
|
|
468
501
|
input: testCaseItem.testCase.getInput(),
|
|
469
502
|
ctx,
|
|
470
|
-
output
|
|
503
|
+
output,
|
|
504
|
+
logDiff
|
|
471
505
|
})
|
|
472
506
|
)
|
|
473
507
|
);
|
|
474
508
|
const { scores, metrics } = normalizeResult(result);
|
|
475
509
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
476
|
-
evaluatorScores.push({
|
|
510
|
+
evaluatorScores.push({
|
|
511
|
+
evaluatorId,
|
|
512
|
+
scores,
|
|
513
|
+
passed,
|
|
514
|
+
metrics,
|
|
515
|
+
logs: logs.length > 0 ? logs : void 0
|
|
516
|
+
});
|
|
477
517
|
} catch (error) {
|
|
478
518
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
479
519
|
evaluatorScores.push({
|
|
@@ -909,6 +949,88 @@ function printBanner() {
|
|
|
909
949
|
];
|
|
910
950
|
console.log(lines.join("\n"));
|
|
911
951
|
}
|
|
952
|
+
function Banner() {
|
|
953
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
|
|
954
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "@m4trix/evals" }),
|
|
955
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: " \xB7 " }),
|
|
956
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "eval-agents-simple" })
|
|
957
|
+
] });
|
|
958
|
+
}
|
|
959
|
+
function GenerateView({
|
|
960
|
+
runner,
|
|
961
|
+
datasetName,
|
|
962
|
+
onComplete
|
|
963
|
+
}) {
|
|
964
|
+
const [result, setResult] = React2.useState(null);
|
|
965
|
+
const [error, setError] = React2.useState(null);
|
|
966
|
+
React2.useEffect(() => {
|
|
967
|
+
let cancelled = false;
|
|
968
|
+
async function run() {
|
|
969
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
970
|
+
if (!dataset) {
|
|
971
|
+
setError(new Error(`Dataset "${datasetName}" not found.`));
|
|
972
|
+
onComplete(new Error(`Dataset "${datasetName}" not found.`));
|
|
973
|
+
return;
|
|
974
|
+
}
|
|
975
|
+
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
976
|
+
const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
|
|
977
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
978
|
+
const payload = testCases.map((item) => {
|
|
979
|
+
const tc = item.testCase;
|
|
980
|
+
return {
|
|
981
|
+
name: item.testCase.getName(),
|
|
982
|
+
input: item.testCase.getInput(),
|
|
983
|
+
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
984
|
+
};
|
|
985
|
+
});
|
|
986
|
+
const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
|
|
987
|
+
const parsed = parse2(absoluteDatasetPath);
|
|
988
|
+
const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
|
|
989
|
+
await writeFile2(
|
|
990
|
+
outputPath,
|
|
991
|
+
`${JSON.stringify(payload, null, 2)}
|
|
992
|
+
`,
|
|
993
|
+
"utf8"
|
|
994
|
+
);
|
|
995
|
+
if (!cancelled) {
|
|
996
|
+
setResult({
|
|
997
|
+
count: payload.length,
|
|
998
|
+
datasetName: dataset.dataset.getName(),
|
|
999
|
+
outputPath
|
|
1000
|
+
});
|
|
1001
|
+
setTimeout(() => onComplete(), 200);
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
void run();
|
|
1005
|
+
return () => {
|
|
1006
|
+
cancelled = true;
|
|
1007
|
+
};
|
|
1008
|
+
}, [runner, datasetName, onComplete]);
|
|
1009
|
+
if (error) {
|
|
1010
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1011
|
+
/* @__PURE__ */ jsxRuntime.jsx(Banner, {}),
|
|
1012
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: error.message })
|
|
1013
|
+
] });
|
|
1014
|
+
}
|
|
1015
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1016
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1017
|
+
result && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1018
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "green", children: [
|
|
1019
|
+
"Generated ",
|
|
1020
|
+
result.count,
|
|
1021
|
+
' test cases for dataset "',
|
|
1022
|
+
result.datasetName,
|
|
1023
|
+
'".'
|
|
1024
|
+
] }),
|
|
1025
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1026
|
+
"Wrote ",
|
|
1027
|
+
result.outputPath
|
|
1028
|
+
] })
|
|
1029
|
+
] })
|
|
1030
|
+
] });
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// src/cli-simple/generate.ts
|
|
912
1034
|
function readOutput2(testCase) {
|
|
913
1035
|
if (typeof testCase.getOutput !== "function") {
|
|
914
1036
|
return void 0;
|
|
@@ -919,7 +1041,7 @@ function createOutputPath(datasetFilePath) {
|
|
|
919
1041
|
const parsed = path.parse(datasetFilePath);
|
|
920
1042
|
return path.join(parsed.dir, `${parsed.name}.cases.json`);
|
|
921
1043
|
}
|
|
922
|
-
async function
|
|
1044
|
+
async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
923
1045
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
924
1046
|
if (!dataset) {
|
|
925
1047
|
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
@@ -937,6 +1059,393 @@ async function generateDatasetJsonCommand(runner, datasetName) {
|
|
|
937
1059
|
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
938
1060
|
console.log(`Wrote ${outputPath}`);
|
|
939
1061
|
}
|
|
1062
|
+
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1063
|
+
return new Promise((resolve4, reject) => {
|
|
1064
|
+
const app = ink.render(
|
|
1065
|
+
React2__default.default.createElement(GenerateView, {
|
|
1066
|
+
runner,
|
|
1067
|
+
datasetName,
|
|
1068
|
+
onComplete: (err) => {
|
|
1069
|
+
app.unmount();
|
|
1070
|
+
if (err) {
|
|
1071
|
+
reject(err);
|
|
1072
|
+
} else {
|
|
1073
|
+
resolve4();
|
|
1074
|
+
}
|
|
1075
|
+
}
|
|
1076
|
+
})
|
|
1077
|
+
);
|
|
1078
|
+
});
|
|
1079
|
+
}
|
|
1080
|
+
function barColor(pct) {
|
|
1081
|
+
if (pct >= 70)
|
|
1082
|
+
return "green";
|
|
1083
|
+
if (pct >= 40)
|
|
1084
|
+
return "yellow";
|
|
1085
|
+
return "red";
|
|
1086
|
+
}
|
|
1087
|
+
function TextBar({
|
|
1088
|
+
label,
|
|
1089
|
+
value,
|
|
1090
|
+
max = 100,
|
|
1091
|
+
labelWidth = 14,
|
|
1092
|
+
barWidth = 20,
|
|
1093
|
+
format = (v) => String(v),
|
|
1094
|
+
colorByValue = true
|
|
1095
|
+
}) {
|
|
1096
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
1097
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
1098
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
1099
|
+
const filledBar = "\u2588".repeat(filled);
|
|
1100
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
1101
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
1102
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1103
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
1104
|
+
" [",
|
|
1105
|
+
color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1106
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
|
|
1107
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
|
|
1108
|
+
] }) : filledBar + emptyBar,
|
|
1109
|
+
"] ",
|
|
1110
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
1111
|
+
] });
|
|
1112
|
+
}
|
|
1113
|
+
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1114
|
+
function Spinner({ label = "Running" }) {
|
|
1115
|
+
const [frame, setFrame] = React2.useState(0);
|
|
1116
|
+
React2.useEffect(() => {
|
|
1117
|
+
const timer = setInterval(() => {
|
|
1118
|
+
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1119
|
+
}, 100);
|
|
1120
|
+
return () => clearInterval(timer);
|
|
1121
|
+
}, []);
|
|
1122
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1123
|
+
FRAMES[frame],
|
|
1124
|
+
" ",
|
|
1125
|
+
label
|
|
1126
|
+
] });
|
|
1127
|
+
}
|
|
1128
|
+
function scoreColor(score) {
|
|
1129
|
+
if (score >= 80)
|
|
1130
|
+
return "green";
|
|
1131
|
+
if (score >= 50)
|
|
1132
|
+
return "yellow";
|
|
1133
|
+
return "red";
|
|
1134
|
+
}
|
|
1135
|
+
function createBar(value, max = 100, width = 20) {
|
|
1136
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
1137
|
+
const filled = Math.round(safe / max * width);
|
|
1138
|
+
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1139
|
+
}
|
|
1140
|
+
function formatScorePart(item, scoreToColor2) {
|
|
1141
|
+
const def = getScoreById(item.id);
|
|
1142
|
+
if (!def) {
|
|
1143
|
+
const numeric = toNumericScore(item.data);
|
|
1144
|
+
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1145
|
+
}
|
|
1146
|
+
const formatted = def.format(item.data);
|
|
1147
|
+
if (def.displayStrategy === "bar") {
|
|
1148
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1149
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1150
|
+
return `${formatted} ${createBar(numeric)}`;
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
return formatted;
|
|
1154
|
+
}
|
|
1155
|
+
function RunView({
|
|
1156
|
+
runner,
|
|
1157
|
+
datasetName,
|
|
1158
|
+
evaluatorPattern,
|
|
1159
|
+
onComplete
|
|
1160
|
+
}) {
|
|
1161
|
+
const [phase, setPhase] = React2.useState(
|
|
1162
|
+
"loading"
|
|
1163
|
+
);
|
|
1164
|
+
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1165
|
+
const [testCases, setTestCases] = React2.useState([]);
|
|
1166
|
+
const [summary, setSummary] = React2.useState(null);
|
|
1167
|
+
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1168
|
+
const runEval = React2.useCallback(async () => {
|
|
1169
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1170
|
+
if (!dataset) {
|
|
1171
|
+
const known = await runner.collectDatasets();
|
|
1172
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1173
|
+
onComplete(
|
|
1174
|
+
new Error(
|
|
1175
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1176
|
+
)
|
|
1177
|
+
);
|
|
1178
|
+
return;
|
|
1179
|
+
}
|
|
1180
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
1181
|
+
if (evaluators.length === 0) {
|
|
1182
|
+
const known = await runner.collectEvaluators();
|
|
1183
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
1184
|
+
onComplete(
|
|
1185
|
+
new Error(
|
|
1186
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
|
|
1187
|
+
)
|
|
1188
|
+
);
|
|
1189
|
+
return;
|
|
1190
|
+
}
|
|
1191
|
+
const nameById = new Map(
|
|
1192
|
+
evaluators.map((item) => [
|
|
1193
|
+
item.id,
|
|
1194
|
+
item.evaluator.getName() ?? item.id
|
|
1195
|
+
])
|
|
1196
|
+
);
|
|
1197
|
+
setEvaluatorNameById(nameById);
|
|
1198
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
1199
|
+
let overallScoreTotal = 0;
|
|
1200
|
+
let overallScoreCount = 0;
|
|
1201
|
+
const done = new Promise((resolve4) => {
|
|
1202
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1203
|
+
if (event.type === "TestCaseProgress") {
|
|
1204
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1205
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1206
|
+
for (const item of event.evaluatorScores) {
|
|
1207
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
1208
|
+
if (numeric !== void 0) {
|
|
1209
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1210
|
+
total: 0,
|
|
1211
|
+
count: 0,
|
|
1212
|
+
passed: 0,
|
|
1213
|
+
failed: 0
|
|
1214
|
+
};
|
|
1215
|
+
aggregates.set(item.evaluatorId, {
|
|
1216
|
+
total: current.total + numeric,
|
|
1217
|
+
count: current.count + 1,
|
|
1218
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
1219
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
1220
|
+
});
|
|
1221
|
+
overallScoreTotal += numeric;
|
|
1222
|
+
overallScoreCount += 1;
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
setTestCases((prev) => [
|
|
1226
|
+
...prev,
|
|
1227
|
+
{
|
|
1228
|
+
name: event.testCaseName,
|
|
1229
|
+
completedTestCases: event.completedTestCases,
|
|
1230
|
+
totalTestCases: event.totalTestCases,
|
|
1231
|
+
durationMs: event.durationMs,
|
|
1232
|
+
passed: event.passed,
|
|
1233
|
+
averageScore,
|
|
1234
|
+
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1235
|
+
evaluatorId: item.evaluatorId,
|
|
1236
|
+
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1237
|
+
scores: item.scores,
|
|
1238
|
+
passed: item.passed,
|
|
1239
|
+
metrics: item.metrics,
|
|
1240
|
+
logs: item.logs
|
|
1241
|
+
}))
|
|
1242
|
+
}
|
|
1243
|
+
]);
|
|
1244
|
+
}
|
|
1245
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1246
|
+
unsubscribe();
|
|
1247
|
+
resolve4(event);
|
|
1248
|
+
}
|
|
1249
|
+
});
|
|
1250
|
+
});
|
|
1251
|
+
const snapshot = await runner.runDatasetWith({
|
|
1252
|
+
datasetId: dataset.id,
|
|
1253
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
1254
|
+
});
|
|
1255
|
+
setRunInfo({
|
|
1256
|
+
runId: snapshot.runId,
|
|
1257
|
+
datasetName: snapshot.datasetName,
|
|
1258
|
+
evaluatorNames: evaluators.map(
|
|
1259
|
+
(e) => e.evaluator.getName() ?? e.id
|
|
1260
|
+
),
|
|
1261
|
+
totalTestCases: snapshot.totalTestCases
|
|
1262
|
+
});
|
|
1263
|
+
setPhase("running");
|
|
1264
|
+
const finalEvent = await done;
|
|
1265
|
+
if (finalEvent.type === "RunFailed") {
|
|
1266
|
+
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1267
|
+
return;
|
|
1268
|
+
}
|
|
1269
|
+
setSummary({
|
|
1270
|
+
passedTestCases: finalEvent.passedTestCases,
|
|
1271
|
+
failedTestCases: finalEvent.failedTestCases,
|
|
1272
|
+
totalTestCases: finalEvent.totalTestCases,
|
|
1273
|
+
overallScoreTotal,
|
|
1274
|
+
overallScoreCount,
|
|
1275
|
+
aggregates: new Map(aggregates),
|
|
1276
|
+
artifactPath: finalEvent.artifactPath
|
|
1277
|
+
});
|
|
1278
|
+
setPhase("completed");
|
|
1279
|
+
setTimeout(() => onComplete(), 200);
|
|
1280
|
+
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1281
|
+
React2.useEffect(() => {
|
|
1282
|
+
void runEval();
|
|
1283
|
+
}, [runEval]);
|
|
1284
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1285
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1286
|
+
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1287
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1288
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
|
|
1289
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
1290
|
+
] }),
|
|
1291
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1292
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
|
|
1293
|
+
runInfo.datasetName
|
|
1294
|
+
] }),
|
|
1295
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1296
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
|
|
1297
|
+
runInfo.evaluatorNames.join(", ")
|
|
1298
|
+
] }),
|
|
1299
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1300
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
|
|
1301
|
+
runInfo.totalTestCases
|
|
1302
|
+
] })
|
|
1303
|
+
] }),
|
|
1304
|
+
phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1305
|
+
Spinner,
|
|
1306
|
+
{
|
|
1307
|
+
label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
|
|
1308
|
+
}
|
|
1309
|
+
) }),
|
|
1310
|
+
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1311
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1312
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1313
|
+
"[",
|
|
1314
|
+
tc.completedTestCases,
|
|
1315
|
+
"/",
|
|
1316
|
+
tc.totalTestCases,
|
|
1317
|
+
"]"
|
|
1318
|
+
] }),
|
|
1319
|
+
" ",
|
|
1320
|
+
tc.name,
|
|
1321
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1322
|
+
" (",
|
|
1323
|
+
tc.durationMs,
|
|
1324
|
+
"ms)"
|
|
1325
|
+
] })
|
|
1326
|
+
] }),
|
|
1327
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
1328
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1329
|
+
item.evaluatorName,
|
|
1330
|
+
":",
|
|
1331
|
+
" ",
|
|
1332
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1333
|
+
" ",
|
|
1334
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1335
|
+
formatScorePart(s),
|
|
1336
|
+
" "
|
|
1337
|
+
] }, s.id)),
|
|
1338
|
+
item.metrics?.map((m) => {
|
|
1339
|
+
const def = getMetricById(m.id);
|
|
1340
|
+
if (!def)
|
|
1341
|
+
return null;
|
|
1342
|
+
const formatted = def.format(m.data);
|
|
1343
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1344
|
+
"[",
|
|
1345
|
+
def.name ? `${def.name}: ` : "",
|
|
1346
|
+
formatted,
|
|
1347
|
+
"]",
|
|
1348
|
+
" "
|
|
1349
|
+
] }, m.id);
|
|
1350
|
+
})
|
|
1351
|
+
] }),
|
|
1352
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1353
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
1354
|
+
ink.Text,
|
|
1355
|
+
{
|
|
1356
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1357
|
+
children: line
|
|
1358
|
+
},
|
|
1359
|
+
lineIdx
|
|
1360
|
+
)) }, logIdx) : null
|
|
1361
|
+
) })
|
|
1362
|
+
] }, item.evaluatorId))
|
|
1363
|
+
] }, i)) }),
|
|
1364
|
+
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1365
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1366
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
|
|
1367
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "green", children: "passed" }),
|
|
1368
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1369
|
+
" ",
|
|
1370
|
+
summary.passedTestCases,
|
|
1371
|
+
"/",
|
|
1372
|
+
summary.totalTestCases
|
|
1373
|
+
] })
|
|
1374
|
+
] }),
|
|
1375
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1376
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
|
|
1377
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1378
|
+
" ",
|
|
1379
|
+
summary.failedTestCases,
|
|
1380
|
+
"/",
|
|
1381
|
+
summary.totalTestCases
|
|
1382
|
+
] })
|
|
1383
|
+
] }),
|
|
1384
|
+
summary.overallScoreCount > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1385
|
+
TextBar,
|
|
1386
|
+
{
|
|
1387
|
+
label: "overall avg",
|
|
1388
|
+
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1389
|
+
barWidth: 20,
|
|
1390
|
+
format: (v) => v.toFixed(2)
|
|
1391
|
+
}
|
|
1392
|
+
) }),
|
|
1393
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1394
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1395
|
+
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1396
|
+
const agg = summary.aggregates.get(id);
|
|
1397
|
+
if (!agg || agg.count === 0) {
|
|
1398
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1399
|
+
"- ",
|
|
1400
|
+
name.padEnd(28),
|
|
1401
|
+
" no numeric scores"
|
|
1402
|
+
] }, id);
|
|
1403
|
+
}
|
|
1404
|
+
const mean = agg.total / agg.count;
|
|
1405
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1406
|
+
"- ",
|
|
1407
|
+
name.padEnd(28),
|
|
1408
|
+
" avg=",
|
|
1409
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1410
|
+
" passed=",
|
|
1411
|
+
agg.passed,
|
|
1412
|
+
" failed=",
|
|
1413
|
+
agg.failed
|
|
1414
|
+
] }, id);
|
|
1415
|
+
})
|
|
1416
|
+
] }),
|
|
1417
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1418
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1419
|
+
testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1420
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1421
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1422
|
+
" ",
|
|
1423
|
+
tc.name.padEnd(24)
|
|
1424
|
+
] }),
|
|
1425
|
+
tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1426
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
|
|
1427
|
+
"score=",
|
|
1428
|
+
tc.averageScore.toFixed(2)
|
|
1429
|
+
] }),
|
|
1430
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1431
|
+
" ",
|
|
1432
|
+
createBar(tc.averageScore, 100, 14)
|
|
1433
|
+
] })
|
|
1434
|
+
] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
|
|
1435
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1436
|
+
" (",
|
|
1437
|
+
tc.durationMs,
|
|
1438
|
+
"ms)"
|
|
1439
|
+
] })
|
|
1440
|
+
] }, i))
|
|
1441
|
+
] }),
|
|
1442
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1443
|
+
"artifact: ",
|
|
1444
|
+
summary.artifactPath
|
|
1445
|
+
] }) })
|
|
1446
|
+
] })
|
|
1447
|
+
] });
|
|
1448
|
+
}
|
|
940
1449
|
|
|
941
1450
|
// src/cli-simple/run.ts
|
|
942
1451
|
var ansi2 = {
|
|
@@ -968,7 +1477,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
968
1477
|
const mean = aggregate.total / aggregate.count;
|
|
969
1478
|
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
970
1479
|
}
|
|
971
|
-
function
|
|
1480
|
+
function createBar2(value, max = 100, width = 20) {
|
|
972
1481
|
const safe = Math.max(0, Math.min(max, value));
|
|
973
1482
|
const filled = Math.round(safe / max * width);
|
|
974
1483
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
@@ -991,7 +1500,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
991
1500
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
992
1501
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
993
1502
|
scoreParts.push(
|
|
994
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(
|
|
1503
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
995
1504
|
);
|
|
996
1505
|
} else {
|
|
997
1506
|
scoreParts.push(formatted);
|
|
@@ -1030,7 +1539,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1030
1539
|
}
|
|
1031
1540
|
return line;
|
|
1032
1541
|
}
|
|
1033
|
-
async function
|
|
1542
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1034
1543
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1035
1544
|
if (!dataset) {
|
|
1036
1545
|
const known = await runner.collectDatasets();
|
|
@@ -1099,6 +1608,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1099
1608
|
item.metrics
|
|
1100
1609
|
)
|
|
1101
1610
|
);
|
|
1611
|
+
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1612
|
+
for (const log of item.logs) {
|
|
1613
|
+
if (log.type === "diff") {
|
|
1614
|
+
const useColor = process.stdout.isTTY;
|
|
1615
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
1616
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1617
|
+
console.log(colored);
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
}
|
|
1102
1622
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1103
1623
|
if (numeric !== void 0) {
|
|
1104
1624
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
@@ -1177,7 +1697,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1177
1697
|
`- overall avg score: ${colorize(
|
|
1178
1698
|
overallAverage.toFixed(2),
|
|
1179
1699
|
scoreToColor(overallAverage)
|
|
1180
|
-
)} ${colorize(
|
|
1700
|
+
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1181
1701
|
);
|
|
1182
1702
|
}
|
|
1183
1703
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
@@ -1200,12 +1720,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1200
1720
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1201
1721
|
summary.averageScore.toFixed(2),
|
|
1202
1722
|
scoreToColor(summary.averageScore)
|
|
1203
|
-
)} ${colorize(
|
|
1723
|
+
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1204
1724
|
);
|
|
1205
1725
|
}
|
|
1206
1726
|
}
|
|
1207
1727
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1208
1728
|
}
|
|
1729
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1730
|
+
return new Promise((resolve4, reject) => {
|
|
1731
|
+
const app = ink.render(
|
|
1732
|
+
React2__default.default.createElement(RunView, {
|
|
1733
|
+
runner,
|
|
1734
|
+
datasetName,
|
|
1735
|
+
evaluatorPattern,
|
|
1736
|
+
onComplete: (err) => {
|
|
1737
|
+
app.unmount();
|
|
1738
|
+
if (err) {
|
|
1739
|
+
reject(err);
|
|
1740
|
+
} else {
|
|
1741
|
+
resolve4();
|
|
1742
|
+
}
|
|
1743
|
+
}
|
|
1744
|
+
})
|
|
1745
|
+
);
|
|
1746
|
+
});
|
|
1747
|
+
}
|
|
1209
1748
|
|
|
1210
1749
|
// src/cli-simple/index.ts
|
|
1211
1750
|
function printUsageAndExit(exitCode) {
|
|
@@ -1233,14 +1772,24 @@ async function main() {
|
|
|
1233
1772
|
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1234
1773
|
printUsageAndExit(1);
|
|
1235
1774
|
}
|
|
1236
|
-
|
|
1775
|
+
const useInk = process.stdout.isTTY === true;
|
|
1776
|
+
if (!useInk) {
|
|
1777
|
+
printBanner();
|
|
1778
|
+
}
|
|
1237
1779
|
const runner = createRunner();
|
|
1238
1780
|
try {
|
|
1239
1781
|
if (args.command === "run") {
|
|
1240
|
-
await
|
|
1782
|
+
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
1783
|
+
runner,
|
|
1784
|
+
args.datasetName,
|
|
1785
|
+
args.evaluatorPattern
|
|
1786
|
+
);
|
|
1241
1787
|
return;
|
|
1242
1788
|
}
|
|
1243
|
-
await
|
|
1789
|
+
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
1790
|
+
runner,
|
|
1791
|
+
args.datasetName
|
|
1792
|
+
);
|
|
1244
1793
|
} finally {
|
|
1245
1794
|
await runner.shutdown();
|
|
1246
1795
|
}
|