@m4trix/evals 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +500 -9
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +497 -9
- package/dist/cli-simple.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -7,6 +7,9 @@ import * as jitiModule from 'jiti';
|
|
|
7
7
|
import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import 'json-diff';
|
|
10
|
+
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
|
+
import { render, Box, Text } from 'ink';
|
|
12
|
+
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
10
13
|
|
|
11
14
|
// src/runner/config.ts
|
|
12
15
|
var defaultRunnerConfig = {
|
|
@@ -886,6 +889,88 @@ function printBanner() {
|
|
|
886
889
|
];
|
|
887
890
|
console.log(lines.join("\n"));
|
|
888
891
|
}
|
|
892
|
+
function Banner() {
|
|
893
|
+
return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
|
|
894
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
|
|
895
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
|
|
896
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
|
|
897
|
+
] });
|
|
898
|
+
}
|
|
899
|
+
function GenerateView({
|
|
900
|
+
runner,
|
|
901
|
+
datasetName,
|
|
902
|
+
onComplete
|
|
903
|
+
}) {
|
|
904
|
+
const [result, setResult] = useState(null);
|
|
905
|
+
const [error, setError] = useState(null);
|
|
906
|
+
useEffect(() => {
|
|
907
|
+
let cancelled = false;
|
|
908
|
+
async function run() {
|
|
909
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
910
|
+
if (!dataset) {
|
|
911
|
+
setError(new Error(`Dataset "${datasetName}" not found.`));
|
|
912
|
+
onComplete(new Error(`Dataset "${datasetName}" not found.`));
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
916
|
+
const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
|
|
917
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
918
|
+
const payload = testCases.map((item) => {
|
|
919
|
+
const tc = item.testCase;
|
|
920
|
+
return {
|
|
921
|
+
name: item.testCase.getName(),
|
|
922
|
+
input: item.testCase.getInput(),
|
|
923
|
+
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
924
|
+
};
|
|
925
|
+
});
|
|
926
|
+
const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
|
|
927
|
+
const parsed = parse2(absoluteDatasetPath);
|
|
928
|
+
const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
|
|
929
|
+
await writeFile2(
|
|
930
|
+
outputPath,
|
|
931
|
+
`${JSON.stringify(payload, null, 2)}
|
|
932
|
+
`,
|
|
933
|
+
"utf8"
|
|
934
|
+
);
|
|
935
|
+
if (!cancelled) {
|
|
936
|
+
setResult({
|
|
937
|
+
count: payload.length,
|
|
938
|
+
datasetName: dataset.dataset.getName(),
|
|
939
|
+
outputPath
|
|
940
|
+
});
|
|
941
|
+
setTimeout(() => onComplete(), 200);
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
void run();
|
|
945
|
+
return () => {
|
|
946
|
+
cancelled = true;
|
|
947
|
+
};
|
|
948
|
+
}, [runner, datasetName, onComplete]);
|
|
949
|
+
if (error) {
|
|
950
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
951
|
+
/* @__PURE__ */ jsx(Banner, {}),
|
|
952
|
+
/* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
|
|
953
|
+
] });
|
|
954
|
+
}
|
|
955
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
956
|
+
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
957
|
+
result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
958
|
+
/* @__PURE__ */ jsxs(Text, { color: "green", children: [
|
|
959
|
+
"Generated ",
|
|
960
|
+
result.count,
|
|
961
|
+
' test cases for dataset "',
|
|
962
|
+
result.datasetName,
|
|
963
|
+
'".'
|
|
964
|
+
] }),
|
|
965
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
966
|
+
"Wrote ",
|
|
967
|
+
result.outputPath
|
|
968
|
+
] })
|
|
969
|
+
] })
|
|
970
|
+
] });
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
// src/cli-simple/generate.ts
|
|
889
974
|
function readOutput2(testCase) {
|
|
890
975
|
if (typeof testCase.getOutput !== "function") {
|
|
891
976
|
return void 0;
|
|
@@ -896,7 +981,7 @@ function createOutputPath(datasetFilePath) {
|
|
|
896
981
|
const parsed = parse(datasetFilePath);
|
|
897
982
|
return join(parsed.dir, `${parsed.name}.cases.json`);
|
|
898
983
|
}
|
|
899
|
-
async function
|
|
984
|
+
async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
900
985
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
901
986
|
if (!dataset) {
|
|
902
987
|
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
@@ -914,6 +999,380 @@ async function generateDatasetJsonCommand(runner, datasetName) {
|
|
|
914
999
|
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
915
1000
|
console.log(`Wrote ${outputPath}`);
|
|
916
1001
|
}
|
|
1002
|
+
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1003
|
+
return new Promise((resolve4, reject) => {
|
|
1004
|
+
const app = render(
|
|
1005
|
+
React2.createElement(GenerateView, {
|
|
1006
|
+
runner,
|
|
1007
|
+
datasetName,
|
|
1008
|
+
onComplete: (err) => {
|
|
1009
|
+
app.unmount();
|
|
1010
|
+
if (err) {
|
|
1011
|
+
reject(err);
|
|
1012
|
+
} else {
|
|
1013
|
+
resolve4();
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
})
|
|
1017
|
+
);
|
|
1018
|
+
});
|
|
1019
|
+
}
|
|
1020
|
+
function barColor(pct) {
|
|
1021
|
+
if (pct >= 70)
|
|
1022
|
+
return "green";
|
|
1023
|
+
if (pct >= 40)
|
|
1024
|
+
return "yellow";
|
|
1025
|
+
return "red";
|
|
1026
|
+
}
|
|
1027
|
+
function TextBar({
|
|
1028
|
+
label,
|
|
1029
|
+
value,
|
|
1030
|
+
max = 100,
|
|
1031
|
+
labelWidth = 14,
|
|
1032
|
+
barWidth = 20,
|
|
1033
|
+
format = (v) => String(v),
|
|
1034
|
+
colorByValue = true
|
|
1035
|
+
}) {
|
|
1036
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
1037
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
1038
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
1039
|
+
const filledBar = "\u2588".repeat(filled);
|
|
1040
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
1041
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
1042
|
+
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1043
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
1044
|
+
" [",
|
|
1045
|
+
color ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1046
|
+
/* @__PURE__ */ jsx(Text, { color, children: filledBar }),
|
|
1047
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
|
|
1048
|
+
] }) : filledBar + emptyBar,
|
|
1049
|
+
"] ",
|
|
1050
|
+
/* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
1051
|
+
] });
|
|
1052
|
+
}
|
|
1053
|
+
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1054
|
+
function Spinner({ label = "Running" }) {
|
|
1055
|
+
const [frame, setFrame] = useState(0);
|
|
1056
|
+
useEffect(() => {
|
|
1057
|
+
const timer = setInterval(() => {
|
|
1058
|
+
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1059
|
+
}, 100);
|
|
1060
|
+
return () => clearInterval(timer);
|
|
1061
|
+
}, []);
|
|
1062
|
+
return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1063
|
+
FRAMES[frame],
|
|
1064
|
+
" ",
|
|
1065
|
+
label
|
|
1066
|
+
] });
|
|
1067
|
+
}
|
|
1068
|
+
function scoreColor(score) {
|
|
1069
|
+
if (score >= 80)
|
|
1070
|
+
return "green";
|
|
1071
|
+
if (score >= 50)
|
|
1072
|
+
return "yellow";
|
|
1073
|
+
return "red";
|
|
1074
|
+
}
|
|
1075
|
+
function createBar(value, max = 100, width = 20) {
|
|
1076
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
1077
|
+
const filled = Math.round(safe / max * width);
|
|
1078
|
+
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1079
|
+
}
|
|
1080
|
+
function formatScorePart(item, scoreToColor2) {
|
|
1081
|
+
const def = getScoreById(item.id);
|
|
1082
|
+
if (!def) {
|
|
1083
|
+
const numeric = toNumericScore(item.data);
|
|
1084
|
+
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1085
|
+
}
|
|
1086
|
+
const formatted = def.format(item.data);
|
|
1087
|
+
if (def.displayStrategy === "bar") {
|
|
1088
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1089
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1090
|
+
return `${formatted} ${createBar(numeric)}`;
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
return formatted;
|
|
1094
|
+
}
|
|
1095
|
+
function RunView({
|
|
1096
|
+
runner,
|
|
1097
|
+
datasetName,
|
|
1098
|
+
evaluatorPattern,
|
|
1099
|
+
onComplete
|
|
1100
|
+
}) {
|
|
1101
|
+
const [phase, setPhase] = useState(
|
|
1102
|
+
"loading"
|
|
1103
|
+
);
|
|
1104
|
+
const [runInfo, setRunInfo] = useState(null);
|
|
1105
|
+
const [testCases, setTestCases] = useState([]);
|
|
1106
|
+
const [summary, setSummary] = useState(null);
|
|
1107
|
+
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1108
|
+
const runEval = useCallback(async () => {
|
|
1109
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1110
|
+
if (!dataset) {
|
|
1111
|
+
const known = await runner.collectDatasets();
|
|
1112
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1113
|
+
onComplete(
|
|
1114
|
+
new Error(
|
|
1115
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1116
|
+
)
|
|
1117
|
+
);
|
|
1118
|
+
return;
|
|
1119
|
+
}
|
|
1120
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
1121
|
+
if (evaluators.length === 0) {
|
|
1122
|
+
const known = await runner.collectEvaluators();
|
|
1123
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
1124
|
+
onComplete(
|
|
1125
|
+
new Error(
|
|
1126
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
|
|
1127
|
+
)
|
|
1128
|
+
);
|
|
1129
|
+
return;
|
|
1130
|
+
}
|
|
1131
|
+
const nameById = new Map(
|
|
1132
|
+
evaluators.map((item) => [
|
|
1133
|
+
item.id,
|
|
1134
|
+
item.evaluator.getName() ?? item.id
|
|
1135
|
+
])
|
|
1136
|
+
);
|
|
1137
|
+
setEvaluatorNameById(nameById);
|
|
1138
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
1139
|
+
let overallScoreTotal = 0;
|
|
1140
|
+
let overallScoreCount = 0;
|
|
1141
|
+
const done = new Promise((resolve4) => {
|
|
1142
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1143
|
+
if (event.type === "TestCaseProgress") {
|
|
1144
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1145
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1146
|
+
for (const item of event.evaluatorScores) {
|
|
1147
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
1148
|
+
if (numeric !== void 0) {
|
|
1149
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1150
|
+
total: 0,
|
|
1151
|
+
count: 0,
|
|
1152
|
+
passed: 0,
|
|
1153
|
+
failed: 0
|
|
1154
|
+
};
|
|
1155
|
+
aggregates.set(item.evaluatorId, {
|
|
1156
|
+
total: current.total + numeric,
|
|
1157
|
+
count: current.count + 1,
|
|
1158
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
1159
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
1160
|
+
});
|
|
1161
|
+
overallScoreTotal += numeric;
|
|
1162
|
+
overallScoreCount += 1;
|
|
1163
|
+
}
|
|
1164
|
+
}
|
|
1165
|
+
setTestCases((prev) => [
|
|
1166
|
+
...prev,
|
|
1167
|
+
{
|
|
1168
|
+
name: event.testCaseName,
|
|
1169
|
+
completedTestCases: event.completedTestCases,
|
|
1170
|
+
totalTestCases: event.totalTestCases,
|
|
1171
|
+
durationMs: event.durationMs,
|
|
1172
|
+
passed: event.passed,
|
|
1173
|
+
averageScore,
|
|
1174
|
+
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1175
|
+
evaluatorId: item.evaluatorId,
|
|
1176
|
+
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1177
|
+
scores: item.scores,
|
|
1178
|
+
passed: item.passed,
|
|
1179
|
+
metrics: item.metrics
|
|
1180
|
+
}))
|
|
1181
|
+
}
|
|
1182
|
+
]);
|
|
1183
|
+
}
|
|
1184
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1185
|
+
unsubscribe();
|
|
1186
|
+
resolve4(event);
|
|
1187
|
+
}
|
|
1188
|
+
});
|
|
1189
|
+
});
|
|
1190
|
+
const snapshot = await runner.runDatasetWith({
|
|
1191
|
+
datasetId: dataset.id,
|
|
1192
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
1193
|
+
});
|
|
1194
|
+
setRunInfo({
|
|
1195
|
+
runId: snapshot.runId,
|
|
1196
|
+
datasetName: snapshot.datasetName,
|
|
1197
|
+
evaluatorNames: evaluators.map(
|
|
1198
|
+
(e) => e.evaluator.getName() ?? e.id
|
|
1199
|
+
),
|
|
1200
|
+
totalTestCases: snapshot.totalTestCases
|
|
1201
|
+
});
|
|
1202
|
+
setPhase("running");
|
|
1203
|
+
const finalEvent = await done;
|
|
1204
|
+
if (finalEvent.type === "RunFailed") {
|
|
1205
|
+
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1206
|
+
return;
|
|
1207
|
+
}
|
|
1208
|
+
setSummary({
|
|
1209
|
+
passedTestCases: finalEvent.passedTestCases,
|
|
1210
|
+
failedTestCases: finalEvent.failedTestCases,
|
|
1211
|
+
totalTestCases: finalEvent.totalTestCases,
|
|
1212
|
+
overallScoreTotal,
|
|
1213
|
+
overallScoreCount,
|
|
1214
|
+
aggregates: new Map(aggregates),
|
|
1215
|
+
artifactPath: finalEvent.artifactPath
|
|
1216
|
+
});
|
|
1217
|
+
setPhase("completed");
|
|
1218
|
+
setTimeout(() => onComplete(), 200);
|
|
1219
|
+
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1220
|
+
useEffect(() => {
|
|
1221
|
+
void runEval();
|
|
1222
|
+
}, [runEval]);
|
|
1223
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
1224
|
+
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1225
|
+
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1226
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1227
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
|
|
1228
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
1229
|
+
] }),
|
|
1230
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1231
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
|
|
1232
|
+
runInfo.datasetName
|
|
1233
|
+
] }),
|
|
1234
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1235
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
|
|
1236
|
+
runInfo.evaluatorNames.join(", ")
|
|
1237
|
+
] }),
|
|
1238
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1239
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
|
|
1240
|
+
runInfo.totalTestCases
|
|
1241
|
+
] })
|
|
1242
|
+
] }),
|
|
1243
|
+
phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
|
|
1244
|
+
Spinner,
|
|
1245
|
+
{
|
|
1246
|
+
label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
|
|
1247
|
+
}
|
|
1248
|
+
) }),
|
|
1249
|
+
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1250
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1251
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1252
|
+
"[",
|
|
1253
|
+
tc.completedTestCases,
|
|
1254
|
+
"/",
|
|
1255
|
+
tc.totalTestCases,
|
|
1256
|
+
"]"
|
|
1257
|
+
] }),
|
|
1258
|
+
" ",
|
|
1259
|
+
tc.name,
|
|
1260
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1261
|
+
" (",
|
|
1262
|
+
tc.durationMs,
|
|
1263
|
+
"ms)"
|
|
1264
|
+
] })
|
|
1265
|
+
] }),
|
|
1266
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
|
|
1267
|
+
item.evaluatorName,
|
|
1268
|
+
":",
|
|
1269
|
+
" ",
|
|
1270
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1271
|
+
" ",
|
|
1272
|
+
item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1273
|
+
formatScorePart(s),
|
|
1274
|
+
" "
|
|
1275
|
+
] }, s.id)),
|
|
1276
|
+
item.metrics?.map((m) => {
|
|
1277
|
+
const def = getMetricById(m.id);
|
|
1278
|
+
if (!def)
|
|
1279
|
+
return null;
|
|
1280
|
+
const formatted = def.format(m.data);
|
|
1281
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1282
|
+
"[",
|
|
1283
|
+
def.name ? `${def.name}: ` : "",
|
|
1284
|
+
formatted,
|
|
1285
|
+
"]",
|
|
1286
|
+
" "
|
|
1287
|
+
] }, m.id);
|
|
1288
|
+
})
|
|
1289
|
+
] }) }, item.evaluatorId))
|
|
1290
|
+
] }, i)) }),
|
|
1291
|
+
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1292
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1293
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
|
|
1294
|
+
/* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
|
|
1295
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1296
|
+
" ",
|
|
1297
|
+
summary.passedTestCases,
|
|
1298
|
+
"/",
|
|
1299
|
+
summary.totalTestCases
|
|
1300
|
+
] })
|
|
1301
|
+
] }),
|
|
1302
|
+
/* @__PURE__ */ jsxs(Box, { children: [
|
|
1303
|
+
/* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
|
|
1304
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1305
|
+
" ",
|
|
1306
|
+
summary.failedTestCases,
|
|
1307
|
+
"/",
|
|
1308
|
+
summary.totalTestCases
|
|
1309
|
+
] })
|
|
1310
|
+
] }),
|
|
1311
|
+
summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
|
|
1312
|
+
TextBar,
|
|
1313
|
+
{
|
|
1314
|
+
label: "overall avg",
|
|
1315
|
+
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1316
|
+
barWidth: 20,
|
|
1317
|
+
format: (v) => v.toFixed(2)
|
|
1318
|
+
}
|
|
1319
|
+
) }),
|
|
1320
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1321
|
+
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1322
|
+
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1323
|
+
const agg = summary.aggregates.get(id);
|
|
1324
|
+
if (!agg || agg.count === 0) {
|
|
1325
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1326
|
+
"- ",
|
|
1327
|
+
name.padEnd(28),
|
|
1328
|
+
" no numeric scores"
|
|
1329
|
+
] }, id);
|
|
1330
|
+
}
|
|
1331
|
+
const mean = agg.total / agg.count;
|
|
1332
|
+
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1333
|
+
"- ",
|
|
1334
|
+
name.padEnd(28),
|
|
1335
|
+
" avg=",
|
|
1336
|
+
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1337
|
+
" passed=",
|
|
1338
|
+
agg.passed,
|
|
1339
|
+
" failed=",
|
|
1340
|
+
agg.failed
|
|
1341
|
+
] }, id);
|
|
1342
|
+
})
|
|
1343
|
+
] }),
|
|
1344
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1345
|
+
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1346
|
+
testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
|
|
1347
|
+
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1348
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1349
|
+
" ",
|
|
1350
|
+
tc.name.padEnd(24)
|
|
1351
|
+
] }),
|
|
1352
|
+
tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1353
|
+
/* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
|
|
1354
|
+
"score=",
|
|
1355
|
+
tc.averageScore.toFixed(2)
|
|
1356
|
+
] }),
|
|
1357
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1358
|
+
" ",
|
|
1359
|
+
createBar(tc.averageScore, 100, 14)
|
|
1360
|
+
] })
|
|
1361
|
+
] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
|
|
1362
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1363
|
+
" (",
|
|
1364
|
+
tc.durationMs,
|
|
1365
|
+
"ms)"
|
|
1366
|
+
] })
|
|
1367
|
+
] }, i))
|
|
1368
|
+
] }),
|
|
1369
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1370
|
+
"artifact: ",
|
|
1371
|
+
summary.artifactPath
|
|
1372
|
+
] }) })
|
|
1373
|
+
] })
|
|
1374
|
+
] });
|
|
1375
|
+
}
|
|
917
1376
|
|
|
918
1377
|
// src/cli-simple/run.ts
|
|
919
1378
|
var ansi2 = {
|
|
@@ -945,7 +1404,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
945
1404
|
const mean = aggregate.total / aggregate.count;
|
|
946
1405
|
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
947
1406
|
}
|
|
948
|
-
function
|
|
1407
|
+
function createBar2(value, max = 100, width = 20) {
|
|
949
1408
|
const safe = Math.max(0, Math.min(max, value));
|
|
950
1409
|
const filled = Math.round(safe / max * width);
|
|
951
1410
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
@@ -968,7 +1427,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
968
1427
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
969
1428
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
970
1429
|
scoreParts.push(
|
|
971
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(
|
|
1430
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
972
1431
|
);
|
|
973
1432
|
} else {
|
|
974
1433
|
scoreParts.push(formatted);
|
|
@@ -1007,7 +1466,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1007
1466
|
}
|
|
1008
1467
|
return line;
|
|
1009
1468
|
}
|
|
1010
|
-
async function
|
|
1469
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1011
1470
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1012
1471
|
if (!dataset) {
|
|
1013
1472
|
const known = await runner.collectDatasets();
|
|
@@ -1154,7 +1613,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1154
1613
|
`- overall avg score: ${colorize(
|
|
1155
1614
|
overallAverage.toFixed(2),
|
|
1156
1615
|
scoreToColor(overallAverage)
|
|
1157
|
-
)} ${colorize(
|
|
1616
|
+
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1158
1617
|
);
|
|
1159
1618
|
}
|
|
1160
1619
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
@@ -1177,12 +1636,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1177
1636
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1178
1637
|
summary.averageScore.toFixed(2),
|
|
1179
1638
|
scoreToColor(summary.averageScore)
|
|
1180
|
-
)} ${colorize(
|
|
1639
|
+
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1181
1640
|
);
|
|
1182
1641
|
}
|
|
1183
1642
|
}
|
|
1184
1643
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1185
1644
|
}
|
|
1645
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1646
|
+
return new Promise((resolve4, reject) => {
|
|
1647
|
+
const app = render(
|
|
1648
|
+
React2.createElement(RunView, {
|
|
1649
|
+
runner,
|
|
1650
|
+
datasetName,
|
|
1651
|
+
evaluatorPattern,
|
|
1652
|
+
onComplete: (err) => {
|
|
1653
|
+
app.unmount();
|
|
1654
|
+
if (err) {
|
|
1655
|
+
reject(err);
|
|
1656
|
+
} else {
|
|
1657
|
+
resolve4();
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
})
|
|
1661
|
+
);
|
|
1662
|
+
});
|
|
1663
|
+
}
|
|
1186
1664
|
|
|
1187
1665
|
// src/cli-simple/index.ts
|
|
1188
1666
|
function printUsageAndExit(exitCode) {
|
|
@@ -1210,14 +1688,24 @@ async function main() {
|
|
|
1210
1688
|
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1211
1689
|
printUsageAndExit(1);
|
|
1212
1690
|
}
|
|
1213
|
-
|
|
1691
|
+
const useInk = process.stdout.isTTY === true;
|
|
1692
|
+
if (!useInk) {
|
|
1693
|
+
printBanner();
|
|
1694
|
+
}
|
|
1214
1695
|
const runner = createRunner();
|
|
1215
1696
|
try {
|
|
1216
1697
|
if (args.command === "run") {
|
|
1217
|
-
await
|
|
1698
|
+
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
1699
|
+
runner,
|
|
1700
|
+
args.datasetName,
|
|
1701
|
+
args.evaluatorPattern
|
|
1702
|
+
);
|
|
1218
1703
|
return;
|
|
1219
1704
|
}
|
|
1220
|
-
await
|
|
1705
|
+
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
1706
|
+
runner,
|
|
1707
|
+
args.datasetName
|
|
1708
|
+
);
|
|
1221
1709
|
} finally {
|
|
1222
1710
|
await runner.shutdown();
|
|
1223
1711
|
}
|