@m4trix/evals 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +500 -9
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +497 -9
- package/dist/cli-simple.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -9,8 +9,13 @@ var jitiModule = require('jiti');
|
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
11
|
require('json-diff');
|
|
12
|
+
var React2 = require('react');
|
|
13
|
+
var ink = require('ink');
|
|
14
|
+
var jsxRuntime = require('react/jsx-runtime');
|
|
12
15
|
|
|
13
16
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
17
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
18
|
+
|
|
14
19
|
function _interopNamespace(e) {
|
|
15
20
|
if (e && e.__esModule) return e;
|
|
16
21
|
var n = Object.create(null);
|
|
@@ -30,6 +35,7 @@ function _interopNamespace(e) {
|
|
|
30
35
|
}
|
|
31
36
|
|
|
32
37
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
38
|
+
var React2__default = /*#__PURE__*/_interopDefault(React2);
|
|
33
39
|
|
|
34
40
|
// src/runner/config.ts
|
|
35
41
|
var defaultRunnerConfig = {
|
|
@@ -909,6 +915,88 @@ function printBanner() {
|
|
|
909
915
|
];
|
|
910
916
|
console.log(lines.join("\n"));
|
|
911
917
|
}
|
|
918
|
+
function Banner() {
|
|
919
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
|
|
920
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "@m4trix/evals" }),
|
|
921
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: " \xB7 " }),
|
|
922
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "eval-agents-simple" })
|
|
923
|
+
] });
|
|
924
|
+
}
|
|
925
|
+
function GenerateView({
|
|
926
|
+
runner,
|
|
927
|
+
datasetName,
|
|
928
|
+
onComplete
|
|
929
|
+
}) {
|
|
930
|
+
const [result, setResult] = React2.useState(null);
|
|
931
|
+
const [error, setError] = React2.useState(null);
|
|
932
|
+
React2.useEffect(() => {
|
|
933
|
+
let cancelled = false;
|
|
934
|
+
async function run() {
|
|
935
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
936
|
+
if (!dataset) {
|
|
937
|
+
setError(new Error(`Dataset "${datasetName}" not found.`));
|
|
938
|
+
onComplete(new Error(`Dataset "${datasetName}" not found.`));
|
|
939
|
+
return;
|
|
940
|
+
}
|
|
941
|
+
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
942
|
+
const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
|
|
943
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
944
|
+
const payload = testCases.map((item) => {
|
|
945
|
+
const tc = item.testCase;
|
|
946
|
+
return {
|
|
947
|
+
name: item.testCase.getName(),
|
|
948
|
+
input: item.testCase.getInput(),
|
|
949
|
+
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
950
|
+
};
|
|
951
|
+
});
|
|
952
|
+
const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
|
|
953
|
+
const parsed = parse2(absoluteDatasetPath);
|
|
954
|
+
const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
|
|
955
|
+
await writeFile2(
|
|
956
|
+
outputPath,
|
|
957
|
+
`${JSON.stringify(payload, null, 2)}
|
|
958
|
+
`,
|
|
959
|
+
"utf8"
|
|
960
|
+
);
|
|
961
|
+
if (!cancelled) {
|
|
962
|
+
setResult({
|
|
963
|
+
count: payload.length,
|
|
964
|
+
datasetName: dataset.dataset.getName(),
|
|
965
|
+
outputPath
|
|
966
|
+
});
|
|
967
|
+
setTimeout(() => onComplete(), 200);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
void run();
|
|
971
|
+
return () => {
|
|
972
|
+
cancelled = true;
|
|
973
|
+
};
|
|
974
|
+
}, [runner, datasetName, onComplete]);
|
|
975
|
+
if (error) {
|
|
976
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
977
|
+
/* @__PURE__ */ jsxRuntime.jsx(Banner, {}),
|
|
978
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: error.message })
|
|
979
|
+
] });
|
|
980
|
+
}
|
|
981
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
982
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
983
|
+
result && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
984
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "green", children: [
|
|
985
|
+
"Generated ",
|
|
986
|
+
result.count,
|
|
987
|
+
' test cases for dataset "',
|
|
988
|
+
result.datasetName,
|
|
989
|
+
'".'
|
|
990
|
+
] }),
|
|
991
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
992
|
+
"Wrote ",
|
|
993
|
+
result.outputPath
|
|
994
|
+
] })
|
|
995
|
+
] })
|
|
996
|
+
] });
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
// src/cli-simple/generate.ts
|
|
912
1000
|
function readOutput2(testCase) {
|
|
913
1001
|
if (typeof testCase.getOutput !== "function") {
|
|
914
1002
|
return void 0;
|
|
@@ -919,7 +1007,7 @@ function createOutputPath(datasetFilePath) {
|
|
|
919
1007
|
const parsed = path.parse(datasetFilePath);
|
|
920
1008
|
return path.join(parsed.dir, `${parsed.name}.cases.json`);
|
|
921
1009
|
}
|
|
922
|
-
async function
|
|
1010
|
+
async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
923
1011
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
924
1012
|
if (!dataset) {
|
|
925
1013
|
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
@@ -937,6 +1025,380 @@ async function generateDatasetJsonCommand(runner, datasetName) {
|
|
|
937
1025
|
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
938
1026
|
console.log(`Wrote ${outputPath}`);
|
|
939
1027
|
}
|
|
1028
|
+
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1029
|
+
return new Promise((resolve4, reject) => {
|
|
1030
|
+
const app = ink.render(
|
|
1031
|
+
React2__default.default.createElement(GenerateView, {
|
|
1032
|
+
runner,
|
|
1033
|
+
datasetName,
|
|
1034
|
+
onComplete: (err) => {
|
|
1035
|
+
app.unmount();
|
|
1036
|
+
if (err) {
|
|
1037
|
+
reject(err);
|
|
1038
|
+
} else {
|
|
1039
|
+
resolve4();
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
})
|
|
1043
|
+
);
|
|
1044
|
+
});
|
|
1045
|
+
}
|
|
1046
|
+
function barColor(pct) {
|
|
1047
|
+
if (pct >= 70)
|
|
1048
|
+
return "green";
|
|
1049
|
+
if (pct >= 40)
|
|
1050
|
+
return "yellow";
|
|
1051
|
+
return "red";
|
|
1052
|
+
}
|
|
1053
|
+
function TextBar({
|
|
1054
|
+
label,
|
|
1055
|
+
value,
|
|
1056
|
+
max = 100,
|
|
1057
|
+
labelWidth = 14,
|
|
1058
|
+
barWidth = 20,
|
|
1059
|
+
format = (v) => String(v),
|
|
1060
|
+
colorByValue = true
|
|
1061
|
+
}) {
|
|
1062
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
1063
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
1064
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
1065
|
+
const filledBar = "\u2588".repeat(filled);
|
|
1066
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
1067
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
1068
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1069
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
1070
|
+
" [",
|
|
1071
|
+
color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1072
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
|
|
1073
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
|
|
1074
|
+
] }) : filledBar + emptyBar,
|
|
1075
|
+
"] ",
|
|
1076
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
1077
|
+
] });
|
|
1078
|
+
}
|
|
1079
|
+
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1080
|
+
function Spinner({ label = "Running" }) {
|
|
1081
|
+
const [frame, setFrame] = React2.useState(0);
|
|
1082
|
+
React2.useEffect(() => {
|
|
1083
|
+
const timer = setInterval(() => {
|
|
1084
|
+
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1085
|
+
}, 100);
|
|
1086
|
+
return () => clearInterval(timer);
|
|
1087
|
+
}, []);
|
|
1088
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1089
|
+
FRAMES[frame],
|
|
1090
|
+
" ",
|
|
1091
|
+
label
|
|
1092
|
+
] });
|
|
1093
|
+
}
|
|
1094
|
+
function scoreColor(score) {
|
|
1095
|
+
if (score >= 80)
|
|
1096
|
+
return "green";
|
|
1097
|
+
if (score >= 50)
|
|
1098
|
+
return "yellow";
|
|
1099
|
+
return "red";
|
|
1100
|
+
}
|
|
1101
|
+
function createBar(value, max = 100, width = 20) {
|
|
1102
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
1103
|
+
const filled = Math.round(safe / max * width);
|
|
1104
|
+
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1105
|
+
}
|
|
1106
|
+
function formatScorePart(item, scoreToColor2) {
|
|
1107
|
+
const def = getScoreById(item.id);
|
|
1108
|
+
if (!def) {
|
|
1109
|
+
const numeric = toNumericScore(item.data);
|
|
1110
|
+
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1111
|
+
}
|
|
1112
|
+
const formatted = def.format(item.data);
|
|
1113
|
+
if (def.displayStrategy === "bar") {
|
|
1114
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1115
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1116
|
+
return `${formatted} ${createBar(numeric)}`;
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
return formatted;
|
|
1120
|
+
}
|
|
1121
|
+
function RunView({
|
|
1122
|
+
runner,
|
|
1123
|
+
datasetName,
|
|
1124
|
+
evaluatorPattern,
|
|
1125
|
+
onComplete
|
|
1126
|
+
}) {
|
|
1127
|
+
const [phase, setPhase] = React2.useState(
|
|
1128
|
+
"loading"
|
|
1129
|
+
);
|
|
1130
|
+
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1131
|
+
const [testCases, setTestCases] = React2.useState([]);
|
|
1132
|
+
const [summary, setSummary] = React2.useState(null);
|
|
1133
|
+
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1134
|
+
const runEval = React2.useCallback(async () => {
|
|
1135
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1136
|
+
if (!dataset) {
|
|
1137
|
+
const known = await runner.collectDatasets();
|
|
1138
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1139
|
+
onComplete(
|
|
1140
|
+
new Error(
|
|
1141
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1142
|
+
)
|
|
1143
|
+
);
|
|
1144
|
+
return;
|
|
1145
|
+
}
|
|
1146
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
1147
|
+
if (evaluators.length === 0) {
|
|
1148
|
+
const known = await runner.collectEvaluators();
|
|
1149
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
1150
|
+
onComplete(
|
|
1151
|
+
new Error(
|
|
1152
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
|
|
1153
|
+
)
|
|
1154
|
+
);
|
|
1155
|
+
return;
|
|
1156
|
+
}
|
|
1157
|
+
const nameById = new Map(
|
|
1158
|
+
evaluators.map((item) => [
|
|
1159
|
+
item.id,
|
|
1160
|
+
item.evaluator.getName() ?? item.id
|
|
1161
|
+
])
|
|
1162
|
+
);
|
|
1163
|
+
setEvaluatorNameById(nameById);
|
|
1164
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
1165
|
+
let overallScoreTotal = 0;
|
|
1166
|
+
let overallScoreCount = 0;
|
|
1167
|
+
const done = new Promise((resolve4) => {
|
|
1168
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1169
|
+
if (event.type === "TestCaseProgress") {
|
|
1170
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1171
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1172
|
+
for (const item of event.evaluatorScores) {
|
|
1173
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
1174
|
+
if (numeric !== void 0) {
|
|
1175
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1176
|
+
total: 0,
|
|
1177
|
+
count: 0,
|
|
1178
|
+
passed: 0,
|
|
1179
|
+
failed: 0
|
|
1180
|
+
};
|
|
1181
|
+
aggregates.set(item.evaluatorId, {
|
|
1182
|
+
total: current.total + numeric,
|
|
1183
|
+
count: current.count + 1,
|
|
1184
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
1185
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
1186
|
+
});
|
|
1187
|
+
overallScoreTotal += numeric;
|
|
1188
|
+
overallScoreCount += 1;
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
setTestCases((prev) => [
|
|
1192
|
+
...prev,
|
|
1193
|
+
{
|
|
1194
|
+
name: event.testCaseName,
|
|
1195
|
+
completedTestCases: event.completedTestCases,
|
|
1196
|
+
totalTestCases: event.totalTestCases,
|
|
1197
|
+
durationMs: event.durationMs,
|
|
1198
|
+
passed: event.passed,
|
|
1199
|
+
averageScore,
|
|
1200
|
+
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1201
|
+
evaluatorId: item.evaluatorId,
|
|
1202
|
+
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1203
|
+
scores: item.scores,
|
|
1204
|
+
passed: item.passed,
|
|
1205
|
+
metrics: item.metrics
|
|
1206
|
+
}))
|
|
1207
|
+
}
|
|
1208
|
+
]);
|
|
1209
|
+
}
|
|
1210
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1211
|
+
unsubscribe();
|
|
1212
|
+
resolve4(event);
|
|
1213
|
+
}
|
|
1214
|
+
});
|
|
1215
|
+
});
|
|
1216
|
+
const snapshot = await runner.runDatasetWith({
|
|
1217
|
+
datasetId: dataset.id,
|
|
1218
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
1219
|
+
});
|
|
1220
|
+
setRunInfo({
|
|
1221
|
+
runId: snapshot.runId,
|
|
1222
|
+
datasetName: snapshot.datasetName,
|
|
1223
|
+
evaluatorNames: evaluators.map(
|
|
1224
|
+
(e) => e.evaluator.getName() ?? e.id
|
|
1225
|
+
),
|
|
1226
|
+
totalTestCases: snapshot.totalTestCases
|
|
1227
|
+
});
|
|
1228
|
+
setPhase("running");
|
|
1229
|
+
const finalEvent = await done;
|
|
1230
|
+
if (finalEvent.type === "RunFailed") {
|
|
1231
|
+
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1232
|
+
return;
|
|
1233
|
+
}
|
|
1234
|
+
setSummary({
|
|
1235
|
+
passedTestCases: finalEvent.passedTestCases,
|
|
1236
|
+
failedTestCases: finalEvent.failedTestCases,
|
|
1237
|
+
totalTestCases: finalEvent.totalTestCases,
|
|
1238
|
+
overallScoreTotal,
|
|
1239
|
+
overallScoreCount,
|
|
1240
|
+
aggregates: new Map(aggregates),
|
|
1241
|
+
artifactPath: finalEvent.artifactPath
|
|
1242
|
+
});
|
|
1243
|
+
setPhase("completed");
|
|
1244
|
+
setTimeout(() => onComplete(), 200);
|
|
1245
|
+
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1246
|
+
React2.useEffect(() => {
|
|
1247
|
+
void runEval();
|
|
1248
|
+
}, [runEval]);
|
|
1249
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1250
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1251
|
+
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1252
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1253
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
|
|
1254
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
1255
|
+
] }),
|
|
1256
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1257
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
|
|
1258
|
+
runInfo.datasetName
|
|
1259
|
+
] }),
|
|
1260
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1261
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
|
|
1262
|
+
runInfo.evaluatorNames.join(", ")
|
|
1263
|
+
] }),
|
|
1264
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1265
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
|
|
1266
|
+
runInfo.totalTestCases
|
|
1267
|
+
] })
|
|
1268
|
+
] }),
|
|
1269
|
+
phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1270
|
+
Spinner,
|
|
1271
|
+
{
|
|
1272
|
+
label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
|
|
1273
|
+
}
|
|
1274
|
+
) }),
|
|
1275
|
+
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1276
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1277
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1278
|
+
"[",
|
|
1279
|
+
tc.completedTestCases,
|
|
1280
|
+
"/",
|
|
1281
|
+
tc.totalTestCases,
|
|
1282
|
+
"]"
|
|
1283
|
+
] }),
|
|
1284
|
+
" ",
|
|
1285
|
+
tc.name,
|
|
1286
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1287
|
+
" (",
|
|
1288
|
+
tc.durationMs,
|
|
1289
|
+
"ms)"
|
|
1290
|
+
] })
|
|
1291
|
+
] }),
|
|
1292
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1293
|
+
item.evaluatorName,
|
|
1294
|
+
":",
|
|
1295
|
+
" ",
|
|
1296
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1297
|
+
" ",
|
|
1298
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1299
|
+
formatScorePart(s),
|
|
1300
|
+
" "
|
|
1301
|
+
] }, s.id)),
|
|
1302
|
+
item.metrics?.map((m) => {
|
|
1303
|
+
const def = getMetricById(m.id);
|
|
1304
|
+
if (!def)
|
|
1305
|
+
return null;
|
|
1306
|
+
const formatted = def.format(m.data);
|
|
1307
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1308
|
+
"[",
|
|
1309
|
+
def.name ? `${def.name}: ` : "",
|
|
1310
|
+
formatted,
|
|
1311
|
+
"]",
|
|
1312
|
+
" "
|
|
1313
|
+
] }, m.id);
|
|
1314
|
+
})
|
|
1315
|
+
] }) }, item.evaluatorId))
|
|
1316
|
+
] }, i)) }),
|
|
1317
|
+
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1318
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1319
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
|
|
1320
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "green", children: "passed" }),
|
|
1321
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1322
|
+
" ",
|
|
1323
|
+
summary.passedTestCases,
|
|
1324
|
+
"/",
|
|
1325
|
+
summary.totalTestCases
|
|
1326
|
+
] })
|
|
1327
|
+
] }),
|
|
1328
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1329
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
|
|
1330
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1331
|
+
" ",
|
|
1332
|
+
summary.failedTestCases,
|
|
1333
|
+
"/",
|
|
1334
|
+
summary.totalTestCases
|
|
1335
|
+
] })
|
|
1336
|
+
] }),
|
|
1337
|
+
summary.overallScoreCount > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1338
|
+
TextBar,
|
|
1339
|
+
{
|
|
1340
|
+
label: "overall avg",
|
|
1341
|
+
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1342
|
+
barWidth: 20,
|
|
1343
|
+
format: (v) => v.toFixed(2)
|
|
1344
|
+
}
|
|
1345
|
+
) }),
|
|
1346
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1347
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1348
|
+
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1349
|
+
const agg = summary.aggregates.get(id);
|
|
1350
|
+
if (!agg || agg.count === 0) {
|
|
1351
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1352
|
+
"- ",
|
|
1353
|
+
name.padEnd(28),
|
|
1354
|
+
" no numeric scores"
|
|
1355
|
+
] }, id);
|
|
1356
|
+
}
|
|
1357
|
+
const mean = agg.total / agg.count;
|
|
1358
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1359
|
+
"- ",
|
|
1360
|
+
name.padEnd(28),
|
|
1361
|
+
" avg=",
|
|
1362
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1363
|
+
" passed=",
|
|
1364
|
+
agg.passed,
|
|
1365
|
+
" failed=",
|
|
1366
|
+
agg.failed
|
|
1367
|
+
] }, id);
|
|
1368
|
+
})
|
|
1369
|
+
] }),
|
|
1370
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1371
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1372
|
+
testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1373
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1374
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1375
|
+
" ",
|
|
1376
|
+
tc.name.padEnd(24)
|
|
1377
|
+
] }),
|
|
1378
|
+
tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1379
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
|
|
1380
|
+
"score=",
|
|
1381
|
+
tc.averageScore.toFixed(2)
|
|
1382
|
+
] }),
|
|
1383
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1384
|
+
" ",
|
|
1385
|
+
createBar(tc.averageScore, 100, 14)
|
|
1386
|
+
] })
|
|
1387
|
+
] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
|
|
1388
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1389
|
+
" (",
|
|
1390
|
+
tc.durationMs,
|
|
1391
|
+
"ms)"
|
|
1392
|
+
] })
|
|
1393
|
+
] }, i))
|
|
1394
|
+
] }),
|
|
1395
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1396
|
+
"artifact: ",
|
|
1397
|
+
summary.artifactPath
|
|
1398
|
+
] }) })
|
|
1399
|
+
] })
|
|
1400
|
+
] });
|
|
1401
|
+
}
|
|
940
1402
|
|
|
941
1403
|
// src/cli-simple/run.ts
|
|
942
1404
|
var ansi2 = {
|
|
@@ -968,7 +1430,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
968
1430
|
const mean = aggregate.total / aggregate.count;
|
|
969
1431
|
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
970
1432
|
}
|
|
971
|
-
function
|
|
1433
|
+
function createBar2(value, max = 100, width = 20) {
|
|
972
1434
|
const safe = Math.max(0, Math.min(max, value));
|
|
973
1435
|
const filled = Math.round(safe / max * width);
|
|
974
1436
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
@@ -991,7 +1453,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
991
1453
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
992
1454
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
993
1455
|
scoreParts.push(
|
|
994
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(
|
|
1456
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
995
1457
|
);
|
|
996
1458
|
} else {
|
|
997
1459
|
scoreParts.push(formatted);
|
|
@@ -1030,7 +1492,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1030
1492
|
}
|
|
1031
1493
|
return line;
|
|
1032
1494
|
}
|
|
1033
|
-
async function
|
|
1495
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1034
1496
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1035
1497
|
if (!dataset) {
|
|
1036
1498
|
const known = await runner.collectDatasets();
|
|
@@ -1177,7 +1639,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1177
1639
|
`- overall avg score: ${colorize(
|
|
1178
1640
|
overallAverage.toFixed(2),
|
|
1179
1641
|
scoreToColor(overallAverage)
|
|
1180
|
-
)} ${colorize(
|
|
1642
|
+
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1181
1643
|
);
|
|
1182
1644
|
}
|
|
1183
1645
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
@@ -1200,12 +1662,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1200
1662
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1201
1663
|
summary.averageScore.toFixed(2),
|
|
1202
1664
|
scoreToColor(summary.averageScore)
|
|
1203
|
-
)} ${colorize(
|
|
1665
|
+
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1204
1666
|
);
|
|
1205
1667
|
}
|
|
1206
1668
|
}
|
|
1207
1669
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1208
1670
|
}
|
|
1671
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1672
|
+
return new Promise((resolve4, reject) => {
|
|
1673
|
+
const app = ink.render(
|
|
1674
|
+
React2__default.default.createElement(RunView, {
|
|
1675
|
+
runner,
|
|
1676
|
+
datasetName,
|
|
1677
|
+
evaluatorPattern,
|
|
1678
|
+
onComplete: (err) => {
|
|
1679
|
+
app.unmount();
|
|
1680
|
+
if (err) {
|
|
1681
|
+
reject(err);
|
|
1682
|
+
} else {
|
|
1683
|
+
resolve4();
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
})
|
|
1687
|
+
);
|
|
1688
|
+
});
|
|
1689
|
+
}
|
|
1209
1690
|
|
|
1210
1691
|
// src/cli-simple/index.ts
|
|
1211
1692
|
function printUsageAndExit(exitCode) {
|
|
@@ -1233,14 +1714,24 @@ async function main() {
|
|
|
1233
1714
|
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1234
1715
|
printUsageAndExit(1);
|
|
1235
1716
|
}
|
|
1236
|
-
|
|
1717
|
+
const useInk = process.stdout.isTTY === true;
|
|
1718
|
+
if (!useInk) {
|
|
1719
|
+
printBanner();
|
|
1720
|
+
}
|
|
1237
1721
|
const runner = createRunner();
|
|
1238
1722
|
try {
|
|
1239
1723
|
if (args.command === "run") {
|
|
1240
|
-
await
|
|
1724
|
+
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
1725
|
+
runner,
|
|
1726
|
+
args.datasetName,
|
|
1727
|
+
args.evaluatorPattern
|
|
1728
|
+
);
|
|
1241
1729
|
return;
|
|
1242
1730
|
}
|
|
1243
|
-
await
|
|
1731
|
+
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
1732
|
+
runner,
|
|
1733
|
+
args.datasetName
|
|
1734
|
+
);
|
|
1244
1735
|
} finally {
|
|
1245
1736
|
await runner.shutdown();
|
|
1246
1737
|
}
|