@m4trix/evals 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +546 -35
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +543 -35
- package/dist/cli-simple.js.map +1 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -5
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -7,6 +7,9 @@ import * as jitiModule from 'jiti';
|
|
|
7
7
|
import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import 'json-diff';
|
|
10
|
+
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
|
+
import { render, Box, Text } from 'ink';
|
|
12
|
+
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
10
13
|
|
|
11
14
|
// src/runner/config.ts
|
|
12
15
|
var defaultRunnerConfig = {
|
|
@@ -867,6 +870,107 @@ function getSimpleCliUsage() {
|
|
|
867
870
|
' "/score/i" regex literal'
|
|
868
871
|
].join("\n");
|
|
869
872
|
}
|
|
873
|
+
|
|
874
|
+
// src/cli-simple/banner.ts
|
|
875
|
+
var ansi = {
|
|
876
|
+
reset: "\x1B[0m",
|
|
877
|
+
dim: "\x1B[2m",
|
|
878
|
+
cyan: "\x1B[36m"
|
|
879
|
+
};
|
|
880
|
+
function printBanner() {
|
|
881
|
+
const c = (s) => `${ansi.cyan}${s}${ansi.reset}`;
|
|
882
|
+
const d = (s) => `${ansi.dim}${s}${ansi.reset}`;
|
|
883
|
+
const lines = [
|
|
884
|
+
"",
|
|
885
|
+
` ${c("\u256D\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256E")}`,
|
|
886
|
+
` ${c("\u2502")} ${d("@m4trix/evals")} ${c("\xB7")} ${d("eval-agents-simple")} ${c("\u2502")}`,
|
|
887
|
+
` ${c("\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256F")}`,
|
|
888
|
+
""
|
|
889
|
+
];
|
|
890
|
+
console.log(lines.join("\n"));
|
|
891
|
+
}
|
|
892
|
+
function Banner() {
|
|
893
|
+
return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
|
|
894
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
|
|
895
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
|
|
896
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
|
|
897
|
+
] });
|
|
898
|
+
}
|
|
899
|
+
function GenerateView({
|
|
900
|
+
runner,
|
|
901
|
+
datasetName,
|
|
902
|
+
onComplete
|
|
903
|
+
}) {
|
|
904
|
+
const [result, setResult] = useState(null);
|
|
905
|
+
const [error, setError] = useState(null);
|
|
906
|
+
useEffect(() => {
|
|
907
|
+
let cancelled = false;
|
|
908
|
+
async function run() {
|
|
909
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
910
|
+
if (!dataset) {
|
|
911
|
+
setError(new Error(`Dataset "${datasetName}" not found.`));
|
|
912
|
+
onComplete(new Error(`Dataset "${datasetName}" not found.`));
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
916
|
+
const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
|
|
917
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
918
|
+
const payload = testCases.map((item) => {
|
|
919
|
+
const tc = item.testCase;
|
|
920
|
+
return {
|
|
921
|
+
name: item.testCase.getName(),
|
|
922
|
+
input: item.testCase.getInput(),
|
|
923
|
+
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
924
|
+
};
|
|
925
|
+
});
|
|
926
|
+
const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
|
|
927
|
+
const parsed = parse2(absoluteDatasetPath);
|
|
928
|
+
const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
|
|
929
|
+
await writeFile2(
|
|
930
|
+
outputPath,
|
|
931
|
+
`${JSON.stringify(payload, null, 2)}
|
|
932
|
+
`,
|
|
933
|
+
"utf8"
|
|
934
|
+
);
|
|
935
|
+
if (!cancelled) {
|
|
936
|
+
setResult({
|
|
937
|
+
count: payload.length,
|
|
938
|
+
datasetName: dataset.dataset.getName(),
|
|
939
|
+
outputPath
|
|
940
|
+
});
|
|
941
|
+
setTimeout(() => onComplete(), 200);
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
void run();
|
|
945
|
+
return () => {
|
|
946
|
+
cancelled = true;
|
|
947
|
+
};
|
|
948
|
+
}, [runner, datasetName, onComplete]);
|
|
949
|
+
if (error) {
|
|
950
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
951
|
+
/* @__PURE__ */ jsx(Banner, {}),
|
|
952
|
+
/* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
|
|
953
|
+
] });
|
|
954
|
+
}
|
|
955
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
956
|
+
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
957
|
+
result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
958
|
+
/* @__PURE__ */ jsxs(Text, { color: "green", children: [
|
|
959
|
+
"Generated ",
|
|
960
|
+
result.count,
|
|
961
|
+
' test cases for dataset "',
|
|
962
|
+
result.datasetName,
|
|
963
|
+
'".'
|
|
964
|
+
] }),
|
|
965
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
966
|
+
"Wrote ",
|
|
967
|
+
result.outputPath
|
|
968
|
+
] })
|
|
969
|
+
] })
|
|
970
|
+
] });
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
// src/cli-simple/generate.ts
|
|
870
974
|
function readOutput2(testCase) {
|
|
871
975
|
if (typeof testCase.getOutput !== "function") {
|
|
872
976
|
return void 0;
|
|
@@ -877,7 +981,7 @@ function createOutputPath(datasetFilePath) {
|
|
|
877
981
|
const parsed = parse(datasetFilePath);
|
|
878
982
|
return join(parsed.dir, `${parsed.name}.cases.json`);
|
|
879
983
|
}
|
|
880
|
-
async function
|
|
984
|
+
async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
881
985
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
882
986
|
if (!dataset) {
|
|
883
987
|
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
@@ -895,9 +999,383 @@ async function generateDatasetJsonCommand(runner, datasetName) {
|
|
|
895
999
|
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
896
1000
|
console.log(`Wrote ${outputPath}`);
|
|
897
1001
|
}
|
|
1002
|
+
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1003
|
+
return new Promise((resolve4, reject) => {
|
|
1004
|
+
const app = render(
|
|
1005
|
+
React2.createElement(GenerateView, {
|
|
1006
|
+
runner,
|
|
1007
|
+
datasetName,
|
|
1008
|
+
onComplete: (err) => {
|
|
1009
|
+
app.unmount();
|
|
1010
|
+
if (err) {
|
|
1011
|
+
reject(err);
|
|
1012
|
+
} else {
|
|
1013
|
+
resolve4();
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
})
|
|
1017
|
+
);
|
|
1018
|
+
});
|
|
1019
|
+
}
|
|
1020
|
+
function barColor(pct) {
|
|
1021
|
+
if (pct >= 70)
|
|
1022
|
+
return "green";
|
|
1023
|
+
if (pct >= 40)
|
|
1024
|
+
return "yellow";
|
|
1025
|
+
return "red";
|
|
1026
|
+
}
|
|
1027
|
+
function TextBar({
|
|
1028
|
+
label,
|
|
1029
|
+
value,
|
|
1030
|
+
max = 100,
|
|
1031
|
+
labelWidth = 14,
|
|
1032
|
+
barWidth = 20,
|
|
1033
|
+
format = (v) => String(v),
|
|
1034
|
+
colorByValue = true
|
|
1035
|
+
}) {
|
|
1036
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
1037
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
1038
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
1039
|
+
const filledBar = "\u2588".repeat(filled);
|
|
1040
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
1041
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
1042
|
+
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1043
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
1044
|
+
" [",
|
|
1045
|
+
color ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1046
|
+
/* @__PURE__ */ jsx(Text, { color, children: filledBar }),
|
|
1047
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
|
|
1048
|
+
] }) : filledBar + emptyBar,
|
|
1049
|
+
"] ",
|
|
1050
|
+
/* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
1051
|
+
] });
|
|
1052
|
+
}
|
|
1053
|
+
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1054
|
+
function Spinner({ label = "Running" }) {
|
|
1055
|
+
const [frame, setFrame] = useState(0);
|
|
1056
|
+
useEffect(() => {
|
|
1057
|
+
const timer = setInterval(() => {
|
|
1058
|
+
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1059
|
+
}, 100);
|
|
1060
|
+
return () => clearInterval(timer);
|
|
1061
|
+
}, []);
|
|
1062
|
+
return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1063
|
+
FRAMES[frame],
|
|
1064
|
+
" ",
|
|
1065
|
+
label
|
|
1066
|
+
] });
|
|
1067
|
+
}
|
|
1068
|
+
function scoreColor(score) {
|
|
1069
|
+
if (score >= 80)
|
|
1070
|
+
return "green";
|
|
1071
|
+
if (score >= 50)
|
|
1072
|
+
return "yellow";
|
|
1073
|
+
return "red";
|
|
1074
|
+
}
|
|
1075
|
+
function createBar(value, max = 100, width = 20) {
|
|
1076
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
1077
|
+
const filled = Math.round(safe / max * width);
|
|
1078
|
+
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1079
|
+
}
|
|
1080
|
+
function formatScorePart(item, scoreToColor2) {
|
|
1081
|
+
const def = getScoreById(item.id);
|
|
1082
|
+
if (!def) {
|
|
1083
|
+
const numeric = toNumericScore(item.data);
|
|
1084
|
+
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1085
|
+
}
|
|
1086
|
+
const formatted = def.format(item.data);
|
|
1087
|
+
if (def.displayStrategy === "bar") {
|
|
1088
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1089
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1090
|
+
return `${formatted} ${createBar(numeric)}`;
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
return formatted;
|
|
1094
|
+
}
|
|
1095
|
+
function RunView({
|
|
1096
|
+
runner,
|
|
1097
|
+
datasetName,
|
|
1098
|
+
evaluatorPattern,
|
|
1099
|
+
onComplete
|
|
1100
|
+
}) {
|
|
1101
|
+
const [phase, setPhase] = useState(
|
|
1102
|
+
"loading"
|
|
1103
|
+
);
|
|
1104
|
+
const [runInfo, setRunInfo] = useState(null);
|
|
1105
|
+
const [testCases, setTestCases] = useState([]);
|
|
1106
|
+
const [summary, setSummary] = useState(null);
|
|
1107
|
+
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1108
|
+
const runEval = useCallback(async () => {
|
|
1109
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1110
|
+
if (!dataset) {
|
|
1111
|
+
const known = await runner.collectDatasets();
|
|
1112
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1113
|
+
onComplete(
|
|
1114
|
+
new Error(
|
|
1115
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1116
|
+
)
|
|
1117
|
+
);
|
|
1118
|
+
return;
|
|
1119
|
+
}
|
|
1120
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
1121
|
+
if (evaluators.length === 0) {
|
|
1122
|
+
const known = await runner.collectEvaluators();
|
|
1123
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
1124
|
+
onComplete(
|
|
1125
|
+
new Error(
|
|
1126
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
|
|
1127
|
+
)
|
|
1128
|
+
);
|
|
1129
|
+
return;
|
|
1130
|
+
}
|
|
1131
|
+
const nameById = new Map(
|
|
1132
|
+
evaluators.map((item) => [
|
|
1133
|
+
item.id,
|
|
1134
|
+
item.evaluator.getName() ?? item.id
|
|
1135
|
+
])
|
|
1136
|
+
);
|
|
1137
|
+
setEvaluatorNameById(nameById);
|
|
1138
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
1139
|
+
let overallScoreTotal = 0;
|
|
1140
|
+
let overallScoreCount = 0;
|
|
1141
|
+
const done = new Promise((resolve4) => {
|
|
1142
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1143
|
+
if (event.type === "TestCaseProgress") {
|
|
1144
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1145
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1146
|
+
for (const item of event.evaluatorScores) {
|
|
1147
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
1148
|
+
if (numeric !== void 0) {
|
|
1149
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1150
|
+
total: 0,
|
|
1151
|
+
count: 0,
|
|
1152
|
+
passed: 0,
|
|
1153
|
+
failed: 0
|
|
1154
|
+
};
|
|
1155
|
+
aggregates.set(item.evaluatorId, {
|
|
1156
|
+
total: current.total + numeric,
|
|
1157
|
+
count: current.count + 1,
|
|
1158
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
1159
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
1160
|
+
});
|
|
1161
|
+
overallScoreTotal += numeric;
|
|
1162
|
+
overallScoreCount += 1;
|
|
1163
|
+
}
|
|
1164
|
+
}
|
|
1165
|
+
setTestCases((prev) => [
|
|
1166
|
+
...prev,
|
|
1167
|
+
{
|
|
1168
|
+
name: event.testCaseName,
|
|
1169
|
+
completedTestCases: event.completedTestCases,
|
|
1170
|
+
totalTestCases: event.totalTestCases,
|
|
1171
|
+
durationMs: event.durationMs,
|
|
1172
|
+
passed: event.passed,
|
|
1173
|
+
averageScore,
|
|
1174
|
+
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1175
|
+
evaluatorId: item.evaluatorId,
|
|
1176
|
+
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1177
|
+
scores: item.scores,
|
|
1178
|
+
passed: item.passed,
|
|
1179
|
+
metrics: item.metrics
|
|
1180
|
+
}))
|
|
1181
|
+
}
|
|
1182
|
+
]);
|
|
1183
|
+
}
|
|
1184
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1185
|
+
unsubscribe();
|
|
1186
|
+
resolve4(event);
|
|
1187
|
+
}
|
|
1188
|
+
});
|
|
1189
|
+
});
|
|
1190
|
+
const snapshot = await runner.runDatasetWith({
|
|
1191
|
+
datasetId: dataset.id,
|
|
1192
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
1193
|
+
});
|
|
1194
|
+
setRunInfo({
|
|
1195
|
+
runId: snapshot.runId,
|
|
1196
|
+
datasetName: snapshot.datasetName,
|
|
1197
|
+
evaluatorNames: evaluators.map(
|
|
1198
|
+
(e) => e.evaluator.getName() ?? e.id
|
|
1199
|
+
),
|
|
1200
|
+
totalTestCases: snapshot.totalTestCases
|
|
1201
|
+
});
|
|
1202
|
+
setPhase("running");
|
|
1203
|
+
const finalEvent = await done;
|
|
1204
|
+
if (finalEvent.type === "RunFailed") {
|
|
1205
|
+
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1206
|
+
return;
|
|
1207
|
+
}
|
|
1208
|
+
setSummary({
|
|
1209
|
+
passedTestCases: finalEvent.passedTestCases,
|
|
1210
|
+
failedTestCases: finalEvent.failedTestCases,
|
|
1211
|
+
totalTestCases: finalEvent.totalTestCases,
|
|
1212
|
+
overallScoreTotal,
|
|
1213
|
+
overallScoreCount,
|
|
1214
|
+
aggregates: new Map(aggregates),
|
|
1215
|
+
artifactPath: finalEvent.artifactPath
|
|
1216
|
+
});
|
|
1217
|
+
setPhase("completed");
|
|
1218
|
+
setTimeout(() => onComplete(), 200);
|
|
1219
|
+
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1220
|
+
useEffect(() => {
|
|
1221
|
+
void runEval();
|
|
1222
|
+
}, [runEval]);
|
|
1223
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
1224
|
+
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1225
|
+
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1226
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1227
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
|
|
1228
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
1229
|
+
] }),
|
|
1230
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1231
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
|
|
1232
|
+
runInfo.datasetName
|
|
1233
|
+
] }),
|
|
1234
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1235
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
|
|
1236
|
+
runInfo.evaluatorNames.join(", ")
|
|
1237
|
+
] }),
|
|
1238
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1239
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
|
|
1240
|
+
runInfo.totalTestCases
|
|
1241
|
+
] })
|
|
1242
|
+
] }),
|
|
1243
|
+
phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
|
|
1244
|
+
Spinner,
|
|
1245
|
+
{
|
|
1246
|
+
label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
|
|
1247
|
+
}
|
|
1248
|
+
) }),
|
|
1249
|
+
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1250
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1251
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1252
|
+
"[",
|
|
1253
|
+
tc.completedTestCases,
|
|
1254
|
+
"/",
|
|
1255
|
+
tc.totalTestCases,
|
|
1256
|
+
"]"
|
|
1257
|
+
] }),
|
|
1258
|
+
" ",
|
|
1259
|
+
tc.name,
|
|
1260
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1261
|
+
" (",
|
|
1262
|
+
tc.durationMs,
|
|
1263
|
+
"ms)"
|
|
1264
|
+
] })
|
|
1265
|
+
] }),
|
|
1266
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsx(Box, { marginLeft: 2, children: /* @__PURE__ */ jsxs(Text, { children: [
|
|
1267
|
+
item.evaluatorName,
|
|
1268
|
+
":",
|
|
1269
|
+
" ",
|
|
1270
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1271
|
+
" ",
|
|
1272
|
+
item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1273
|
+
formatScorePart(s),
|
|
1274
|
+
" "
|
|
1275
|
+
] }, s.id)),
|
|
1276
|
+
item.metrics?.map((m) => {
|
|
1277
|
+
const def = getMetricById(m.id);
|
|
1278
|
+
if (!def)
|
|
1279
|
+
return null;
|
|
1280
|
+
const formatted = def.format(m.data);
|
|
1281
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1282
|
+
"[",
|
|
1283
|
+
def.name ? `${def.name}: ` : "",
|
|
1284
|
+
formatted,
|
|
1285
|
+
"]",
|
|
1286
|
+
" "
|
|
1287
|
+
] }, m.id);
|
|
1288
|
+
})
|
|
1289
|
+
] }) }, item.evaluatorId))
|
|
1290
|
+
] }, i)) }),
|
|
1291
|
+
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1292
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1293
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
|
|
1294
|
+
/* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
|
|
1295
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1296
|
+
" ",
|
|
1297
|
+
summary.passedTestCases,
|
|
1298
|
+
"/",
|
|
1299
|
+
summary.totalTestCases
|
|
1300
|
+
] })
|
|
1301
|
+
] }),
|
|
1302
|
+
/* @__PURE__ */ jsxs(Box, { children: [
|
|
1303
|
+
/* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
|
|
1304
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1305
|
+
" ",
|
|
1306
|
+
summary.failedTestCases,
|
|
1307
|
+
"/",
|
|
1308
|
+
summary.totalTestCases
|
|
1309
|
+
] })
|
|
1310
|
+
] }),
|
|
1311
|
+
summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
|
|
1312
|
+
TextBar,
|
|
1313
|
+
{
|
|
1314
|
+
label: "overall avg",
|
|
1315
|
+
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1316
|
+
barWidth: 20,
|
|
1317
|
+
format: (v) => v.toFixed(2)
|
|
1318
|
+
}
|
|
1319
|
+
) }),
|
|
1320
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1321
|
+
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1322
|
+
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1323
|
+
const agg = summary.aggregates.get(id);
|
|
1324
|
+
if (!agg || agg.count === 0) {
|
|
1325
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1326
|
+
"- ",
|
|
1327
|
+
name.padEnd(28),
|
|
1328
|
+
" no numeric scores"
|
|
1329
|
+
] }, id);
|
|
1330
|
+
}
|
|
1331
|
+
const mean = agg.total / agg.count;
|
|
1332
|
+
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1333
|
+
"- ",
|
|
1334
|
+
name.padEnd(28),
|
|
1335
|
+
" avg=",
|
|
1336
|
+
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1337
|
+
" passed=",
|
|
1338
|
+
agg.passed,
|
|
1339
|
+
" failed=",
|
|
1340
|
+
agg.failed
|
|
1341
|
+
] }, id);
|
|
1342
|
+
})
|
|
1343
|
+
] }),
|
|
1344
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1345
|
+
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1346
|
+
testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
|
|
1347
|
+
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1348
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1349
|
+
" ",
|
|
1350
|
+
tc.name.padEnd(24)
|
|
1351
|
+
] }),
|
|
1352
|
+
tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1353
|
+
/* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
|
|
1354
|
+
"score=",
|
|
1355
|
+
tc.averageScore.toFixed(2)
|
|
1356
|
+
] }),
|
|
1357
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1358
|
+
" ",
|
|
1359
|
+
createBar(tc.averageScore, 100, 14)
|
|
1360
|
+
] })
|
|
1361
|
+
] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
|
|
1362
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1363
|
+
" (",
|
|
1364
|
+
tc.durationMs,
|
|
1365
|
+
"ms)"
|
|
1366
|
+
] })
|
|
1367
|
+
] }, i))
|
|
1368
|
+
] }),
|
|
1369
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1370
|
+
"artifact: ",
|
|
1371
|
+
summary.artifactPath
|
|
1372
|
+
] }) })
|
|
1373
|
+
] })
|
|
1374
|
+
] });
|
|
1375
|
+
}
|
|
898
1376
|
|
|
899
1377
|
// src/cli-simple/run.ts
|
|
900
|
-
var
|
|
1378
|
+
var ansi2 = {
|
|
901
1379
|
reset: "\x1B[0m",
|
|
902
1380
|
bold: "\x1B[1m",
|
|
903
1381
|
dim: "\x1B[2m",
|
|
@@ -908,16 +1386,16 @@ var ansi = {
|
|
|
908
1386
|
magenta: "\x1B[35m"
|
|
909
1387
|
};
|
|
910
1388
|
function colorize(text, color) {
|
|
911
|
-
return `${color}${text}${
|
|
1389
|
+
return `${color}${text}${ansi2.reset}`;
|
|
912
1390
|
}
|
|
913
1391
|
function scoreToColor(score) {
|
|
914
1392
|
if (score >= 80) {
|
|
915
|
-
return
|
|
1393
|
+
return ansi2.green;
|
|
916
1394
|
}
|
|
917
1395
|
if (score >= 50) {
|
|
918
|
-
return
|
|
1396
|
+
return ansi2.yellow;
|
|
919
1397
|
}
|
|
920
|
-
return
|
|
1398
|
+
return ansi2.red;
|
|
921
1399
|
}
|
|
922
1400
|
function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
923
1401
|
if (!aggregate || aggregate.count === 0) {
|
|
@@ -926,13 +1404,13 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
926
1404
|
const mean = aggregate.total / aggregate.count;
|
|
927
1405
|
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
928
1406
|
}
|
|
929
|
-
function
|
|
1407
|
+
function createBar2(value, max = 100, width = 20) {
|
|
930
1408
|
const safe = Math.max(0, Math.min(max, value));
|
|
931
1409
|
const filled = Math.round(safe / max * width);
|
|
932
1410
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
933
1411
|
}
|
|
934
1412
|
function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
935
|
-
const passLabel = passed ? colorize("PASS", `${
|
|
1413
|
+
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
936
1414
|
const scoreParts = [];
|
|
937
1415
|
for (const item of scores) {
|
|
938
1416
|
const def = getScoreById(item.id);
|
|
@@ -949,7 +1427,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
949
1427
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
950
1428
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
951
1429
|
scoreParts.push(
|
|
952
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(
|
|
1430
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
953
1431
|
);
|
|
954
1432
|
} else {
|
|
955
1433
|
scoreParts.push(formatted);
|
|
@@ -963,7 +1441,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
963
1441
|
scoreParts.push(
|
|
964
1442
|
colorize(
|
|
965
1443
|
formatted,
|
|
966
|
-
item.passed === true ? `${
|
|
1444
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
967
1445
|
)
|
|
968
1446
|
);
|
|
969
1447
|
break;
|
|
@@ -988,7 +1466,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
988
1466
|
}
|
|
989
1467
|
return line;
|
|
990
1468
|
}
|
|
991
|
-
async function
|
|
1469
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
992
1470
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
993
1471
|
if (!dataset) {
|
|
994
1472
|
const known = await runner.collectDatasets();
|
|
@@ -1030,10 +1508,10 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1030
1508
|
const frame = spinnerFrames[spinnerIndex % spinnerFrames.length];
|
|
1031
1509
|
spinnerIndex += 1;
|
|
1032
1510
|
process.stdout.write(
|
|
1033
|
-
`\r${colorize(frame,
|
|
1511
|
+
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
1034
1512
|
`${completedCount}/${totalCount}`,
|
|
1035
|
-
|
|
1036
|
-
)} ${colorize("(live)",
|
|
1513
|
+
ansi2.bold
|
|
1514
|
+
)} ${colorize("(live)", ansi2.dim)}`
|
|
1037
1515
|
);
|
|
1038
1516
|
}
|
|
1039
1517
|
let spinnerTimer;
|
|
@@ -1045,7 +1523,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1045
1523
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1046
1524
|
clearLine();
|
|
1047
1525
|
console.log(
|
|
1048
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`,
|
|
1526
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
|
|
1049
1527
|
);
|
|
1050
1528
|
for (const item of event.evaluatorScores) {
|
|
1051
1529
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -1096,14 +1574,14 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1096
1574
|
evaluatorIds: evaluators.map((item) => item.id)
|
|
1097
1575
|
});
|
|
1098
1576
|
totalCount = snapshot.totalTestCases;
|
|
1099
|
-
console.log(colorize("=== Eval Run Started ===", `${
|
|
1100
|
-
console.log(`Run: ${colorize(snapshot.runId,
|
|
1101
|
-
console.log(`Dataset: ${colorize(snapshot.datasetName,
|
|
1577
|
+
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
1578
|
+
console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
|
|
1579
|
+
console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
|
|
1102
1580
|
console.log(
|
|
1103
1581
|
`Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
|
|
1104
1582
|
);
|
|
1105
1583
|
console.log(
|
|
1106
|
-
`Total test cases: ${colorize(String(snapshot.totalTestCases),
|
|
1584
|
+
`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
|
|
1107
1585
|
);
|
|
1108
1586
|
console.log("");
|
|
1109
1587
|
drawSpinner();
|
|
@@ -1116,17 +1594,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1116
1594
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
1117
1595
|
}
|
|
1118
1596
|
console.log("");
|
|
1119
|
-
console.log(colorize("=== Run Summary ===", `${
|
|
1597
|
+
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
1120
1598
|
console.log(
|
|
1121
1599
|
`- passed: ${colorize(
|
|
1122
1600
|
`${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
|
|
1123
|
-
|
|
1601
|
+
ansi2.green
|
|
1124
1602
|
)}`
|
|
1125
1603
|
);
|
|
1126
1604
|
console.log(
|
|
1127
1605
|
`- failed: ${colorize(
|
|
1128
1606
|
`${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
|
|
1129
|
-
finalEvent.failedTestCases > 0 ?
|
|
1607
|
+
finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
1130
1608
|
)}`
|
|
1131
1609
|
);
|
|
1132
1610
|
if (overallScoreCount > 0) {
|
|
@@ -1135,22 +1613,22 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1135
1613
|
`- overall avg score: ${colorize(
|
|
1136
1614
|
overallAverage.toFixed(2),
|
|
1137
1615
|
scoreToColor(overallAverage)
|
|
1138
|
-
)} ${colorize(
|
|
1616
|
+
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1139
1617
|
);
|
|
1140
1618
|
}
|
|
1141
|
-
console.log(colorize("- evaluator averages:",
|
|
1619
|
+
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
1142
1620
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
1143
1621
|
console.log(
|
|
1144
1622
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1145
1623
|
);
|
|
1146
1624
|
}
|
|
1147
1625
|
if (testCaseSummaries.length > 0) {
|
|
1148
|
-
console.log(colorize("- test case scores:",
|
|
1626
|
+
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1149
1627
|
for (const summary of testCaseSummaries) {
|
|
1150
|
-
const status = summary.passed ? colorize("PASS",
|
|
1628
|
+
const status = summary.passed ? colorize("PASS", ansi2.green) : colorize("FAIL", ansi2.red);
|
|
1151
1629
|
if (summary.averageScore === void 0) {
|
|
1152
1630
|
console.log(
|
|
1153
|
-
` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`,
|
|
1631
|
+
` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1154
1632
|
);
|
|
1155
1633
|
continue;
|
|
1156
1634
|
}
|
|
@@ -1158,11 +1636,30 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1158
1636
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1159
1637
|
summary.averageScore.toFixed(2),
|
|
1160
1638
|
scoreToColor(summary.averageScore)
|
|
1161
|
-
)} ${colorize(
|
|
1639
|
+
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1162
1640
|
);
|
|
1163
1641
|
}
|
|
1164
1642
|
}
|
|
1165
|
-
console.log(`- artifact: ${colorize(finalEvent.artifactPath,
|
|
1643
|
+
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1644
|
+
}
|
|
1645
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1646
|
+
return new Promise((resolve4, reject) => {
|
|
1647
|
+
const app = render(
|
|
1648
|
+
React2.createElement(RunView, {
|
|
1649
|
+
runner,
|
|
1650
|
+
datasetName,
|
|
1651
|
+
evaluatorPattern,
|
|
1652
|
+
onComplete: (err) => {
|
|
1653
|
+
app.unmount();
|
|
1654
|
+
if (err) {
|
|
1655
|
+
reject(err);
|
|
1656
|
+
} else {
|
|
1657
|
+
resolve4();
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
})
|
|
1661
|
+
);
|
|
1662
|
+
});
|
|
1166
1663
|
}
|
|
1167
1664
|
|
|
1168
1665
|
// src/cli-simple/index.ts
|
|
@@ -1187,17 +1684,28 @@ async function main() {
|
|
|
1187
1684
|
console.error("Missing required --dataset <datasetName> argument.");
|
|
1188
1685
|
printUsageAndExit(1);
|
|
1189
1686
|
}
|
|
1687
|
+
if (args.command === "run" && !args.evaluatorPattern) {
|
|
1688
|
+
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1689
|
+
printUsageAndExit(1);
|
|
1690
|
+
}
|
|
1691
|
+
const useInk = process.stdout.isTTY === true;
|
|
1692
|
+
if (!useInk) {
|
|
1693
|
+
printBanner();
|
|
1694
|
+
}
|
|
1190
1695
|
const runner = createRunner();
|
|
1191
1696
|
try {
|
|
1192
1697
|
if (args.command === "run") {
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1698
|
+
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
1699
|
+
runner,
|
|
1700
|
+
args.datasetName,
|
|
1701
|
+
args.evaluatorPattern
|
|
1702
|
+
);
|
|
1198
1703
|
return;
|
|
1199
1704
|
}
|
|
1200
|
-
await
|
|
1705
|
+
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
1706
|
+
runner,
|
|
1707
|
+
args.datasetName
|
|
1708
|
+
);
|
|
1201
1709
|
} finally {
|
|
1202
1710
|
await runner.shutdown();
|
|
1203
1711
|
}
|