@m4trix/evals 0.9.1 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +561 -12
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +558 -12
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +24 -3
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +24 -3
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +23 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +25 -13
- package/dist/index.js +23 -2
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,10 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, mkdir, appendFile, readdir } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import 'json-diff';
|
|
9
|
+
import { diffString } from 'json-diff';
|
|
10
|
+
import React2, { useState, useEffect, useCallback } from 'react';
|
|
11
|
+
import { render, Box, Text } from 'ink';
|
|
12
|
+
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
10
13
|
|
|
11
14
|
// src/runner/config.ts
|
|
12
15
|
var defaultRunnerConfig = {
|
|
@@ -253,6 +256,29 @@ async function collectTestCasesFromFiles(config) {
|
|
|
253
256
|
);
|
|
254
257
|
return found.flat();
|
|
255
258
|
}
|
|
259
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
260
|
+
const diff = diffString(expected, actual, { color: false });
|
|
261
|
+
return {
|
|
262
|
+
type: "diff",
|
|
263
|
+
label: options?.label,
|
|
264
|
+
expected,
|
|
265
|
+
actual,
|
|
266
|
+
diff: diff || "(no differences)"
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
function getDiffLines(entry) {
|
|
270
|
+
const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
271
|
+
return raw.split("\n").map((line) => {
|
|
272
|
+
const trimmed = line.trimStart();
|
|
273
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
274
|
+
return { type: "remove", line };
|
|
275
|
+
}
|
|
276
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
277
|
+
return { type: "add", line };
|
|
278
|
+
}
|
|
279
|
+
return { type: "context", line };
|
|
280
|
+
});
|
|
281
|
+
}
|
|
256
282
|
|
|
257
283
|
// src/evals/metric.ts
|
|
258
284
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -436,6 +462,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
436
462
|
continue;
|
|
437
463
|
}
|
|
438
464
|
try {
|
|
465
|
+
const logs = [];
|
|
466
|
+
const logDiff = (expected, actual, options) => {
|
|
467
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
468
|
+
};
|
|
439
469
|
const ctx = yield* Effect.promise(
|
|
440
470
|
() => Promise.resolve(evaluator.resolveContext())
|
|
441
471
|
);
|
|
@@ -444,13 +474,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
444
474
|
evaluateFn({
|
|
445
475
|
input: testCaseItem.testCase.getInput(),
|
|
446
476
|
ctx,
|
|
447
|
-
output
|
|
477
|
+
output,
|
|
478
|
+
logDiff
|
|
448
479
|
})
|
|
449
480
|
)
|
|
450
481
|
);
|
|
451
482
|
const { scores, metrics } = normalizeResult(result);
|
|
452
483
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
453
|
-
evaluatorScores.push({
|
|
484
|
+
evaluatorScores.push({
|
|
485
|
+
evaluatorId,
|
|
486
|
+
scores,
|
|
487
|
+
passed,
|
|
488
|
+
metrics,
|
|
489
|
+
logs: logs.length > 0 ? logs : void 0
|
|
490
|
+
});
|
|
454
491
|
} catch (error) {
|
|
455
492
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
456
493
|
evaluatorScores.push({
|
|
@@ -886,6 +923,88 @@ function printBanner() {
|
|
|
886
923
|
];
|
|
887
924
|
console.log(lines.join("\n"));
|
|
888
925
|
}
|
|
926
|
+
function Banner() {
|
|
927
|
+
return /* @__PURE__ */ jsxs(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
|
|
928
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "@m4trix/evals" }),
|
|
929
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", children: " \xB7 " }),
|
|
930
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "eval-agents-simple" })
|
|
931
|
+
] });
|
|
932
|
+
}
|
|
933
|
+
function GenerateView({
|
|
934
|
+
runner,
|
|
935
|
+
datasetName,
|
|
936
|
+
onComplete
|
|
937
|
+
}) {
|
|
938
|
+
const [result, setResult] = useState(null);
|
|
939
|
+
const [error, setError] = useState(null);
|
|
940
|
+
useEffect(() => {
|
|
941
|
+
let cancelled = false;
|
|
942
|
+
async function run() {
|
|
943
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
944
|
+
if (!dataset) {
|
|
945
|
+
setError(new Error(`Dataset "${datasetName}" not found.`));
|
|
946
|
+
onComplete(new Error(`Dataset "${datasetName}" not found.`));
|
|
947
|
+
return;
|
|
948
|
+
}
|
|
949
|
+
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
950
|
+
const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
|
|
951
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
952
|
+
const payload = testCases.map((item) => {
|
|
953
|
+
const tc = item.testCase;
|
|
954
|
+
return {
|
|
955
|
+
name: item.testCase.getName(),
|
|
956
|
+
input: item.testCase.getInput(),
|
|
957
|
+
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
958
|
+
};
|
|
959
|
+
});
|
|
960
|
+
const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
|
|
961
|
+
const parsed = parse2(absoluteDatasetPath);
|
|
962
|
+
const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
|
|
963
|
+
await writeFile2(
|
|
964
|
+
outputPath,
|
|
965
|
+
`${JSON.stringify(payload, null, 2)}
|
|
966
|
+
`,
|
|
967
|
+
"utf8"
|
|
968
|
+
);
|
|
969
|
+
if (!cancelled) {
|
|
970
|
+
setResult({
|
|
971
|
+
count: payload.length,
|
|
972
|
+
datasetName: dataset.dataset.getName(),
|
|
973
|
+
outputPath
|
|
974
|
+
});
|
|
975
|
+
setTimeout(() => onComplete(), 200);
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
void run();
|
|
979
|
+
return () => {
|
|
980
|
+
cancelled = true;
|
|
981
|
+
};
|
|
982
|
+
}, [runner, datasetName, onComplete]);
|
|
983
|
+
if (error) {
|
|
984
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
985
|
+
/* @__PURE__ */ jsx(Banner, {}),
|
|
986
|
+
/* @__PURE__ */ jsx(Text, { color: "red", children: error.message })
|
|
987
|
+
] });
|
|
988
|
+
}
|
|
989
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
990
|
+
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
991
|
+
result && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
992
|
+
/* @__PURE__ */ jsxs(Text, { color: "green", children: [
|
|
993
|
+
"Generated ",
|
|
994
|
+
result.count,
|
|
995
|
+
' test cases for dataset "',
|
|
996
|
+
result.datasetName,
|
|
997
|
+
'".'
|
|
998
|
+
] }),
|
|
999
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1000
|
+
"Wrote ",
|
|
1001
|
+
result.outputPath
|
|
1002
|
+
] })
|
|
1003
|
+
] })
|
|
1004
|
+
] });
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
// src/cli-simple/generate.ts
|
|
889
1008
|
function readOutput2(testCase) {
|
|
890
1009
|
if (typeof testCase.getOutput !== "function") {
|
|
891
1010
|
return void 0;
|
|
@@ -896,7 +1015,7 @@ function createOutputPath(datasetFilePath) {
|
|
|
896
1015
|
const parsed = parse(datasetFilePath);
|
|
897
1016
|
return join(parsed.dir, `${parsed.name}.cases.json`);
|
|
898
1017
|
}
|
|
899
|
-
async function
|
|
1018
|
+
async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
900
1019
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
901
1020
|
if (!dataset) {
|
|
902
1021
|
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
@@ -914,6 +1033,393 @@ async function generateDatasetJsonCommand(runner, datasetName) {
|
|
|
914
1033
|
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
915
1034
|
console.log(`Wrote ${outputPath}`);
|
|
916
1035
|
}
|
|
1036
|
+
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1037
|
+
return new Promise((resolve4, reject) => {
|
|
1038
|
+
const app = render(
|
|
1039
|
+
React2.createElement(GenerateView, {
|
|
1040
|
+
runner,
|
|
1041
|
+
datasetName,
|
|
1042
|
+
onComplete: (err) => {
|
|
1043
|
+
app.unmount();
|
|
1044
|
+
if (err) {
|
|
1045
|
+
reject(err);
|
|
1046
|
+
} else {
|
|
1047
|
+
resolve4();
|
|
1048
|
+
}
|
|
1049
|
+
}
|
|
1050
|
+
})
|
|
1051
|
+
);
|
|
1052
|
+
});
|
|
1053
|
+
}
|
|
1054
|
+
function barColor(pct) {
|
|
1055
|
+
if (pct >= 70)
|
|
1056
|
+
return "green";
|
|
1057
|
+
if (pct >= 40)
|
|
1058
|
+
return "yellow";
|
|
1059
|
+
return "red";
|
|
1060
|
+
}
|
|
1061
|
+
function TextBar({
|
|
1062
|
+
label,
|
|
1063
|
+
value,
|
|
1064
|
+
max = 100,
|
|
1065
|
+
labelWidth = 14,
|
|
1066
|
+
barWidth = 20,
|
|
1067
|
+
format = (v) => String(v),
|
|
1068
|
+
colorByValue = true
|
|
1069
|
+
}) {
|
|
1070
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
1071
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
1072
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
1073
|
+
const filledBar = "\u2588".repeat(filled);
|
|
1074
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
1075
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
1076
|
+
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1077
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
1078
|
+
" [",
|
|
1079
|
+
color ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1080
|
+
/* @__PURE__ */ jsx(Text, { color, children: filledBar }),
|
|
1081
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
|
|
1082
|
+
] }) : filledBar + emptyBar,
|
|
1083
|
+
"] ",
|
|
1084
|
+
/* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
1085
|
+
] });
|
|
1086
|
+
}
|
|
1087
|
+
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1088
|
+
function Spinner({ label = "Running" }) {
|
|
1089
|
+
const [frame, setFrame] = useState(0);
|
|
1090
|
+
useEffect(() => {
|
|
1091
|
+
const timer = setInterval(() => {
|
|
1092
|
+
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1093
|
+
}, 100);
|
|
1094
|
+
return () => clearInterval(timer);
|
|
1095
|
+
}, []);
|
|
1096
|
+
return /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1097
|
+
FRAMES[frame],
|
|
1098
|
+
" ",
|
|
1099
|
+
label
|
|
1100
|
+
] });
|
|
1101
|
+
}
|
|
1102
|
+
function scoreColor(score) {
|
|
1103
|
+
if (score >= 80)
|
|
1104
|
+
return "green";
|
|
1105
|
+
if (score >= 50)
|
|
1106
|
+
return "yellow";
|
|
1107
|
+
return "red";
|
|
1108
|
+
}
|
|
1109
|
+
function createBar(value, max = 100, width = 20) {
|
|
1110
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
1111
|
+
const filled = Math.round(safe / max * width);
|
|
1112
|
+
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1113
|
+
}
|
|
1114
|
+
function formatScorePart(item, scoreToColor2) {
|
|
1115
|
+
const def = getScoreById(item.id);
|
|
1116
|
+
if (!def) {
|
|
1117
|
+
const numeric = toNumericScore(item.data);
|
|
1118
|
+
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1119
|
+
}
|
|
1120
|
+
const formatted = def.format(item.data);
|
|
1121
|
+
if (def.displayStrategy === "bar") {
|
|
1122
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1123
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1124
|
+
return `${formatted} ${createBar(numeric)}`;
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
return formatted;
|
|
1128
|
+
}
|
|
1129
|
+
function RunView({
|
|
1130
|
+
runner,
|
|
1131
|
+
datasetName,
|
|
1132
|
+
evaluatorPattern,
|
|
1133
|
+
onComplete
|
|
1134
|
+
}) {
|
|
1135
|
+
const [phase, setPhase] = useState(
|
|
1136
|
+
"loading"
|
|
1137
|
+
);
|
|
1138
|
+
const [runInfo, setRunInfo] = useState(null);
|
|
1139
|
+
const [testCases, setTestCases] = useState([]);
|
|
1140
|
+
const [summary, setSummary] = useState(null);
|
|
1141
|
+
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1142
|
+
const runEval = useCallback(async () => {
|
|
1143
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1144
|
+
if (!dataset) {
|
|
1145
|
+
const known = await runner.collectDatasets();
|
|
1146
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1147
|
+
onComplete(
|
|
1148
|
+
new Error(
|
|
1149
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1150
|
+
)
|
|
1151
|
+
);
|
|
1152
|
+
return;
|
|
1153
|
+
}
|
|
1154
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
1155
|
+
if (evaluators.length === 0) {
|
|
1156
|
+
const known = await runner.collectEvaluators();
|
|
1157
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
1158
|
+
onComplete(
|
|
1159
|
+
new Error(
|
|
1160
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
|
|
1161
|
+
)
|
|
1162
|
+
);
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1165
|
+
const nameById = new Map(
|
|
1166
|
+
evaluators.map((item) => [
|
|
1167
|
+
item.id,
|
|
1168
|
+
item.evaluator.getName() ?? item.id
|
|
1169
|
+
])
|
|
1170
|
+
);
|
|
1171
|
+
setEvaluatorNameById(nameById);
|
|
1172
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
1173
|
+
let overallScoreTotal = 0;
|
|
1174
|
+
let overallScoreCount = 0;
|
|
1175
|
+
const done = new Promise((resolve4) => {
|
|
1176
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1177
|
+
if (event.type === "TestCaseProgress") {
|
|
1178
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1179
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1180
|
+
for (const item of event.evaluatorScores) {
|
|
1181
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
1182
|
+
if (numeric !== void 0) {
|
|
1183
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1184
|
+
total: 0,
|
|
1185
|
+
count: 0,
|
|
1186
|
+
passed: 0,
|
|
1187
|
+
failed: 0
|
|
1188
|
+
};
|
|
1189
|
+
aggregates.set(item.evaluatorId, {
|
|
1190
|
+
total: current.total + numeric,
|
|
1191
|
+
count: current.count + 1,
|
|
1192
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
1193
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
1194
|
+
});
|
|
1195
|
+
overallScoreTotal += numeric;
|
|
1196
|
+
overallScoreCount += 1;
|
|
1197
|
+
}
|
|
1198
|
+
}
|
|
1199
|
+
setTestCases((prev) => [
|
|
1200
|
+
...prev,
|
|
1201
|
+
{
|
|
1202
|
+
name: event.testCaseName,
|
|
1203
|
+
completedTestCases: event.completedTestCases,
|
|
1204
|
+
totalTestCases: event.totalTestCases,
|
|
1205
|
+
durationMs: event.durationMs,
|
|
1206
|
+
passed: event.passed,
|
|
1207
|
+
averageScore,
|
|
1208
|
+
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1209
|
+
evaluatorId: item.evaluatorId,
|
|
1210
|
+
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1211
|
+
scores: item.scores,
|
|
1212
|
+
passed: item.passed,
|
|
1213
|
+
metrics: item.metrics,
|
|
1214
|
+
logs: item.logs
|
|
1215
|
+
}))
|
|
1216
|
+
}
|
|
1217
|
+
]);
|
|
1218
|
+
}
|
|
1219
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1220
|
+
unsubscribe();
|
|
1221
|
+
resolve4(event);
|
|
1222
|
+
}
|
|
1223
|
+
});
|
|
1224
|
+
});
|
|
1225
|
+
const snapshot = await runner.runDatasetWith({
|
|
1226
|
+
datasetId: dataset.id,
|
|
1227
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
1228
|
+
});
|
|
1229
|
+
setRunInfo({
|
|
1230
|
+
runId: snapshot.runId,
|
|
1231
|
+
datasetName: snapshot.datasetName,
|
|
1232
|
+
evaluatorNames: evaluators.map(
|
|
1233
|
+
(e) => e.evaluator.getName() ?? e.id
|
|
1234
|
+
),
|
|
1235
|
+
totalTestCases: snapshot.totalTestCases
|
|
1236
|
+
});
|
|
1237
|
+
setPhase("running");
|
|
1238
|
+
const finalEvent = await done;
|
|
1239
|
+
if (finalEvent.type === "RunFailed") {
|
|
1240
|
+
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1241
|
+
return;
|
|
1242
|
+
}
|
|
1243
|
+
setSummary({
|
|
1244
|
+
passedTestCases: finalEvent.passedTestCases,
|
|
1245
|
+
failedTestCases: finalEvent.failedTestCases,
|
|
1246
|
+
totalTestCases: finalEvent.totalTestCases,
|
|
1247
|
+
overallScoreTotal,
|
|
1248
|
+
overallScoreCount,
|
|
1249
|
+
aggregates: new Map(aggregates),
|
|
1250
|
+
artifactPath: finalEvent.artifactPath
|
|
1251
|
+
});
|
|
1252
|
+
setPhase("completed");
|
|
1253
|
+
setTimeout(() => onComplete(), 200);
|
|
1254
|
+
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1255
|
+
useEffect(() => {
|
|
1256
|
+
void runEval();
|
|
1257
|
+
}, [runEval]);
|
|
1258
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", padding: 1, children: [
|
|
1259
|
+
/* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(Banner, {}) }),
|
|
1260
|
+
runInfo && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1261
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1262
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run " }),
|
|
1263
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: runInfo.runId })
|
|
1264
|
+
] }),
|
|
1265
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1266
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Dataset " }),
|
|
1267
|
+
runInfo.datasetName
|
|
1268
|
+
] }),
|
|
1269
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1270
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Evaluators " }),
|
|
1271
|
+
runInfo.evaluatorNames.join(", ")
|
|
1272
|
+
] }),
|
|
1273
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1274
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Test cases " }),
|
|
1275
|
+
runInfo.totalTestCases
|
|
1276
|
+
] })
|
|
1277
|
+
] }),
|
|
1278
|
+
phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
|
|
1279
|
+
Spinner,
|
|
1280
|
+
{
|
|
1281
|
+
label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
|
|
1282
|
+
}
|
|
1283
|
+
) }),
|
|
1284
|
+
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1285
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1286
|
+
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
1287
|
+
"[",
|
|
1288
|
+
tc.completedTestCases,
|
|
1289
|
+
"/",
|
|
1290
|
+
tc.totalTestCases,
|
|
1291
|
+
"]"
|
|
1292
|
+
] }),
|
|
1293
|
+
" ",
|
|
1294
|
+
tc.name,
|
|
1295
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1296
|
+
" (",
|
|
1297
|
+
tc.durationMs,
|
|
1298
|
+
"ms)"
|
|
1299
|
+
] })
|
|
1300
|
+
] }),
|
|
1301
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
1302
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1303
|
+
item.evaluatorName,
|
|
1304
|
+
":",
|
|
1305
|
+
" ",
|
|
1306
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1307
|
+
" ",
|
|
1308
|
+
item.scores.map((s) => /* @__PURE__ */ jsxs(Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1309
|
+
formatScorePart(s),
|
|
1310
|
+
" "
|
|
1311
|
+
] }, s.id)),
|
|
1312
|
+
item.metrics?.map((m) => {
|
|
1313
|
+
const def = getMetricById(m.id);
|
|
1314
|
+
if (!def)
|
|
1315
|
+
return null;
|
|
1316
|
+
const formatted = def.format(m.data);
|
|
1317
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1318
|
+
"[",
|
|
1319
|
+
def.name ? `${def.name}: ` : "",
|
|
1320
|
+
formatted,
|
|
1321
|
+
"]",
|
|
1322
|
+
" "
|
|
1323
|
+
] }, m.id);
|
|
1324
|
+
})
|
|
1325
|
+
] }),
|
|
1326
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1327
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
1328
|
+
Text,
|
|
1329
|
+
{
|
|
1330
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1331
|
+
children: line
|
|
1332
|
+
},
|
|
1333
|
+
lineIdx
|
|
1334
|
+
)) }, logIdx) : null
|
|
1335
|
+
) })
|
|
1336
|
+
] }, item.evaluatorId))
|
|
1337
|
+
] }, i)) }),
|
|
1338
|
+
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
1339
|
+
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1340
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, children: [
|
|
1341
|
+
/* @__PURE__ */ jsx(Text, { color: "green", children: "passed" }),
|
|
1342
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1343
|
+
" ",
|
|
1344
|
+
summary.passedTestCases,
|
|
1345
|
+
"/",
|
|
1346
|
+
summary.totalTestCases
|
|
1347
|
+
] })
|
|
1348
|
+
] }),
|
|
1349
|
+
/* @__PURE__ */ jsxs(Box, { children: [
|
|
1350
|
+
/* @__PURE__ */ jsx(Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
|
|
1351
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1352
|
+
" ",
|
|
1353
|
+
summary.failedTestCases,
|
|
1354
|
+
"/",
|
|
1355
|
+
summary.totalTestCases
|
|
1356
|
+
] })
|
|
1357
|
+
] }),
|
|
1358
|
+
summary.overallScoreCount > 0 && /* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsx(
|
|
1359
|
+
TextBar,
|
|
1360
|
+
{
|
|
1361
|
+
label: "overall avg",
|
|
1362
|
+
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1363
|
+
barWidth: 20,
|
|
1364
|
+
format: (v) => v.toFixed(2)
|
|
1365
|
+
}
|
|
1366
|
+
) }),
|
|
1367
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1368
|
+
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1369
|
+
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1370
|
+
const agg = summary.aggregates.get(id);
|
|
1371
|
+
if (!agg || agg.count === 0) {
|
|
1372
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1373
|
+
"- ",
|
|
1374
|
+
name.padEnd(28),
|
|
1375
|
+
" no numeric scores"
|
|
1376
|
+
] }, id);
|
|
1377
|
+
}
|
|
1378
|
+
const mean = agg.total / agg.count;
|
|
1379
|
+
return /* @__PURE__ */ jsxs(Text, { children: [
|
|
1380
|
+
"- ",
|
|
1381
|
+
name.padEnd(28),
|
|
1382
|
+
" avg=",
|
|
1383
|
+
/* @__PURE__ */ jsx(Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1384
|
+
" passed=",
|
|
1385
|
+
agg.passed,
|
|
1386
|
+
" failed=",
|
|
1387
|
+
agg.failed
|
|
1388
|
+
] }, id);
|
|
1389
|
+
})
|
|
1390
|
+
] }),
|
|
1391
|
+
/* @__PURE__ */ jsxs(Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1392
|
+
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "test case scores" }),
|
|
1393
|
+
testCases.map((tc, i) => /* @__PURE__ */ jsxs(Box, { children: [
|
|
1394
|
+
/* @__PURE__ */ jsx(Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1395
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1396
|
+
" ",
|
|
1397
|
+
tc.name.padEnd(24)
|
|
1398
|
+
] }),
|
|
1399
|
+
tc.averageScore !== void 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1400
|
+
/* @__PURE__ */ jsxs(Text, { color: scoreColor(tc.averageScore), children: [
|
|
1401
|
+
"score=",
|
|
1402
|
+
tc.averageScore.toFixed(2)
|
|
1403
|
+
] }),
|
|
1404
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1405
|
+
" ",
|
|
1406
|
+
createBar(tc.averageScore, 100, 14)
|
|
1407
|
+
] })
|
|
1408
|
+
] }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "score=n/a" }),
|
|
1409
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1410
|
+
" (",
|
|
1411
|
+
tc.durationMs,
|
|
1412
|
+
"ms)"
|
|
1413
|
+
] })
|
|
1414
|
+
] }, i))
|
|
1415
|
+
] }),
|
|
1416
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, children: /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1417
|
+
"artifact: ",
|
|
1418
|
+
summary.artifactPath
|
|
1419
|
+
] }) })
|
|
1420
|
+
] })
|
|
1421
|
+
] });
|
|
1422
|
+
}
|
|
917
1423
|
|
|
918
1424
|
// src/cli-simple/run.ts
|
|
919
1425
|
var ansi2 = {
|
|
@@ -945,7 +1451,7 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
945
1451
|
const mean = aggregate.total / aggregate.count;
|
|
946
1452
|
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
947
1453
|
}
|
|
948
|
-
function
|
|
1454
|
+
function createBar2(value, max = 100, width = 20) {
|
|
949
1455
|
const safe = Math.max(0, Math.min(max, value));
|
|
950
1456
|
const filled = Math.round(safe / max * width);
|
|
951
1457
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
@@ -968,7 +1474,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
968
1474
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
969
1475
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
970
1476
|
scoreParts.push(
|
|
971
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(
|
|
1477
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
972
1478
|
);
|
|
973
1479
|
} else {
|
|
974
1480
|
scoreParts.push(formatted);
|
|
@@ -1007,7 +1513,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1007
1513
|
}
|
|
1008
1514
|
return line;
|
|
1009
1515
|
}
|
|
1010
|
-
async function
|
|
1516
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1011
1517
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1012
1518
|
if (!dataset) {
|
|
1013
1519
|
const known = await runner.collectDatasets();
|
|
@@ -1076,6 +1582,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1076
1582
|
item.metrics
|
|
1077
1583
|
)
|
|
1078
1584
|
);
|
|
1585
|
+
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1586
|
+
for (const log of item.logs) {
|
|
1587
|
+
if (log.type === "diff") {
|
|
1588
|
+
const useColor = process.stdout.isTTY;
|
|
1589
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
1590
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1591
|
+
console.log(colored);
|
|
1592
|
+
}
|
|
1593
|
+
}
|
|
1594
|
+
}
|
|
1595
|
+
}
|
|
1079
1596
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1080
1597
|
if (numeric !== void 0) {
|
|
1081
1598
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
@@ -1154,7 +1671,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1154
1671
|
`- overall avg score: ${colorize(
|
|
1155
1672
|
overallAverage.toFixed(2),
|
|
1156
1673
|
scoreToColor(overallAverage)
|
|
1157
|
-
)} ${colorize(
|
|
1674
|
+
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1158
1675
|
);
|
|
1159
1676
|
}
|
|
1160
1677
|
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
@@ -1177,12 +1694,31 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1177
1694
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1178
1695
|
summary.averageScore.toFixed(2),
|
|
1179
1696
|
scoreToColor(summary.averageScore)
|
|
1180
|
-
)} ${colorize(
|
|
1697
|
+
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1181
1698
|
);
|
|
1182
1699
|
}
|
|
1183
1700
|
}
|
|
1184
1701
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1185
1702
|
}
|
|
1703
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1704
|
+
return new Promise((resolve4, reject) => {
|
|
1705
|
+
const app = render(
|
|
1706
|
+
React2.createElement(RunView, {
|
|
1707
|
+
runner,
|
|
1708
|
+
datasetName,
|
|
1709
|
+
evaluatorPattern,
|
|
1710
|
+
onComplete: (err) => {
|
|
1711
|
+
app.unmount();
|
|
1712
|
+
if (err) {
|
|
1713
|
+
reject(err);
|
|
1714
|
+
} else {
|
|
1715
|
+
resolve4();
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
})
|
|
1719
|
+
);
|
|
1720
|
+
});
|
|
1721
|
+
}
|
|
1186
1722
|
|
|
1187
1723
|
// src/cli-simple/index.ts
|
|
1188
1724
|
function printUsageAndExit(exitCode) {
|
|
@@ -1210,14 +1746,24 @@ async function main() {
|
|
|
1210
1746
|
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1211
1747
|
printUsageAndExit(1);
|
|
1212
1748
|
}
|
|
1213
|
-
|
|
1749
|
+
const useInk = process.stdout.isTTY === true;
|
|
1750
|
+
if (!useInk) {
|
|
1751
|
+
printBanner();
|
|
1752
|
+
}
|
|
1214
1753
|
const runner = createRunner();
|
|
1215
1754
|
try {
|
|
1216
1755
|
if (args.command === "run") {
|
|
1217
|
-
await
|
|
1756
|
+
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
1757
|
+
runner,
|
|
1758
|
+
args.datasetName,
|
|
1759
|
+
args.evaluatorPattern
|
|
1760
|
+
);
|
|
1218
1761
|
return;
|
|
1219
1762
|
}
|
|
1220
|
-
await
|
|
1763
|
+
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
1764
|
+
runner,
|
|
1765
|
+
args.datasetName
|
|
1766
|
+
);
|
|
1221
1767
|
} finally {
|
|
1222
1768
|
await runner.shutdown();
|
|
1223
1769
|
}
|