@m4trix/evals 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +546 -35
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +543 -35
- package/dist/cli-simple.js.map +1 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -5
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -9,8 +9,13 @@ var jitiModule = require('jiti');
|
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
11
|
require('json-diff');
|
|
12
|
+
var React2 = require('react');
|
|
13
|
+
var ink = require('ink');
|
|
14
|
+
var jsxRuntime = require('react/jsx-runtime');
|
|
12
15
|
|
|
13
16
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
17
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
18
|
+
|
|
14
19
|
function _interopNamespace(e) {
|
|
15
20
|
if (e && e.__esModule) return e;
|
|
16
21
|
var n = Object.create(null);
|
|
@@ -30,6 +35,7 @@ function _interopNamespace(e) {
|
|
|
30
35
|
}
|
|
31
36
|
|
|
32
37
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
38
|
+
var React2__default = /*#__PURE__*/_interopDefault(React2);
|
|
33
39
|
|
|
34
40
|
// src/runner/config.ts
|
|
35
41
|
var defaultRunnerConfig = {
|
|
@@ -890,6 +896,107 @@ function getSimpleCliUsage() {
|
|
|
890
896
|
' "/score/i" regex literal'
|
|
891
897
|
].join("\n");
|
|
892
898
|
}
|
|
899
|
+
|
|
900
|
+
// src/cli-simple/banner.ts
|
|
901
|
+
var ansi = {
|
|
902
|
+
reset: "\x1B[0m",
|
|
903
|
+
dim: "\x1B[2m",
|
|
904
|
+
cyan: "\x1B[36m"
|
|
905
|
+
};
|
|
906
|
+
function printBanner() {
|
|
907
|
+
const c = (s) => `${ansi.cyan}${s}${ansi.reset}`;
|
|
908
|
+
const d = (s) => `${ansi.dim}${s}${ansi.reset}`;
|
|
909
|
+
const lines = [
|
|
910
|
+
"",
|
|
911
|
+
` ${c("\u256D\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256E")}`,
|
|
912
|
+
` ${c("\u2502")} ${d("@m4trix/evals")} ${c("\xB7")} ${d("eval-agents-simple")} ${c("\u2502")}`,
|
|
913
|
+
` ${c("\u2570\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u256F")}`,
|
|
914
|
+
""
|
|
915
|
+
];
|
|
916
|
+
console.log(lines.join("\n"));
|
|
917
|
+
}
|
|
918
|
+
function Banner() {
|
|
919
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, paddingY: 0, children: [
|
|
920
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "@m4trix/evals" }),
|
|
921
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: " \xB7 " }),
|
|
922
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "eval-agents-simple" })
|
|
923
|
+
] });
|
|
924
|
+
}
|
|
925
|
+
function GenerateView({
|
|
926
|
+
runner,
|
|
927
|
+
datasetName,
|
|
928
|
+
onComplete
|
|
929
|
+
}) {
|
|
930
|
+
const [result, setResult] = React2.useState(null);
|
|
931
|
+
const [error, setError] = React2.useState(null);
|
|
932
|
+
React2.useEffect(() => {
|
|
933
|
+
let cancelled = false;
|
|
934
|
+
async function run() {
|
|
935
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
936
|
+
if (!dataset) {
|
|
937
|
+
setError(new Error(`Dataset "${datasetName}" not found.`));
|
|
938
|
+
onComplete(new Error(`Dataset "${datasetName}" not found.`));
|
|
939
|
+
return;
|
|
940
|
+
}
|
|
941
|
+
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
942
|
+
const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
|
|
943
|
+
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
944
|
+
const payload = testCases.map((item) => {
|
|
945
|
+
const tc = item.testCase;
|
|
946
|
+
return {
|
|
947
|
+
name: item.testCase.getName(),
|
|
948
|
+
input: item.testCase.getInput(),
|
|
949
|
+
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
950
|
+
};
|
|
951
|
+
});
|
|
952
|
+
const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
|
|
953
|
+
const parsed = parse2(absoluteDatasetPath);
|
|
954
|
+
const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
|
|
955
|
+
await writeFile2(
|
|
956
|
+
outputPath,
|
|
957
|
+
`${JSON.stringify(payload, null, 2)}
|
|
958
|
+
`,
|
|
959
|
+
"utf8"
|
|
960
|
+
);
|
|
961
|
+
if (!cancelled) {
|
|
962
|
+
setResult({
|
|
963
|
+
count: payload.length,
|
|
964
|
+
datasetName: dataset.dataset.getName(),
|
|
965
|
+
outputPath
|
|
966
|
+
});
|
|
967
|
+
setTimeout(() => onComplete(), 200);
|
|
968
|
+
}
|
|
969
|
+
}
|
|
970
|
+
void run();
|
|
971
|
+
return () => {
|
|
972
|
+
cancelled = true;
|
|
973
|
+
};
|
|
974
|
+
}, [runner, datasetName, onComplete]);
|
|
975
|
+
if (error) {
|
|
976
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
977
|
+
/* @__PURE__ */ jsxRuntime.jsx(Banner, {}),
|
|
978
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: error.message })
|
|
979
|
+
] });
|
|
980
|
+
}
|
|
981
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
982
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
983
|
+
result && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
984
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "green", children: [
|
|
985
|
+
"Generated ",
|
|
986
|
+
result.count,
|
|
987
|
+
' test cases for dataset "',
|
|
988
|
+
result.datasetName,
|
|
989
|
+
'".'
|
|
990
|
+
] }),
|
|
991
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
992
|
+
"Wrote ",
|
|
993
|
+
result.outputPath
|
|
994
|
+
] })
|
|
995
|
+
] })
|
|
996
|
+
] });
|
|
997
|
+
}
|
|
998
|
+
|
|
999
|
+
// src/cli-simple/generate.ts
|
|
893
1000
|
function readOutput2(testCase) {
|
|
894
1001
|
if (typeof testCase.getOutput !== "function") {
|
|
895
1002
|
return void 0;
|
|
@@ -900,7 +1007,7 @@ function createOutputPath(datasetFilePath) {
|
|
|
900
1007
|
const parsed = path.parse(datasetFilePath);
|
|
901
1008
|
return path.join(parsed.dir, `${parsed.name}.cases.json`);
|
|
902
1009
|
}
|
|
903
|
-
async function
|
|
1010
|
+
async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
904
1011
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
905
1012
|
if (!dataset) {
|
|
906
1013
|
throw new Error(`Dataset "${datasetName}" not found.`);
|
|
@@ -918,9 +1025,383 @@ async function generateDatasetJsonCommand(runner, datasetName) {
|
|
|
918
1025
|
console.log(`Generated ${payload.length} test cases for dataset "${dataset.dataset.getName()}".`);
|
|
919
1026
|
console.log(`Wrote ${outputPath}`);
|
|
920
1027
|
}
|
|
1028
|
+
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1029
|
+
return new Promise((resolve4, reject) => {
|
|
1030
|
+
const app = ink.render(
|
|
1031
|
+
React2__default.default.createElement(GenerateView, {
|
|
1032
|
+
runner,
|
|
1033
|
+
datasetName,
|
|
1034
|
+
onComplete: (err) => {
|
|
1035
|
+
app.unmount();
|
|
1036
|
+
if (err) {
|
|
1037
|
+
reject(err);
|
|
1038
|
+
} else {
|
|
1039
|
+
resolve4();
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
})
|
|
1043
|
+
);
|
|
1044
|
+
});
|
|
1045
|
+
}
|
|
1046
|
+
function barColor(pct) {
|
|
1047
|
+
if (pct >= 70)
|
|
1048
|
+
return "green";
|
|
1049
|
+
if (pct >= 40)
|
|
1050
|
+
return "yellow";
|
|
1051
|
+
return "red";
|
|
1052
|
+
}
|
|
1053
|
+
function TextBar({
|
|
1054
|
+
label,
|
|
1055
|
+
value,
|
|
1056
|
+
max = 100,
|
|
1057
|
+
labelWidth = 14,
|
|
1058
|
+
barWidth = 20,
|
|
1059
|
+
format = (v) => String(v),
|
|
1060
|
+
colorByValue = true
|
|
1061
|
+
}) {
|
|
1062
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
1063
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
1064
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
1065
|
+
const filledBar = "\u2588".repeat(filled);
|
|
1066
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
1067
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
1068
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1069
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
1070
|
+
" [",
|
|
1071
|
+
color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1072
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
|
|
1073
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
|
|
1074
|
+
] }) : filledBar + emptyBar,
|
|
1075
|
+
"] ",
|
|
1076
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
1077
|
+
] });
|
|
1078
|
+
}
|
|
1079
|
+
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1080
|
+
function Spinner({ label = "Running" }) {
|
|
1081
|
+
const [frame, setFrame] = React2.useState(0);
|
|
1082
|
+
React2.useEffect(() => {
|
|
1083
|
+
const timer = setInterval(() => {
|
|
1084
|
+
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1085
|
+
}, 100);
|
|
1086
|
+
return () => clearInterval(timer);
|
|
1087
|
+
}, []);
|
|
1088
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1089
|
+
FRAMES[frame],
|
|
1090
|
+
" ",
|
|
1091
|
+
label
|
|
1092
|
+
] });
|
|
1093
|
+
}
|
|
1094
|
+
function scoreColor(score) {
|
|
1095
|
+
if (score >= 80)
|
|
1096
|
+
return "green";
|
|
1097
|
+
if (score >= 50)
|
|
1098
|
+
return "yellow";
|
|
1099
|
+
return "red";
|
|
1100
|
+
}
|
|
1101
|
+
function createBar(value, max = 100, width = 20) {
|
|
1102
|
+
const safe = Math.max(0, Math.min(max, value));
|
|
1103
|
+
const filled = Math.round(safe / max * width);
|
|
1104
|
+
return "\u2588".repeat(filled) + "\u2591".repeat(width - filled);
|
|
1105
|
+
}
|
|
1106
|
+
function formatScorePart(item, scoreToColor2) {
|
|
1107
|
+
const def = getScoreById(item.id);
|
|
1108
|
+
if (!def) {
|
|
1109
|
+
const numeric = toNumericScore(item.data);
|
|
1110
|
+
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
1111
|
+
}
|
|
1112
|
+
const formatted = def.format(item.data);
|
|
1113
|
+
if (def.displayStrategy === "bar") {
|
|
1114
|
+
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
1115
|
+
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
1116
|
+
return `${formatted} ${createBar(numeric)}`;
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
return formatted;
|
|
1120
|
+
}
|
|
1121
|
+
function RunView({
|
|
1122
|
+
runner,
|
|
1123
|
+
datasetName,
|
|
1124
|
+
evaluatorPattern,
|
|
1125
|
+
onComplete
|
|
1126
|
+
}) {
|
|
1127
|
+
const [phase, setPhase] = React2.useState(
|
|
1128
|
+
"loading"
|
|
1129
|
+
);
|
|
1130
|
+
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1131
|
+
const [testCases, setTestCases] = React2.useState([]);
|
|
1132
|
+
const [summary, setSummary] = React2.useState(null);
|
|
1133
|
+
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1134
|
+
const runEval = React2.useCallback(async () => {
|
|
1135
|
+
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1136
|
+
if (!dataset) {
|
|
1137
|
+
const known = await runner.collectDatasets();
|
|
1138
|
+
const available = known.map((item) => item.dataset.getName()).sort();
|
|
1139
|
+
onComplete(
|
|
1140
|
+
new Error(
|
|
1141
|
+
available.length > 0 ? `Dataset "${datasetName}" not found. Available: ${available.join(", ")}` : `Dataset "${datasetName}" not found.`
|
|
1142
|
+
)
|
|
1143
|
+
);
|
|
1144
|
+
return;
|
|
1145
|
+
}
|
|
1146
|
+
const evaluators = await runner.resolveEvaluatorsByNamePattern(evaluatorPattern);
|
|
1147
|
+
if (evaluators.length === 0) {
|
|
1148
|
+
const known = await runner.collectEvaluators();
|
|
1149
|
+
const available = known.map((item) => item.evaluator.getName()).filter((name) => typeof name === "string").sort();
|
|
1150
|
+
onComplete(
|
|
1151
|
+
new Error(
|
|
1152
|
+
available.length > 0 ? `No evaluator matched "${evaluatorPattern}". Available: ${available.join(", ")}` : `No evaluator matched "${evaluatorPattern}".`
|
|
1153
|
+
)
|
|
1154
|
+
);
|
|
1155
|
+
return;
|
|
1156
|
+
}
|
|
1157
|
+
const nameById = new Map(
|
|
1158
|
+
evaluators.map((item) => [
|
|
1159
|
+
item.id,
|
|
1160
|
+
item.evaluator.getName() ?? item.id
|
|
1161
|
+
])
|
|
1162
|
+
);
|
|
1163
|
+
setEvaluatorNameById(nameById);
|
|
1164
|
+
const aggregates = /* @__PURE__ */ new Map();
|
|
1165
|
+
let overallScoreTotal = 0;
|
|
1166
|
+
let overallScoreCount = 0;
|
|
1167
|
+
const done = new Promise((resolve4) => {
|
|
1168
|
+
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1169
|
+
if (event.type === "TestCaseProgress") {
|
|
1170
|
+
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
1171
|
+
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, v) => sum + v, 0) / numericScores.length : void 0;
|
|
1172
|
+
for (const item of event.evaluatorScores) {
|
|
1173
|
+
const numeric = toNumericScoreFromScores(item.scores);
|
|
1174
|
+
if (numeric !== void 0) {
|
|
1175
|
+
const current = aggregates.get(item.evaluatorId) ?? {
|
|
1176
|
+
total: 0,
|
|
1177
|
+
count: 0,
|
|
1178
|
+
passed: 0,
|
|
1179
|
+
failed: 0
|
|
1180
|
+
};
|
|
1181
|
+
aggregates.set(item.evaluatorId, {
|
|
1182
|
+
total: current.total + numeric,
|
|
1183
|
+
count: current.count + 1,
|
|
1184
|
+
passed: current.passed + (item.passed ? 1 : 0),
|
|
1185
|
+
failed: current.failed + (item.passed ? 0 : 1)
|
|
1186
|
+
});
|
|
1187
|
+
overallScoreTotal += numeric;
|
|
1188
|
+
overallScoreCount += 1;
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
setTestCases((prev) => [
|
|
1192
|
+
...prev,
|
|
1193
|
+
{
|
|
1194
|
+
name: event.testCaseName,
|
|
1195
|
+
completedTestCases: event.completedTestCases,
|
|
1196
|
+
totalTestCases: event.totalTestCases,
|
|
1197
|
+
durationMs: event.durationMs,
|
|
1198
|
+
passed: event.passed,
|
|
1199
|
+
averageScore,
|
|
1200
|
+
evaluatorScores: event.evaluatorScores.map((item) => ({
|
|
1201
|
+
evaluatorId: item.evaluatorId,
|
|
1202
|
+
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1203
|
+
scores: item.scores,
|
|
1204
|
+
passed: item.passed,
|
|
1205
|
+
metrics: item.metrics
|
|
1206
|
+
}))
|
|
1207
|
+
}
|
|
1208
|
+
]);
|
|
1209
|
+
}
|
|
1210
|
+
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1211
|
+
unsubscribe();
|
|
1212
|
+
resolve4(event);
|
|
1213
|
+
}
|
|
1214
|
+
});
|
|
1215
|
+
});
|
|
1216
|
+
const snapshot = await runner.runDatasetWith({
|
|
1217
|
+
datasetId: dataset.id,
|
|
1218
|
+
evaluatorIds: evaluators.map((item) => item.id)
|
|
1219
|
+
});
|
|
1220
|
+
setRunInfo({
|
|
1221
|
+
runId: snapshot.runId,
|
|
1222
|
+
datasetName: snapshot.datasetName,
|
|
1223
|
+
evaluatorNames: evaluators.map(
|
|
1224
|
+
(e) => e.evaluator.getName() ?? e.id
|
|
1225
|
+
),
|
|
1226
|
+
totalTestCases: snapshot.totalTestCases
|
|
1227
|
+
});
|
|
1228
|
+
setPhase("running");
|
|
1229
|
+
const finalEvent = await done;
|
|
1230
|
+
if (finalEvent.type === "RunFailed") {
|
|
1231
|
+
onComplete(new Error(`Run failed: ${finalEvent.errorMessage}`));
|
|
1232
|
+
return;
|
|
1233
|
+
}
|
|
1234
|
+
setSummary({
|
|
1235
|
+
passedTestCases: finalEvent.passedTestCases,
|
|
1236
|
+
failedTestCases: finalEvent.failedTestCases,
|
|
1237
|
+
totalTestCases: finalEvent.totalTestCases,
|
|
1238
|
+
overallScoreTotal,
|
|
1239
|
+
overallScoreCount,
|
|
1240
|
+
aggregates: new Map(aggregates),
|
|
1241
|
+
artifactPath: finalEvent.artifactPath
|
|
1242
|
+
});
|
|
1243
|
+
setPhase("completed");
|
|
1244
|
+
setTimeout(() => onComplete(), 200);
|
|
1245
|
+
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1246
|
+
React2.useEffect(() => {
|
|
1247
|
+
void runEval();
|
|
1248
|
+
}, [runEval]);
|
|
1249
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
1250
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(Banner, {}) }),
|
|
1251
|
+
runInfo && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1252
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1253
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run " }),
|
|
1254
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: runInfo.runId })
|
|
1255
|
+
] }),
|
|
1256
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1257
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Dataset " }),
|
|
1258
|
+
runInfo.datasetName
|
|
1259
|
+
] }),
|
|
1260
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1261
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Evaluators " }),
|
|
1262
|
+
runInfo.evaluatorNames.join(", ")
|
|
1263
|
+
] }),
|
|
1264
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1265
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Test cases " }),
|
|
1266
|
+
runInfo.totalTestCases
|
|
1267
|
+
] })
|
|
1268
|
+
] }),
|
|
1269
|
+
phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1270
|
+
Spinner,
|
|
1271
|
+
{
|
|
1272
|
+
label: `Evaluations ${testCases.length}/${runInfo?.totalTestCases ?? 0}`
|
|
1273
|
+
}
|
|
1274
|
+
) }),
|
|
1275
|
+
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1276
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1277
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
1278
|
+
"[",
|
|
1279
|
+
tc.completedTestCases,
|
|
1280
|
+
"/",
|
|
1281
|
+
tc.totalTestCases,
|
|
1282
|
+
"]"
|
|
1283
|
+
] }),
|
|
1284
|
+
" ",
|
|
1285
|
+
tc.name,
|
|
1286
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1287
|
+
" (",
|
|
1288
|
+
tc.durationMs,
|
|
1289
|
+
"ms)"
|
|
1290
|
+
] })
|
|
1291
|
+
] }),
|
|
1292
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1293
|
+
item.evaluatorName,
|
|
1294
|
+
":",
|
|
1295
|
+
" ",
|
|
1296
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1297
|
+
" ",
|
|
1298
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1299
|
+
formatScorePart(s),
|
|
1300
|
+
" "
|
|
1301
|
+
] }, s.id)),
|
|
1302
|
+
item.metrics?.map((m) => {
|
|
1303
|
+
const def = getMetricById(m.id);
|
|
1304
|
+
if (!def)
|
|
1305
|
+
return null;
|
|
1306
|
+
const formatted = def.format(m.data);
|
|
1307
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1308
|
+
"[",
|
|
1309
|
+
def.name ? `${def.name}: ` : "",
|
|
1310
|
+
formatted,
|
|
1311
|
+
"]",
|
|
1312
|
+
" "
|
|
1313
|
+
] }, m.id);
|
|
1314
|
+
})
|
|
1315
|
+
] }) }, item.evaluatorId))
|
|
1316
|
+
] }, i)) }),
|
|
1317
|
+
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1318
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
1319
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, children: [
|
|
1320
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "green", children: "passed" }),
|
|
1321
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1322
|
+
" ",
|
|
1323
|
+
summary.passedTestCases,
|
|
1324
|
+
"/",
|
|
1325
|
+
summary.totalTestCases
|
|
1326
|
+
] })
|
|
1327
|
+
] }),
|
|
1328
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1329
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: summary.failedTestCases > 0 ? "red" : "gray", children: "failed" }),
|
|
1330
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1331
|
+
" ",
|
|
1332
|
+
summary.failedTestCases,
|
|
1333
|
+
"/",
|
|
1334
|
+
summary.totalTestCases
|
|
1335
|
+
] })
|
|
1336
|
+
] }),
|
|
1337
|
+
summary.overallScoreCount > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
1338
|
+
TextBar,
|
|
1339
|
+
{
|
|
1340
|
+
label: "overall avg",
|
|
1341
|
+
value: summary.overallScoreTotal / summary.overallScoreCount,
|
|
1342
|
+
barWidth: 20,
|
|
1343
|
+
format: (v) => v.toFixed(2)
|
|
1344
|
+
}
|
|
1345
|
+
) }),
|
|
1346
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1347
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1348
|
+
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1349
|
+
const agg = summary.aggregates.get(id);
|
|
1350
|
+
if (!agg || agg.count === 0) {
|
|
1351
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1352
|
+
"- ",
|
|
1353
|
+
name.padEnd(28),
|
|
1354
|
+
" no numeric scores"
|
|
1355
|
+
] }, id);
|
|
1356
|
+
}
|
|
1357
|
+
const mean = agg.total / agg.count;
|
|
1358
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1359
|
+
"- ",
|
|
1360
|
+
name.padEnd(28),
|
|
1361
|
+
" avg=",
|
|
1362
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: scoreColor(mean), children: mean.toFixed(2) }),
|
|
1363
|
+
" passed=",
|
|
1364
|
+
agg.passed,
|
|
1365
|
+
" failed=",
|
|
1366
|
+
agg.failed
|
|
1367
|
+
] }, id);
|
|
1368
|
+
})
|
|
1369
|
+
] }),
|
|
1370
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { marginTop: 1, flexDirection: "column", children: [
|
|
1371
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "test case scores" }),
|
|
1372
|
+
testCases.map((tc, i) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { children: [
|
|
1373
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: tc.passed ? "green" : "red", children: tc.passed ? "PASS" : "FAIL" }),
|
|
1374
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1375
|
+
" ",
|
|
1376
|
+
tc.name.padEnd(24)
|
|
1377
|
+
] }),
|
|
1378
|
+
tc.averageScore !== void 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
1379
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(tc.averageScore), children: [
|
|
1380
|
+
"score=",
|
|
1381
|
+
tc.averageScore.toFixed(2)
|
|
1382
|
+
] }),
|
|
1383
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1384
|
+
" ",
|
|
1385
|
+
createBar(tc.averageScore, 100, 14)
|
|
1386
|
+
] })
|
|
1387
|
+
] }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "score=n/a" }),
|
|
1388
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1389
|
+
" (",
|
|
1390
|
+
tc.durationMs,
|
|
1391
|
+
"ms)"
|
|
1392
|
+
] })
|
|
1393
|
+
] }, i))
|
|
1394
|
+
] }),
|
|
1395
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, children: /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1396
|
+
"artifact: ",
|
|
1397
|
+
summary.artifactPath
|
|
1398
|
+
] }) })
|
|
1399
|
+
] })
|
|
1400
|
+
] });
|
|
1401
|
+
}
|
|
921
1402
|
|
|
922
1403
|
// src/cli-simple/run.ts
|
|
923
|
-
var
|
|
1404
|
+
var ansi2 = {
|
|
924
1405
|
reset: "\x1B[0m",
|
|
925
1406
|
bold: "\x1B[1m",
|
|
926
1407
|
dim: "\x1B[2m",
|
|
@@ -931,16 +1412,16 @@ var ansi = {
|
|
|
931
1412
|
magenta: "\x1B[35m"
|
|
932
1413
|
};
|
|
933
1414
|
function colorize(text, color) {
|
|
934
|
-
return `${color}${text}${
|
|
1415
|
+
return `${color}${text}${ansi2.reset}`;
|
|
935
1416
|
}
|
|
936
1417
|
function scoreToColor(score) {
|
|
937
1418
|
if (score >= 80) {
|
|
938
|
-
return
|
|
1419
|
+
return ansi2.green;
|
|
939
1420
|
}
|
|
940
1421
|
if (score >= 50) {
|
|
941
|
-
return
|
|
1422
|
+
return ansi2.yellow;
|
|
942
1423
|
}
|
|
943
|
-
return
|
|
1424
|
+
return ansi2.red;
|
|
944
1425
|
}
|
|
945
1426
|
function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
946
1427
|
if (!aggregate || aggregate.count === 0) {
|
|
@@ -949,13 +1430,13 @@ function getEvaluatorSummaryLine(evaluatorName, aggregate) {
|
|
|
949
1430
|
const mean = aggregate.total / aggregate.count;
|
|
950
1431
|
return `- ${evaluatorName.padEnd(28)} avg=${colorize(mean.toFixed(2), scoreToColor(mean))} passed=${aggregate.passed} failed=${aggregate.failed}`;
|
|
951
1432
|
}
|
|
952
|
-
function
|
|
1433
|
+
function createBar2(value, max = 100, width = 20) {
|
|
953
1434
|
const safe = Math.max(0, Math.min(max, value));
|
|
954
1435
|
const filled = Math.round(safe / max * width);
|
|
955
1436
|
return `${"\u2588".repeat(filled)}${"\u2591".repeat(width - filled)}`;
|
|
956
1437
|
}
|
|
957
1438
|
function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
958
|
-
const passLabel = passed ? colorize("PASS", `${
|
|
1439
|
+
const passLabel = passed ? colorize("PASS", `${ansi2.bold}${ansi2.green}`) : colorize("FAIL", `${ansi2.bold}${ansi2.red}`);
|
|
959
1440
|
const scoreParts = [];
|
|
960
1441
|
for (const item of scores) {
|
|
961
1442
|
const def = getScoreById(item.id);
|
|
@@ -972,7 +1453,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
972
1453
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
973
1454
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
974
1455
|
scoreParts.push(
|
|
975
|
-
`${colorize(formatted, scoreToColor(numeric))} ${colorize(
|
|
1456
|
+
`${colorize(formatted, scoreToColor(numeric))} ${colorize(createBar2(numeric), ansi2.dim)}`
|
|
976
1457
|
);
|
|
977
1458
|
} else {
|
|
978
1459
|
scoreParts.push(formatted);
|
|
@@ -986,7 +1467,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
986
1467
|
scoreParts.push(
|
|
987
1468
|
colorize(
|
|
988
1469
|
formatted,
|
|
989
|
-
item.passed === true ? `${
|
|
1470
|
+
item.passed === true ? `${ansi2.bold}${ansi2.green}` : item.passed === false ? `${ansi2.bold}${ansi2.red}` : ansi2.dim
|
|
990
1471
|
)
|
|
991
1472
|
);
|
|
992
1473
|
break;
|
|
@@ -1011,7 +1492,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics) {
|
|
|
1011
1492
|
}
|
|
1012
1493
|
return line;
|
|
1013
1494
|
}
|
|
1014
|
-
async function
|
|
1495
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
1015
1496
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1016
1497
|
if (!dataset) {
|
|
1017
1498
|
const known = await runner.collectDatasets();
|
|
@@ -1053,10 +1534,10 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1053
1534
|
const frame = spinnerFrames[spinnerIndex % spinnerFrames.length];
|
|
1054
1535
|
spinnerIndex += 1;
|
|
1055
1536
|
process.stdout.write(
|
|
1056
|
-
`\r${colorize(frame,
|
|
1537
|
+
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
1057
1538
|
`${completedCount}/${totalCount}`,
|
|
1058
|
-
|
|
1059
|
-
)} ${colorize("(live)",
|
|
1539
|
+
ansi2.bold
|
|
1540
|
+
)} ${colorize("(live)", ansi2.dim)}`
|
|
1060
1541
|
);
|
|
1061
1542
|
}
|
|
1062
1543
|
let spinnerTimer;
|
|
@@ -1068,7 +1549,7 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1068
1549
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
1069
1550
|
clearLine();
|
|
1070
1551
|
console.log(
|
|
1071
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`,
|
|
1552
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.durationMs}ms)`, ansi2.dim)}`
|
|
1072
1553
|
);
|
|
1073
1554
|
for (const item of event.evaluatorScores) {
|
|
1074
1555
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -1119,14 +1600,14 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1119
1600
|
evaluatorIds: evaluators.map((item) => item.id)
|
|
1120
1601
|
});
|
|
1121
1602
|
totalCount = snapshot.totalTestCases;
|
|
1122
|
-
console.log(colorize("=== Eval Run Started ===", `${
|
|
1123
|
-
console.log(`Run: ${colorize(snapshot.runId,
|
|
1124
|
-
console.log(`Dataset: ${colorize(snapshot.datasetName,
|
|
1603
|
+
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
1604
|
+
console.log(`Run: ${colorize(snapshot.runId, ansi2.cyan)}`);
|
|
1605
|
+
console.log(`Dataset: ${colorize(snapshot.datasetName, ansi2.bold)}`);
|
|
1125
1606
|
console.log(
|
|
1126
1607
|
`Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
|
|
1127
1608
|
);
|
|
1128
1609
|
console.log(
|
|
1129
|
-
`Total test cases: ${colorize(String(snapshot.totalTestCases),
|
|
1610
|
+
`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
|
|
1130
1611
|
);
|
|
1131
1612
|
console.log("");
|
|
1132
1613
|
drawSpinner();
|
|
@@ -1139,17 +1620,17 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1139
1620
|
throw new Error(`Run failed: ${finalEvent.errorMessage}`);
|
|
1140
1621
|
}
|
|
1141
1622
|
console.log("");
|
|
1142
|
-
console.log(colorize("=== Run Summary ===", `${
|
|
1623
|
+
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
1143
1624
|
console.log(
|
|
1144
1625
|
`- passed: ${colorize(
|
|
1145
1626
|
`${finalEvent.passedTestCases}/${finalEvent.totalTestCases}`,
|
|
1146
|
-
|
|
1627
|
+
ansi2.green
|
|
1147
1628
|
)}`
|
|
1148
1629
|
);
|
|
1149
1630
|
console.log(
|
|
1150
1631
|
`- failed: ${colorize(
|
|
1151
1632
|
`${finalEvent.failedTestCases}/${finalEvent.totalTestCases}`,
|
|
1152
|
-
finalEvent.failedTestCases > 0 ?
|
|
1633
|
+
finalEvent.failedTestCases > 0 ? ansi2.red : ansi2.dim
|
|
1153
1634
|
)}`
|
|
1154
1635
|
);
|
|
1155
1636
|
if (overallScoreCount > 0) {
|
|
@@ -1158,22 +1639,22 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1158
1639
|
`- overall avg score: ${colorize(
|
|
1159
1640
|
overallAverage.toFixed(2),
|
|
1160
1641
|
scoreToColor(overallAverage)
|
|
1161
|
-
)} ${colorize(
|
|
1642
|
+
)} ${colorize(createBar2(overallAverage), ansi2.dim)}`
|
|
1162
1643
|
);
|
|
1163
1644
|
}
|
|
1164
|
-
console.log(colorize("- evaluator averages:",
|
|
1645
|
+
console.log(colorize("- evaluator averages:", ansi2.magenta));
|
|
1165
1646
|
for (const [evaluatorId, evaluatorName] of evaluatorNameById.entries()) {
|
|
1166
1647
|
console.log(
|
|
1167
1648
|
getEvaluatorSummaryLine(evaluatorName, aggregates.get(evaluatorId))
|
|
1168
1649
|
);
|
|
1169
1650
|
}
|
|
1170
1651
|
if (testCaseSummaries.length > 0) {
|
|
1171
|
-
console.log(colorize("- test case scores:",
|
|
1652
|
+
console.log(colorize("- test case scores:", ansi2.magenta));
|
|
1172
1653
|
for (const summary of testCaseSummaries) {
|
|
1173
|
-
const status = summary.passed ? colorize("PASS",
|
|
1654
|
+
const status = summary.passed ? colorize("PASS", ansi2.green) : colorize("FAIL", ansi2.red);
|
|
1174
1655
|
if (summary.averageScore === void 0) {
|
|
1175
1656
|
console.log(
|
|
1176
|
-
` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`,
|
|
1657
|
+
` ${status} ${summary.name.padEnd(24)} score=n/a ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1177
1658
|
);
|
|
1178
1659
|
continue;
|
|
1179
1660
|
}
|
|
@@ -1181,11 +1662,30 @@ async function runSimpleEvalCommand(runner, datasetName, evaluatorPattern) {
|
|
|
1181
1662
|
` ${status} ${summary.name.padEnd(24)} score=${colorize(
|
|
1182
1663
|
summary.averageScore.toFixed(2),
|
|
1183
1664
|
scoreToColor(summary.averageScore)
|
|
1184
|
-
)} ${colorize(
|
|
1665
|
+
)} ${colorize(createBar2(summary.averageScore, 100, 14), ansi2.dim)} ${colorize(`(${summary.durationMs}ms)`, ansi2.dim)}`
|
|
1185
1666
|
);
|
|
1186
1667
|
}
|
|
1187
1668
|
}
|
|
1188
|
-
console.log(`- artifact: ${colorize(finalEvent.artifactPath,
|
|
1669
|
+
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1670
|
+
}
|
|
1671
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1672
|
+
return new Promise((resolve4, reject) => {
|
|
1673
|
+
const app = ink.render(
|
|
1674
|
+
React2__default.default.createElement(RunView, {
|
|
1675
|
+
runner,
|
|
1676
|
+
datasetName,
|
|
1677
|
+
evaluatorPattern,
|
|
1678
|
+
onComplete: (err) => {
|
|
1679
|
+
app.unmount();
|
|
1680
|
+
if (err) {
|
|
1681
|
+
reject(err);
|
|
1682
|
+
} else {
|
|
1683
|
+
resolve4();
|
|
1684
|
+
}
|
|
1685
|
+
}
|
|
1686
|
+
})
|
|
1687
|
+
);
|
|
1688
|
+
});
|
|
1189
1689
|
}
|
|
1190
1690
|
|
|
1191
1691
|
// src/cli-simple/index.ts
|
|
@@ -1210,17 +1710,28 @@ async function main() {
|
|
|
1210
1710
|
console.error("Missing required --dataset <datasetName> argument.");
|
|
1211
1711
|
printUsageAndExit(1);
|
|
1212
1712
|
}
|
|
1713
|
+
if (args.command === "run" && !args.evaluatorPattern) {
|
|
1714
|
+
console.error("Missing required --evaluator <name-or-pattern> argument.");
|
|
1715
|
+
printUsageAndExit(1);
|
|
1716
|
+
}
|
|
1717
|
+
const useInk = process.stdout.isTTY === true;
|
|
1718
|
+
if (!useInk) {
|
|
1719
|
+
printBanner();
|
|
1720
|
+
}
|
|
1213
1721
|
const runner = createRunner();
|
|
1214
1722
|
try {
|
|
1215
1723
|
if (args.command === "run") {
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1724
|
+
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
1725
|
+
runner,
|
|
1726
|
+
args.datasetName,
|
|
1727
|
+
args.evaluatorPattern
|
|
1728
|
+
);
|
|
1221
1729
|
return;
|
|
1222
1730
|
}
|
|
1223
|
-
await
|
|
1731
|
+
await (useInk ? generateDatasetJsonCommandInk : generateDatasetJsonCommandPlain)(
|
|
1732
|
+
runner,
|
|
1733
|
+
args.datasetName
|
|
1734
|
+
);
|
|
1224
1735
|
} finally {
|
|
1225
1736
|
await runner.shutdown();
|
|
1226
1737
|
}
|