@m4trix/evals 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +105 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +105 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +79 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +79 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +36 -5
- package/dist/index.js +77 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1077,20 +1077,70 @@ function getMetricById(id) {
|
|
|
1077
1077
|
|
|
1078
1078
|
// src/evals/score.ts
|
|
1079
1079
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1080
|
+
function formatScoreData(def, data, options) {
|
|
1081
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1082
|
+
}
|
|
1083
|
+
var ScoreAggregate = {
|
|
1084
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1085
|
+
averageFields(fields) {
|
|
1086
|
+
return (values) => {
|
|
1087
|
+
const count = values.length || 1;
|
|
1088
|
+
const result = {};
|
|
1089
|
+
for (const field of fields) {
|
|
1090
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1091
|
+
}
|
|
1092
|
+
return result;
|
|
1093
|
+
};
|
|
1094
|
+
},
|
|
1095
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
1096
|
+
averageWithVariance(values) {
|
|
1097
|
+
if (values.length === 0) {
|
|
1098
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
1099
|
+
}
|
|
1100
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1101
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1102
|
+
const mean = sum / values.length;
|
|
1103
|
+
let stdDev;
|
|
1104
|
+
if (values.length >= 2) {
|
|
1105
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1106
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1107
|
+
}
|
|
1108
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
1109
|
+
},
|
|
1110
|
+
/** All runs must pass. Use for binary scores. */
|
|
1111
|
+
all(values) {
|
|
1112
|
+
const total = values.length;
|
|
1113
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1114
|
+
return {
|
|
1115
|
+
...values[0],
|
|
1116
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1117
|
+
passedCount,
|
|
1118
|
+
totalCount: total
|
|
1119
|
+
};
|
|
1120
|
+
},
|
|
1121
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
1122
|
+
last(values) {
|
|
1123
|
+
return values[values.length - 1] ?? {};
|
|
1124
|
+
}
|
|
1125
|
+
};
|
|
1080
1126
|
var Score = {
|
|
1127
|
+
aggregate: ScoreAggregate,
|
|
1081
1128
|
of(config) {
|
|
1082
1129
|
const def = {
|
|
1083
1130
|
id: config.id,
|
|
1084
1131
|
name: config.name,
|
|
1085
1132
|
displayStrategy: config.displayStrategy,
|
|
1086
|
-
|
|
1087
|
-
|
|
1133
|
+
formatValue: config.formatValue,
|
|
1134
|
+
formatAggregate: config.formatAggregate,
|
|
1135
|
+
aggregateValues: config.aggregateValues,
|
|
1088
1136
|
make: (data, options) => {
|
|
1089
1137
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1090
1138
|
return {
|
|
1091
1139
|
id: config.id,
|
|
1092
1140
|
data,
|
|
1093
|
-
...passed !== void 0 && { passed }
|
|
1141
|
+
...passed !== void 0 && { passed },
|
|
1142
|
+
def
|
|
1143
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
1094
1144
|
};
|
|
1095
1145
|
}
|
|
1096
1146
|
};
|
|
@@ -1103,29 +1153,6 @@ function getScoreById(id) {
|
|
|
1103
1153
|
}
|
|
1104
1154
|
|
|
1105
1155
|
// src/evals/aggregators.ts
|
|
1106
|
-
function aggregateAverageWithVariance(values) {
|
|
1107
|
-
if (values.length === 0) {
|
|
1108
|
-
return { value: 0, count: 0 };
|
|
1109
|
-
}
|
|
1110
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1111
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1112
|
-
const mean = sum / values.length;
|
|
1113
|
-
let stdDev;
|
|
1114
|
-
if (values.length >= 2) {
|
|
1115
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1116
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1117
|
-
}
|
|
1118
|
-
return { value: mean, stdDev, count: values.length };
|
|
1119
|
-
}
|
|
1120
|
-
function aggregateAll(values) {
|
|
1121
|
-
const total = values.length;
|
|
1122
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
1123
|
-
return {
|
|
1124
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
1125
|
-
passedCount,
|
|
1126
|
-
totalCount: total
|
|
1127
|
-
};
|
|
1128
|
-
}
|
|
1129
1156
|
function aggregateTokenCountSum(values) {
|
|
1130
1157
|
const initial = {
|
|
1131
1158
|
input: 0,
|
|
@@ -1178,35 +1205,40 @@ Score.of({
|
|
|
1178
1205
|
id: "percent",
|
|
1179
1206
|
name: "Score",
|
|
1180
1207
|
displayStrategy: "bar",
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1208
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
1209
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1210
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
1211
|
+
});
|
|
1212
|
+
Score.of({
|
|
1213
|
+
id: "delta",
|
|
1214
|
+
name: "Delta",
|
|
1215
|
+
displayStrategy: "number",
|
|
1216
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
1217
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
1218
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
1188
1219
|
});
|
|
1189
1220
|
Score.of({
|
|
1190
1221
|
id: "binary",
|
|
1191
1222
|
name: "Result",
|
|
1192
1223
|
displayStrategy: "passFail",
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
}
|
|
1199
|
-
return base;
|
|
1224
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
1225
|
+
formatAggregate: (data) => {
|
|
1226
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1227
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1228
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1200
1229
|
}
|
|
1201
|
-
return
|
|
1230
|
+
return base;
|
|
1202
1231
|
},
|
|
1203
|
-
|
|
1232
|
+
aggregateValues: Score.aggregate.all
|
|
1204
1233
|
});
|
|
1205
1234
|
|
|
1206
1235
|
// src/runner/score-utils.ts
|
|
1236
|
+
function getScoreDef(item) {
|
|
1237
|
+
return item.def ?? getScoreById(item.id);
|
|
1238
|
+
}
|
|
1207
1239
|
function toNumericScoreFromScores(scores) {
|
|
1208
1240
|
for (const item of scores) {
|
|
1209
|
-
const def =
|
|
1241
|
+
const def = getScoreDef(item);
|
|
1210
1242
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1211
1243
|
const value = item.data.value;
|
|
1212
1244
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1624,7 +1656,7 @@ var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
|
1624
1656
|
() => appendJsonLine(message.artifactPath, {
|
|
1625
1657
|
runId: message.runId,
|
|
1626
1658
|
ts: Date.now(),
|
|
1627
|
-
...message.payload
|
|
1659
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1628
1660
|
})
|
|
1629
1661
|
);
|
|
1630
1662
|
})
|
|
@@ -2166,12 +2198,12 @@ function scoreColor(score) {
|
|
|
2166
2198
|
return "red";
|
|
2167
2199
|
}
|
|
2168
2200
|
function formatScorePart(item) {
|
|
2169
|
-
const def = getScoreById(item.id);
|
|
2201
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2170
2202
|
if (!def) {
|
|
2171
2203
|
const numeric = toNumericScore(item.data);
|
|
2172
2204
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
2173
2205
|
}
|
|
2174
|
-
const formatted = def
|
|
2206
|
+
const formatted = formatScoreData(def, item.data);
|
|
2175
2207
|
if (def.displayStrategy === "bar") {
|
|
2176
2208
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2177
2209
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -2345,7 +2377,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2345
2377
|
if (item.scores.length > 0) {
|
|
2346
2378
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2347
2379
|
const s = item.scores[sIdx];
|
|
2348
|
-
const def = getScoreById(s.id);
|
|
2380
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2349
2381
|
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
2350
2382
|
rows.push(
|
|
2351
2383
|
/* @__PURE__ */ jsxRuntime.jsxs(
|