@m4trix/evals 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +105 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +105 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +79 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +79 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +36 -5
- package/dist/index.js +77 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1051,20 +1051,70 @@ function getMetricById(id) {
|
|
|
1051
1051
|
|
|
1052
1052
|
// src/evals/score.ts
|
|
1053
1053
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1054
|
+
function formatScoreData(def, data, options) {
|
|
1055
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1056
|
+
}
|
|
1057
|
+
var ScoreAggregate = {
|
|
1058
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1059
|
+
averageFields(fields) {
|
|
1060
|
+
return (values) => {
|
|
1061
|
+
const count = values.length || 1;
|
|
1062
|
+
const result = {};
|
|
1063
|
+
for (const field of fields) {
|
|
1064
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1065
|
+
}
|
|
1066
|
+
return result;
|
|
1067
|
+
};
|
|
1068
|
+
},
|
|
1069
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
1070
|
+
averageWithVariance(values) {
|
|
1071
|
+
if (values.length === 0) {
|
|
1072
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
1073
|
+
}
|
|
1074
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1075
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1076
|
+
const mean = sum / values.length;
|
|
1077
|
+
let stdDev;
|
|
1078
|
+
if (values.length >= 2) {
|
|
1079
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1080
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1081
|
+
}
|
|
1082
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
1083
|
+
},
|
|
1084
|
+
/** All runs must pass. Use for binary scores. */
|
|
1085
|
+
all(values) {
|
|
1086
|
+
const total = values.length;
|
|
1087
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1088
|
+
return {
|
|
1089
|
+
...values[0],
|
|
1090
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1091
|
+
passedCount,
|
|
1092
|
+
totalCount: total
|
|
1093
|
+
};
|
|
1094
|
+
},
|
|
1095
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
1096
|
+
last(values) {
|
|
1097
|
+
return values[values.length - 1] ?? {};
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1054
1100
|
var Score = {
|
|
1101
|
+
aggregate: ScoreAggregate,
|
|
1055
1102
|
of(config) {
|
|
1056
1103
|
const def = {
|
|
1057
1104
|
id: config.id,
|
|
1058
1105
|
name: config.name,
|
|
1059
1106
|
displayStrategy: config.displayStrategy,
|
|
1060
|
-
|
|
1061
|
-
|
|
1107
|
+
formatValue: config.formatValue,
|
|
1108
|
+
formatAggregate: config.formatAggregate,
|
|
1109
|
+
aggregateValues: config.aggregateValues,
|
|
1062
1110
|
make: (data, options) => {
|
|
1063
1111
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1064
1112
|
return {
|
|
1065
1113
|
id: config.id,
|
|
1066
1114
|
data,
|
|
1067
|
-
...passed !== void 0 && { passed }
|
|
1115
|
+
...passed !== void 0 && { passed },
|
|
1116
|
+
def
|
|
1117
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
1068
1118
|
};
|
|
1069
1119
|
}
|
|
1070
1120
|
};
|
|
@@ -1077,29 +1127,6 @@ function getScoreById(id) {
|
|
|
1077
1127
|
}
|
|
1078
1128
|
|
|
1079
1129
|
// src/evals/aggregators.ts
|
|
1080
|
-
function aggregateAverageWithVariance(values) {
|
|
1081
|
-
if (values.length === 0) {
|
|
1082
|
-
return { value: 0, count: 0 };
|
|
1083
|
-
}
|
|
1084
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1085
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1086
|
-
const mean = sum / values.length;
|
|
1087
|
-
let stdDev;
|
|
1088
|
-
if (values.length >= 2) {
|
|
1089
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1090
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1091
|
-
}
|
|
1092
|
-
return { value: mean, stdDev, count: values.length };
|
|
1093
|
-
}
|
|
1094
|
-
function aggregateAll(values) {
|
|
1095
|
-
const total = values.length;
|
|
1096
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
1097
|
-
return {
|
|
1098
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
1099
|
-
passedCount,
|
|
1100
|
-
totalCount: total
|
|
1101
|
-
};
|
|
1102
|
-
}
|
|
1103
1130
|
function aggregateTokenCountSum(values) {
|
|
1104
1131
|
const initial = {
|
|
1105
1132
|
input: 0,
|
|
@@ -1152,35 +1179,40 @@ Score.of({
|
|
|
1152
1179
|
id: "percent",
|
|
1153
1180
|
name: "Score",
|
|
1154
1181
|
displayStrategy: "bar",
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1182
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
1183
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1184
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
1185
|
+
});
|
|
1186
|
+
Score.of({
|
|
1187
|
+
id: "delta",
|
|
1188
|
+
name: "Delta",
|
|
1189
|
+
displayStrategy: "number",
|
|
1190
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
1191
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
1192
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
1162
1193
|
});
|
|
1163
1194
|
Score.of({
|
|
1164
1195
|
id: "binary",
|
|
1165
1196
|
name: "Result",
|
|
1166
1197
|
displayStrategy: "passFail",
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
}
|
|
1173
|
-
return base;
|
|
1198
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
1199
|
+
formatAggregate: (data) => {
|
|
1200
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1201
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1202
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1174
1203
|
}
|
|
1175
|
-
return
|
|
1204
|
+
return base;
|
|
1176
1205
|
},
|
|
1177
|
-
|
|
1206
|
+
aggregateValues: Score.aggregate.all
|
|
1178
1207
|
});
|
|
1179
1208
|
|
|
1180
1209
|
// src/runner/score-utils.ts
|
|
1210
|
+
function getScoreDef(item) {
|
|
1211
|
+
return item.def ?? getScoreById(item.id);
|
|
1212
|
+
}
|
|
1181
1213
|
function toNumericScoreFromScores(scores) {
|
|
1182
1214
|
for (const item of scores) {
|
|
1183
|
-
const def =
|
|
1215
|
+
const def = getScoreDef(item);
|
|
1184
1216
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1185
1217
|
const value = item.data.value;
|
|
1186
1218
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1598,7 +1630,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
1598
1630
|
() => appendJsonLine(message.artifactPath, {
|
|
1599
1631
|
runId: message.runId,
|
|
1600
1632
|
ts: Date.now(),
|
|
1601
|
-
...message.payload
|
|
1633
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1602
1634
|
})
|
|
1603
1635
|
);
|
|
1604
1636
|
})
|
|
@@ -2140,12 +2172,12 @@ function scoreColor(score) {
|
|
|
2140
2172
|
return "red";
|
|
2141
2173
|
}
|
|
2142
2174
|
function formatScorePart(item) {
|
|
2143
|
-
const def = getScoreById(item.id);
|
|
2175
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2144
2176
|
if (!def) {
|
|
2145
2177
|
const numeric = toNumericScore(item.data);
|
|
2146
2178
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
2147
2179
|
}
|
|
2148
|
-
const formatted = def
|
|
2180
|
+
const formatted = formatScoreData(def, item.data);
|
|
2149
2181
|
if (def.displayStrategy === "bar") {
|
|
2150
2182
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2151
2183
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -2319,7 +2351,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2319
2351
|
if (item.scores.length > 0) {
|
|
2320
2352
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2321
2353
|
const s = item.scores[sIdx];
|
|
2322
|
-
const def = getScoreById(s.id);
|
|
2354
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2323
2355
|
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
2324
2356
|
rows.push(
|
|
2325
2357
|
/* @__PURE__ */ jsxs(
|