@m4trix/evals 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +113 -76
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +113 -76
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +87 -47
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +87 -47
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +86 -44
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +54 -5
- package/dist/index.js +85 -45
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1051,20 +1051,70 @@ function getMetricById(id) {
|
|
|
1051
1051
|
|
|
1052
1052
|
// src/evals/score.ts
|
|
1053
1053
|
var registry2 = /* @__PURE__ */ new Map();
|
|
1054
|
+
function formatScoreData(def, data, options) {
|
|
1055
|
+
return options?.isAggregated ? def.formatAggregate(data) : def.formatValue(data);
|
|
1056
|
+
}
|
|
1057
|
+
var ScoreAggregate = {
|
|
1058
|
+
/** Average numeric fields. Use for scores like { value, delta }. */
|
|
1059
|
+
averageFields(fields) {
|
|
1060
|
+
return (values) => {
|
|
1061
|
+
const count = values.length || 1;
|
|
1062
|
+
const result = {};
|
|
1063
|
+
for (const field of fields) {
|
|
1064
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1065
|
+
}
|
|
1066
|
+
return result;
|
|
1067
|
+
};
|
|
1068
|
+
},
|
|
1069
|
+
/** Average `value` with sample std dev. Use for percent-style scores. */
|
|
1070
|
+
averageWithVariance(values) {
|
|
1071
|
+
if (values.length === 0) {
|
|
1072
|
+
return { value: 0, stdDev: void 0, count: 0 };
|
|
1073
|
+
}
|
|
1074
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1075
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1076
|
+
const mean = sum / values.length;
|
|
1077
|
+
let stdDev;
|
|
1078
|
+
if (values.length >= 2) {
|
|
1079
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1080
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1081
|
+
}
|
|
1082
|
+
return { ...values[0], value: mean, stdDev, count: values.length };
|
|
1083
|
+
},
|
|
1084
|
+
/** All runs must pass. Use for binary scores. */
|
|
1085
|
+
all(values) {
|
|
1086
|
+
const total = values.length;
|
|
1087
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1088
|
+
return {
|
|
1089
|
+
...values[0],
|
|
1090
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1091
|
+
passedCount,
|
|
1092
|
+
totalCount: total
|
|
1093
|
+
};
|
|
1094
|
+
},
|
|
1095
|
+
/** Take last value (no aggregation). Use when aggregation is not meaningful. */
|
|
1096
|
+
last(values) {
|
|
1097
|
+
return values[values.length - 1] ?? {};
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1054
1100
|
var Score = {
|
|
1101
|
+
aggregate: ScoreAggregate,
|
|
1055
1102
|
of(config) {
|
|
1056
1103
|
const def = {
|
|
1057
1104
|
id: config.id,
|
|
1058
1105
|
name: config.name,
|
|
1059
1106
|
displayStrategy: config.displayStrategy,
|
|
1060
|
-
|
|
1061
|
-
|
|
1107
|
+
formatValue: config.formatValue,
|
|
1108
|
+
formatAggregate: config.formatAggregate,
|
|
1109
|
+
aggregateValues: config.aggregateValues,
|
|
1062
1110
|
make: (data, options) => {
|
|
1063
1111
|
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1064
1112
|
return {
|
|
1065
1113
|
id: config.id,
|
|
1066
1114
|
data,
|
|
1067
|
-
...passed !== void 0 && { passed }
|
|
1115
|
+
...passed !== void 0 && { passed },
|
|
1116
|
+
def
|
|
1117
|
+
// Attach def so rendering/aggregation works without registry lookup
|
|
1068
1118
|
};
|
|
1069
1119
|
}
|
|
1070
1120
|
};
|
|
@@ -1077,29 +1127,6 @@ function getScoreById(id) {
|
|
|
1077
1127
|
}
|
|
1078
1128
|
|
|
1079
1129
|
// src/evals/aggregators.ts
|
|
1080
|
-
function aggregateAverageWithVariance(values) {
|
|
1081
|
-
if (values.length === 0) {
|
|
1082
|
-
return { value: 0, count: 0 };
|
|
1083
|
-
}
|
|
1084
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1085
|
-
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1086
|
-
const mean = sum / values.length;
|
|
1087
|
-
let stdDev;
|
|
1088
|
-
if (values.length >= 2) {
|
|
1089
|
-
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1090
|
-
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1091
|
-
}
|
|
1092
|
-
return { value: mean, stdDev, count: values.length };
|
|
1093
|
-
}
|
|
1094
|
-
function aggregateAll(values) {
|
|
1095
|
-
const total = values.length;
|
|
1096
|
-
const passedCount = values.filter((v) => v.passed).length;
|
|
1097
|
-
return {
|
|
1098
|
-
passed: total > 0 && values.every((v) => v.passed),
|
|
1099
|
-
passedCount,
|
|
1100
|
-
totalCount: total
|
|
1101
|
-
};
|
|
1102
|
-
}
|
|
1103
1130
|
function aggregateTokenCountSum(values) {
|
|
1104
1131
|
const initial = {
|
|
1105
1132
|
input: 0,
|
|
@@ -1152,35 +1179,40 @@ Score.of({
|
|
|
1152
1179
|
id: "percent",
|
|
1153
1180
|
name: "Score",
|
|
1154
1181
|
displayStrategy: "bar",
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1182
|
+
formatValue: (data) => data.value.toFixed(2),
|
|
1183
|
+
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
1184
|
+
aggregateValues: Score.aggregate.averageWithVariance
|
|
1185
|
+
});
|
|
1186
|
+
Score.of({
|
|
1187
|
+
id: "delta",
|
|
1188
|
+
name: "Delta",
|
|
1189
|
+
displayStrategy: "number",
|
|
1190
|
+
formatValue: (data) => `${data.value.toFixed(2)} (${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)} vs baseline)`,
|
|
1191
|
+
formatAggregate: (data) => `Avg: ${data.value.toFixed(2)} (Delta: ${data.delta >= 0 ? "+" : ""}${data.delta.toFixed(2)})`,
|
|
1192
|
+
aggregateValues: Score.aggregate.averageFields(["value", "delta"])
|
|
1162
1193
|
});
|
|
1163
1194
|
Score.of({
|
|
1164
1195
|
id: "binary",
|
|
1165
1196
|
name: "Result",
|
|
1166
1197
|
displayStrategy: "passFail",
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
}
|
|
1173
|
-
return base;
|
|
1198
|
+
formatValue: (data) => data.passed ? "PASSED" : "NOT PASSED",
|
|
1199
|
+
formatAggregate: (data) => {
|
|
1200
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1201
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1202
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1174
1203
|
}
|
|
1175
|
-
return
|
|
1204
|
+
return base;
|
|
1176
1205
|
},
|
|
1177
|
-
|
|
1206
|
+
aggregateValues: Score.aggregate.all
|
|
1178
1207
|
});
|
|
1179
1208
|
|
|
1180
1209
|
// src/runner/score-utils.ts
|
|
1210
|
+
function getScoreDef(item) {
|
|
1211
|
+
return item.def ?? getScoreById(item.id);
|
|
1212
|
+
}
|
|
1181
1213
|
function toNumericScoreFromScores(scores) {
|
|
1182
1214
|
for (const item of scores) {
|
|
1183
|
-
const def =
|
|
1215
|
+
const def = getScoreDef(item);
|
|
1184
1216
|
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1185
1217
|
const value = item.data.value;
|
|
1186
1218
|
if (typeof value === "number" && Number.isFinite(value)) {
|
|
@@ -1261,6 +1293,7 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1261
1293
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1262
1294
|
const rerunPassed = [];
|
|
1263
1295
|
for (let r = 0; r < reruns; r++) {
|
|
1296
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1264
1297
|
const started = Date.now();
|
|
1265
1298
|
const evaluatorScores = [];
|
|
1266
1299
|
let testCaseError;
|
|
@@ -1287,6 +1320,11 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1287
1320
|
input: testCaseItem.testCase.getInput(),
|
|
1288
1321
|
ctx,
|
|
1289
1322
|
output,
|
|
1323
|
+
meta: {
|
|
1324
|
+
triggerId: task.triggerId,
|
|
1325
|
+
runId: evaluatorRunId,
|
|
1326
|
+
datasetId: task.datasetId
|
|
1327
|
+
},
|
|
1290
1328
|
logDiff,
|
|
1291
1329
|
log
|
|
1292
1330
|
})
|
|
@@ -1592,7 +1630,7 @@ var createPersistenceWorker = (queue) => Effect.forever(
|
|
|
1592
1630
|
() => appendJsonLine(message.artifactPath, {
|
|
1593
1631
|
runId: message.runId,
|
|
1594
1632
|
ts: Date.now(),
|
|
1595
|
-
...message.payload
|
|
1633
|
+
...typeof message.payload === "object" && message.payload !== null && !Array.isArray(message.payload) ? message.payload : {}
|
|
1596
1634
|
})
|
|
1597
1635
|
);
|
|
1598
1636
|
})
|
|
@@ -1776,6 +1814,7 @@ var EffectRunner = class {
|
|
|
1776
1814
|
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1777
1815
|
0
|
|
1778
1816
|
);
|
|
1817
|
+
const triggerId = request.triggerId ?? `trg-${randomUUID()}`;
|
|
1779
1818
|
const runId = `run-${randomUUID()}`;
|
|
1780
1819
|
const artifactPath = createArtifactPath(
|
|
1781
1820
|
this.config.artifactDirectory,
|
|
@@ -1817,6 +1856,7 @@ var EffectRunner = class {
|
|
|
1817
1856
|
await Effect.runPromise(
|
|
1818
1857
|
Queue.offer(this.runQueue, {
|
|
1819
1858
|
runId,
|
|
1859
|
+
triggerId,
|
|
1820
1860
|
datasetId: request.datasetId,
|
|
1821
1861
|
dataset: dataset.dataset,
|
|
1822
1862
|
evaluators: selectedEvaluators,
|
|
@@ -2132,12 +2172,12 @@ function scoreColor(score) {
|
|
|
2132
2172
|
return "red";
|
|
2133
2173
|
}
|
|
2134
2174
|
function formatScorePart(item) {
|
|
2135
|
-
const def = getScoreById(item.id);
|
|
2175
|
+
const def = item.def ?? getScoreById(item.id);
|
|
2136
2176
|
if (!def) {
|
|
2137
2177
|
const numeric = toNumericScore(item.data);
|
|
2138
2178
|
return numeric !== void 0 ? `${numeric.toFixed(2)}` : "n/a";
|
|
2139
2179
|
}
|
|
2140
|
-
const formatted = def
|
|
2180
|
+
const formatted = formatScoreData(def, item.data);
|
|
2141
2181
|
if (def.displayStrategy === "bar") {
|
|
2142
2182
|
const numeric = typeof item.data === "object" && item.data !== null && "value" in item.data ? item.data.value : toNumericScore(item.data);
|
|
2143
2183
|
if (typeof numeric === "number" && Number.isFinite(numeric)) {
|
|
@@ -2311,7 +2351,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2311
2351
|
if (item.scores.length > 0) {
|
|
2312
2352
|
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2313
2353
|
const s = item.scores[sIdx];
|
|
2314
|
-
const def = getScoreById(s.id);
|
|
2354
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2315
2355
|
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
2316
2356
|
rows.push(
|
|
2317
2357
|
/* @__PURE__ */ jsxs(
|