@m4trix/evals 0.18.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +142 -42
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +142 -42
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +100 -30
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +100 -30
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +97 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +21 -9
- package/dist/index.js +97 -28
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -523,7 +523,11 @@ var Metric = {
|
|
|
523
523
|
name: config.name,
|
|
524
524
|
aggregate: config.aggregate,
|
|
525
525
|
format: config.format,
|
|
526
|
-
make: (data) => ({
|
|
526
|
+
make: (data, options) => ({
|
|
527
|
+
id: config.id,
|
|
528
|
+
data,
|
|
529
|
+
...options?.name !== void 0 && { name: options.name }
|
|
530
|
+
})
|
|
527
531
|
};
|
|
528
532
|
registry.set(config.id, def);
|
|
529
533
|
return def;
|
|
@@ -545,25 +549,61 @@ var ScoreAggregate = {
|
|
|
545
549
|
const count = values.length || 1;
|
|
546
550
|
const result = {};
|
|
547
551
|
for (const field of fields) {
|
|
548
|
-
result[field] = values.reduce(
|
|
552
|
+
result[field] = values.reduce(
|
|
553
|
+
(s, v) => s + (v[field] ?? 0),
|
|
554
|
+
0
|
|
555
|
+
) / count;
|
|
549
556
|
}
|
|
550
557
|
return result;
|
|
551
558
|
};
|
|
552
559
|
},
|
|
553
|
-
/** Average
|
|
554
|
-
averageWithVariance(
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
560
|
+
/** Average selected numeric fields, with sample std dev tracked for `value`. */
|
|
561
|
+
averageWithVariance(fields) {
|
|
562
|
+
return (values) => {
|
|
563
|
+
const count = values.length;
|
|
564
|
+
const result = {};
|
|
565
|
+
for (const field of fields) {
|
|
566
|
+
result[field] = count === 0 ? 0 : values.reduce(
|
|
567
|
+
(sum, item) => sum + (item[field] ?? 0),
|
|
568
|
+
0
|
|
569
|
+
) / count;
|
|
570
|
+
}
|
|
571
|
+
const valueField = "value";
|
|
572
|
+
const hasValueField = fields.includes(valueField);
|
|
573
|
+
if (count === 0) {
|
|
574
|
+
if (hasValueField) {
|
|
575
|
+
result[valueField] = 0;
|
|
576
|
+
}
|
|
577
|
+
return {
|
|
578
|
+
...result,
|
|
579
|
+
stdDev: void 0,
|
|
580
|
+
count: 0
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
let stdDev;
|
|
584
|
+
if (hasValueField && count >= 2) {
|
|
585
|
+
const sum = values.reduce(
|
|
586
|
+
(s, v) => s + (v[valueField] ?? 0),
|
|
587
|
+
0
|
|
588
|
+
);
|
|
589
|
+
const sumSq = values.reduce(
|
|
590
|
+
(s, v) => {
|
|
591
|
+
const value = v[valueField] ?? 0;
|
|
592
|
+
return s + value * value;
|
|
593
|
+
},
|
|
594
|
+
0
|
|
595
|
+
);
|
|
596
|
+
const mean = sum / count;
|
|
597
|
+
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
598
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
599
|
+
}
|
|
600
|
+
return {
|
|
601
|
+
...values[0],
|
|
602
|
+
...result,
|
|
603
|
+
stdDev,
|
|
604
|
+
count
|
|
605
|
+
};
|
|
606
|
+
};
|
|
567
607
|
},
|
|
568
608
|
/** All runs must pass. Use for binary scores. */
|
|
569
609
|
all(values) {
|
|
@@ -597,6 +637,7 @@ var Score = {
|
|
|
597
637
|
id: config.id,
|
|
598
638
|
data,
|
|
599
639
|
...passed !== void 0 && { passed },
|
|
640
|
+
...options?.name !== void 0 && { name: options.name },
|
|
600
641
|
def
|
|
601
642
|
// Attach def so rendering/aggregation works without registry lookup
|
|
602
643
|
};
|
|
@@ -665,7 +706,7 @@ var percentScore = Score.of({
|
|
|
665
706
|
displayStrategy: "bar",
|
|
666
707
|
formatValue: (data) => data.value.toFixed(2),
|
|
667
708
|
formatAggregate: (data) => data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`,
|
|
668
|
-
aggregateValues: Score.aggregate.averageWithVariance
|
|
709
|
+
aggregateValues: Score.aggregate.averageWithVariance(["value"])
|
|
669
710
|
});
|
|
670
711
|
var deltaScore = Score.of({
|
|
671
712
|
id: "delta",
|
|
@@ -697,6 +738,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
697
738
|
function formatLogMessage(msg) {
|
|
698
739
|
if (typeof msg === "string")
|
|
699
740
|
return msg;
|
|
741
|
+
if (msg instanceof Error)
|
|
742
|
+
return msg.stack ?? msg.message;
|
|
700
743
|
try {
|
|
701
744
|
if (msg !== null && typeof msg === "object") {
|
|
702
745
|
return JSON.stringify(msg, null, 2);
|
|
@@ -1043,6 +1086,7 @@ function toNumericScore(value) {
|
|
|
1043
1086
|
}
|
|
1044
1087
|
|
|
1045
1088
|
// src/runner/execution.ts
|
|
1089
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
1046
1090
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1047
1091
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1048
1092
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1099,20 +1143,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1099
1143
|
if (!evaluateFn) {
|
|
1100
1144
|
continue;
|
|
1101
1145
|
}
|
|
1146
|
+
const logs = [];
|
|
1147
|
+
const logDiff = (expected, actual, options) => {
|
|
1148
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1149
|
+
};
|
|
1150
|
+
const log = (message, options) => {
|
|
1151
|
+
logs.push(createLogEntry(message, options));
|
|
1152
|
+
};
|
|
1153
|
+
const createError = (message, options) => {
|
|
1154
|
+
const entry = createLogEntry(message, options);
|
|
1155
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
1156
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
1157
|
+
return error;
|
|
1158
|
+
};
|
|
1102
1159
|
try {
|
|
1103
|
-
const logs = [];
|
|
1104
|
-
const logDiff = (expected, actual, options) => {
|
|
1105
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1106
|
-
};
|
|
1107
|
-
const log = (message, options) => {
|
|
1108
|
-
logs.push(createLogEntry(message, options));
|
|
1109
|
-
};
|
|
1110
1160
|
const ctx = yield* effect.Effect.promise(
|
|
1111
1161
|
() => Promise.resolve(evaluator.resolveContext())
|
|
1112
1162
|
);
|
|
1113
1163
|
const result = yield* effect.Effect.promise(
|
|
1114
|
-
() => Promise.resolve(
|
|
1115
|
-
evaluateFn({
|
|
1164
|
+
() => Promise.resolve().then(
|
|
1165
|
+
() => evaluateFn({
|
|
1116
1166
|
input: testCaseItem.testCase.getInput(),
|
|
1117
1167
|
ctx,
|
|
1118
1168
|
output,
|
|
@@ -1122,10 +1172,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1122
1172
|
datasetId: task.datasetId
|
|
1123
1173
|
},
|
|
1124
1174
|
logDiff,
|
|
1125
|
-
log
|
|
1175
|
+
log,
|
|
1176
|
+
createError
|
|
1126
1177
|
})
|
|
1127
1178
|
)
|
|
1128
1179
|
);
|
|
1180
|
+
if (result instanceof Error) {
|
|
1181
|
+
const evaluatorError = result;
|
|
1182
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
1183
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
1184
|
+
testCaseError = result.message;
|
|
1185
|
+
evaluatorScores.push({
|
|
1186
|
+
evaluatorId,
|
|
1187
|
+
scores: [],
|
|
1188
|
+
passed: false,
|
|
1189
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1190
|
+
});
|
|
1191
|
+
continue;
|
|
1192
|
+
}
|
|
1129
1193
|
const { scores, metrics } = normalizeResult(result);
|
|
1130
1194
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1131
1195
|
evaluatorScores.push({
|
|
@@ -1136,11 +1200,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
1136
1200
|
logs: logs.length > 0 ? logs : void 0
|
|
1137
1201
|
});
|
|
1138
1202
|
} catch (error) {
|
|
1203
|
+
if (error instanceof Error) {
|
|
1204
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
1205
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
1206
|
+
}
|
|
1139
1207
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1140
1208
|
evaluatorScores.push({
|
|
1141
1209
|
evaluatorId,
|
|
1142
1210
|
scores: [],
|
|
1143
|
-
passed: false
|
|
1211
|
+
passed: false,
|
|
1212
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1144
1213
|
});
|
|
1145
1214
|
}
|
|
1146
1215
|
}
|