@m4trix/evals 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +51 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +51 -13
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +39 -11
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +39 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +39 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -1
- package/dist/index.js +39 -11
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -268,6 +268,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
268
268
|
function formatLogMessage(msg) {
|
|
269
269
|
if (typeof msg === "string")
|
|
270
270
|
return msg;
|
|
271
|
+
if (msg instanceof Error)
|
|
272
|
+
return msg.stack ?? msg.message;
|
|
271
273
|
try {
|
|
272
274
|
if (msg !== null && typeof msg === "object") {
|
|
273
275
|
return JSON.stringify(msg, null, 2);
|
|
@@ -607,6 +609,7 @@ function toNumericScore(value) {
|
|
|
607
609
|
}
|
|
608
610
|
|
|
609
611
|
// src/runner/execution.ts
|
|
612
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
610
613
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
611
614
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
612
615
|
if (scoresWithPassed.length > 0) {
|
|
@@ -663,20 +666,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
663
666
|
if (!evaluateFn) {
|
|
664
667
|
continue;
|
|
665
668
|
}
|
|
669
|
+
const logs = [];
|
|
670
|
+
const logDiff = (expected, actual, options) => {
|
|
671
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
672
|
+
};
|
|
673
|
+
const log = (message, options) => {
|
|
674
|
+
logs.push(createLogEntry(message, options));
|
|
675
|
+
};
|
|
676
|
+
const createError = (message, options) => {
|
|
677
|
+
const entry = createLogEntry(message, options);
|
|
678
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
679
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
680
|
+
return error;
|
|
681
|
+
};
|
|
666
682
|
try {
|
|
667
|
-
const logs = [];
|
|
668
|
-
const logDiff = (expected, actual, options) => {
|
|
669
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
670
|
-
};
|
|
671
|
-
const log = (message, options) => {
|
|
672
|
-
logs.push(createLogEntry(message, options));
|
|
673
|
-
};
|
|
674
683
|
const ctx = yield* Effect.promise(
|
|
675
684
|
() => Promise.resolve(evaluator.resolveContext())
|
|
676
685
|
);
|
|
677
686
|
const result = yield* Effect.promise(
|
|
678
|
-
() => Promise.resolve(
|
|
679
|
-
evaluateFn({
|
|
687
|
+
() => Promise.resolve().then(
|
|
688
|
+
() => evaluateFn({
|
|
680
689
|
input: testCaseItem.testCase.getInput(),
|
|
681
690
|
ctx,
|
|
682
691
|
output,
|
|
@@ -686,10 +695,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
686
695
|
datasetId: task.datasetId
|
|
687
696
|
},
|
|
688
697
|
logDiff,
|
|
689
|
-
log
|
|
698
|
+
log,
|
|
699
|
+
createError
|
|
690
700
|
})
|
|
691
701
|
)
|
|
692
702
|
);
|
|
703
|
+
if (result instanceof Error) {
|
|
704
|
+
const evaluatorError = result;
|
|
705
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
706
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
707
|
+
testCaseError = result.message;
|
|
708
|
+
evaluatorScores.push({
|
|
709
|
+
evaluatorId,
|
|
710
|
+
scores: [],
|
|
711
|
+
passed: false,
|
|
712
|
+
logs: logs.length > 0 ? logs : void 0
|
|
713
|
+
});
|
|
714
|
+
continue;
|
|
715
|
+
}
|
|
693
716
|
const { scores, metrics } = normalizeResult(result);
|
|
694
717
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
695
718
|
evaluatorScores.push({
|
|
@@ -700,11 +723,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
700
723
|
logs: logs.length > 0 ? logs : void 0
|
|
701
724
|
});
|
|
702
725
|
} catch (error) {
|
|
726
|
+
if (error instanceof Error) {
|
|
727
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
728
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
729
|
+
}
|
|
703
730
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
704
731
|
evaluatorScores.push({
|
|
705
732
|
evaluatorId,
|
|
706
733
|
scores: [],
|
|
707
|
-
passed: false
|
|
734
|
+
passed: false,
|
|
735
|
+
logs: logs.length > 0 ? logs : void 0
|
|
708
736
|
});
|
|
709
737
|
}
|
|
710
738
|
}
|
|
@@ -1688,6 +1716,7 @@ function RunView({
|
|
|
1688
1716
|
rerunTotal: event.rerunTotal,
|
|
1689
1717
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1690
1718
|
passed: events.every((e) => e.passed),
|
|
1719
|
+
errorMessage: event.errorMessage,
|
|
1691
1720
|
events,
|
|
1692
1721
|
aggregatedEvaluatorScores,
|
|
1693
1722
|
isAggregated
|
|
@@ -1798,8 +1827,13 @@ function RunView({
|
|
|
1798
1827
|
" (",
|
|
1799
1828
|
tc.durationMs,
|
|
1800
1829
|
"ms)"
|
|
1801
|
-
] })
|
|
1830
|
+
] }),
|
|
1831
|
+
tc.errorMessage ? /* @__PURE__ */ jsxs(Text, { color: "red", bold: true, children: [
|
|
1832
|
+
" ",
|
|
1833
|
+
"ERROR"
|
|
1834
|
+
] }) : null
|
|
1802
1835
|
] }),
|
|
1836
|
+
tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
|
|
1803
1837
|
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
1804
1838
|
Box,
|
|
1805
1839
|
{
|
|
@@ -2337,9 +2371,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2337
2371
|
0
|
|
2338
2372
|
);
|
|
2339
2373
|
const lines = [];
|
|
2374
|
+
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2340
2375
|
lines.push(
|
|
2341
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2376
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2342
2377
|
);
|
|
2378
|
+
if (event.errorMessage) {
|
|
2379
|
+
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
2380
|
+
}
|
|
2343
2381
|
for (const item of aggregatedScores) {
|
|
2344
2382
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2345
2383
|
lines.push(
|