@m4trix/evals 0.19.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/dist/cli-simple.cjs +135 -26
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +135 -23
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +56 -12
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +56 -12
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +56 -12
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +17 -1
- package/dist/index.js +56 -12
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -7,7 +7,8 @@ import * as jitiModule from 'jiti';
|
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffString } from 'json-diff';
|
|
10
|
-
import
|
|
10
|
+
import * as React2 from 'react';
|
|
11
|
+
import React2__default, { useState, useEffect, useCallback } from 'react';
|
|
11
12
|
import { render, Box, Text } from 'ink';
|
|
12
13
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
13
14
|
|
|
@@ -268,6 +269,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
268
269
|
function formatLogMessage(msg) {
|
|
269
270
|
if (typeof msg === "string")
|
|
270
271
|
return msg;
|
|
272
|
+
if (msg instanceof Error)
|
|
273
|
+
return msg.stack ?? msg.message;
|
|
271
274
|
try {
|
|
272
275
|
if (msg !== null && typeof msg === "object") {
|
|
273
276
|
return JSON.stringify(msg, null, 2);
|
|
@@ -607,6 +610,7 @@ function toNumericScore(value) {
|
|
|
607
610
|
}
|
|
608
611
|
|
|
609
612
|
// src/runner/execution.ts
|
|
613
|
+
var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
|
|
610
614
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
611
615
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
612
616
|
if (scoresWithPassed.length > 0) {
|
|
@@ -648,13 +652,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
648
652
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
649
653
|
);
|
|
650
654
|
}
|
|
651
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
655
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
652
656
|
return Effect.gen(function* () {
|
|
653
657
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
654
658
|
const rerunPassed = [];
|
|
655
659
|
for (let r = 0; r < reruns; r++) {
|
|
656
660
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
657
661
|
const started = Date.now();
|
|
662
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
663
|
+
n + 1,
|
|
664
|
+
n + 1
|
|
665
|
+
]);
|
|
666
|
+
yield* publishEvent({
|
|
667
|
+
type: "TestCaseStarted",
|
|
668
|
+
runId: task.runId,
|
|
669
|
+
testCaseId: testCaseItem.id,
|
|
670
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
671
|
+
startedTestCases: startedEvaluations,
|
|
672
|
+
totalTestCases: totalEvaluations,
|
|
673
|
+
rerunIndex: r + 1,
|
|
674
|
+
rerunTotal: reruns
|
|
675
|
+
});
|
|
658
676
|
const evaluatorScores = [];
|
|
659
677
|
let testCaseError;
|
|
660
678
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -663,20 +681,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
663
681
|
if (!evaluateFn) {
|
|
664
682
|
continue;
|
|
665
683
|
}
|
|
684
|
+
const logs = [];
|
|
685
|
+
const logDiff = (expected, actual, options) => {
|
|
686
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
687
|
+
};
|
|
688
|
+
const log = (message, options) => {
|
|
689
|
+
logs.push(createLogEntry(message, options));
|
|
690
|
+
};
|
|
691
|
+
const createError = (message, options) => {
|
|
692
|
+
const entry = createLogEntry(message, options);
|
|
693
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
694
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
695
|
+
return error;
|
|
696
|
+
};
|
|
666
697
|
try {
|
|
667
|
-
const logs = [];
|
|
668
|
-
const logDiff = (expected, actual, options) => {
|
|
669
|
-
logs.push(createDiffLogEntry(expected, actual, options));
|
|
670
|
-
};
|
|
671
|
-
const log = (message, options) => {
|
|
672
|
-
logs.push(createLogEntry(message, options));
|
|
673
|
-
};
|
|
674
698
|
const ctx = yield* Effect.promise(
|
|
675
699
|
() => Promise.resolve(evaluator.resolveContext())
|
|
676
700
|
);
|
|
677
701
|
const result = yield* Effect.promise(
|
|
678
|
-
() => Promise.resolve(
|
|
679
|
-
evaluateFn({
|
|
702
|
+
() => Promise.resolve().then(
|
|
703
|
+
() => evaluateFn({
|
|
680
704
|
input: testCaseItem.testCase.getInput(),
|
|
681
705
|
ctx,
|
|
682
706
|
output,
|
|
@@ -686,10 +710,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
686
710
|
datasetId: task.datasetId
|
|
687
711
|
},
|
|
688
712
|
logDiff,
|
|
689
|
-
log
|
|
713
|
+
log,
|
|
714
|
+
createError
|
|
690
715
|
})
|
|
691
716
|
)
|
|
692
717
|
);
|
|
718
|
+
if (result instanceof Error) {
|
|
719
|
+
const evaluatorError = result;
|
|
720
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
721
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
722
|
+
testCaseError = result.message;
|
|
723
|
+
evaluatorScores.push({
|
|
724
|
+
evaluatorId,
|
|
725
|
+
scores: [],
|
|
726
|
+
passed: false,
|
|
727
|
+
logs: logs.length > 0 ? logs : void 0
|
|
728
|
+
});
|
|
729
|
+
continue;
|
|
730
|
+
}
|
|
693
731
|
const { scores, metrics } = normalizeResult(result);
|
|
694
732
|
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
695
733
|
evaluatorScores.push({
|
|
@@ -700,11 +738,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
|
|
|
700
738
|
logs: logs.length > 0 ? logs : void 0
|
|
701
739
|
});
|
|
702
740
|
} catch (error) {
|
|
741
|
+
if (error instanceof Error) {
|
|
742
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
743
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
744
|
+
}
|
|
703
745
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
704
746
|
evaluatorScores.push({
|
|
705
747
|
evaluatorId,
|
|
706
748
|
scores: [],
|
|
707
|
-
passed: false
|
|
749
|
+
passed: false,
|
|
750
|
+
logs: logs.length > 0 ? logs : void 0
|
|
708
751
|
});
|
|
709
752
|
}
|
|
710
753
|
}
|
|
@@ -775,6 +818,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
775
818
|
);
|
|
776
819
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
777
820
|
const completedRef = yield* Ref.make(0);
|
|
821
|
+
const startedRef = yield* Ref.make(0);
|
|
778
822
|
const passedRef = yield* Ref.make(0);
|
|
779
823
|
const failedRef = yield* Ref.make(0);
|
|
780
824
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -784,6 +828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
784
828
|
publishEvent,
|
|
785
829
|
persistenceQueue,
|
|
786
830
|
updateSnapshot,
|
|
831
|
+
startedRef,
|
|
787
832
|
completedRef,
|
|
788
833
|
passedRef,
|
|
789
834
|
failedRef
|
|
@@ -1435,7 +1480,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1435
1480
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1436
1481
|
return new Promise((resolve5, reject) => {
|
|
1437
1482
|
const app = render(
|
|
1438
|
-
|
|
1483
|
+
React2__default.createElement(GenerateView, {
|
|
1439
1484
|
runner,
|
|
1440
1485
|
datasetName,
|
|
1441
1486
|
onComplete: (err) => {
|
|
@@ -1592,7 +1637,9 @@ function RunView({
|
|
|
1592
1637
|
);
|
|
1593
1638
|
const [runInfo, setRunInfo] = useState(null);
|
|
1594
1639
|
const [testCases, setTestCases] = useState([]);
|
|
1640
|
+
const [startedEvaluations, setStartedEvaluations] = useState(0);
|
|
1595
1641
|
const [completedEvaluations, setCompletedEvaluations] = useState(0);
|
|
1642
|
+
const [runningEvaluations, setRunningEvaluations] = useState([]);
|
|
1596
1643
|
const [summary, setSummary] = useState(null);
|
|
1597
1644
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1598
1645
|
const runEval = useCallback(async () => {
|
|
@@ -1629,6 +1676,25 @@ function RunView({
|
|
|
1629
1676
|
let overallScoreCount = 0;
|
|
1630
1677
|
const done = new Promise((resolve5) => {
|
|
1631
1678
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1679
|
+
if (event.type === "TestCaseStarted") {
|
|
1680
|
+
setStartedEvaluations(event.startedTestCases);
|
|
1681
|
+
setRunningEvaluations((prev) => {
|
|
1682
|
+
const withoutDuplicate = prev.filter(
|
|
1683
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1684
|
+
);
|
|
1685
|
+
return [
|
|
1686
|
+
...withoutDuplicate,
|
|
1687
|
+
{
|
|
1688
|
+
testCaseId: event.testCaseId,
|
|
1689
|
+
name: event.testCaseName,
|
|
1690
|
+
rerunIndex: event.rerunIndex,
|
|
1691
|
+
rerunTotal: event.rerunTotal,
|
|
1692
|
+
startedTestCases: event.startedTestCases,
|
|
1693
|
+
totalTestCases: event.totalTestCases
|
|
1694
|
+
}
|
|
1695
|
+
];
|
|
1696
|
+
});
|
|
1697
|
+
}
|
|
1632
1698
|
if (event.type === "TestCaseProgress") {
|
|
1633
1699
|
for (const item of event.evaluatorScores) {
|
|
1634
1700
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
@@ -1688,12 +1754,18 @@ function RunView({
|
|
|
1688
1754
|
rerunTotal: event.rerunTotal,
|
|
1689
1755
|
durationMs: events.reduce((s, e) => s + e.durationMs, 0),
|
|
1690
1756
|
passed: events.every((e) => e.passed),
|
|
1757
|
+
errorMessage: event.errorMessage,
|
|
1691
1758
|
events,
|
|
1692
1759
|
aggregatedEvaluatorScores,
|
|
1693
1760
|
isAggregated
|
|
1694
1761
|
};
|
|
1695
1762
|
byId.set(event.testCaseId, merged);
|
|
1696
1763
|
setCompletedEvaluations(event.completedTestCases);
|
|
1764
|
+
setRunningEvaluations(
|
|
1765
|
+
(running) => running.filter(
|
|
1766
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1767
|
+
)
|
|
1768
|
+
);
|
|
1697
1769
|
return Array.from(byId.values());
|
|
1698
1770
|
});
|
|
1699
1771
|
}
|
|
@@ -1769,12 +1841,30 @@ function RunView({
|
|
|
1769
1841
|
runInfo.totalTestCases
|
|
1770
1842
|
] })
|
|
1771
1843
|
] }),
|
|
1772
|
-
phase === "running" && /* @__PURE__ */
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
1776
|
-
|
|
1777
|
-
|
|
1844
|
+
phase === "running" && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1845
|
+
/* @__PURE__ */ jsx(
|
|
1846
|
+
Spinner,
|
|
1847
|
+
{
|
|
1848
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1849
|
+
}
|
|
1850
|
+
),
|
|
1851
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
|
|
1852
|
+
"[running ",
|
|
1853
|
+
item.startedTestCases,
|
|
1854
|
+
"/",
|
|
1855
|
+
item.totalTestCases,
|
|
1856
|
+
"] ",
|
|
1857
|
+
item.name,
|
|
1858
|
+
" ",
|
|
1859
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1860
|
+
"(",
|
|
1861
|
+
item.rerunIndex,
|
|
1862
|
+
"/",
|
|
1863
|
+
item.rerunTotal,
|
|
1864
|
+
")"
|
|
1865
|
+
] })
|
|
1866
|
+
] }, `${item.testCaseId}:${item.rerunIndex}`)) })
|
|
1867
|
+
] }),
|
|
1778
1868
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1779
1869
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1780
1870
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -1798,8 +1888,13 @@ function RunView({
|
|
|
1798
1888
|
" (",
|
|
1799
1889
|
tc.durationMs,
|
|
1800
1890
|
"ms)"
|
|
1801
|
-
] })
|
|
1891
|
+
] }),
|
|
1892
|
+
tc.errorMessage ? /* @__PURE__ */ jsxs(Text, { color: "red", bold: true, children: [
|
|
1893
|
+
" ",
|
|
1894
|
+
"ERROR"
|
|
1895
|
+
] }) : null
|
|
1802
1896
|
] }),
|
|
1897
|
+
tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
|
|
1803
1898
|
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
1804
1899
|
Box,
|
|
1805
1900
|
{
|
|
@@ -2244,9 +2339,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2244
2339
|
let overallScoreTotal = 0;
|
|
2245
2340
|
let overallScoreSumSq = 0;
|
|
2246
2341
|
let overallScoreCount = 0;
|
|
2342
|
+
let startedCount = 0;
|
|
2247
2343
|
let completedCount = 0;
|
|
2248
2344
|
let totalCount = 0;
|
|
2249
2345
|
let runFinished = false;
|
|
2346
|
+
const inFlightReruns = /* @__PURE__ */ new Set();
|
|
2250
2347
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2251
2348
|
let spinnerIndex = 0;
|
|
2252
2349
|
function clearLine() {
|
|
@@ -2270,7 +2367,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2270
2367
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2271
2368
|
`${completedCount}/${totalCount}`,
|
|
2272
2369
|
ansi2.bold
|
|
2273
|
-
)} ${colorize(
|
|
2370
|
+
)} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
|
|
2274
2371
|
);
|
|
2275
2372
|
}
|
|
2276
2373
|
let lastPrintedTestCaseId = null;
|
|
@@ -2278,8 +2375,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2278
2375
|
let spinnerTimer;
|
|
2279
2376
|
const done = new Promise((resolve5) => {
|
|
2280
2377
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2378
|
+
if (event.type === "TestCaseStarted") {
|
|
2379
|
+
startedCount = event.startedTestCases;
|
|
2380
|
+
inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2381
|
+
clearLine();
|
|
2382
|
+
process.stdout.write(
|
|
2383
|
+
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2384
|
+
`
|
|
2385
|
+
);
|
|
2386
|
+
drawSpinner();
|
|
2387
|
+
}
|
|
2281
2388
|
if (event.type === "TestCaseProgress") {
|
|
2282
2389
|
completedCount = event.completedTestCases;
|
|
2390
|
+
inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2283
2391
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2284
2392
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2285
2393
|
const testCaseId = event.testCaseId;
|
|
@@ -2337,9 +2445,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2337
2445
|
0
|
|
2338
2446
|
);
|
|
2339
2447
|
const lines = [];
|
|
2448
|
+
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2340
2449
|
lines.push(
|
|
2341
|
-
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
|
|
2450
|
+
`${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
|
|
2342
2451
|
);
|
|
2452
|
+
if (event.errorMessage) {
|
|
2453
|
+
lines.push(colorize(event.errorMessage, ansi2.red));
|
|
2454
|
+
}
|
|
2343
2455
|
for (const item of aggregatedScores) {
|
|
2344
2456
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2345
2457
|
lines.push(
|