@m4trix/evals 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +84 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +84 -10
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +17 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +17 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -14,8 +14,6 @@ var ink = require('ink');
|
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
15
15
|
|
|
16
16
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
17
|
-
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
18
|
-
|
|
19
17
|
function _interopNamespace(e) {
|
|
20
18
|
if (e && e.__esModule) return e;
|
|
21
19
|
var n = Object.create(null);
|
|
@@ -35,7 +33,7 @@ function _interopNamespace(e) {
|
|
|
35
33
|
}
|
|
36
34
|
|
|
37
35
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
38
|
-
var
|
|
36
|
+
var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
|
|
39
37
|
|
|
40
38
|
// src/runner/config.ts
|
|
41
39
|
var defaultRunnerConfig = {
|
|
@@ -677,13 +675,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
677
675
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
678
676
|
);
|
|
679
677
|
}
|
|
680
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
678
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
681
679
|
return effect.Effect.gen(function* () {
|
|
682
680
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
683
681
|
const rerunPassed = [];
|
|
684
682
|
for (let r = 0; r < reruns; r++) {
|
|
685
683
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
686
684
|
const started = Date.now();
|
|
685
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
686
|
+
n + 1,
|
|
687
|
+
n + 1
|
|
688
|
+
]);
|
|
689
|
+
yield* publishEvent({
|
|
690
|
+
type: "TestCaseStarted",
|
|
691
|
+
runId: task.runId,
|
|
692
|
+
testCaseId: testCaseItem.id,
|
|
693
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
694
|
+
startedTestCases: startedEvaluations,
|
|
695
|
+
totalTestCases: totalEvaluations,
|
|
696
|
+
rerunIndex: r + 1,
|
|
697
|
+
rerunTotal: reruns
|
|
698
|
+
});
|
|
687
699
|
const evaluatorScores = [];
|
|
688
700
|
let testCaseError;
|
|
689
701
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -829,6 +841,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
829
841
|
);
|
|
830
842
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
831
843
|
const completedRef = yield* effect.Ref.make(0);
|
|
844
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
832
845
|
const passedRef = yield* effect.Ref.make(0);
|
|
833
846
|
const failedRef = yield* effect.Ref.make(0);
|
|
834
847
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -838,6 +851,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
838
851
|
publishEvent,
|
|
839
852
|
persistenceQueue,
|
|
840
853
|
updateSnapshot,
|
|
854
|
+
startedRef,
|
|
841
855
|
completedRef,
|
|
842
856
|
passedRef,
|
|
843
857
|
failedRef
|
|
@@ -1489,7 +1503,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1489
1503
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1490
1504
|
return new Promise((resolve5, reject) => {
|
|
1491
1505
|
const app = ink.render(
|
|
1492
|
-
|
|
1506
|
+
React2__namespace.default.createElement(GenerateView, {
|
|
1493
1507
|
runner,
|
|
1494
1508
|
datasetName,
|
|
1495
1509
|
onComplete: (err) => {
|
|
@@ -1646,7 +1660,9 @@ function RunView({
|
|
|
1646
1660
|
);
|
|
1647
1661
|
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1648
1662
|
const [testCases, setTestCases] = React2.useState([]);
|
|
1663
|
+
const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
|
|
1649
1664
|
const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
|
|
1665
|
+
const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
|
|
1650
1666
|
const [summary, setSummary] = React2.useState(null);
|
|
1651
1667
|
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1652
1668
|
const runEval = React2.useCallback(async () => {
|
|
@@ -1683,6 +1699,25 @@ function RunView({
|
|
|
1683
1699
|
let overallScoreCount = 0;
|
|
1684
1700
|
const done = new Promise((resolve5) => {
|
|
1685
1701
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1702
|
+
if (event.type === "TestCaseStarted") {
|
|
1703
|
+
setStartedEvaluations(event.startedTestCases);
|
|
1704
|
+
setRunningEvaluations((prev) => {
|
|
1705
|
+
const withoutDuplicate = prev.filter(
|
|
1706
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1707
|
+
);
|
|
1708
|
+
return [
|
|
1709
|
+
...withoutDuplicate,
|
|
1710
|
+
{
|
|
1711
|
+
testCaseId: event.testCaseId,
|
|
1712
|
+
name: event.testCaseName,
|
|
1713
|
+
rerunIndex: event.rerunIndex,
|
|
1714
|
+
rerunTotal: event.rerunTotal,
|
|
1715
|
+
startedTestCases: event.startedTestCases,
|
|
1716
|
+
totalTestCases: event.totalTestCases
|
|
1717
|
+
}
|
|
1718
|
+
];
|
|
1719
|
+
});
|
|
1720
|
+
}
|
|
1686
1721
|
if (event.type === "TestCaseProgress") {
|
|
1687
1722
|
for (const item of event.evaluatorScores) {
|
|
1688
1723
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
@@ -1749,6 +1784,11 @@ function RunView({
|
|
|
1749
1784
|
};
|
|
1750
1785
|
byId.set(event.testCaseId, merged);
|
|
1751
1786
|
setCompletedEvaluations(event.completedTestCases);
|
|
1787
|
+
setRunningEvaluations(
|
|
1788
|
+
(running) => running.filter(
|
|
1789
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1790
|
+
)
|
|
1791
|
+
);
|
|
1752
1792
|
return Array.from(byId.values());
|
|
1753
1793
|
});
|
|
1754
1794
|
}
|
|
@@ -1824,12 +1864,30 @@ function RunView({
|
|
|
1824
1864
|
runInfo.totalTestCases
|
|
1825
1865
|
] })
|
|
1826
1866
|
] }),
|
|
1827
|
-
phase === "running" && /* @__PURE__ */ jsxRuntime.
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1867
|
+
phase === "running" && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1868
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
1869
|
+
Spinner,
|
|
1870
|
+
{
|
|
1871
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1872
|
+
}
|
|
1873
|
+
),
|
|
1874
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
|
|
1875
|
+
"[running ",
|
|
1876
|
+
item.startedTestCases,
|
|
1877
|
+
"/",
|
|
1878
|
+
item.totalTestCases,
|
|
1879
|
+
"] ",
|
|
1880
|
+
item.name,
|
|
1881
|
+
" ",
|
|
1882
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1883
|
+
"(",
|
|
1884
|
+
item.rerunIndex,
|
|
1885
|
+
"/",
|
|
1886
|
+
item.rerunTotal,
|
|
1887
|
+
")"
|
|
1888
|
+
] })
|
|
1889
|
+
] }, `${item.testCaseId}:${item.rerunIndex}`)) })
|
|
1890
|
+
] }),
|
|
1833
1891
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1834
1892
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1835
1893
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -2304,9 +2362,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2304
2362
|
let overallScoreTotal = 0;
|
|
2305
2363
|
let overallScoreSumSq = 0;
|
|
2306
2364
|
let overallScoreCount = 0;
|
|
2365
|
+
let startedCount = 0;
|
|
2307
2366
|
let completedCount = 0;
|
|
2308
2367
|
let totalCount = 0;
|
|
2309
2368
|
let runFinished = false;
|
|
2369
|
+
const inFlightReruns = /* @__PURE__ */ new Set();
|
|
2310
2370
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2311
2371
|
let spinnerIndex = 0;
|
|
2312
2372
|
function clearLine() {
|
|
@@ -2330,7 +2390,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2330
2390
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2331
2391
|
`${completedCount}/${totalCount}`,
|
|
2332
2392
|
ansi2.bold
|
|
2333
|
-
)} ${colorize(
|
|
2393
|
+
)} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
|
|
2334
2394
|
);
|
|
2335
2395
|
}
|
|
2336
2396
|
let lastPrintedTestCaseId = null;
|
|
@@ -2338,8 +2398,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2338
2398
|
let spinnerTimer;
|
|
2339
2399
|
const done = new Promise((resolve5) => {
|
|
2340
2400
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2401
|
+
if (event.type === "TestCaseStarted") {
|
|
2402
|
+
startedCount = event.startedTestCases;
|
|
2403
|
+
inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2404
|
+
clearLine();
|
|
2405
|
+
process.stdout.write(
|
|
2406
|
+
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2407
|
+
`
|
|
2408
|
+
);
|
|
2409
|
+
drawSpinner();
|
|
2410
|
+
}
|
|
2341
2411
|
if (event.type === "TestCaseProgress") {
|
|
2342
2412
|
completedCount = event.completedTestCases;
|
|
2413
|
+
inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2343
2414
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2344
2415
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2345
2416
|
const testCaseId = event.testCaseId;
|
|
@@ -2547,7 +2618,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2547
2618
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2548
2619
|
return new Promise((resolve5, reject) => {
|
|
2549
2620
|
const app = ink.render(
|
|
2550
|
-
|
|
2621
|
+
React2__namespace.createElement(RunView, {
|
|
2551
2622
|
runner,
|
|
2552
2623
|
datasetName,
|
|
2553
2624
|
evaluatorPattern,
|