@m4trix/evals 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +84 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +84 -10
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +17 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +17 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -7,7 +7,8 @@ import * as jitiModule from 'jiti';
|
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffString } from 'json-diff';
|
|
10
|
-
import
|
|
10
|
+
import * as React2 from 'react';
|
|
11
|
+
import React2__default, { useState, useEffect, useCallback } from 'react';
|
|
11
12
|
import { render, Box, Text } from 'ink';
|
|
12
13
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
13
14
|
|
|
@@ -651,13 +652,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
651
652
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
652
653
|
);
|
|
653
654
|
}
|
|
654
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
655
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
655
656
|
return Effect.gen(function* () {
|
|
656
657
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
657
658
|
const rerunPassed = [];
|
|
658
659
|
for (let r = 0; r < reruns; r++) {
|
|
659
660
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
660
661
|
const started = Date.now();
|
|
662
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
663
|
+
n + 1,
|
|
664
|
+
n + 1
|
|
665
|
+
]);
|
|
666
|
+
yield* publishEvent({
|
|
667
|
+
type: "TestCaseStarted",
|
|
668
|
+
runId: task.runId,
|
|
669
|
+
testCaseId: testCaseItem.id,
|
|
670
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
671
|
+
startedTestCases: startedEvaluations,
|
|
672
|
+
totalTestCases: totalEvaluations,
|
|
673
|
+
rerunIndex: r + 1,
|
|
674
|
+
rerunTotal: reruns
|
|
675
|
+
});
|
|
661
676
|
const evaluatorScores = [];
|
|
662
677
|
let testCaseError;
|
|
663
678
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -803,6 +818,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
803
818
|
);
|
|
804
819
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
805
820
|
const completedRef = yield* Ref.make(0);
|
|
821
|
+
const startedRef = yield* Ref.make(0);
|
|
806
822
|
const passedRef = yield* Ref.make(0);
|
|
807
823
|
const failedRef = yield* Ref.make(0);
|
|
808
824
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -812,6 +828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
812
828
|
publishEvent,
|
|
813
829
|
persistenceQueue,
|
|
814
830
|
updateSnapshot,
|
|
831
|
+
startedRef,
|
|
815
832
|
completedRef,
|
|
816
833
|
passedRef,
|
|
817
834
|
failedRef
|
|
@@ -1463,7 +1480,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1463
1480
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1464
1481
|
return new Promise((resolve5, reject) => {
|
|
1465
1482
|
const app = render(
|
|
1466
|
-
|
|
1483
|
+
React2__default.createElement(GenerateView, {
|
|
1467
1484
|
runner,
|
|
1468
1485
|
datasetName,
|
|
1469
1486
|
onComplete: (err) => {
|
|
@@ -1620,7 +1637,9 @@ function RunView({
|
|
|
1620
1637
|
);
|
|
1621
1638
|
const [runInfo, setRunInfo] = useState(null);
|
|
1622
1639
|
const [testCases, setTestCases] = useState([]);
|
|
1640
|
+
const [startedEvaluations, setStartedEvaluations] = useState(0);
|
|
1623
1641
|
const [completedEvaluations, setCompletedEvaluations] = useState(0);
|
|
1642
|
+
const [runningEvaluations, setRunningEvaluations] = useState([]);
|
|
1624
1643
|
const [summary, setSummary] = useState(null);
|
|
1625
1644
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1626
1645
|
const runEval = useCallback(async () => {
|
|
@@ -1657,6 +1676,25 @@ function RunView({
|
|
|
1657
1676
|
let overallScoreCount = 0;
|
|
1658
1677
|
const done = new Promise((resolve5) => {
|
|
1659
1678
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1679
|
+
if (event.type === "TestCaseStarted") {
|
|
1680
|
+
setStartedEvaluations(event.startedTestCases);
|
|
1681
|
+
setRunningEvaluations((prev) => {
|
|
1682
|
+
const withoutDuplicate = prev.filter(
|
|
1683
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1684
|
+
);
|
|
1685
|
+
return [
|
|
1686
|
+
...withoutDuplicate,
|
|
1687
|
+
{
|
|
1688
|
+
testCaseId: event.testCaseId,
|
|
1689
|
+
name: event.testCaseName,
|
|
1690
|
+
rerunIndex: event.rerunIndex,
|
|
1691
|
+
rerunTotal: event.rerunTotal,
|
|
1692
|
+
startedTestCases: event.startedTestCases,
|
|
1693
|
+
totalTestCases: event.totalTestCases
|
|
1694
|
+
}
|
|
1695
|
+
];
|
|
1696
|
+
});
|
|
1697
|
+
}
|
|
1660
1698
|
if (event.type === "TestCaseProgress") {
|
|
1661
1699
|
for (const item of event.evaluatorScores) {
|
|
1662
1700
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
@@ -1723,6 +1761,11 @@ function RunView({
|
|
|
1723
1761
|
};
|
|
1724
1762
|
byId.set(event.testCaseId, merged);
|
|
1725
1763
|
setCompletedEvaluations(event.completedTestCases);
|
|
1764
|
+
setRunningEvaluations(
|
|
1765
|
+
(running) => running.filter(
|
|
1766
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1767
|
+
)
|
|
1768
|
+
);
|
|
1726
1769
|
return Array.from(byId.values());
|
|
1727
1770
|
});
|
|
1728
1771
|
}
|
|
@@ -1798,12 +1841,30 @@ function RunView({
|
|
|
1798
1841
|
runInfo.totalTestCases
|
|
1799
1842
|
] })
|
|
1800
1843
|
] }),
|
|
1801
|
-
phase === "running" && /* @__PURE__ */
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1844
|
+
phase === "running" && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1845
|
+
/* @__PURE__ */ jsx(
|
|
1846
|
+
Spinner,
|
|
1847
|
+
{
|
|
1848
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1849
|
+
}
|
|
1850
|
+
),
|
|
1851
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
|
|
1852
|
+
"[running ",
|
|
1853
|
+
item.startedTestCases,
|
|
1854
|
+
"/",
|
|
1855
|
+
item.totalTestCases,
|
|
1856
|
+
"] ",
|
|
1857
|
+
item.name,
|
|
1858
|
+
" ",
|
|
1859
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1860
|
+
"(",
|
|
1861
|
+
item.rerunIndex,
|
|
1862
|
+
"/",
|
|
1863
|
+
item.rerunTotal,
|
|
1864
|
+
")"
|
|
1865
|
+
] })
|
|
1866
|
+
] }, `${item.testCaseId}:${item.rerunIndex}`)) })
|
|
1867
|
+
] }),
|
|
1807
1868
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1808
1869
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1809
1870
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -2278,9 +2339,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2278
2339
|
let overallScoreTotal = 0;
|
|
2279
2340
|
let overallScoreSumSq = 0;
|
|
2280
2341
|
let overallScoreCount = 0;
|
|
2342
|
+
let startedCount = 0;
|
|
2281
2343
|
let completedCount = 0;
|
|
2282
2344
|
let totalCount = 0;
|
|
2283
2345
|
let runFinished = false;
|
|
2346
|
+
const inFlightReruns = /* @__PURE__ */ new Set();
|
|
2284
2347
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2285
2348
|
let spinnerIndex = 0;
|
|
2286
2349
|
function clearLine() {
|
|
@@ -2304,7 +2367,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2304
2367
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2305
2368
|
`${completedCount}/${totalCount}`,
|
|
2306
2369
|
ansi2.bold
|
|
2307
|
-
)} ${colorize(
|
|
2370
|
+
)} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
|
|
2308
2371
|
);
|
|
2309
2372
|
}
|
|
2310
2373
|
let lastPrintedTestCaseId = null;
|
|
@@ -2312,8 +2375,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2312
2375
|
let spinnerTimer;
|
|
2313
2376
|
const done = new Promise((resolve5) => {
|
|
2314
2377
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2378
|
+
if (event.type === "TestCaseStarted") {
|
|
2379
|
+
startedCount = event.startedTestCases;
|
|
2380
|
+
inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2381
|
+
clearLine();
|
|
2382
|
+
process.stdout.write(
|
|
2383
|
+
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2384
|
+
`
|
|
2385
|
+
);
|
|
2386
|
+
drawSpinner();
|
|
2387
|
+
}
|
|
2315
2388
|
if (event.type === "TestCaseProgress") {
|
|
2316
2389
|
completedCount = event.completedTestCases;
|
|
2390
|
+
inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2317
2391
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2318
2392
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2319
2393
|
const testCaseId = event.testCaseId;
|