@m4trix/evals 0.20.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +222 -28
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +221 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +115 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +114 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +117 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +10 -2
- package/dist/index.js +114 -5
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli-simple.js
CHANGED
|
@@ -6,8 +6,11 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
10
|
-
import
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
|
+
import stringify from 'fast-json-stable-stringify';
|
|
11
|
+
import { cpus } from 'os';
|
|
12
|
+
import * as React2 from 'react';
|
|
13
|
+
import React2__default, { useState, useEffect, useCallback } from 'react';
|
|
11
14
|
import { render, Box, Text } from 'ink';
|
|
12
15
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
13
16
|
|
|
@@ -260,10 +263,102 @@ async function collectTestCasesFromFiles(config) {
|
|
|
260
263
|
);
|
|
261
264
|
return found.flat();
|
|
262
265
|
}
|
|
266
|
+
function preprocessForDiff(value, options) {
|
|
267
|
+
if (options?.sort && Array.isArray(value)) {
|
|
268
|
+
return [...value].sort((a, b) => {
|
|
269
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
270
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
271
|
+
return aStr.localeCompare(bStr);
|
|
272
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
273
|
+
}
|
|
274
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
275
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
276
|
+
const filtered = {};
|
|
277
|
+
for (const [k, v] of Object.entries(value)) {
|
|
278
|
+
if (!keys.includes(k)) {
|
|
279
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return filtered;
|
|
283
|
+
}
|
|
284
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
285
|
+
const result = {};
|
|
286
|
+
for (const [k, v] of Object.entries(value)) {
|
|
287
|
+
result[k] = preprocessForDiff(v, options);
|
|
288
|
+
}
|
|
289
|
+
return result;
|
|
290
|
+
}
|
|
291
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
292
|
+
return Number(value.toFixed(options.precision));
|
|
293
|
+
}
|
|
294
|
+
return value;
|
|
295
|
+
}
|
|
296
|
+
function toPrettyJson(value) {
|
|
297
|
+
const str = stringify(value);
|
|
298
|
+
try {
|
|
299
|
+
const parsed = JSON.parse(str);
|
|
300
|
+
return JSON.stringify(parsed, null, 2);
|
|
301
|
+
} catch {
|
|
302
|
+
return str;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
function formatDiffParts(parts) {
|
|
306
|
+
const lines = [];
|
|
307
|
+
for (const part of parts) {
|
|
308
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
309
|
+
const partLines = part.value.split("\n");
|
|
310
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
311
|
+
const line = partLines[i];
|
|
312
|
+
if (i === partLines.length - 1 && line === "")
|
|
313
|
+
continue;
|
|
314
|
+
lines.push(prefix + line);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return lines.join("\n");
|
|
318
|
+
}
|
|
263
319
|
function createDiffString(expected, actual, diffOptions) {
|
|
264
|
-
const
|
|
265
|
-
const
|
|
266
|
-
|
|
320
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
321
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
322
|
+
if (diffOptions?.keysOnly) {
|
|
323
|
+
const expectedKeys = JSON.stringify(
|
|
324
|
+
extractKeys(expectedProcessed),
|
|
325
|
+
null,
|
|
326
|
+
2
|
|
327
|
+
);
|
|
328
|
+
const actualKeys = JSON.stringify(
|
|
329
|
+
extractKeys(actualProcessed),
|
|
330
|
+
null,
|
|
331
|
+
2
|
|
332
|
+
);
|
|
333
|
+
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
334
|
+
return formatDiffParts(parts2);
|
|
335
|
+
}
|
|
336
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
337
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
338
|
+
if (expectedStr === actualStr) {
|
|
339
|
+
return "";
|
|
340
|
+
}
|
|
341
|
+
const parts = diffLines(expectedStr, actualStr);
|
|
342
|
+
if (diffOptions?.outputNewOnly) {
|
|
343
|
+
const filtered = parts.filter(
|
|
344
|
+
(p) => p.added === true
|
|
345
|
+
);
|
|
346
|
+
return formatDiffParts(filtered);
|
|
347
|
+
}
|
|
348
|
+
return formatDiffParts(parts);
|
|
349
|
+
}
|
|
350
|
+
function extractKeys(value) {
|
|
351
|
+
if (value === null || typeof value !== "object") {
|
|
352
|
+
return "\xB7";
|
|
353
|
+
}
|
|
354
|
+
if (Array.isArray(value)) {
|
|
355
|
+
return value.map(extractKeys);
|
|
356
|
+
}
|
|
357
|
+
const result = {};
|
|
358
|
+
for (const [k, v] of Object.entries(value)) {
|
|
359
|
+
result[k] = extractKeys(v);
|
|
360
|
+
}
|
|
361
|
+
return result;
|
|
267
362
|
}
|
|
268
363
|
function formatLogMessage(msg) {
|
|
269
364
|
if (typeof msg === "string")
|
|
@@ -651,13 +746,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
651
746
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
652
747
|
);
|
|
653
748
|
}
|
|
654
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
749
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
655
750
|
return Effect.gen(function* () {
|
|
656
751
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
657
752
|
const rerunPassed = [];
|
|
658
753
|
for (let r = 0; r < reruns; r++) {
|
|
659
754
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
660
755
|
const started = Date.now();
|
|
756
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
757
|
+
n + 1,
|
|
758
|
+
n + 1
|
|
759
|
+
]);
|
|
760
|
+
yield* publishEvent({
|
|
761
|
+
type: "TestCaseStarted",
|
|
762
|
+
runId: task.runId,
|
|
763
|
+
testCaseId: testCaseItem.id,
|
|
764
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
765
|
+
startedTestCases: startedEvaluations,
|
|
766
|
+
totalTestCases: totalEvaluations,
|
|
767
|
+
rerunIndex: r + 1,
|
|
768
|
+
rerunTotal: reruns
|
|
769
|
+
});
|
|
661
770
|
const evaluatorScores = [];
|
|
662
771
|
let testCaseError;
|
|
663
772
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -803,6 +912,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
803
912
|
);
|
|
804
913
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
805
914
|
const completedRef = yield* Ref.make(0);
|
|
915
|
+
const startedRef = yield* Ref.make(0);
|
|
806
916
|
const passedRef = yield* Ref.make(0);
|
|
807
917
|
const failedRef = yield* Ref.make(0);
|
|
808
918
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -812,6 +922,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
812
922
|
publishEvent,
|
|
813
923
|
persistenceQueue,
|
|
814
924
|
updateSnapshot,
|
|
925
|
+
startedRef,
|
|
815
926
|
completedRef,
|
|
816
927
|
passedRef,
|
|
817
928
|
failedRef
|
|
@@ -1287,8 +1398,9 @@ var EffectRunner = class {
|
|
|
1287
1398
|
);
|
|
1288
1399
|
}
|
|
1289
1400
|
};
|
|
1290
|
-
|
|
1291
|
-
|
|
1401
|
+
function getDefaultConcurrency() {
|
|
1402
|
+
return Math.max(1, cpus().length);
|
|
1403
|
+
}
|
|
1292
1404
|
function parseSimpleCliArgs(argv) {
|
|
1293
1405
|
const args = {
|
|
1294
1406
|
help: false,
|
|
@@ -1315,6 +1427,14 @@ function parseSimpleCliArgs(argv) {
|
|
|
1315
1427
|
index += 1;
|
|
1316
1428
|
continue;
|
|
1317
1429
|
}
|
|
1430
|
+
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1431
|
+
const n = parseInt(argv[index + 1], 10);
|
|
1432
|
+
if (!Number.isNaN(n) && n >= 1) {
|
|
1433
|
+
args.concurrency = n;
|
|
1434
|
+
}
|
|
1435
|
+
index += 1;
|
|
1436
|
+
continue;
|
|
1437
|
+
}
|
|
1318
1438
|
args.unknownArgs.push(token);
|
|
1319
1439
|
}
|
|
1320
1440
|
return args;
|
|
@@ -1322,9 +1442,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1322
1442
|
function getSimpleCliUsage() {
|
|
1323
1443
|
return [
|
|
1324
1444
|
"Usage:",
|
|
1325
|
-
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
1445
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
|
|
1326
1446
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1327
1447
|
"",
|
|
1448
|
+
"Options:",
|
|
1449
|
+
" --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
|
|
1450
|
+
"",
|
|
1328
1451
|
"Pattern examples for --evaluator:",
|
|
1329
1452
|
" score-evaluator exact name (case-insensitive)",
|
|
1330
1453
|
' "*score*" wildcard pattern',
|
|
@@ -1463,7 +1586,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1463
1586
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1464
1587
|
return new Promise((resolve5, reject) => {
|
|
1465
1588
|
const app = render(
|
|
1466
|
-
|
|
1589
|
+
React2__default.createElement(GenerateView, {
|
|
1467
1590
|
runner,
|
|
1468
1591
|
datasetName,
|
|
1469
1592
|
onComplete: (err) => {
|
|
@@ -1613,6 +1736,7 @@ function RunView({
|
|
|
1613
1736
|
runner,
|
|
1614
1737
|
datasetName,
|
|
1615
1738
|
evaluatorPattern,
|
|
1739
|
+
concurrency,
|
|
1616
1740
|
onComplete
|
|
1617
1741
|
}) {
|
|
1618
1742
|
const [phase, setPhase] = useState(
|
|
@@ -1620,7 +1744,9 @@ function RunView({
|
|
|
1620
1744
|
);
|
|
1621
1745
|
const [runInfo, setRunInfo] = useState(null);
|
|
1622
1746
|
const [testCases, setTestCases] = useState([]);
|
|
1747
|
+
const [startedEvaluations, setStartedEvaluations] = useState(0);
|
|
1623
1748
|
const [completedEvaluations, setCompletedEvaluations] = useState(0);
|
|
1749
|
+
const [runningEvaluations, setRunningEvaluations] = useState([]);
|
|
1624
1750
|
const [summary, setSummary] = useState(null);
|
|
1625
1751
|
const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
|
|
1626
1752
|
const runEval = useCallback(async () => {
|
|
@@ -1657,6 +1783,25 @@ function RunView({
|
|
|
1657
1783
|
let overallScoreCount = 0;
|
|
1658
1784
|
const done = new Promise((resolve5) => {
|
|
1659
1785
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1786
|
+
if (event.type === "TestCaseStarted") {
|
|
1787
|
+
setStartedEvaluations(event.startedTestCases);
|
|
1788
|
+
setRunningEvaluations((prev) => {
|
|
1789
|
+
const withoutDuplicate = prev.filter(
|
|
1790
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1791
|
+
);
|
|
1792
|
+
return [
|
|
1793
|
+
...withoutDuplicate,
|
|
1794
|
+
{
|
|
1795
|
+
testCaseId: event.testCaseId,
|
|
1796
|
+
name: event.testCaseName,
|
|
1797
|
+
rerunIndex: event.rerunIndex,
|
|
1798
|
+
rerunTotal: event.rerunTotal,
|
|
1799
|
+
startedTestCases: event.startedTestCases,
|
|
1800
|
+
totalTestCases: event.totalTestCases
|
|
1801
|
+
}
|
|
1802
|
+
];
|
|
1803
|
+
});
|
|
1804
|
+
}
|
|
1660
1805
|
if (event.type === "TestCaseProgress") {
|
|
1661
1806
|
for (const item of event.evaluatorScores) {
|
|
1662
1807
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
@@ -1723,6 +1868,11 @@ function RunView({
|
|
|
1723
1868
|
};
|
|
1724
1869
|
byId.set(event.testCaseId, merged);
|
|
1725
1870
|
setCompletedEvaluations(event.completedTestCases);
|
|
1871
|
+
setRunningEvaluations(
|
|
1872
|
+
(running) => running.filter(
|
|
1873
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1874
|
+
)
|
|
1875
|
+
);
|
|
1726
1876
|
return Array.from(byId.values());
|
|
1727
1877
|
});
|
|
1728
1878
|
}
|
|
@@ -1734,7 +1884,8 @@ function RunView({
|
|
|
1734
1884
|
});
|
|
1735
1885
|
const snapshot = await runner.runDatasetWith({
|
|
1736
1886
|
datasetId: dataset.id,
|
|
1737
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
1887
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
1888
|
+
concurrency
|
|
1738
1889
|
});
|
|
1739
1890
|
setRunInfo({
|
|
1740
1891
|
runId: snapshot.runId,
|
|
@@ -1762,7 +1913,7 @@ function RunView({
|
|
|
1762
1913
|
});
|
|
1763
1914
|
setPhase("completed");
|
|
1764
1915
|
setTimeout(() => onComplete(), 200);
|
|
1765
|
-
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1916
|
+
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1766
1917
|
useEffect(() => {
|
|
1767
1918
|
void runEval();
|
|
1768
1919
|
}, [runEval]);
|
|
@@ -1798,12 +1949,38 @@ function RunView({
|
|
|
1798
1949
|
runInfo.totalTestCases
|
|
1799
1950
|
] })
|
|
1800
1951
|
] }),
|
|
1801
|
-
phase === "running" && /* @__PURE__ */
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1952
|
+
phase === "running" && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1953
|
+
/* @__PURE__ */ jsx(
|
|
1954
|
+
Spinner,
|
|
1955
|
+
{
|
|
1956
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1957
|
+
}
|
|
1958
|
+
),
|
|
1959
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
1960
|
+
Text,
|
|
1961
|
+
{
|
|
1962
|
+
color: "yellow",
|
|
1963
|
+
children: [
|
|
1964
|
+
"[running ",
|
|
1965
|
+
item.startedTestCases,
|
|
1966
|
+
"/",
|
|
1967
|
+
item.totalTestCases,
|
|
1968
|
+
"]",
|
|
1969
|
+
" ",
|
|
1970
|
+
item.name,
|
|
1971
|
+
" ",
|
|
1972
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1973
|
+
"(",
|
|
1974
|
+
item.rerunIndex,
|
|
1975
|
+
"/",
|
|
1976
|
+
item.rerunTotal,
|
|
1977
|
+
")"
|
|
1978
|
+
] })
|
|
1979
|
+
]
|
|
1980
|
+
},
|
|
1981
|
+
`${item.testCaseId}:${item.rerunIndex}`
|
|
1982
|
+
)) })
|
|
1983
|
+
] }),
|
|
1807
1984
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1808
1985
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1809
1986
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -1884,7 +2061,7 @@ function RunView({
|
|
|
1884
2061
|
},
|
|
1885
2062
|
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1886
2063
|
);
|
|
1887
|
-
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "
|
|
2064
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
1888
2065
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1889
2066
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1890
2067
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
@@ -1942,9 +2119,9 @@ function RunView({
|
|
|
1942
2119
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
1943
2120
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1944
2121
|
const agg = summary.aggregates.get(id);
|
|
1945
|
-
const scoreKeys = [
|
|
1946
|
-
(
|
|
1947
|
-
);
|
|
2122
|
+
const scoreKeys = [
|
|
2123
|
+
...summary.scoreItemsByEvaluatorScore?.keys() ?? []
|
|
2124
|
+
].filter((k) => k.startsWith(`${id}:`));
|
|
1948
2125
|
if (scoreKeys.length === 0) {
|
|
1949
2126
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1950
2127
|
"- ",
|
|
@@ -2252,7 +2429,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2252
2429
|
}
|
|
2253
2430
|
return lines;
|
|
2254
2431
|
}
|
|
2255
|
-
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2432
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2256
2433
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
2257
2434
|
if (!dataset) {
|
|
2258
2435
|
const known = await runner.collectDatasets();
|
|
@@ -2278,9 +2455,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2278
2455
|
let overallScoreTotal = 0;
|
|
2279
2456
|
let overallScoreSumSq = 0;
|
|
2280
2457
|
let overallScoreCount = 0;
|
|
2458
|
+
let startedCount = 0;
|
|
2281
2459
|
let completedCount = 0;
|
|
2282
2460
|
let totalCount = 0;
|
|
2283
2461
|
let runFinished = false;
|
|
2462
|
+
const inFlightReruns = /* @__PURE__ */ new Set();
|
|
2284
2463
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2285
2464
|
let spinnerIndex = 0;
|
|
2286
2465
|
function clearLine() {
|
|
@@ -2304,7 +2483,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2304
2483
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2305
2484
|
`${completedCount}/${totalCount}`,
|
|
2306
2485
|
ansi2.bold
|
|
2307
|
-
)} ${colorize(
|
|
2486
|
+
)} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
|
|
2308
2487
|
);
|
|
2309
2488
|
}
|
|
2310
2489
|
let lastPrintedTestCaseId = null;
|
|
@@ -2312,8 +2491,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2312
2491
|
let spinnerTimer;
|
|
2313
2492
|
const done = new Promise((resolve5) => {
|
|
2314
2493
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2494
|
+
if (event.type === "TestCaseStarted") {
|
|
2495
|
+
startedCount = event.startedTestCases;
|
|
2496
|
+
inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2497
|
+
clearLine();
|
|
2498
|
+
process.stdout.write(
|
|
2499
|
+
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2500
|
+
`
|
|
2501
|
+
);
|
|
2502
|
+
drawSpinner();
|
|
2503
|
+
}
|
|
2315
2504
|
if (event.type === "TestCaseProgress") {
|
|
2316
2505
|
completedCount = event.completedTestCases;
|
|
2506
|
+
inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2317
2507
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2318
2508
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2319
2509
|
const testCaseId = event.testCaseId;
|
|
@@ -2429,7 +2619,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2429
2619
|
});
|
|
2430
2620
|
const snapshot = await runner.runDatasetWith({
|
|
2431
2621
|
datasetId: dataset.id,
|
|
2432
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
2622
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
2623
|
+
concurrency
|
|
2433
2624
|
});
|
|
2434
2625
|
totalCount = snapshot.totalTestCases;
|
|
2435
2626
|
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
@@ -2518,13 +2709,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2518
2709
|
}
|
|
2519
2710
|
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2520
2711
|
}
|
|
2521
|
-
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2712
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2522
2713
|
return new Promise((resolve5, reject) => {
|
|
2523
2714
|
const app = render(
|
|
2524
2715
|
React2.createElement(RunView, {
|
|
2525
2716
|
runner,
|
|
2526
2717
|
datasetName,
|
|
2527
2718
|
evaluatorPattern,
|
|
2719
|
+
concurrency,
|
|
2528
2720
|
onComplete: (err) => {
|
|
2529
2721
|
app.unmount();
|
|
2530
2722
|
if (err) {
|
|
@@ -2571,10 +2763,12 @@ async function main() {
|
|
|
2571
2763
|
const runner = createRunner();
|
|
2572
2764
|
try {
|
|
2573
2765
|
if (args.command === "run") {
|
|
2766
|
+
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2574
2767
|
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
2575
2768
|
runner,
|
|
2576
2769
|
args.datasetName,
|
|
2577
|
-
args.evaluatorPattern
|
|
2770
|
+
args.evaluatorPattern,
|
|
2771
|
+
concurrency
|
|
2578
2772
|
);
|
|
2579
2773
|
return;
|
|
2580
2774
|
}
|