@m4trix/evals 0.20.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +222 -28
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +221 -27
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +115 -5
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +114 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +117 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +10 -2
- package/dist/index.js +114 -5
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,9 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var diff = require('diff');
|
|
12
|
+
var stringify = require('fast-json-stable-stringify');
|
|
13
|
+
var os = require('os');
|
|
12
14
|
var React2 = require('react');
|
|
13
15
|
var ink = require('ink');
|
|
14
16
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -35,7 +37,8 @@ function _interopNamespace(e) {
|
|
|
35
37
|
}
|
|
36
38
|
|
|
37
39
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
38
|
-
var
|
|
40
|
+
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
41
|
+
var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
|
|
39
42
|
|
|
40
43
|
// src/runner/config.ts
|
|
41
44
|
var defaultRunnerConfig = {
|
|
@@ -286,10 +289,102 @@ async function collectTestCasesFromFiles(config) {
|
|
|
286
289
|
);
|
|
287
290
|
return found.flat();
|
|
288
291
|
}
|
|
292
|
+
function preprocessForDiff(value, options) {
|
|
293
|
+
if (options?.sort && Array.isArray(value)) {
|
|
294
|
+
return [...value].sort((a, b) => {
|
|
295
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
296
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
297
|
+
return aStr.localeCompare(bStr);
|
|
298
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
299
|
+
}
|
|
300
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
301
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
302
|
+
const filtered = {};
|
|
303
|
+
for (const [k, v] of Object.entries(value)) {
|
|
304
|
+
if (!keys.includes(k)) {
|
|
305
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return filtered;
|
|
309
|
+
}
|
|
310
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
311
|
+
const result = {};
|
|
312
|
+
for (const [k, v] of Object.entries(value)) {
|
|
313
|
+
result[k] = preprocessForDiff(v, options);
|
|
314
|
+
}
|
|
315
|
+
return result;
|
|
316
|
+
}
|
|
317
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
318
|
+
return Number(value.toFixed(options.precision));
|
|
319
|
+
}
|
|
320
|
+
return value;
|
|
321
|
+
}
|
|
322
|
+
function toPrettyJson(value) {
|
|
323
|
+
const str = stringify__default.default(value);
|
|
324
|
+
try {
|
|
325
|
+
const parsed = JSON.parse(str);
|
|
326
|
+
return JSON.stringify(parsed, null, 2);
|
|
327
|
+
} catch {
|
|
328
|
+
return str;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
function formatDiffParts(parts) {
|
|
332
|
+
const lines = [];
|
|
333
|
+
for (const part of parts) {
|
|
334
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
335
|
+
const partLines = part.value.split("\n");
|
|
336
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
337
|
+
const line = partLines[i];
|
|
338
|
+
if (i === partLines.length - 1 && line === "")
|
|
339
|
+
continue;
|
|
340
|
+
lines.push(prefix + line);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return lines.join("\n");
|
|
344
|
+
}
|
|
289
345
|
function createDiffString(expected, actual, diffOptions) {
|
|
290
|
-
const
|
|
291
|
-
const
|
|
292
|
-
|
|
346
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
347
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
348
|
+
if (diffOptions?.keysOnly) {
|
|
349
|
+
const expectedKeys = JSON.stringify(
|
|
350
|
+
extractKeys(expectedProcessed),
|
|
351
|
+
null,
|
|
352
|
+
2
|
|
353
|
+
);
|
|
354
|
+
const actualKeys = JSON.stringify(
|
|
355
|
+
extractKeys(actualProcessed),
|
|
356
|
+
null,
|
|
357
|
+
2
|
|
358
|
+
);
|
|
359
|
+
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
360
|
+
return formatDiffParts(parts2);
|
|
361
|
+
}
|
|
362
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
363
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
364
|
+
if (expectedStr === actualStr) {
|
|
365
|
+
return "";
|
|
366
|
+
}
|
|
367
|
+
const parts = diff.diffLines(expectedStr, actualStr);
|
|
368
|
+
if (diffOptions?.outputNewOnly) {
|
|
369
|
+
const filtered = parts.filter(
|
|
370
|
+
(p) => p.added === true
|
|
371
|
+
);
|
|
372
|
+
return formatDiffParts(filtered);
|
|
373
|
+
}
|
|
374
|
+
return formatDiffParts(parts);
|
|
375
|
+
}
|
|
376
|
+
function extractKeys(value) {
|
|
377
|
+
if (value === null || typeof value !== "object") {
|
|
378
|
+
return "\xB7";
|
|
379
|
+
}
|
|
380
|
+
if (Array.isArray(value)) {
|
|
381
|
+
return value.map(extractKeys);
|
|
382
|
+
}
|
|
383
|
+
const result = {};
|
|
384
|
+
for (const [k, v] of Object.entries(value)) {
|
|
385
|
+
result[k] = extractKeys(v);
|
|
386
|
+
}
|
|
387
|
+
return result;
|
|
293
388
|
}
|
|
294
389
|
function formatLogMessage(msg) {
|
|
295
390
|
if (typeof msg === "string")
|
|
@@ -677,13 +772,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
677
772
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
678
773
|
);
|
|
679
774
|
}
|
|
680
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
775
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
681
776
|
return effect.Effect.gen(function* () {
|
|
682
777
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
683
778
|
const rerunPassed = [];
|
|
684
779
|
for (let r = 0; r < reruns; r++) {
|
|
685
780
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
686
781
|
const started = Date.now();
|
|
782
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
783
|
+
n + 1,
|
|
784
|
+
n + 1
|
|
785
|
+
]);
|
|
786
|
+
yield* publishEvent({
|
|
787
|
+
type: "TestCaseStarted",
|
|
788
|
+
runId: task.runId,
|
|
789
|
+
testCaseId: testCaseItem.id,
|
|
790
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
791
|
+
startedTestCases: startedEvaluations,
|
|
792
|
+
totalTestCases: totalEvaluations,
|
|
793
|
+
rerunIndex: r + 1,
|
|
794
|
+
rerunTotal: reruns
|
|
795
|
+
});
|
|
687
796
|
const evaluatorScores = [];
|
|
688
797
|
let testCaseError;
|
|
689
798
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -829,6 +938,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
829
938
|
);
|
|
830
939
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
831
940
|
const completedRef = yield* effect.Ref.make(0);
|
|
941
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
832
942
|
const passedRef = yield* effect.Ref.make(0);
|
|
833
943
|
const failedRef = yield* effect.Ref.make(0);
|
|
834
944
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -838,6 +948,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
838
948
|
publishEvent,
|
|
839
949
|
persistenceQueue,
|
|
840
950
|
updateSnapshot,
|
|
951
|
+
startedRef,
|
|
841
952
|
completedRef,
|
|
842
953
|
passedRef,
|
|
843
954
|
failedRef
|
|
@@ -1313,8 +1424,9 @@ var EffectRunner = class {
|
|
|
1313
1424
|
);
|
|
1314
1425
|
}
|
|
1315
1426
|
};
|
|
1316
|
-
|
|
1317
|
-
|
|
1427
|
+
function getDefaultConcurrency() {
|
|
1428
|
+
return Math.max(1, os.cpus().length);
|
|
1429
|
+
}
|
|
1318
1430
|
function parseSimpleCliArgs(argv) {
|
|
1319
1431
|
const args = {
|
|
1320
1432
|
help: false,
|
|
@@ -1341,6 +1453,14 @@ function parseSimpleCliArgs(argv) {
|
|
|
1341
1453
|
index += 1;
|
|
1342
1454
|
continue;
|
|
1343
1455
|
}
|
|
1456
|
+
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1457
|
+
const n = parseInt(argv[index + 1], 10);
|
|
1458
|
+
if (!Number.isNaN(n) && n >= 1) {
|
|
1459
|
+
args.concurrency = n;
|
|
1460
|
+
}
|
|
1461
|
+
index += 1;
|
|
1462
|
+
continue;
|
|
1463
|
+
}
|
|
1344
1464
|
args.unknownArgs.push(token);
|
|
1345
1465
|
}
|
|
1346
1466
|
return args;
|
|
@@ -1348,9 +1468,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1348
1468
|
function getSimpleCliUsage() {
|
|
1349
1469
|
return [
|
|
1350
1470
|
"Usage:",
|
|
1351
|
-
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
1471
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
|
|
1352
1472
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1353
1473
|
"",
|
|
1474
|
+
"Options:",
|
|
1475
|
+
" --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
|
|
1476
|
+
"",
|
|
1354
1477
|
"Pattern examples for --evaluator:",
|
|
1355
1478
|
" score-evaluator exact name (case-insensitive)",
|
|
1356
1479
|
' "*score*" wildcard pattern',
|
|
@@ -1489,7 +1612,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1489
1612
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1490
1613
|
return new Promise((resolve5, reject) => {
|
|
1491
1614
|
const app = ink.render(
|
|
1492
|
-
|
|
1615
|
+
React2__namespace.default.createElement(GenerateView, {
|
|
1493
1616
|
runner,
|
|
1494
1617
|
datasetName,
|
|
1495
1618
|
onComplete: (err) => {
|
|
@@ -1639,6 +1762,7 @@ function RunView({
|
|
|
1639
1762
|
runner,
|
|
1640
1763
|
datasetName,
|
|
1641
1764
|
evaluatorPattern,
|
|
1765
|
+
concurrency,
|
|
1642
1766
|
onComplete
|
|
1643
1767
|
}) {
|
|
1644
1768
|
const [phase, setPhase] = React2.useState(
|
|
@@ -1646,7 +1770,9 @@ function RunView({
|
|
|
1646
1770
|
);
|
|
1647
1771
|
const [runInfo, setRunInfo] = React2.useState(null);
|
|
1648
1772
|
const [testCases, setTestCases] = React2.useState([]);
|
|
1773
|
+
const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
|
|
1649
1774
|
const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
|
|
1775
|
+
const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
|
|
1650
1776
|
const [summary, setSummary] = React2.useState(null);
|
|
1651
1777
|
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1652
1778
|
const runEval = React2.useCallback(async () => {
|
|
@@ -1683,6 +1809,25 @@ function RunView({
|
|
|
1683
1809
|
let overallScoreCount = 0;
|
|
1684
1810
|
const done = new Promise((resolve5) => {
|
|
1685
1811
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1812
|
+
if (event.type === "TestCaseStarted") {
|
|
1813
|
+
setStartedEvaluations(event.startedTestCases);
|
|
1814
|
+
setRunningEvaluations((prev) => {
|
|
1815
|
+
const withoutDuplicate = prev.filter(
|
|
1816
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1817
|
+
);
|
|
1818
|
+
return [
|
|
1819
|
+
...withoutDuplicate,
|
|
1820
|
+
{
|
|
1821
|
+
testCaseId: event.testCaseId,
|
|
1822
|
+
name: event.testCaseName,
|
|
1823
|
+
rerunIndex: event.rerunIndex,
|
|
1824
|
+
rerunTotal: event.rerunTotal,
|
|
1825
|
+
startedTestCases: event.startedTestCases,
|
|
1826
|
+
totalTestCases: event.totalTestCases
|
|
1827
|
+
}
|
|
1828
|
+
];
|
|
1829
|
+
});
|
|
1830
|
+
}
|
|
1686
1831
|
if (event.type === "TestCaseProgress") {
|
|
1687
1832
|
for (const item of event.evaluatorScores) {
|
|
1688
1833
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
@@ -1749,6 +1894,11 @@ function RunView({
|
|
|
1749
1894
|
};
|
|
1750
1895
|
byId.set(event.testCaseId, merged);
|
|
1751
1896
|
setCompletedEvaluations(event.completedTestCases);
|
|
1897
|
+
setRunningEvaluations(
|
|
1898
|
+
(running) => running.filter(
|
|
1899
|
+
(item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
|
|
1900
|
+
)
|
|
1901
|
+
);
|
|
1752
1902
|
return Array.from(byId.values());
|
|
1753
1903
|
});
|
|
1754
1904
|
}
|
|
@@ -1760,7 +1910,8 @@ function RunView({
|
|
|
1760
1910
|
});
|
|
1761
1911
|
const snapshot = await runner.runDatasetWith({
|
|
1762
1912
|
datasetId: dataset.id,
|
|
1763
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
1913
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
1914
|
+
concurrency
|
|
1764
1915
|
});
|
|
1765
1916
|
setRunInfo({
|
|
1766
1917
|
runId: snapshot.runId,
|
|
@@ -1788,7 +1939,7 @@ function RunView({
|
|
|
1788
1939
|
});
|
|
1789
1940
|
setPhase("completed");
|
|
1790
1941
|
setTimeout(() => onComplete(), 200);
|
|
1791
|
-
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1942
|
+
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1792
1943
|
React2.useEffect(() => {
|
|
1793
1944
|
void runEval();
|
|
1794
1945
|
}, [runEval]);
|
|
@@ -1824,12 +1975,38 @@ function RunView({
|
|
|
1824
1975
|
runInfo.totalTestCases
|
|
1825
1976
|
] })
|
|
1826
1977
|
] }),
|
|
1827
|
-
phase === "running" && /* @__PURE__ */ jsxRuntime.
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1978
|
+
phase === "running" && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
|
|
1979
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
1980
|
+
Spinner,
|
|
1981
|
+
{
|
|
1982
|
+
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1983
|
+
}
|
|
1984
|
+
),
|
|
1985
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1986
|
+
ink.Text,
|
|
1987
|
+
{
|
|
1988
|
+
color: "yellow",
|
|
1989
|
+
children: [
|
|
1990
|
+
"[running ",
|
|
1991
|
+
item.startedTestCases,
|
|
1992
|
+
"/",
|
|
1993
|
+
item.totalTestCases,
|
|
1994
|
+
"]",
|
|
1995
|
+
" ",
|
|
1996
|
+
item.name,
|
|
1997
|
+
" ",
|
|
1998
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1999
|
+
"(",
|
|
2000
|
+
item.rerunIndex,
|
|
2001
|
+
"/",
|
|
2002
|
+
item.rerunTotal,
|
|
2003
|
+
")"
|
|
2004
|
+
] })
|
|
2005
|
+
]
|
|
2006
|
+
},
|
|
2007
|
+
`${item.testCaseId}:${item.rerunIndex}`
|
|
2008
|
+
)) })
|
|
2009
|
+
] }),
|
|
1833
2010
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1834
2011
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1835
2012
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -1910,7 +2087,7 @@ function RunView({
|
|
|
1910
2087
|
},
|
|
1911
2088
|
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1912
2089
|
);
|
|
1913
|
-
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "
|
|
2090
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
1914
2091
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1915
2092
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1916
2093
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -1968,9 +2145,9 @@ function RunView({
|
|
|
1968
2145
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
1969
2146
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
1970
2147
|
const agg = summary.aggregates.get(id);
|
|
1971
|
-
const scoreKeys = [
|
|
1972
|
-
(
|
|
1973
|
-
);
|
|
2148
|
+
const scoreKeys = [
|
|
2149
|
+
...summary.scoreItemsByEvaluatorScore?.keys() ?? []
|
|
2150
|
+
].filter((k) => k.startsWith(`${id}:`));
|
|
1974
2151
|
if (scoreKeys.length === 0) {
|
|
1975
2152
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1976
2153
|
"- ",
|
|
@@ -2278,7 +2455,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2278
2455
|
}
|
|
2279
2456
|
return lines;
|
|
2280
2457
|
}
|
|
2281
|
-
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2458
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2282
2459
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
2283
2460
|
if (!dataset) {
|
|
2284
2461
|
const known = await runner.collectDatasets();
|
|
@@ -2304,9 +2481,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2304
2481
|
let overallScoreTotal = 0;
|
|
2305
2482
|
let overallScoreSumSq = 0;
|
|
2306
2483
|
let overallScoreCount = 0;
|
|
2484
|
+
let startedCount = 0;
|
|
2307
2485
|
let completedCount = 0;
|
|
2308
2486
|
let totalCount = 0;
|
|
2309
2487
|
let runFinished = false;
|
|
2488
|
+
const inFlightReruns = /* @__PURE__ */ new Set();
|
|
2310
2489
|
const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
2311
2490
|
let spinnerIndex = 0;
|
|
2312
2491
|
function clearLine() {
|
|
@@ -2330,7 +2509,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2330
2509
|
`\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
|
|
2331
2510
|
`${completedCount}/${totalCount}`,
|
|
2332
2511
|
ansi2.bold
|
|
2333
|
-
)} ${colorize(
|
|
2512
|
+
)} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
|
|
2334
2513
|
);
|
|
2335
2514
|
}
|
|
2336
2515
|
let lastPrintedTestCaseId = null;
|
|
@@ -2338,8 +2517,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2338
2517
|
let spinnerTimer;
|
|
2339
2518
|
const done = new Promise((resolve5) => {
|
|
2340
2519
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
2520
|
+
if (event.type === "TestCaseStarted") {
|
|
2521
|
+
startedCount = event.startedTestCases;
|
|
2522
|
+
inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2523
|
+
clearLine();
|
|
2524
|
+
process.stdout.write(
|
|
2525
|
+
`${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
|
|
2526
|
+
`
|
|
2527
|
+
);
|
|
2528
|
+
drawSpinner();
|
|
2529
|
+
}
|
|
2341
2530
|
if (event.type === "TestCaseProgress") {
|
|
2342
2531
|
completedCount = event.completedTestCases;
|
|
2532
|
+
inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
|
|
2343
2533
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
2344
2534
|
const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
|
|
2345
2535
|
const testCaseId = event.testCaseId;
|
|
@@ -2455,7 +2645,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2455
2645
|
});
|
|
2456
2646
|
const snapshot = await runner.runDatasetWith({
|
|
2457
2647
|
datasetId: dataset.id,
|
|
2458
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
2648
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
2649
|
+
concurrency
|
|
2459
2650
|
});
|
|
2460
2651
|
totalCount = snapshot.totalTestCases;
|
|
2461
2652
|
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
@@ -2544,13 +2735,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2544
2735
|
}
|
|
2545
2736
|
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2546
2737
|
}
|
|
2547
|
-
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2738
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2548
2739
|
return new Promise((resolve5, reject) => {
|
|
2549
2740
|
const app = ink.render(
|
|
2550
|
-
|
|
2741
|
+
React2__namespace.createElement(RunView, {
|
|
2551
2742
|
runner,
|
|
2552
2743
|
datasetName,
|
|
2553
2744
|
evaluatorPattern,
|
|
2745
|
+
concurrency,
|
|
2554
2746
|
onComplete: (err) => {
|
|
2555
2747
|
app.unmount();
|
|
2556
2748
|
if (err) {
|
|
@@ -2597,10 +2789,12 @@ async function main() {
|
|
|
2597
2789
|
const runner = createRunner();
|
|
2598
2790
|
try {
|
|
2599
2791
|
if (args.command === "run") {
|
|
2792
|
+
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2600
2793
|
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
2601
2794
|
runner,
|
|
2602
2795
|
args.datasetName,
|
|
2603
|
-
args.evaluatorPattern
|
|
2796
|
+
args.evaluatorPattern,
|
|
2797
|
+
concurrency
|
|
2604
2798
|
);
|
|
2605
2799
|
return;
|
|
2606
2800
|
}
|