@m4trix/evals 0.21.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +156 -33
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +153 -33
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +98 -4
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +97 -4
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +100 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -2
- package/dist/index.js +97 -4
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli-simple.js
CHANGED
|
@@ -6,7 +6,9 @@ import { resolve, relative, join, parse, dirname } from 'path';
|
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
7
7
|
import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
|
-
import {
|
|
9
|
+
import { diffLines } from 'diff';
|
|
10
|
+
import stringify from 'fast-json-stable-stringify';
|
|
11
|
+
import { cpus } from 'os';
|
|
10
12
|
import * as React2 from 'react';
|
|
11
13
|
import React2__default, { useState, useEffect, useCallback } from 'react';
|
|
12
14
|
import { render, Box, Text } from 'ink';
|
|
@@ -261,10 +263,102 @@ async function collectTestCasesFromFiles(config) {
|
|
|
261
263
|
);
|
|
262
264
|
return found.flat();
|
|
263
265
|
}
|
|
266
|
+
function preprocessForDiff(value, options) {
|
|
267
|
+
if (options?.sort && Array.isArray(value)) {
|
|
268
|
+
return [...value].sort((a, b) => {
|
|
269
|
+
const aStr = stringify(preprocessForDiff(a, options));
|
|
270
|
+
const bStr = stringify(preprocessForDiff(b, options));
|
|
271
|
+
return aStr.localeCompare(bStr);
|
|
272
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
273
|
+
}
|
|
274
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
275
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
276
|
+
const filtered = {};
|
|
277
|
+
for (const [k, v] of Object.entries(value)) {
|
|
278
|
+
if (!keys.includes(k)) {
|
|
279
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return filtered;
|
|
283
|
+
}
|
|
284
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
285
|
+
const result = {};
|
|
286
|
+
for (const [k, v] of Object.entries(value)) {
|
|
287
|
+
result[k] = preprocessForDiff(v, options);
|
|
288
|
+
}
|
|
289
|
+
return result;
|
|
290
|
+
}
|
|
291
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
292
|
+
return Number(value.toFixed(options.precision));
|
|
293
|
+
}
|
|
294
|
+
return value;
|
|
295
|
+
}
|
|
296
|
+
function toPrettyJson(value) {
|
|
297
|
+
const str = stringify(value);
|
|
298
|
+
try {
|
|
299
|
+
const parsed = JSON.parse(str);
|
|
300
|
+
return JSON.stringify(parsed, null, 2);
|
|
301
|
+
} catch {
|
|
302
|
+
return str;
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
function formatDiffParts(parts) {
|
|
306
|
+
const lines = [];
|
|
307
|
+
for (const part of parts) {
|
|
308
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
309
|
+
const partLines = part.value.split("\n");
|
|
310
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
311
|
+
const line = partLines[i];
|
|
312
|
+
if (i === partLines.length - 1 && line === "")
|
|
313
|
+
continue;
|
|
314
|
+
lines.push(prefix + line);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
return lines.join("\n");
|
|
318
|
+
}
|
|
264
319
|
function createDiffString(expected, actual, diffOptions) {
|
|
265
|
-
const
|
|
266
|
-
const
|
|
267
|
-
|
|
320
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
321
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
322
|
+
if (diffOptions?.keysOnly) {
|
|
323
|
+
const expectedKeys = JSON.stringify(
|
|
324
|
+
extractKeys(expectedProcessed),
|
|
325
|
+
null,
|
|
326
|
+
2
|
|
327
|
+
);
|
|
328
|
+
const actualKeys = JSON.stringify(
|
|
329
|
+
extractKeys(actualProcessed),
|
|
330
|
+
null,
|
|
331
|
+
2
|
|
332
|
+
);
|
|
333
|
+
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
334
|
+
return formatDiffParts(parts2);
|
|
335
|
+
}
|
|
336
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
337
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
338
|
+
if (expectedStr === actualStr) {
|
|
339
|
+
return "";
|
|
340
|
+
}
|
|
341
|
+
const parts = diffLines(expectedStr, actualStr);
|
|
342
|
+
if (diffOptions?.outputNewOnly) {
|
|
343
|
+
const filtered = parts.filter(
|
|
344
|
+
(p) => p.added === true
|
|
345
|
+
);
|
|
346
|
+
return formatDiffParts(filtered);
|
|
347
|
+
}
|
|
348
|
+
return formatDiffParts(parts);
|
|
349
|
+
}
|
|
350
|
+
function extractKeys(value) {
|
|
351
|
+
if (value === null || typeof value !== "object") {
|
|
352
|
+
return "\xB7";
|
|
353
|
+
}
|
|
354
|
+
if (Array.isArray(value)) {
|
|
355
|
+
return value.map(extractKeys);
|
|
356
|
+
}
|
|
357
|
+
const result = {};
|
|
358
|
+
for (const [k, v] of Object.entries(value)) {
|
|
359
|
+
result[k] = extractKeys(v);
|
|
360
|
+
}
|
|
361
|
+
return result;
|
|
268
362
|
}
|
|
269
363
|
function formatLogMessage(msg) {
|
|
270
364
|
if (typeof msg === "string")
|
|
@@ -1304,8 +1398,9 @@ var EffectRunner = class {
|
|
|
1304
1398
|
);
|
|
1305
1399
|
}
|
|
1306
1400
|
};
|
|
1307
|
-
|
|
1308
|
-
|
|
1401
|
+
function getDefaultConcurrency() {
|
|
1402
|
+
return Math.max(1, cpus().length);
|
|
1403
|
+
}
|
|
1309
1404
|
function parseSimpleCliArgs(argv) {
|
|
1310
1405
|
const args = {
|
|
1311
1406
|
help: false,
|
|
@@ -1332,6 +1427,14 @@ function parseSimpleCliArgs(argv) {
|
|
|
1332
1427
|
index += 1;
|
|
1333
1428
|
continue;
|
|
1334
1429
|
}
|
|
1430
|
+
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1431
|
+
const n = parseInt(argv[index + 1], 10);
|
|
1432
|
+
if (!Number.isNaN(n) && n >= 1) {
|
|
1433
|
+
args.concurrency = n;
|
|
1434
|
+
}
|
|
1435
|
+
index += 1;
|
|
1436
|
+
continue;
|
|
1437
|
+
}
|
|
1335
1438
|
args.unknownArgs.push(token);
|
|
1336
1439
|
}
|
|
1337
1440
|
return args;
|
|
@@ -1339,9 +1442,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1339
1442
|
function getSimpleCliUsage() {
|
|
1340
1443
|
return [
|
|
1341
1444
|
"Usage:",
|
|
1342
|
-
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
1445
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
|
|
1343
1446
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1344
1447
|
"",
|
|
1448
|
+
"Options:",
|
|
1449
|
+
" --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
|
|
1450
|
+
"",
|
|
1345
1451
|
"Pattern examples for --evaluator:",
|
|
1346
1452
|
" score-evaluator exact name (case-insensitive)",
|
|
1347
1453
|
' "*score*" wildcard pattern',
|
|
@@ -1630,6 +1736,7 @@ function RunView({
|
|
|
1630
1736
|
runner,
|
|
1631
1737
|
datasetName,
|
|
1632
1738
|
evaluatorPattern,
|
|
1739
|
+
concurrency,
|
|
1633
1740
|
onComplete
|
|
1634
1741
|
}) {
|
|
1635
1742
|
const [phase, setPhase] = useState(
|
|
@@ -1777,7 +1884,8 @@ function RunView({
|
|
|
1777
1884
|
});
|
|
1778
1885
|
const snapshot = await runner.runDatasetWith({
|
|
1779
1886
|
datasetId: dataset.id,
|
|
1780
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
1887
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
1888
|
+
concurrency
|
|
1781
1889
|
});
|
|
1782
1890
|
setRunInfo({
|
|
1783
1891
|
runId: snapshot.runId,
|
|
@@ -1805,7 +1913,7 @@ function RunView({
|
|
|
1805
1913
|
});
|
|
1806
1914
|
setPhase("completed");
|
|
1807
1915
|
setTimeout(() => onComplete(), 200);
|
|
1808
|
-
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1916
|
+
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1809
1917
|
useEffect(() => {
|
|
1810
1918
|
void runEval();
|
|
1811
1919
|
}, [runEval]);
|
|
@@ -1848,22 +1956,30 @@ function RunView({
|
|
|
1848
1956
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1849
1957
|
}
|
|
1850
1958
|
),
|
|
1851
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1959
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
1960
|
+
Text,
|
|
1961
|
+
{
|
|
1962
|
+
color: "yellow",
|
|
1963
|
+
children: [
|
|
1964
|
+
"[running ",
|
|
1965
|
+
item.startedTestCases,
|
|
1966
|
+
"/",
|
|
1967
|
+
item.totalTestCases,
|
|
1968
|
+
"]",
|
|
1969
|
+
" ",
|
|
1970
|
+
item.name,
|
|
1971
|
+
" ",
|
|
1972
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1973
|
+
"(",
|
|
1974
|
+
item.rerunIndex,
|
|
1975
|
+
"/",
|
|
1976
|
+
item.rerunTotal,
|
|
1977
|
+
")"
|
|
1978
|
+
] })
|
|
1979
|
+
]
|
|
1980
|
+
},
|
|
1981
|
+
`${item.testCaseId}:${item.rerunIndex}`
|
|
1982
|
+
)) })
|
|
1867
1983
|
] }),
|
|
1868
1984
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1869
1985
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -1945,7 +2061,7 @@ function RunView({
|
|
|
1945
2061
|
},
|
|
1946
2062
|
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1947
2063
|
);
|
|
1948
|
-
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: "
|
|
2064
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
1949
2065
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1950
2066
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1951
2067
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
@@ -2003,9 +2119,9 @@ function RunView({
|
|
|
2003
2119
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
2004
2120
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2005
2121
|
const agg = summary.aggregates.get(id);
|
|
2006
|
-
const scoreKeys = [
|
|
2007
|
-
(
|
|
2008
|
-
);
|
|
2122
|
+
const scoreKeys = [
|
|
2123
|
+
...summary.scoreItemsByEvaluatorScore?.keys() ?? []
|
|
2124
|
+
].filter((k) => k.startsWith(`${id}:`));
|
|
2009
2125
|
if (scoreKeys.length === 0) {
|
|
2010
2126
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2011
2127
|
"- ",
|
|
@@ -2313,7 +2429,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2313
2429
|
}
|
|
2314
2430
|
return lines;
|
|
2315
2431
|
}
|
|
2316
|
-
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2432
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2317
2433
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
2318
2434
|
if (!dataset) {
|
|
2319
2435
|
const known = await runner.collectDatasets();
|
|
@@ -2503,7 +2619,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2503
2619
|
});
|
|
2504
2620
|
const snapshot = await runner.runDatasetWith({
|
|
2505
2621
|
datasetId: dataset.id,
|
|
2506
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
2622
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
2623
|
+
concurrency
|
|
2507
2624
|
});
|
|
2508
2625
|
totalCount = snapshot.totalTestCases;
|
|
2509
2626
|
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
@@ -2592,13 +2709,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2592
2709
|
}
|
|
2593
2710
|
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2594
2711
|
}
|
|
2595
|
-
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2712
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2596
2713
|
return new Promise((resolve5, reject) => {
|
|
2597
2714
|
const app = render(
|
|
2598
2715
|
React2.createElement(RunView, {
|
|
2599
2716
|
runner,
|
|
2600
2717
|
datasetName,
|
|
2601
2718
|
evaluatorPattern,
|
|
2719
|
+
concurrency,
|
|
2602
2720
|
onComplete: (err) => {
|
|
2603
2721
|
app.unmount();
|
|
2604
2722
|
if (err) {
|
|
@@ -2645,10 +2763,12 @@ async function main() {
|
|
|
2645
2763
|
const runner = createRunner();
|
|
2646
2764
|
try {
|
|
2647
2765
|
if (args.command === "run") {
|
|
2766
|
+
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2648
2767
|
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
2649
2768
|
runner,
|
|
2650
2769
|
args.datasetName,
|
|
2651
|
-
args.evaluatorPattern
|
|
2770
|
+
args.evaluatorPattern,
|
|
2771
|
+
concurrency
|
|
2652
2772
|
);
|
|
2653
2773
|
return;
|
|
2654
2774
|
}
|