@m4trix/evals 0.21.0 → 0.21.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +156 -33
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +153 -33
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +98 -4
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +97 -4
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +100 -4
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -2
- package/dist/index.js +97 -4
- package/dist/index.js.map +1 -1
- package/package.json +3 -3
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,12 +8,16 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
var
|
|
11
|
+
var diff = require('diff');
|
|
12
|
+
var stringify = require('fast-json-stable-stringify');
|
|
13
|
+
var os = require('os');
|
|
12
14
|
var React2 = require('react');
|
|
13
15
|
var ink = require('ink');
|
|
14
16
|
var jsxRuntime = require('react/jsx-runtime');
|
|
15
17
|
|
|
16
18
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
19
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
20
|
+
|
|
17
21
|
function _interopNamespace(e) {
|
|
18
22
|
if (e && e.__esModule) return e;
|
|
19
23
|
var n = Object.create(null);
|
|
@@ -33,6 +37,7 @@ function _interopNamespace(e) {
|
|
|
33
37
|
}
|
|
34
38
|
|
|
35
39
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
40
|
+
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
36
41
|
var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
|
|
37
42
|
|
|
38
43
|
// src/runner/config.ts
|
|
@@ -284,10 +289,102 @@ async function collectTestCasesFromFiles(config) {
|
|
|
284
289
|
);
|
|
285
290
|
return found.flat();
|
|
286
291
|
}
|
|
292
|
+
function preprocessForDiff(value, options) {
|
|
293
|
+
if (options?.sort && Array.isArray(value)) {
|
|
294
|
+
return [...value].sort((a, b) => {
|
|
295
|
+
const aStr = stringify__default.default(preprocessForDiff(a, options));
|
|
296
|
+
const bStr = stringify__default.default(preprocessForDiff(b, options));
|
|
297
|
+
return aStr.localeCompare(bStr);
|
|
298
|
+
}).map((item) => preprocessForDiff(item, options));
|
|
299
|
+
}
|
|
300
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
|
|
301
|
+
const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
|
|
302
|
+
const filtered = {};
|
|
303
|
+
for (const [k, v] of Object.entries(value)) {
|
|
304
|
+
if (!keys.includes(k)) {
|
|
305
|
+
filtered[k] = preprocessForDiff(v, options);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
return filtered;
|
|
309
|
+
}
|
|
310
|
+
if (value !== null && typeof value === "object" && !Array.isArray(value)) {
|
|
311
|
+
const result = {};
|
|
312
|
+
for (const [k, v] of Object.entries(value)) {
|
|
313
|
+
result[k] = preprocessForDiff(v, options);
|
|
314
|
+
}
|
|
315
|
+
return result;
|
|
316
|
+
}
|
|
317
|
+
if (typeof value === "number" && options?.precision !== void 0) {
|
|
318
|
+
return Number(value.toFixed(options.precision));
|
|
319
|
+
}
|
|
320
|
+
return value;
|
|
321
|
+
}
|
|
322
|
+
function toPrettyJson(value) {
|
|
323
|
+
const str = stringify__default.default(value);
|
|
324
|
+
try {
|
|
325
|
+
const parsed = JSON.parse(str);
|
|
326
|
+
return JSON.stringify(parsed, null, 2);
|
|
327
|
+
} catch {
|
|
328
|
+
return str;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
function formatDiffParts(parts) {
|
|
332
|
+
const lines = [];
|
|
333
|
+
for (const part of parts) {
|
|
334
|
+
const prefix = part.added ? "+ " : part.removed ? "- " : "";
|
|
335
|
+
const partLines = part.value.split("\n");
|
|
336
|
+
for (let i = 0; i < partLines.length; i++) {
|
|
337
|
+
const line = partLines[i];
|
|
338
|
+
if (i === partLines.length - 1 && line === "")
|
|
339
|
+
continue;
|
|
340
|
+
lines.push(prefix + line);
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return lines.join("\n");
|
|
344
|
+
}
|
|
287
345
|
function createDiffString(expected, actual, diffOptions) {
|
|
288
|
-
const
|
|
289
|
-
const
|
|
290
|
-
|
|
346
|
+
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
347
|
+
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
348
|
+
if (diffOptions?.keysOnly) {
|
|
349
|
+
const expectedKeys = JSON.stringify(
|
|
350
|
+
extractKeys(expectedProcessed),
|
|
351
|
+
null,
|
|
352
|
+
2
|
|
353
|
+
);
|
|
354
|
+
const actualKeys = JSON.stringify(
|
|
355
|
+
extractKeys(actualProcessed),
|
|
356
|
+
null,
|
|
357
|
+
2
|
|
358
|
+
);
|
|
359
|
+
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
360
|
+
return formatDiffParts(parts2);
|
|
361
|
+
}
|
|
362
|
+
const expectedStr = toPrettyJson(expectedProcessed);
|
|
363
|
+
const actualStr = toPrettyJson(actualProcessed);
|
|
364
|
+
if (expectedStr === actualStr) {
|
|
365
|
+
return "";
|
|
366
|
+
}
|
|
367
|
+
const parts = diff.diffLines(expectedStr, actualStr);
|
|
368
|
+
if (diffOptions?.outputNewOnly) {
|
|
369
|
+
const filtered = parts.filter(
|
|
370
|
+
(p) => p.added === true
|
|
371
|
+
);
|
|
372
|
+
return formatDiffParts(filtered);
|
|
373
|
+
}
|
|
374
|
+
return formatDiffParts(parts);
|
|
375
|
+
}
|
|
376
|
+
function extractKeys(value) {
|
|
377
|
+
if (value === null || typeof value !== "object") {
|
|
378
|
+
return "\xB7";
|
|
379
|
+
}
|
|
380
|
+
if (Array.isArray(value)) {
|
|
381
|
+
return value.map(extractKeys);
|
|
382
|
+
}
|
|
383
|
+
const result = {};
|
|
384
|
+
for (const [k, v] of Object.entries(value)) {
|
|
385
|
+
result[k] = extractKeys(v);
|
|
386
|
+
}
|
|
387
|
+
return result;
|
|
291
388
|
}
|
|
292
389
|
function formatLogMessage(msg) {
|
|
293
390
|
if (typeof msg === "string")
|
|
@@ -1327,8 +1424,9 @@ var EffectRunner = class {
|
|
|
1327
1424
|
);
|
|
1328
1425
|
}
|
|
1329
1426
|
};
|
|
1330
|
-
|
|
1331
|
-
|
|
1427
|
+
function getDefaultConcurrency() {
|
|
1428
|
+
return Math.max(1, os.cpus().length);
|
|
1429
|
+
}
|
|
1332
1430
|
function parseSimpleCliArgs(argv) {
|
|
1333
1431
|
const args = {
|
|
1334
1432
|
help: false,
|
|
@@ -1355,6 +1453,14 @@ function parseSimpleCliArgs(argv) {
|
|
|
1355
1453
|
index += 1;
|
|
1356
1454
|
continue;
|
|
1357
1455
|
}
|
|
1456
|
+
if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
|
|
1457
|
+
const n = parseInt(argv[index + 1], 10);
|
|
1458
|
+
if (!Number.isNaN(n) && n >= 1) {
|
|
1459
|
+
args.concurrency = n;
|
|
1460
|
+
}
|
|
1461
|
+
index += 1;
|
|
1462
|
+
continue;
|
|
1463
|
+
}
|
|
1358
1464
|
args.unknownArgs.push(token);
|
|
1359
1465
|
}
|
|
1360
1466
|
return args;
|
|
@@ -1362,9 +1468,12 @@ function parseSimpleCliArgs(argv) {
|
|
|
1362
1468
|
function getSimpleCliUsage() {
|
|
1363
1469
|
return [
|
|
1364
1470
|
"Usage:",
|
|
1365
|
-
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
|
|
1471
|
+
" eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
|
|
1366
1472
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1367
1473
|
"",
|
|
1474
|
+
"Options:",
|
|
1475
|
+
" --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
|
|
1476
|
+
"",
|
|
1368
1477
|
"Pattern examples for --evaluator:",
|
|
1369
1478
|
" score-evaluator exact name (case-insensitive)",
|
|
1370
1479
|
' "*score*" wildcard pattern',
|
|
@@ -1653,6 +1762,7 @@ function RunView({
|
|
|
1653
1762
|
runner,
|
|
1654
1763
|
datasetName,
|
|
1655
1764
|
evaluatorPattern,
|
|
1765
|
+
concurrency,
|
|
1656
1766
|
onComplete
|
|
1657
1767
|
}) {
|
|
1658
1768
|
const [phase, setPhase] = React2.useState(
|
|
@@ -1800,7 +1910,8 @@ function RunView({
|
|
|
1800
1910
|
});
|
|
1801
1911
|
const snapshot = await runner.runDatasetWith({
|
|
1802
1912
|
datasetId: dataset.id,
|
|
1803
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
1913
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
1914
|
+
concurrency
|
|
1804
1915
|
});
|
|
1805
1916
|
setRunInfo({
|
|
1806
1917
|
runId: snapshot.runId,
|
|
@@ -1828,7 +1939,7 @@ function RunView({
|
|
|
1828
1939
|
});
|
|
1829
1940
|
setPhase("completed");
|
|
1830
1941
|
setTimeout(() => onComplete(), 200);
|
|
1831
|
-
}, [runner, datasetName, evaluatorPattern, onComplete]);
|
|
1942
|
+
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1832
1943
|
React2.useEffect(() => {
|
|
1833
1944
|
void runEval();
|
|
1834
1945
|
}, [runEval]);
|
|
@@ -1871,22 +1982,30 @@ function RunView({
|
|
|
1871
1982
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
1872
1983
|
}
|
|
1873
1984
|
),
|
|
1874
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1985
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1986
|
+
ink.Text,
|
|
1987
|
+
{
|
|
1988
|
+
color: "yellow",
|
|
1989
|
+
children: [
|
|
1990
|
+
"[running ",
|
|
1991
|
+
item.startedTestCases,
|
|
1992
|
+
"/",
|
|
1993
|
+
item.totalTestCases,
|
|
1994
|
+
"]",
|
|
1995
|
+
" ",
|
|
1996
|
+
item.name,
|
|
1997
|
+
" ",
|
|
1998
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1999
|
+
"(",
|
|
2000
|
+
item.rerunIndex,
|
|
2001
|
+
"/",
|
|
2002
|
+
item.rerunTotal,
|
|
2003
|
+
")"
|
|
2004
|
+
] })
|
|
2005
|
+
]
|
|
2006
|
+
},
|
|
2007
|
+
`${item.testCaseId}:${item.rerunIndex}`
|
|
2008
|
+
)) })
|
|
1890
2009
|
] }),
|
|
1891
2010
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
1892
2011
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -1968,7 +2087,7 @@ function RunView({
|
|
|
1968
2087
|
},
|
|
1969
2088
|
`${item.evaluatorId}-${s.id}-${idx}`
|
|
1970
2089
|
);
|
|
1971
|
-
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "
|
|
2090
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
1972
2091
|
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1973
2092
|
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
1974
2093
|
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2026,9 +2145,9 @@ function RunView({
|
|
|
2026
2145
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
2027
2146
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2028
2147
|
const agg = summary.aggregates.get(id);
|
|
2029
|
-
const scoreKeys = [
|
|
2030
|
-
(
|
|
2031
|
-
);
|
|
2148
|
+
const scoreKeys = [
|
|
2149
|
+
...summary.scoreItemsByEvaluatorScore?.keys() ?? []
|
|
2150
|
+
].filter((k) => k.startsWith(`${id}:`));
|
|
2032
2151
|
if (scoreKeys.length === 0) {
|
|
2033
2152
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2034
2153
|
"- ",
|
|
@@ -2336,7 +2455,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2336
2455
|
}
|
|
2337
2456
|
return lines;
|
|
2338
2457
|
}
|
|
2339
|
-
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
|
|
2458
|
+
async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2340
2459
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
2341
2460
|
if (!dataset) {
|
|
2342
2461
|
const known = await runner.collectDatasets();
|
|
@@ -2526,7 +2645,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2526
2645
|
});
|
|
2527
2646
|
const snapshot = await runner.runDatasetWith({
|
|
2528
2647
|
datasetId: dataset.id,
|
|
2529
|
-
evaluatorIds: evaluators.map((item) => item.id)
|
|
2648
|
+
evaluatorIds: evaluators.map((item) => item.id),
|
|
2649
|
+
concurrency
|
|
2530
2650
|
});
|
|
2531
2651
|
totalCount = snapshot.totalTestCases;
|
|
2532
2652
|
console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
@@ -2615,13 +2735,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
2615
2735
|
}
|
|
2616
2736
|
console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
|
|
2617
2737
|
}
|
|
2618
|
-
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
2738
|
+
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2619
2739
|
return new Promise((resolve5, reject) => {
|
|
2620
2740
|
const app = ink.render(
|
|
2621
2741
|
React2__namespace.createElement(RunView, {
|
|
2622
2742
|
runner,
|
|
2623
2743
|
datasetName,
|
|
2624
2744
|
evaluatorPattern,
|
|
2745
|
+
concurrency,
|
|
2625
2746
|
onComplete: (err) => {
|
|
2626
2747
|
app.unmount();
|
|
2627
2748
|
if (err) {
|
|
@@ -2668,10 +2789,12 @@ async function main() {
|
|
|
2668
2789
|
const runner = createRunner();
|
|
2669
2790
|
try {
|
|
2670
2791
|
if (args.command === "run") {
|
|
2792
|
+
const concurrency = args.concurrency ?? getDefaultConcurrency();
|
|
2671
2793
|
await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
|
|
2672
2794
|
runner,
|
|
2673
2795
|
args.datasetName,
|
|
2674
|
-
args.evaluatorPattern
|
|
2796
|
+
args.evaluatorPattern,
|
|
2797
|
+
concurrency
|
|
2675
2798
|
);
|
|
2676
2799
|
return;
|
|
2677
2800
|
}
|