@m4trix/evals 0.20.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,9 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var jsonDiff = require('json-diff');
11
+ var diff = require('diff');
12
+ var stringify = require('fast-json-stable-stringify');
13
+ var os = require('os');
12
14
  var React2 = require('react');
13
15
  var ink = require('ink');
14
16
  var jsxRuntime = require('react/jsx-runtime');
@@ -35,7 +37,8 @@ function _interopNamespace(e) {
35
37
  }
36
38
 
37
39
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
38
- var React2__default = /*#__PURE__*/_interopDefault(React2);
40
+ var stringify__default = /*#__PURE__*/_interopDefault(stringify);
41
+ var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
39
42
 
40
43
  // src/runner/config.ts
41
44
  var defaultRunnerConfig = {
@@ -286,10 +289,102 @@ async function collectTestCasesFromFiles(config) {
286
289
  );
287
290
  return found.flat();
288
291
  }
292
+ function preprocessForDiff(value, options) {
293
+ if (options?.sort && Array.isArray(value)) {
294
+ return [...value].sort((a, b) => {
295
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
296
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
297
+ return aStr.localeCompare(bStr);
298
+ }).map((item) => preprocessForDiff(item, options));
299
+ }
300
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
301
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
302
+ const filtered = {};
303
+ for (const [k, v] of Object.entries(value)) {
304
+ if (!keys.includes(k)) {
305
+ filtered[k] = preprocessForDiff(v, options);
306
+ }
307
+ }
308
+ return filtered;
309
+ }
310
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
311
+ const result = {};
312
+ for (const [k, v] of Object.entries(value)) {
313
+ result[k] = preprocessForDiff(v, options);
314
+ }
315
+ return result;
316
+ }
317
+ if (typeof value === "number" && options?.precision !== void 0) {
318
+ return Number(value.toFixed(options.precision));
319
+ }
320
+ return value;
321
+ }
322
+ function toPrettyJson(value) {
323
+ const str = stringify__default.default(value);
324
+ try {
325
+ const parsed = JSON.parse(str);
326
+ return JSON.stringify(parsed, null, 2);
327
+ } catch {
328
+ return str;
329
+ }
330
+ }
331
+ function formatDiffParts(parts) {
332
+ const lines = [];
333
+ for (const part of parts) {
334
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
335
+ const partLines = part.value.split("\n");
336
+ for (let i = 0; i < partLines.length; i++) {
337
+ const line = partLines[i];
338
+ if (i === partLines.length - 1 && line === "")
339
+ continue;
340
+ lines.push(prefix + line);
341
+ }
342
+ }
343
+ return lines.join("\n");
344
+ }
289
345
  function createDiffString(expected, actual, diffOptions) {
290
- const opts = { ...diffOptions, color: false };
291
- const result = jsonDiff.diffString(expected, actual, opts);
292
- return typeof result === "string" ? result : "";
346
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
347
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
348
+ if (diffOptions?.keysOnly) {
349
+ const expectedKeys = JSON.stringify(
350
+ extractKeys(expectedProcessed),
351
+ null,
352
+ 2
353
+ );
354
+ const actualKeys = JSON.stringify(
355
+ extractKeys(actualProcessed),
356
+ null,
357
+ 2
358
+ );
359
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
360
+ return formatDiffParts(parts2);
361
+ }
362
+ const expectedStr = toPrettyJson(expectedProcessed);
363
+ const actualStr = toPrettyJson(actualProcessed);
364
+ if (expectedStr === actualStr) {
365
+ return "";
366
+ }
367
+ const parts = diff.diffLines(expectedStr, actualStr);
368
+ if (diffOptions?.outputNewOnly) {
369
+ const filtered = parts.filter(
370
+ (p) => p.added === true
371
+ );
372
+ return formatDiffParts(filtered);
373
+ }
374
+ return formatDiffParts(parts);
375
+ }
376
+ function extractKeys(value) {
377
+ if (value === null || typeof value !== "object") {
378
+ return "\xB7";
379
+ }
380
+ if (Array.isArray(value)) {
381
+ return value.map(extractKeys);
382
+ }
383
+ const result = {};
384
+ for (const [k, v] of Object.entries(value)) {
385
+ result[k] = extractKeys(v);
386
+ }
387
+ return result;
293
388
  }
294
389
  function formatLogMessage(msg) {
295
390
  if (typeof msg === "string")
@@ -677,13 +772,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
677
772
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
678
773
  );
679
774
  }
680
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
775
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
681
776
  return effect.Effect.gen(function* () {
682
777
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
683
778
  const rerunPassed = [];
684
779
  for (let r = 0; r < reruns; r++) {
685
780
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
686
781
  const started = Date.now();
782
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
783
+ n + 1,
784
+ n + 1
785
+ ]);
786
+ yield* publishEvent({
787
+ type: "TestCaseStarted",
788
+ runId: task.runId,
789
+ testCaseId: testCaseItem.id,
790
+ testCaseName: testCaseItem.testCase.getName(),
791
+ startedTestCases: startedEvaluations,
792
+ totalTestCases: totalEvaluations,
793
+ rerunIndex: r + 1,
794
+ rerunTotal: reruns
795
+ });
687
796
  const evaluatorScores = [];
688
797
  let testCaseError;
689
798
  const output = readOutput(testCaseItem.testCase);
@@ -829,6 +938,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
829
938
  );
830
939
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
831
940
  const completedRef = yield* effect.Ref.make(0);
941
+ const startedRef = yield* effect.Ref.make(0);
832
942
  const passedRef = yield* effect.Ref.make(0);
833
943
  const failedRef = yield* effect.Ref.make(0);
834
944
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -838,6 +948,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
838
948
  publishEvent,
839
949
  persistenceQueue,
840
950
  updateSnapshot,
951
+ startedRef,
841
952
  completedRef,
842
953
  passedRef,
843
954
  failedRef
@@ -1313,8 +1424,9 @@ var EffectRunner = class {
1313
1424
  );
1314
1425
  }
1315
1426
  };
1316
-
1317
- // src/cli-simple/args.ts
1427
+ function getDefaultConcurrency() {
1428
+ return Math.max(1, os.cpus().length);
1429
+ }
1318
1430
  function parseSimpleCliArgs(argv) {
1319
1431
  const args = {
1320
1432
  help: false,
@@ -1341,6 +1453,14 @@ function parseSimpleCliArgs(argv) {
1341
1453
  index += 1;
1342
1454
  continue;
1343
1455
  }
1456
+ if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1457
+ const n = parseInt(argv[index + 1], 10);
1458
+ if (!Number.isNaN(n) && n >= 1) {
1459
+ args.concurrency = n;
1460
+ }
1461
+ index += 1;
1462
+ continue;
1463
+ }
1344
1464
  args.unknownArgs.push(token);
1345
1465
  }
1346
1466
  return args;
@@ -1348,9 +1468,12 @@ function parseSimpleCliArgs(argv) {
1348
1468
  function getSimpleCliUsage() {
1349
1469
  return [
1350
1470
  "Usage:",
1351
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
1471
+ " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1352
1472
  " eval-agents-simple generate --dataset <datasetName>",
1353
1473
  "",
1474
+ "Options:",
1475
+ " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1476
+ "",
1354
1477
  "Pattern examples for --evaluator:",
1355
1478
  " score-evaluator exact name (case-insensitive)",
1356
1479
  ' "*score*" wildcard pattern',
@@ -1489,7 +1612,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1489
1612
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1490
1613
  return new Promise((resolve5, reject) => {
1491
1614
  const app = ink.render(
1492
- React2__default.default.createElement(GenerateView, {
1615
+ React2__namespace.default.createElement(GenerateView, {
1493
1616
  runner,
1494
1617
  datasetName,
1495
1618
  onComplete: (err) => {
@@ -1639,6 +1762,7 @@ function RunView({
1639
1762
  runner,
1640
1763
  datasetName,
1641
1764
  evaluatorPattern,
1765
+ concurrency,
1642
1766
  onComplete
1643
1767
  }) {
1644
1768
  const [phase, setPhase] = React2.useState(
@@ -1646,7 +1770,9 @@ function RunView({
1646
1770
  );
1647
1771
  const [runInfo, setRunInfo] = React2.useState(null);
1648
1772
  const [testCases, setTestCases] = React2.useState([]);
1773
+ const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
1649
1774
  const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1775
+ const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
1650
1776
  const [summary, setSummary] = React2.useState(null);
1651
1777
  const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1652
1778
  const runEval = React2.useCallback(async () => {
@@ -1683,6 +1809,25 @@ function RunView({
1683
1809
  let overallScoreCount = 0;
1684
1810
  const done = new Promise((resolve5) => {
1685
1811
  const unsubscribe = runner.subscribeRunEvents((event) => {
1812
+ if (event.type === "TestCaseStarted") {
1813
+ setStartedEvaluations(event.startedTestCases);
1814
+ setRunningEvaluations((prev) => {
1815
+ const withoutDuplicate = prev.filter(
1816
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1817
+ );
1818
+ return [
1819
+ ...withoutDuplicate,
1820
+ {
1821
+ testCaseId: event.testCaseId,
1822
+ name: event.testCaseName,
1823
+ rerunIndex: event.rerunIndex,
1824
+ rerunTotal: event.rerunTotal,
1825
+ startedTestCases: event.startedTestCases,
1826
+ totalTestCases: event.totalTestCases
1827
+ }
1828
+ ];
1829
+ });
1830
+ }
1686
1831
  if (event.type === "TestCaseProgress") {
1687
1832
  for (const item of event.evaluatorScores) {
1688
1833
  const numeric = toNumericScoreFromScores(item.scores);
@@ -1749,6 +1894,11 @@ function RunView({
1749
1894
  };
1750
1895
  byId.set(event.testCaseId, merged);
1751
1896
  setCompletedEvaluations(event.completedTestCases);
1897
+ setRunningEvaluations(
1898
+ (running) => running.filter(
1899
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1900
+ )
1901
+ );
1752
1902
  return Array.from(byId.values());
1753
1903
  });
1754
1904
  }
@@ -1760,7 +1910,8 @@ function RunView({
1760
1910
  });
1761
1911
  const snapshot = await runner.runDatasetWith({
1762
1912
  datasetId: dataset.id,
1763
- evaluatorIds: evaluators.map((item) => item.id)
1913
+ evaluatorIds: evaluators.map((item) => item.id),
1914
+ concurrency
1764
1915
  });
1765
1916
  setRunInfo({
1766
1917
  runId: snapshot.runId,
@@ -1788,7 +1939,7 @@ function RunView({
1788
1939
  });
1789
1940
  setPhase("completed");
1790
1941
  setTimeout(() => onComplete(), 200);
1791
- }, [runner, datasetName, evaluatorPattern, onComplete]);
1942
+ }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1792
1943
  React2.useEffect(() => {
1793
1944
  void runEval();
1794
1945
  }, [runEval]);
@@ -1824,12 +1975,38 @@ function RunView({
1824
1975
  runInfo.totalTestCases
1825
1976
  ] })
1826
1977
  ] }),
1827
- phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1828
- Spinner,
1829
- {
1830
- label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1831
- }
1832
- ) }),
1978
+ phase === "running" && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1979
+ /* @__PURE__ */ jsxRuntime.jsx(
1980
+ Spinner,
1981
+ {
1982
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1983
+ }
1984
+ ),
1985
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1986
+ ink.Text,
1987
+ {
1988
+ color: "yellow",
1989
+ children: [
1990
+ "[running ",
1991
+ item.startedTestCases,
1992
+ "/",
1993
+ item.totalTestCases,
1994
+ "]",
1995
+ " ",
1996
+ item.name,
1997
+ " ",
1998
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1999
+ "(",
2000
+ item.rerunIndex,
2001
+ "/",
2002
+ item.rerunTotal,
2003
+ ")"
2004
+ ] })
2005
+ ]
2006
+ },
2007
+ `${item.testCaseId}:${item.rerunIndex}`
2008
+ )) })
2009
+ ] }),
1833
2010
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1834
2011
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1835
2012
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -1910,7 +2087,7 @@ function RunView({
1910
2087
  },
1911
2088
  `${item.evaluatorId}-${s.id}-${idx}`
1912
2089
  );
1913
- }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2090
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
1914
2091
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1915
2092
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1916
2093
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -1968,9 +2145,9 @@ function RunView({
1968
2145
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
1969
2146
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1970
2147
  const agg = summary.aggregates.get(id);
1971
- const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1972
- (k) => k.startsWith(`${id}:`)
1973
- );
2148
+ const scoreKeys = [
2149
+ ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2150
+ ].filter((k) => k.startsWith(`${id}:`));
1974
2151
  if (scoreKeys.length === 0) {
1975
2152
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1976
2153
  "- ",
@@ -2278,7 +2455,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2278
2455
  }
2279
2456
  return lines;
2280
2457
  }
2281
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2458
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2282
2459
  const dataset = await runner.resolveDatasetByName(datasetName);
2283
2460
  if (!dataset) {
2284
2461
  const known = await runner.collectDatasets();
@@ -2304,9 +2481,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2304
2481
  let overallScoreTotal = 0;
2305
2482
  let overallScoreSumSq = 0;
2306
2483
  let overallScoreCount = 0;
2484
+ let startedCount = 0;
2307
2485
  let completedCount = 0;
2308
2486
  let totalCount = 0;
2309
2487
  let runFinished = false;
2488
+ const inFlightReruns = /* @__PURE__ */ new Set();
2310
2489
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2311
2490
  let spinnerIndex = 0;
2312
2491
  function clearLine() {
@@ -2330,7 +2509,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2330
2509
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2331
2510
  `${completedCount}/${totalCount}`,
2332
2511
  ansi2.bold
2333
- )} ${colorize("(live)", ansi2.dim)}`
2512
+ )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2334
2513
  );
2335
2514
  }
2336
2515
  let lastPrintedTestCaseId = null;
@@ -2338,8 +2517,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2338
2517
  let spinnerTimer;
2339
2518
  const done = new Promise((resolve5) => {
2340
2519
  const unsubscribe = runner.subscribeRunEvents((event) => {
2520
+ if (event.type === "TestCaseStarted") {
2521
+ startedCount = event.startedTestCases;
2522
+ inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2523
+ clearLine();
2524
+ process.stdout.write(
2525
+ `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2526
+ `
2527
+ );
2528
+ drawSpinner();
2529
+ }
2341
2530
  if (event.type === "TestCaseProgress") {
2342
2531
  completedCount = event.completedTestCases;
2532
+ inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2343
2533
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2344
2534
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2345
2535
  const testCaseId = event.testCaseId;
@@ -2455,7 +2645,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2455
2645
  });
2456
2646
  const snapshot = await runner.runDatasetWith({
2457
2647
  datasetId: dataset.id,
2458
- evaluatorIds: evaluators.map((item) => item.id)
2648
+ evaluatorIds: evaluators.map((item) => item.id),
2649
+ concurrency
2459
2650
  });
2460
2651
  totalCount = snapshot.totalTestCases;
2461
2652
  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
@@ -2544,13 +2735,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2544
2735
  }
2545
2736
  console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2546
2737
  }
2547
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2738
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2548
2739
  return new Promise((resolve5, reject) => {
2549
2740
  const app = ink.render(
2550
- React2__default.default.createElement(RunView, {
2741
+ React2__namespace.createElement(RunView, {
2551
2742
  runner,
2552
2743
  datasetName,
2553
2744
  evaluatorPattern,
2745
+ concurrency,
2554
2746
  onComplete: (err) => {
2555
2747
  app.unmount();
2556
2748
  if (err) {
@@ -2597,10 +2789,12 @@ async function main() {
2597
2789
  const runner = createRunner();
2598
2790
  try {
2599
2791
  if (args.command === "run") {
2792
+ const concurrency = args.concurrency ?? getDefaultConcurrency();
2600
2793
  await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
2601
2794
  runner,
2602
2795
  args.datasetName,
2603
- args.evaluatorPattern
2796
+ args.evaluatorPattern,
2797
+ concurrency
2604
2798
  );
2605
2799
  return;
2606
2800
  }