@m4trix/evals 0.20.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,8 +6,11 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffString } from 'json-diff';
10
- import React2, { useState, useEffect, useCallback } from 'react';
9
+ import { diffLines } from 'diff';
10
+ import stringify from 'fast-json-stable-stringify';
11
+ import { cpus } from 'os';
12
+ import * as React2 from 'react';
13
+ import React2__default, { useState, useEffect, useCallback } from 'react';
11
14
  import { render, Box, Text } from 'ink';
12
15
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
13
16
 
@@ -260,10 +263,102 @@ async function collectTestCasesFromFiles(config) {
260
263
  );
261
264
  return found.flat();
262
265
  }
266
+ function preprocessForDiff(value, options) {
267
+ if (options?.sort && Array.isArray(value)) {
268
+ return [...value].sort((a, b) => {
269
+ const aStr = stringify(preprocessForDiff(a, options));
270
+ const bStr = stringify(preprocessForDiff(b, options));
271
+ return aStr.localeCompare(bStr);
272
+ }).map((item) => preprocessForDiff(item, options));
273
+ }
274
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
275
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
276
+ const filtered = {};
277
+ for (const [k, v] of Object.entries(value)) {
278
+ if (!keys.includes(k)) {
279
+ filtered[k] = preprocessForDiff(v, options);
280
+ }
281
+ }
282
+ return filtered;
283
+ }
284
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
285
+ const result = {};
286
+ for (const [k, v] of Object.entries(value)) {
287
+ result[k] = preprocessForDiff(v, options);
288
+ }
289
+ return result;
290
+ }
291
+ if (typeof value === "number" && options?.precision !== void 0) {
292
+ return Number(value.toFixed(options.precision));
293
+ }
294
+ return value;
295
+ }
296
+ function toPrettyJson(value) {
297
+ const str = stringify(value);
298
+ try {
299
+ const parsed = JSON.parse(str);
300
+ return JSON.stringify(parsed, null, 2);
301
+ } catch {
302
+ return str;
303
+ }
304
+ }
305
+ function formatDiffParts(parts) {
306
+ const lines = [];
307
+ for (const part of parts) {
308
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
309
+ const partLines = part.value.split("\n");
310
+ for (let i = 0; i < partLines.length; i++) {
311
+ const line = partLines[i];
312
+ if (i === partLines.length - 1 && line === "")
313
+ continue;
314
+ lines.push(prefix + line);
315
+ }
316
+ }
317
+ return lines.join("\n");
318
+ }
263
319
  function createDiffString(expected, actual, diffOptions) {
264
- const opts = { ...diffOptions, color: false };
265
- const result = diffString(expected, actual, opts);
266
- return typeof result === "string" ? result : "";
320
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
321
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
322
+ if (diffOptions?.keysOnly) {
323
+ const expectedKeys = JSON.stringify(
324
+ extractKeys(expectedProcessed),
325
+ null,
326
+ 2
327
+ );
328
+ const actualKeys = JSON.stringify(
329
+ extractKeys(actualProcessed),
330
+ null,
331
+ 2
332
+ );
333
+ const parts2 = diffLines(expectedKeys, actualKeys);
334
+ return formatDiffParts(parts2);
335
+ }
336
+ const expectedStr = toPrettyJson(expectedProcessed);
337
+ const actualStr = toPrettyJson(actualProcessed);
338
+ if (expectedStr === actualStr) {
339
+ return "";
340
+ }
341
+ const parts = diffLines(expectedStr, actualStr);
342
+ if (diffOptions?.outputNewOnly) {
343
+ const filtered = parts.filter(
344
+ (p) => p.added === true
345
+ );
346
+ return formatDiffParts(filtered);
347
+ }
348
+ return formatDiffParts(parts);
349
+ }
350
+ function extractKeys(value) {
351
+ if (value === null || typeof value !== "object") {
352
+ return "\xB7";
353
+ }
354
+ if (Array.isArray(value)) {
355
+ return value.map(extractKeys);
356
+ }
357
+ const result = {};
358
+ for (const [k, v] of Object.entries(value)) {
359
+ result[k] = extractKeys(v);
360
+ }
361
+ return result;
267
362
  }
268
363
  function formatLogMessage(msg) {
269
364
  if (typeof msg === "string")
@@ -651,13 +746,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
651
746
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
652
747
  );
653
748
  }
654
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
749
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
655
750
  return Effect.gen(function* () {
656
751
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
657
752
  const rerunPassed = [];
658
753
  for (let r = 0; r < reruns; r++) {
659
754
  const evaluatorRunId = `run-${randomUUID()}`;
660
755
  const started = Date.now();
756
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
757
+ n + 1,
758
+ n + 1
759
+ ]);
760
+ yield* publishEvent({
761
+ type: "TestCaseStarted",
762
+ runId: task.runId,
763
+ testCaseId: testCaseItem.id,
764
+ testCaseName: testCaseItem.testCase.getName(),
765
+ startedTestCases: startedEvaluations,
766
+ totalTestCases: totalEvaluations,
767
+ rerunIndex: r + 1,
768
+ rerunTotal: reruns
769
+ });
661
770
  const evaluatorScores = [];
662
771
  let testCaseError;
663
772
  const output = readOutput(testCaseItem.testCase);
@@ -803,6 +912,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
803
912
  );
804
913
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
805
914
  const completedRef = yield* Ref.make(0);
915
+ const startedRef = yield* Ref.make(0);
806
916
  const passedRef = yield* Ref.make(0);
807
917
  const failedRef = yield* Ref.make(0);
808
918
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -812,6 +922,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
812
922
  publishEvent,
813
923
  persistenceQueue,
814
924
  updateSnapshot,
925
+ startedRef,
815
926
  completedRef,
816
927
  passedRef,
817
928
  failedRef
@@ -1287,8 +1398,9 @@ var EffectRunner = class {
1287
1398
  );
1288
1399
  }
1289
1400
  };
1290
-
1291
- // src/cli-simple/args.ts
1401
+ function getDefaultConcurrency() {
1402
+ return Math.max(1, cpus().length);
1403
+ }
1292
1404
  function parseSimpleCliArgs(argv) {
1293
1405
  const args = {
1294
1406
  help: false,
@@ -1315,6 +1427,14 @@ function parseSimpleCliArgs(argv) {
1315
1427
  index += 1;
1316
1428
  continue;
1317
1429
  }
1430
+ if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1431
+ const n = parseInt(argv[index + 1], 10);
1432
+ if (!Number.isNaN(n) && n >= 1) {
1433
+ args.concurrency = n;
1434
+ }
1435
+ index += 1;
1436
+ continue;
1437
+ }
1318
1438
  args.unknownArgs.push(token);
1319
1439
  }
1320
1440
  return args;
@@ -1322,9 +1442,12 @@ function parseSimpleCliArgs(argv) {
1322
1442
  function getSimpleCliUsage() {
1323
1443
  return [
1324
1444
  "Usage:",
1325
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
1445
+ " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1326
1446
  " eval-agents-simple generate --dataset <datasetName>",
1327
1447
  "",
1448
+ "Options:",
1449
+ " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1450
+ "",
1328
1451
  "Pattern examples for --evaluator:",
1329
1452
  " score-evaluator exact name (case-insensitive)",
1330
1453
  ' "*score*" wildcard pattern',
@@ -1463,7 +1586,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1463
1586
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1464
1587
  return new Promise((resolve5, reject) => {
1465
1588
  const app = render(
1466
- React2.createElement(GenerateView, {
1589
+ React2__default.createElement(GenerateView, {
1467
1590
  runner,
1468
1591
  datasetName,
1469
1592
  onComplete: (err) => {
@@ -1613,6 +1736,7 @@ function RunView({
1613
1736
  runner,
1614
1737
  datasetName,
1615
1738
  evaluatorPattern,
1739
+ concurrency,
1616
1740
  onComplete
1617
1741
  }) {
1618
1742
  const [phase, setPhase] = useState(
@@ -1620,7 +1744,9 @@ function RunView({
1620
1744
  );
1621
1745
  const [runInfo, setRunInfo] = useState(null);
1622
1746
  const [testCases, setTestCases] = useState([]);
1747
+ const [startedEvaluations, setStartedEvaluations] = useState(0);
1623
1748
  const [completedEvaluations, setCompletedEvaluations] = useState(0);
1749
+ const [runningEvaluations, setRunningEvaluations] = useState([]);
1624
1750
  const [summary, setSummary] = useState(null);
1625
1751
  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1626
1752
  const runEval = useCallback(async () => {
@@ -1657,6 +1783,25 @@ function RunView({
1657
1783
  let overallScoreCount = 0;
1658
1784
  const done = new Promise((resolve5) => {
1659
1785
  const unsubscribe = runner.subscribeRunEvents((event) => {
1786
+ if (event.type === "TestCaseStarted") {
1787
+ setStartedEvaluations(event.startedTestCases);
1788
+ setRunningEvaluations((prev) => {
1789
+ const withoutDuplicate = prev.filter(
1790
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1791
+ );
1792
+ return [
1793
+ ...withoutDuplicate,
1794
+ {
1795
+ testCaseId: event.testCaseId,
1796
+ name: event.testCaseName,
1797
+ rerunIndex: event.rerunIndex,
1798
+ rerunTotal: event.rerunTotal,
1799
+ startedTestCases: event.startedTestCases,
1800
+ totalTestCases: event.totalTestCases
1801
+ }
1802
+ ];
1803
+ });
1804
+ }
1660
1805
  if (event.type === "TestCaseProgress") {
1661
1806
  for (const item of event.evaluatorScores) {
1662
1807
  const numeric = toNumericScoreFromScores(item.scores);
@@ -1723,6 +1868,11 @@ function RunView({
1723
1868
  };
1724
1869
  byId.set(event.testCaseId, merged);
1725
1870
  setCompletedEvaluations(event.completedTestCases);
1871
+ setRunningEvaluations(
1872
+ (running) => running.filter(
1873
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1874
+ )
1875
+ );
1726
1876
  return Array.from(byId.values());
1727
1877
  });
1728
1878
  }
@@ -1734,7 +1884,8 @@ function RunView({
1734
1884
  });
1735
1885
  const snapshot = await runner.runDatasetWith({
1736
1886
  datasetId: dataset.id,
1737
- evaluatorIds: evaluators.map((item) => item.id)
1887
+ evaluatorIds: evaluators.map((item) => item.id),
1888
+ concurrency
1738
1889
  });
1739
1890
  setRunInfo({
1740
1891
  runId: snapshot.runId,
@@ -1762,7 +1913,7 @@ function RunView({
1762
1913
  });
1763
1914
  setPhase("completed");
1764
1915
  setTimeout(() => onComplete(), 200);
1765
- }, [runner, datasetName, evaluatorPattern, onComplete]);
1916
+ }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1766
1917
  useEffect(() => {
1767
1918
  void runEval();
1768
1919
  }, [runEval]);
@@ -1798,12 +1949,38 @@ function RunView({
1798
1949
  runInfo.totalTestCases
1799
1950
  ] })
1800
1951
  ] }),
1801
- phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1802
- Spinner,
1803
- {
1804
- label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1805
- }
1806
- ) }),
1952
+ phase === "running" && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1953
+ /* @__PURE__ */ jsx(
1954
+ Spinner,
1955
+ {
1956
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1957
+ }
1958
+ ),
1959
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
1960
+ Text,
1961
+ {
1962
+ color: "yellow",
1963
+ children: [
1964
+ "[running ",
1965
+ item.startedTestCases,
1966
+ "/",
1967
+ item.totalTestCases,
1968
+ "]",
1969
+ " ",
1970
+ item.name,
1971
+ " ",
1972
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1973
+ "(",
1974
+ item.rerunIndex,
1975
+ "/",
1976
+ item.rerunTotal,
1977
+ ")"
1978
+ ] })
1979
+ ]
1980
+ },
1981
+ `${item.testCaseId}:${item.rerunIndex}`
1982
+ )) })
1983
+ ] }),
1807
1984
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1808
1985
  /* @__PURE__ */ jsxs(Text, { children: [
1809
1986
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -1884,7 +2061,7 @@ function RunView({
1884
2061
  },
1885
2062
  `${item.evaluatorId}-${s.id}-${idx}`
1886
2063
  );
1887
- }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2064
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
1888
2065
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1889
2066
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1890
2067
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -1942,9 +2119,9 @@ function RunView({
1942
2119
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
1943
2120
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
1944
2121
  const agg = summary.aggregates.get(id);
1945
- const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
1946
- (k) => k.startsWith(`${id}:`)
1947
- );
2122
+ const scoreKeys = [
2123
+ ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2124
+ ].filter((k) => k.startsWith(`${id}:`));
1948
2125
  if (scoreKeys.length === 0) {
1949
2126
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1950
2127
  "- ",
@@ -2252,7 +2429,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2252
2429
  }
2253
2430
  return lines;
2254
2431
  }
2255
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2432
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2256
2433
  const dataset = await runner.resolveDatasetByName(datasetName);
2257
2434
  if (!dataset) {
2258
2435
  const known = await runner.collectDatasets();
@@ -2278,9 +2455,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2278
2455
  let overallScoreTotal = 0;
2279
2456
  let overallScoreSumSq = 0;
2280
2457
  let overallScoreCount = 0;
2458
+ let startedCount = 0;
2281
2459
  let completedCount = 0;
2282
2460
  let totalCount = 0;
2283
2461
  let runFinished = false;
2462
+ const inFlightReruns = /* @__PURE__ */ new Set();
2284
2463
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2285
2464
  let spinnerIndex = 0;
2286
2465
  function clearLine() {
@@ -2304,7 +2483,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2304
2483
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2305
2484
  `${completedCount}/${totalCount}`,
2306
2485
  ansi2.bold
2307
- )} ${colorize("(live)", ansi2.dim)}`
2486
+ )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2308
2487
  );
2309
2488
  }
2310
2489
  let lastPrintedTestCaseId = null;
@@ -2312,8 +2491,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2312
2491
  let spinnerTimer;
2313
2492
  const done = new Promise((resolve5) => {
2314
2493
  const unsubscribe = runner.subscribeRunEvents((event) => {
2494
+ if (event.type === "TestCaseStarted") {
2495
+ startedCount = event.startedTestCases;
2496
+ inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2497
+ clearLine();
2498
+ process.stdout.write(
2499
+ `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2500
+ `
2501
+ );
2502
+ drawSpinner();
2503
+ }
2315
2504
  if (event.type === "TestCaseProgress") {
2316
2505
  completedCount = event.completedTestCases;
2506
+ inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2317
2507
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2318
2508
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2319
2509
  const testCaseId = event.testCaseId;
@@ -2429,7 +2619,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2429
2619
  });
2430
2620
  const snapshot = await runner.runDatasetWith({
2431
2621
  datasetId: dataset.id,
2432
- evaluatorIds: evaluators.map((item) => item.id)
2622
+ evaluatorIds: evaluators.map((item) => item.id),
2623
+ concurrency
2433
2624
  });
2434
2625
  totalCount = snapshot.totalTestCases;
2435
2626
  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
@@ -2518,13 +2709,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2518
2709
  }
2519
2710
  console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2520
2711
  }
2521
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2712
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2522
2713
  return new Promise((resolve5, reject) => {
2523
2714
  const app = render(
2524
2715
  React2.createElement(RunView, {
2525
2716
  runner,
2526
2717
  datasetName,
2527
2718
  evaluatorPattern,
2719
+ concurrency,
2528
2720
  onComplete: (err) => {
2529
2721
  app.unmount();
2530
2722
  if (err) {
@@ -2571,10 +2763,12 @@ async function main() {
2571
2763
  const runner = createRunner();
2572
2764
  try {
2573
2765
  if (args.command === "run") {
2766
+ const concurrency = args.concurrency ?? getDefaultConcurrency();
2574
2767
  await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
2575
2768
  runner,
2576
2769
  args.datasetName,
2577
- args.evaluatorPattern
2770
+ args.evaluatorPattern,
2771
+ concurrency
2578
2772
  );
2579
2773
  return;
2580
2774
  }