@m4trix/evals 0.21.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,9 @@ import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffString } from 'json-diff';
9
+ import { diffLines } from 'diff';
10
+ import stringify from 'fast-json-stable-stringify';
11
+ import { cpus } from 'os';
10
12
  import * as React2 from 'react';
11
13
  import React2__default, { useState, useEffect, useCallback } from 'react';
12
14
  import { render, Box, Text } from 'ink';
@@ -261,10 +263,102 @@ async function collectTestCasesFromFiles(config) {
261
263
  );
262
264
  return found.flat();
263
265
  }
266
+ function preprocessForDiff(value, options) {
267
+ if (options?.sort && Array.isArray(value)) {
268
+ return [...value].sort((a, b) => {
269
+ const aStr = stringify(preprocessForDiff(a, options));
270
+ const bStr = stringify(preprocessForDiff(b, options));
271
+ return aStr.localeCompare(bStr);
272
+ }).map((item) => preprocessForDiff(item, options));
273
+ }
274
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
275
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
276
+ const filtered = {};
277
+ for (const [k, v] of Object.entries(value)) {
278
+ if (!keys.includes(k)) {
279
+ filtered[k] = preprocessForDiff(v, options);
280
+ }
281
+ }
282
+ return filtered;
283
+ }
284
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
285
+ const result = {};
286
+ for (const [k, v] of Object.entries(value)) {
287
+ result[k] = preprocessForDiff(v, options);
288
+ }
289
+ return result;
290
+ }
291
+ if (typeof value === "number" && options?.precision !== void 0) {
292
+ return Number(value.toFixed(options.precision));
293
+ }
294
+ return value;
295
+ }
296
+ function toPrettyJson(value) {
297
+ const str = stringify(value);
298
+ try {
299
+ const parsed = JSON.parse(str);
300
+ return JSON.stringify(parsed, null, 2);
301
+ } catch {
302
+ return str;
303
+ }
304
+ }
305
+ function formatDiffParts(parts) {
306
+ const lines = [];
307
+ for (const part of parts) {
308
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
309
+ const partLines = part.value.split("\n");
310
+ for (let i = 0; i < partLines.length; i++) {
311
+ const line = partLines[i];
312
+ if (i === partLines.length - 1 && line === "")
313
+ continue;
314
+ lines.push(prefix + line);
315
+ }
316
+ }
317
+ return lines.join("\n");
318
+ }
264
319
  function createDiffString(expected, actual, diffOptions) {
265
- const opts = { ...diffOptions, color: false };
266
- const result = diffString(expected, actual, opts);
267
- return typeof result === "string" ? result : "";
320
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
321
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
322
+ if (diffOptions?.keysOnly) {
323
+ const expectedKeys = JSON.stringify(
324
+ extractKeys(expectedProcessed),
325
+ null,
326
+ 2
327
+ );
328
+ const actualKeys = JSON.stringify(
329
+ extractKeys(actualProcessed),
330
+ null,
331
+ 2
332
+ );
333
+ const parts2 = diffLines(expectedKeys, actualKeys);
334
+ return formatDiffParts(parts2);
335
+ }
336
+ const expectedStr = toPrettyJson(expectedProcessed);
337
+ const actualStr = toPrettyJson(actualProcessed);
338
+ if (expectedStr === actualStr) {
339
+ return "";
340
+ }
341
+ const parts = diffLines(expectedStr, actualStr);
342
+ if (diffOptions?.outputNewOnly) {
343
+ const filtered = parts.filter(
344
+ (p) => p.added === true
345
+ );
346
+ return formatDiffParts(filtered);
347
+ }
348
+ return formatDiffParts(parts);
349
+ }
350
+ function extractKeys(value) {
351
+ if (value === null || typeof value !== "object") {
352
+ return "\xB7";
353
+ }
354
+ if (Array.isArray(value)) {
355
+ return value.map(extractKeys);
356
+ }
357
+ const result = {};
358
+ for (const [k, v] of Object.entries(value)) {
359
+ result[k] = extractKeys(v);
360
+ }
361
+ return result;
268
362
  }
269
363
  function formatLogMessage(msg) {
270
364
  if (typeof msg === "string")
@@ -1304,8 +1398,9 @@ var EffectRunner = class {
1304
1398
  );
1305
1399
  }
1306
1400
  };
1307
-
1308
- // src/cli-simple/args.ts
1401
+ function getDefaultConcurrency() {
1402
+ return Math.max(1, cpus().length);
1403
+ }
1309
1404
  function parseSimpleCliArgs(argv) {
1310
1405
  const args = {
1311
1406
  help: false,
@@ -1332,6 +1427,14 @@ function parseSimpleCliArgs(argv) {
1332
1427
  index += 1;
1333
1428
  continue;
1334
1429
  }
1430
+ if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1431
+ const n = parseInt(argv[index + 1], 10);
1432
+ if (!Number.isNaN(n) && n >= 1) {
1433
+ args.concurrency = n;
1434
+ }
1435
+ index += 1;
1436
+ continue;
1437
+ }
1335
1438
  args.unknownArgs.push(token);
1336
1439
  }
1337
1440
  return args;
@@ -1339,9 +1442,12 @@ function parseSimpleCliArgs(argv) {
1339
1442
  function getSimpleCliUsage() {
1340
1443
  return [
1341
1444
  "Usage:",
1342
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
1445
+ " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1343
1446
  " eval-agents-simple generate --dataset <datasetName>",
1344
1447
  "",
1448
+ "Options:",
1449
+ " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1450
+ "",
1345
1451
  "Pattern examples for --evaluator:",
1346
1452
  " score-evaluator exact name (case-insensitive)",
1347
1453
  ' "*score*" wildcard pattern',
@@ -1630,6 +1736,7 @@ function RunView({
1630
1736
  runner,
1631
1737
  datasetName,
1632
1738
  evaluatorPattern,
1739
+ concurrency,
1633
1740
  onComplete
1634
1741
  }) {
1635
1742
  const [phase, setPhase] = useState(
@@ -1777,7 +1884,8 @@ function RunView({
1777
1884
  });
1778
1885
  const snapshot = await runner.runDatasetWith({
1779
1886
  datasetId: dataset.id,
1780
- evaluatorIds: evaluators.map((item) => item.id)
1887
+ evaluatorIds: evaluators.map((item) => item.id),
1888
+ concurrency
1781
1889
  });
1782
1890
  setRunInfo({
1783
1891
  runId: snapshot.runId,
@@ -1805,7 +1913,7 @@ function RunView({
1805
1913
  });
1806
1914
  setPhase("completed");
1807
1915
  setTimeout(() => onComplete(), 200);
1808
- }, [runner, datasetName, evaluatorPattern, onComplete]);
1916
+ }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1809
1917
  useEffect(() => {
1810
1918
  void runEval();
1811
1919
  }, [runEval]);
@@ -1848,22 +1956,30 @@ function RunView({
1848
1956
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1849
1957
  }
1850
1958
  ),
1851
- runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
1852
- "[running ",
1853
- item.startedTestCases,
1854
- "/",
1855
- item.totalTestCases,
1856
- "] ",
1857
- item.name,
1858
- " ",
1859
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1860
- "(",
1861
- item.rerunIndex,
1862
- "/",
1863
- item.rerunTotal,
1864
- ")"
1865
- ] })
1866
- ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
1959
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
1960
+ Text,
1961
+ {
1962
+ color: "yellow",
1963
+ children: [
1964
+ "[running ",
1965
+ item.startedTestCases,
1966
+ "/",
1967
+ item.totalTestCases,
1968
+ "]",
1969
+ " ",
1970
+ item.name,
1971
+ " ",
1972
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1973
+ "(",
1974
+ item.rerunIndex,
1975
+ "/",
1976
+ item.rerunTotal,
1977
+ ")"
1978
+ ] })
1979
+ ]
1980
+ },
1981
+ `${item.testCaseId}:${item.rerunIndex}`
1982
+ )) })
1867
1983
  ] }),
1868
1984
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1869
1985
  /* @__PURE__ */ jsxs(Text, { children: [
@@ -1945,7 +2061,7 @@ function RunView({
1945
2061
  },
1946
2062
  `${item.evaluatorId}-${s.id}-${idx}`
1947
2063
  );
1948
- }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2064
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
1949
2065
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1950
2066
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1951
2067
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -2003,9 +2119,9 @@ function RunView({
2003
2119
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
2004
2120
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2005
2121
  const agg = summary.aggregates.get(id);
2006
- const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2007
- (k) => k.startsWith(`${id}:`)
2008
- );
2122
+ const scoreKeys = [
2123
+ ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2124
+ ].filter((k) => k.startsWith(`${id}:`));
2009
2125
  if (scoreKeys.length === 0) {
2010
2126
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2011
2127
  "- ",
@@ -2313,7 +2429,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2313
2429
  }
2314
2430
  return lines;
2315
2431
  }
2316
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2432
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2317
2433
  const dataset = await runner.resolveDatasetByName(datasetName);
2318
2434
  if (!dataset) {
2319
2435
  const known = await runner.collectDatasets();
@@ -2503,7 +2619,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2503
2619
  });
2504
2620
  const snapshot = await runner.runDatasetWith({
2505
2621
  datasetId: dataset.id,
2506
- evaluatorIds: evaluators.map((item) => item.id)
2622
+ evaluatorIds: evaluators.map((item) => item.id),
2623
+ concurrency
2507
2624
  });
2508
2625
  totalCount = snapshot.totalTestCases;
2509
2626
  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
@@ -2592,13 +2709,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2592
2709
  }
2593
2710
  console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2594
2711
  }
2595
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2712
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2596
2713
  return new Promise((resolve5, reject) => {
2597
2714
  const app = render(
2598
2715
  React2.createElement(RunView, {
2599
2716
  runner,
2600
2717
  datasetName,
2601
2718
  evaluatorPattern,
2719
+ concurrency,
2602
2720
  onComplete: (err) => {
2603
2721
  app.unmount();
2604
2722
  if (err) {
@@ -2645,10 +2763,12 @@ async function main() {
2645
2763
  const runner = createRunner();
2646
2764
  try {
2647
2765
  if (args.command === "run") {
2766
+ const concurrency = args.concurrency ?? getDefaultConcurrency();
2648
2767
  await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
2649
2768
  runner,
2650
2769
  args.datasetName,
2651
- args.evaluatorPattern
2770
+ args.evaluatorPattern,
2771
+ concurrency
2652
2772
  );
2653
2773
  return;
2654
2774
  }