@m4trix/evals 0.21.0 → 0.21.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,16 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var jsonDiff = require('json-diff');
11
+ var diff = require('diff');
12
+ var stringify = require('fast-json-stable-stringify');
13
+ var os = require('os');
12
14
  var React2 = require('react');
13
15
  var ink = require('ink');
14
16
  var jsxRuntime = require('react/jsx-runtime');
15
17
 
16
18
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
19
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
20
+
17
21
  function _interopNamespace(e) {
18
22
  if (e && e.__esModule) return e;
19
23
  var n = Object.create(null);
@@ -33,6 +37,7 @@ function _interopNamespace(e) {
33
37
  }
34
38
 
35
39
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
40
+ var stringify__default = /*#__PURE__*/_interopDefault(stringify);
36
41
  var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
37
42
 
38
43
  // src/runner/config.ts
@@ -284,10 +289,102 @@ async function collectTestCasesFromFiles(config) {
284
289
  );
285
290
  return found.flat();
286
291
  }
292
+ function preprocessForDiff(value, options) {
293
+ if (options?.sort && Array.isArray(value)) {
294
+ return [...value].sort((a, b) => {
295
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
296
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
297
+ return aStr.localeCompare(bStr);
298
+ }).map((item) => preprocessForDiff(item, options));
299
+ }
300
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
301
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
302
+ const filtered = {};
303
+ for (const [k, v] of Object.entries(value)) {
304
+ if (!keys.includes(k)) {
305
+ filtered[k] = preprocessForDiff(v, options);
306
+ }
307
+ }
308
+ return filtered;
309
+ }
310
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
311
+ const result = {};
312
+ for (const [k, v] of Object.entries(value)) {
313
+ result[k] = preprocessForDiff(v, options);
314
+ }
315
+ return result;
316
+ }
317
+ if (typeof value === "number" && options?.precision !== void 0) {
318
+ return Number(value.toFixed(options.precision));
319
+ }
320
+ return value;
321
+ }
322
+ function toPrettyJson(value) {
323
+ const str = stringify__default.default(value);
324
+ try {
325
+ const parsed = JSON.parse(str);
326
+ return JSON.stringify(parsed, null, 2);
327
+ } catch {
328
+ return str;
329
+ }
330
+ }
331
+ function formatDiffParts(parts) {
332
+ const lines = [];
333
+ for (const part of parts) {
334
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
335
+ const partLines = part.value.split("\n");
336
+ for (let i = 0; i < partLines.length; i++) {
337
+ const line = partLines[i];
338
+ if (i === partLines.length - 1 && line === "")
339
+ continue;
340
+ lines.push(prefix + line);
341
+ }
342
+ }
343
+ return lines.join("\n");
344
+ }
287
345
  function createDiffString(expected, actual, diffOptions) {
288
- const opts = { ...diffOptions, color: false };
289
- const result = jsonDiff.diffString(expected, actual, opts);
290
- return typeof result === "string" ? result : "";
346
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
347
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
348
+ if (diffOptions?.keysOnly) {
349
+ const expectedKeys = JSON.stringify(
350
+ extractKeys(expectedProcessed),
351
+ null,
352
+ 2
353
+ );
354
+ const actualKeys = JSON.stringify(
355
+ extractKeys(actualProcessed),
356
+ null,
357
+ 2
358
+ );
359
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
360
+ return formatDiffParts(parts2);
361
+ }
362
+ const expectedStr = toPrettyJson(expectedProcessed);
363
+ const actualStr = toPrettyJson(actualProcessed);
364
+ if (expectedStr === actualStr) {
365
+ return "";
366
+ }
367
+ const parts = diff.diffLines(expectedStr, actualStr);
368
+ if (diffOptions?.outputNewOnly) {
369
+ const filtered = parts.filter(
370
+ (p) => p.added === true
371
+ );
372
+ return formatDiffParts(filtered);
373
+ }
374
+ return formatDiffParts(parts);
375
+ }
376
+ function extractKeys(value) {
377
+ if (value === null || typeof value !== "object") {
378
+ return "\xB7";
379
+ }
380
+ if (Array.isArray(value)) {
381
+ return value.map(extractKeys);
382
+ }
383
+ const result = {};
384
+ for (const [k, v] of Object.entries(value)) {
385
+ result[k] = extractKeys(v);
386
+ }
387
+ return result;
291
388
  }
292
389
  function formatLogMessage(msg) {
293
390
  if (typeof msg === "string")
@@ -1327,8 +1424,9 @@ var EffectRunner = class {
1327
1424
  );
1328
1425
  }
1329
1426
  };
1330
-
1331
- // src/cli-simple/args.ts
1427
+ function getDefaultConcurrency() {
1428
+ return Math.max(1, os.cpus().length);
1429
+ }
1332
1430
  function parseSimpleCliArgs(argv) {
1333
1431
  const args = {
1334
1432
  help: false,
@@ -1355,6 +1453,14 @@ function parseSimpleCliArgs(argv) {
1355
1453
  index += 1;
1356
1454
  continue;
1357
1455
  }
1456
+ if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1457
+ const n = parseInt(argv[index + 1], 10);
1458
+ if (!Number.isNaN(n) && n >= 1) {
1459
+ args.concurrency = n;
1460
+ }
1461
+ index += 1;
1462
+ continue;
1463
+ }
1358
1464
  args.unknownArgs.push(token);
1359
1465
  }
1360
1466
  return args;
@@ -1362,9 +1468,12 @@ function parseSimpleCliArgs(argv) {
1362
1468
  function getSimpleCliUsage() {
1363
1469
  return [
1364
1470
  "Usage:",
1365
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
1471
+ " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1366
1472
  " eval-agents-simple generate --dataset <datasetName>",
1367
1473
  "",
1474
+ "Options:",
1475
+ " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1476
+ "",
1368
1477
  "Pattern examples for --evaluator:",
1369
1478
  " score-evaluator exact name (case-insensitive)",
1370
1479
  ' "*score*" wildcard pattern',
@@ -1653,6 +1762,7 @@ function RunView({
1653
1762
  runner,
1654
1763
  datasetName,
1655
1764
  evaluatorPattern,
1765
+ concurrency,
1656
1766
  onComplete
1657
1767
  }) {
1658
1768
  const [phase, setPhase] = React2.useState(
@@ -1800,7 +1910,8 @@ function RunView({
1800
1910
  });
1801
1911
  const snapshot = await runner.runDatasetWith({
1802
1912
  datasetId: dataset.id,
1803
- evaluatorIds: evaluators.map((item) => item.id)
1913
+ evaluatorIds: evaluators.map((item) => item.id),
1914
+ concurrency
1804
1915
  });
1805
1916
  setRunInfo({
1806
1917
  runId: snapshot.runId,
@@ -1828,7 +1939,7 @@ function RunView({
1828
1939
  });
1829
1940
  setPhase("completed");
1830
1941
  setTimeout(() => onComplete(), 200);
1831
- }, [runner, datasetName, evaluatorPattern, onComplete]);
1942
+ }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1832
1943
  React2.useEffect(() => {
1833
1944
  void runEval();
1834
1945
  }, [runEval]);
@@ -1871,22 +1982,30 @@ function RunView({
1871
1982
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1872
1983
  }
1873
1984
  ),
1874
- runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
1875
- "[running ",
1876
- item.startedTestCases,
1877
- "/",
1878
- item.totalTestCases,
1879
- "] ",
1880
- item.name,
1881
- " ",
1882
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1883
- "(",
1884
- item.rerunIndex,
1885
- "/",
1886
- item.rerunTotal,
1887
- ")"
1888
- ] })
1889
- ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
1985
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1986
+ ink.Text,
1987
+ {
1988
+ color: "yellow",
1989
+ children: [
1990
+ "[running ",
1991
+ item.startedTestCases,
1992
+ "/",
1993
+ item.totalTestCases,
1994
+ "]",
1995
+ " ",
1996
+ item.name,
1997
+ " ",
1998
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1999
+ "(",
2000
+ item.rerunIndex,
2001
+ "/",
2002
+ item.rerunTotal,
2003
+ ")"
2004
+ ] })
2005
+ ]
2006
+ },
2007
+ `${item.testCaseId}:${item.rerunIndex}`
2008
+ )) })
1890
2009
  ] }),
1891
2010
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1892
2011
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
@@ -1968,7 +2087,7 @@ function RunView({
1968
2087
  },
1969
2088
  `${item.evaluatorId}-${s.id}-${idx}`
1970
2089
  );
1971
- }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2090
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
1972
2091
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1973
2092
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1974
2093
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -2026,9 +2145,9 @@ function RunView({
2026
2145
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
2027
2146
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2028
2147
  const agg = summary.aggregates.get(id);
2029
- const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2030
- (k) => k.startsWith(`${id}:`)
2031
- );
2148
+ const scoreKeys = [
2149
+ ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2150
+ ].filter((k) => k.startsWith(`${id}:`));
2032
2151
  if (scoreKeys.length === 0) {
2033
2152
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2034
2153
  "- ",
@@ -2336,7 +2455,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2336
2455
  }
2337
2456
  return lines;
2338
2457
  }
2339
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2458
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2340
2459
  const dataset = await runner.resolveDatasetByName(datasetName);
2341
2460
  if (!dataset) {
2342
2461
  const known = await runner.collectDatasets();
@@ -2526,7 +2645,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2526
2645
  });
2527
2646
  const snapshot = await runner.runDatasetWith({
2528
2647
  datasetId: dataset.id,
2529
- evaluatorIds: evaluators.map((item) => item.id)
2648
+ evaluatorIds: evaluators.map((item) => item.id),
2649
+ concurrency
2530
2650
  });
2531
2651
  totalCount = snapshot.totalTestCases;
2532
2652
  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
@@ -2615,13 +2735,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2615
2735
  }
2616
2736
  console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2617
2737
  }
2618
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2738
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2619
2739
  return new Promise((resolve5, reject) => {
2620
2740
  const app = ink.render(
2621
2741
  React2__namespace.createElement(RunView, {
2622
2742
  runner,
2623
2743
  datasetName,
2624
2744
  evaluatorPattern,
2745
+ concurrency,
2625
2746
  onComplete: (err) => {
2626
2747
  app.unmount();
2627
2748
  if (err) {
@@ -2668,10 +2789,12 @@ async function main() {
2668
2789
  const runner = createRunner();
2669
2790
  try {
2670
2791
  if (args.command === "run") {
2792
+ const concurrency = args.concurrency ?? getDefaultConcurrency();
2671
2793
  await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
2672
2794
  runner,
2673
2795
  args.datasetName,
2674
- args.evaluatorPattern
2796
+ args.evaluatorPattern,
2797
+ concurrency
2675
2798
  );
2676
2799
  return;
2677
2800
  }