@m4trix/evals 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,12 +8,16 @@ var path = require('path');
8
8
  var jitiModule = require('jiti');
9
9
  var promises = require('fs/promises');
10
10
  var url = require('url');
11
- var jsonDiff = require('json-diff');
11
+ var diff = require('diff');
12
+ var stringify = require('fast-json-stable-stringify');
13
+ var os = require('os');
12
14
  var React2 = require('react');
13
15
  var ink = require('ink');
14
16
  var jsxRuntime = require('react/jsx-runtime');
15
17
 
16
18
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
19
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
20
+
17
21
  function _interopNamespace(e) {
18
22
  if (e && e.__esModule) return e;
19
23
  var n = Object.create(null);
@@ -33,6 +37,7 @@ function _interopNamespace(e) {
33
37
  }
34
38
 
35
39
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
40
+ var stringify__default = /*#__PURE__*/_interopDefault(stringify);
36
41
  var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
37
42
 
38
43
  // src/runner/config.ts
@@ -284,10 +289,102 @@ async function collectTestCasesFromFiles(config) {
284
289
  );
285
290
  return found.flat();
286
291
  }
292
+ function preprocessForDiff(value, options) {
293
+ if (options?.sort && Array.isArray(value)) {
294
+ return [...value].sort((a, b) => {
295
+ const aStr = stringify__default.default(preprocessForDiff(a, options));
296
+ const bStr = stringify__default.default(preprocessForDiff(b, options));
297
+ return aStr.localeCompare(bStr);
298
+ }).map((item) => preprocessForDiff(item, options));
299
+ }
300
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
301
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
302
+ const filtered = {};
303
+ for (const [k, v] of Object.entries(value)) {
304
+ if (!keys.includes(k)) {
305
+ filtered[k] = preprocessForDiff(v, options);
306
+ }
307
+ }
308
+ return filtered;
309
+ }
310
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
311
+ const result = {};
312
+ for (const [k, v] of Object.entries(value)) {
313
+ result[k] = preprocessForDiff(v, options);
314
+ }
315
+ return result;
316
+ }
317
+ if (typeof value === "number" && options?.precision !== void 0) {
318
+ return Number(value.toFixed(options.precision));
319
+ }
320
+ return value;
321
+ }
322
+ function toPrettyJson(value) {
323
+ const str = stringify__default.default(value);
324
+ try {
325
+ const parsed = JSON.parse(str);
326
+ return JSON.stringify(parsed, null, 2);
327
+ } catch {
328
+ return str;
329
+ }
330
+ }
331
+ function formatDiffParts(parts) {
332
+ const lines = [];
333
+ for (const part of parts) {
334
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
335
+ const partLines = part.value.split("\n");
336
+ for (let i = 0; i < partLines.length; i++) {
337
+ const line = partLines[i];
338
+ if (i === partLines.length - 1 && line === "")
339
+ continue;
340
+ lines.push(prefix + line);
341
+ }
342
+ }
343
+ return lines.join("\n");
344
+ }
287
345
  function createDiffString(expected, actual, diffOptions) {
288
- const opts = { ...diffOptions, color: false };
289
- const result = jsonDiff.diffString(expected, actual, opts);
290
- return typeof result === "string" ? result : "";
346
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
347
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
348
+ if (diffOptions?.keysOnly) {
349
+ const expectedKeys = JSON.stringify(
350
+ extractKeys(expectedProcessed),
351
+ null,
352
+ 2
353
+ );
354
+ const actualKeys = JSON.stringify(
355
+ extractKeys(actualProcessed),
356
+ null,
357
+ 2
358
+ );
359
+ const parts2 = diff.diffLines(expectedKeys, actualKeys);
360
+ return formatDiffParts(parts2);
361
+ }
362
+ const expectedStr = toPrettyJson(expectedProcessed);
363
+ const actualStr = toPrettyJson(actualProcessed);
364
+ if (expectedStr === actualStr) {
365
+ return "";
366
+ }
367
+ const parts = diff.diffLines(expectedStr, actualStr);
368
+ if (diffOptions?.outputNewOnly) {
369
+ const filtered = parts.filter(
370
+ (p) => p.added === true
371
+ );
372
+ return formatDiffParts(filtered);
373
+ }
374
+ return formatDiffParts(parts);
375
+ }
376
+ function extractKeys(value) {
377
+ if (value === null || typeof value !== "object") {
378
+ return "\xB7";
379
+ }
380
+ if (Array.isArray(value)) {
381
+ return value.map(extractKeys);
382
+ }
383
+ const result = {};
384
+ for (const [k, v] of Object.entries(value)) {
385
+ result[k] = extractKeys(v);
386
+ }
387
+ return result;
291
388
  }
292
389
  function formatLogMessage(msg) {
293
390
  if (typeof msg === "string")
@@ -666,6 +763,20 @@ function readOutput(testCase) {
666
763
  }
667
764
  return candidate.getOutput();
668
765
  }
766
+ function buildEvaluationUnits(testCases) {
767
+ const units = [];
768
+ for (const testCaseItem of testCases) {
769
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
770
+ for (let r = 0; r < rerunTotal; r++) {
771
+ units.push({
772
+ testCaseItem,
773
+ rerunIndex: r + 1,
774
+ rerunTotal
775
+ });
776
+ }
777
+ }
778
+ return units;
779
+ }
669
780
  function nowIsoForFile() {
670
781
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
671
782
  }
@@ -675,157 +786,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
675
786
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
676
787
  );
677
788
  }
678
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
789
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
790
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
679
791
  return effect.Effect.gen(function* () {
680
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
681
- const rerunPassed = [];
682
- for (let r = 0; r < reruns; r++) {
683
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
684
- const started = Date.now();
685
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
686
- n + 1,
687
- n + 1
688
- ]);
689
- yield* publishEvent({
690
- type: "TestCaseStarted",
691
- runId: task.runId,
692
- testCaseId: testCaseItem.id,
693
- testCaseName: testCaseItem.testCase.getName(),
694
- startedTestCases: startedEvaluations,
695
- totalTestCases: totalEvaluations,
696
- rerunIndex: r + 1,
697
- rerunTotal: reruns
698
- });
699
- const evaluatorScores = [];
700
- let testCaseError;
701
- const output = readOutput(testCaseItem.testCase);
702
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
703
- const evaluateFn = evaluator.getEvaluateFn();
704
- if (!evaluateFn) {
705
- continue;
706
- }
707
- const logs = [];
708
- const logDiff = (expected, actual, options) => {
709
- logs.push(createDiffLogEntry(expected, actual, options));
710
- };
711
- const log = (message, options) => {
712
- logs.push(createLogEntry(message, options));
713
- };
714
- const createError = (message, options) => {
715
- const entry = createLogEntry(message, options);
716
- const error = message instanceof Error ? message : new Error(entry.message);
717
- error[evaluatorErrorLogEntryKey] = entry;
718
- return error;
719
- };
720
- try {
721
- const ctx = yield* effect.Effect.promise(
722
- () => Promise.resolve(evaluator.resolveContext())
723
- );
724
- const result = yield* effect.Effect.promise(
725
- () => Promise.resolve().then(
726
- () => evaluateFn({
727
- input: testCaseItem.testCase.getInput(),
728
- ctx,
729
- output,
730
- meta: {
731
- triggerId: task.triggerId,
732
- runId: evaluatorRunId,
733
- datasetId: task.datasetId
734
- },
735
- logDiff,
736
- log,
737
- createError
738
- })
739
- )
740
- );
741
- if (result instanceof Error) {
742
- const evaluatorError = result;
743
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
744
- logs.push(taggedEntry ?? createLogEntry(result));
745
- testCaseError = result.message;
746
- evaluatorScores.push({
747
- evaluatorId,
748
- scores: [],
749
- passed: false,
750
- logs: logs.length > 0 ? logs : void 0
751
- });
752
- continue;
753
- }
754
- const { scores, metrics } = normalizeResult(result);
755
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
756
- evaluatorScores.push({
757
- evaluatorId,
758
- scores,
759
- passed: passed2,
760
- metrics,
761
- logs: logs.length > 0 ? logs : void 0
762
- });
763
- } catch (error) {
764
- if (error instanceof Error) {
765
- const taggedEntry = error[evaluatorErrorLogEntryKey];
766
- logs.push(taggedEntry ?? createLogEntry(error));
767
- }
768
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
792
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
793
+ const started = Date.now();
794
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
795
+ n + 1,
796
+ n + 1
797
+ ]);
798
+ yield* publishEvent({
799
+ type: "TestCaseStarted",
800
+ runId: task.runId,
801
+ testCaseId: testCaseItem.id,
802
+ testCaseName: testCaseItem.testCase.getName(),
803
+ startedTestCases: startedEvaluations,
804
+ totalTestCases: totalEvaluations,
805
+ rerunIndex,
806
+ rerunTotal
807
+ });
808
+ const evaluatorScores = [];
809
+ let testCaseError;
810
+ const output = readOutput(testCaseItem.testCase);
811
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
812
+ const evaluateFn = evaluator.getEvaluateFn();
813
+ if (!evaluateFn) {
814
+ continue;
815
+ }
816
+ const logs = [];
817
+ const logDiff = (expected, actual, options) => {
818
+ logs.push(createDiffLogEntry(expected, actual, options));
819
+ };
820
+ const log = (message, options) => {
821
+ logs.push(createLogEntry(message, options));
822
+ };
823
+ const createError = (message, options) => {
824
+ const entry = createLogEntry(message, options);
825
+ const error = message instanceof Error ? message : new Error(entry.message);
826
+ error[evaluatorErrorLogEntryKey] = entry;
827
+ return error;
828
+ };
829
+ try {
830
+ const ctx = yield* effect.Effect.promise(
831
+ () => Promise.resolve(evaluator.resolveContext())
832
+ );
833
+ const result = yield* effect.Effect.promise(
834
+ () => Promise.resolve().then(
835
+ () => evaluateFn({
836
+ input: testCaseItem.testCase.getInput(),
837
+ ctx,
838
+ output,
839
+ meta: {
840
+ triggerId: task.triggerId,
841
+ runId: evaluatorRunId,
842
+ datasetId: task.datasetId
843
+ },
844
+ logDiff,
845
+ log,
846
+ createError
847
+ })
848
+ )
849
+ );
850
+ if (result instanceof Error) {
851
+ const evaluatorError = result;
852
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
853
+ logs.push(taggedEntry ?? createLogEntry(result));
854
+ testCaseError = result.message;
769
855
  evaluatorScores.push({
770
856
  evaluatorId,
771
857
  scores: [],
772
858
  passed: false,
773
859
  logs: logs.length > 0 ? logs : void 0
774
860
  });
861
+ continue;
862
+ }
863
+ const { scores, metrics } = normalizeResult(result);
864
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
865
+ evaluatorScores.push({
866
+ evaluatorId,
867
+ scores,
868
+ passed,
869
+ metrics,
870
+ logs: logs.length > 0 ? logs : void 0
871
+ });
872
+ } catch (error) {
873
+ if (error instanceof Error) {
874
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
875
+ logs.push(taggedEntry ?? createLogEntry(error));
775
876
  }
877
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
878
+ evaluatorScores.push({
879
+ evaluatorId,
880
+ scores: [],
881
+ passed: false,
882
+ logs: logs.length > 0 ? logs : void 0
883
+ });
776
884
  }
777
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
778
- rerunPassed.push(rerunPassedThis);
779
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
780
- n + 1,
781
- n + 1
782
- ]);
783
- const progressEvent = {
784
- type: "TestCaseProgress",
785
- runId: task.runId,
786
- testCaseId: testCaseItem.id,
787
- testCaseName: testCaseItem.testCase.getName(),
788
- completedTestCases: completedEvaluations,
789
- totalTestCases: totalEvaluations,
790
- rerunIndex: r + 1,
791
- rerunTotal: reruns,
792
- passed: rerunPassedThis,
793
- durationMs: Date.now() - started,
794
- evaluatorScores,
795
- output,
796
- errorMessage: testCaseError
797
- };
798
- updateSnapshot(task.runId, (snapshot) => ({
799
- ...snapshot,
800
- completedTestCases: completedEvaluations
801
- }));
802
- yield* publishEvent(progressEvent);
803
- yield* effect.Queue.offer(persistenceQueue, {
804
- runId: task.runId,
805
- artifactPath: task.snapshot.artifactPath,
806
- payload: progressEvent
807
- });
808
885
  }
809
- const testCasePassed = rerunPassed.every(Boolean);
810
- if (testCasePassed) {
811
- yield* effect.Ref.update(passedRef, (n) => n + 1);
812
- } else {
813
- yield* effect.Ref.update(failedRef, (n) => n + 1);
814
- }
815
- const [passed, failed] = yield* effect.Effect.all([
816
- effect.Ref.get(passedRef),
817
- effect.Ref.get(failedRef)
886
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
887
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
888
+ n + 1,
889
+ n + 1
818
890
  ]);
819
- updateSnapshot(task.runId, (snapshot) => ({
891
+ const progressEvent = {
892
+ type: "TestCaseProgress",
893
+ runId: task.runId,
894
+ testCaseId: testCaseItem.id,
895
+ testCaseName: testCaseItem.testCase.getName(),
896
+ completedTestCases: completedEvaluations,
897
+ totalTestCases: totalEvaluations,
898
+ rerunIndex,
899
+ rerunTotal,
900
+ passed: rerunPassedThis,
901
+ durationMs: Date.now() - started,
902
+ evaluatorScores,
903
+ output,
904
+ errorMessage: testCaseError
905
+ };
906
+ yield* updateSnapshot(task.runId, (snapshot) => ({
820
907
  ...snapshot,
821
- passedTestCases: passed,
822
- failedTestCases: failed
908
+ completedTestCases: completedEvaluations
823
909
  }));
910
+ yield* publishEvent(progressEvent);
911
+ yield* effect.Queue.offer(persistenceQueue, {
912
+ runId: task.runId,
913
+ artifactPath: task.snapshot.artifactPath,
914
+ payload: progressEvent
915
+ });
916
+ const testCaseCompleted = yield* effect.Ref.modify(
917
+ testCaseResultsRef,
918
+ (map) => {
919
+ const key = testCaseItem.id;
920
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
921
+ const newResults = [...existing.results, rerunPassedThis];
922
+ const newCompletedCount = existing.completedCount + 1;
923
+ const isLast = newCompletedCount === rerunTotal;
924
+ const newMap = new Map(map);
925
+ newMap.set(key, {
926
+ completedCount: newCompletedCount,
927
+ results: newResults
928
+ });
929
+ const outcome = isLast ? newResults.every(Boolean) : null;
930
+ return [outcome, newMap];
931
+ }
932
+ );
933
+ if (testCaseCompleted !== null) {
934
+ if (testCaseCompleted) {
935
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
936
+ } else {
937
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
938
+ }
939
+ const [passed, failed] = yield* effect.Effect.all([
940
+ effect.Ref.get(passedRef),
941
+ effect.Ref.get(failedRef)
942
+ ]);
943
+ yield* updateSnapshot(task.runId, (snapshot) => ({
944
+ ...snapshot,
945
+ passedTestCases: passed,
946
+ failedTestCases: failed
947
+ }));
948
+ }
824
949
  });
825
950
  }
826
951
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
827
952
  const startedAt = Date.now();
828
- updateSnapshot(task.runId, (snapshot) => ({
953
+ yield* updateSnapshot(task.runId, (snapshot) => ({
829
954
  ...snapshot,
830
955
  status: "running",
831
956
  startedAt
@@ -844,9 +969,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
844
969
  const startedRef = yield* effect.Ref.make(0);
845
970
  const passedRef = yield* effect.Ref.make(0);
846
971
  const failedRef = yield* effect.Ref.make(0);
847
- const processTestCase = (testCaseItem) => processOneTestCase(
972
+ const testCaseResultsRef = yield* effect.Ref.make(
973
+ /* @__PURE__ */ new Map()
974
+ );
975
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
976
+ const processEvaluation = (unit) => processOneEvaluation(
848
977
  task,
849
- testCaseItem,
978
+ unit,
850
979
  totalEvaluations,
851
980
  publishEvent,
852
981
  persistenceQueue,
@@ -854,11 +983,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
854
983
  startedRef,
855
984
  completedRef,
856
985
  passedRef,
857
- failedRef
986
+ failedRef,
987
+ testCaseResultsRef
858
988
  );
859
989
  yield* effect.Effect.forEach(
860
- task.testCases,
861
- processTestCase,
990
+ evaluationUnits,
991
+ processEvaluation,
862
992
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
863
993
  );
864
994
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -876,7 +1006,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
876
1006
  totalTestCases: task.testCases.length,
877
1007
  artifactPath: task.snapshot.artifactPath
878
1008
  };
879
- updateSnapshot(task.runId, (snapshot) => ({
1009
+ yield* updateSnapshot(task.runId, (snapshot) => ({
880
1010
  ...snapshot,
881
1011
  status: "completed",
882
1012
  completedTestCases: completedEvaluations,
@@ -1129,7 +1259,9 @@ var EffectRunner = class {
1129
1259
  this.persistenceQueue = effect.Effect.runSync(
1130
1260
  effect.Queue.unbounded()
1131
1261
  );
1132
- this.snapshots = /* @__PURE__ */ new Map();
1262
+ this.snapshotsRef = effect.Effect.runSync(
1263
+ effect.Ref.make(/* @__PURE__ */ new Map())
1264
+ );
1133
1265
  this.listeners = /* @__PURE__ */ new Set();
1134
1266
  this.datasetsById = /* @__PURE__ */ new Map();
1135
1267
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1232,7 +1364,13 @@ var EffectRunner = class {
1232
1364
  status: "queued",
1233
1365
  artifactPath
1234
1366
  };
1235
- this.snapshots.set(runId, snapshot);
1367
+ await effect.Effect.runPromise(
1368
+ effect.Ref.update(this.snapshotsRef, (map) => {
1369
+ const next = new Map(map);
1370
+ next.set(runId, snapshot);
1371
+ return next;
1372
+ })
1373
+ );
1236
1374
  const queuedEvent = {
1237
1375
  type: "RunQueued",
1238
1376
  runId,
@@ -1273,12 +1411,12 @@ var EffectRunner = class {
1273
1411
  };
1274
1412
  }
1275
1413
  getRunSnapshot(runId) {
1276
- return this.snapshots.get(runId);
1414
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1277
1415
  }
1278
1416
  getAllRunSnapshots() {
1279
- return Array.from(this.snapshots.values()).sort(
1280
- (a, b) => b.queuedAt - a.queuedAt
1281
- );
1417
+ return Array.from(
1418
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1419
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1282
1420
  }
1283
1421
  async loadRunSnapshotsFromArtifacts() {
1284
1422
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1307,11 +1445,15 @@ var EffectRunner = class {
1307
1445
  );
1308
1446
  }
1309
1447
  updateSnapshot(runId, updater) {
1310
- const existing = this.snapshots.get(runId);
1311
- if (!existing) {
1312
- return;
1313
- }
1314
- this.snapshots.set(runId, updater(existing));
1448
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
1449
+ const existing = map.get(runId);
1450
+ if (!existing) {
1451
+ return [void 0, map];
1452
+ }
1453
+ const next = new Map(map);
1454
+ next.set(runId, updater(existing));
1455
+ return [void 0, next];
1456
+ }).pipe(effect.Effect.asVoid);
1315
1457
  }
1316
1458
  publishEvent(event) {
1317
1459
  return effect.Effect.sync(() => {
@@ -1327,8 +1469,9 @@ var EffectRunner = class {
1327
1469
  );
1328
1470
  }
1329
1471
  };
1330
-
1331
- // src/cli-simple/args.ts
1472
+ function getDefaultConcurrency() {
1473
+ return Math.max(1, os.cpus().length);
1474
+ }
1332
1475
  function parseSimpleCliArgs(argv) {
1333
1476
  const args = {
1334
1477
  help: false,
@@ -1355,6 +1498,14 @@ function parseSimpleCliArgs(argv) {
1355
1498
  index += 1;
1356
1499
  continue;
1357
1500
  }
1501
+ if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1502
+ const n = parseInt(argv[index + 1], 10);
1503
+ if (!Number.isNaN(n) && n >= 1) {
1504
+ args.concurrency = n;
1505
+ }
1506
+ index += 1;
1507
+ continue;
1508
+ }
1358
1509
  args.unknownArgs.push(token);
1359
1510
  }
1360
1511
  return args;
@@ -1362,9 +1513,12 @@ function parseSimpleCliArgs(argv) {
1362
1513
  function getSimpleCliUsage() {
1363
1514
  return [
1364
1515
  "Usage:",
1365
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
1516
+ " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1366
1517
  " eval-agents-simple generate --dataset <datasetName>",
1367
1518
  "",
1519
+ "Options:",
1520
+ " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1521
+ "",
1368
1522
  "Pattern examples for --evaluator:",
1369
1523
  " score-evaluator exact name (case-insensitive)",
1370
1524
  ' "*score*" wildcard pattern',
@@ -1653,6 +1807,7 @@ function RunView({
1653
1807
  runner,
1654
1808
  datasetName,
1655
1809
  evaluatorPattern,
1810
+ concurrency,
1656
1811
  onComplete
1657
1812
  }) {
1658
1813
  const [phase, setPhase] = React2.useState(
@@ -1800,7 +1955,8 @@ function RunView({
1800
1955
  });
1801
1956
  const snapshot = await runner.runDatasetWith({
1802
1957
  datasetId: dataset.id,
1803
- evaluatorIds: evaluators.map((item) => item.id)
1958
+ evaluatorIds: evaluators.map((item) => item.id),
1959
+ concurrency
1804
1960
  });
1805
1961
  setRunInfo({
1806
1962
  runId: snapshot.runId,
@@ -1828,7 +1984,7 @@ function RunView({
1828
1984
  });
1829
1985
  setPhase("completed");
1830
1986
  setTimeout(() => onComplete(), 200);
1831
- }, [runner, datasetName, evaluatorPattern, onComplete]);
1987
+ }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1832
1988
  React2.useEffect(() => {
1833
1989
  void runEval();
1834
1990
  }, [runEval]);
@@ -1871,22 +2027,30 @@ function RunView({
1871
2027
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1872
2028
  }
1873
2029
  ),
1874
- runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
1875
- "[running ",
1876
- item.startedTestCases,
1877
- "/",
1878
- item.totalTestCases,
1879
- "] ",
1880
- item.name,
1881
- " ",
1882
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1883
- "(",
1884
- item.rerunIndex,
1885
- "/",
1886
- item.rerunTotal,
1887
- ")"
1888
- ] })
1889
- ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
2030
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
2031
+ ink.Text,
2032
+ {
2033
+ color: "yellow",
2034
+ children: [
2035
+ "[running ",
2036
+ item.startedTestCases,
2037
+ "/",
2038
+ item.totalTestCases,
2039
+ "]",
2040
+ " ",
2041
+ item.name,
2042
+ " ",
2043
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2044
+ "(",
2045
+ item.rerunIndex,
2046
+ "/",
2047
+ item.rerunTotal,
2048
+ ")"
2049
+ ] })
2050
+ ]
2051
+ },
2052
+ `${item.testCaseId}:${item.rerunIndex}`
2053
+ )) })
1890
2054
  ] }),
1891
2055
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1892
2056
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
@@ -1968,7 +2132,7 @@ function RunView({
1968
2132
  },
1969
2133
  `${item.evaluatorId}-${s.id}-${idx}`
1970
2134
  );
1971
- }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2135
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
1972
2136
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1973
2137
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
1974
2138
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -2026,9 +2190,9 @@ function RunView({
2026
2190
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
2027
2191
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2028
2192
  const agg = summary.aggregates.get(id);
2029
- const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2030
- (k) => k.startsWith(`${id}:`)
2031
- );
2193
+ const scoreKeys = [
2194
+ ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2195
+ ].filter((k) => k.startsWith(`${id}:`));
2032
2196
  if (scoreKeys.length === 0) {
2033
2197
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2034
2198
  "- ",
@@ -2336,7 +2500,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2336
2500
  }
2337
2501
  return lines;
2338
2502
  }
2339
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2503
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2340
2504
  const dataset = await runner.resolveDatasetByName(datasetName);
2341
2505
  if (!dataset) {
2342
2506
  const known = await runner.collectDatasets();
@@ -2526,7 +2690,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2526
2690
  });
2527
2691
  const snapshot = await runner.runDatasetWith({
2528
2692
  datasetId: dataset.id,
2529
- evaluatorIds: evaluators.map((item) => item.id)
2693
+ evaluatorIds: evaluators.map((item) => item.id),
2694
+ concurrency
2530
2695
  });
2531
2696
  totalCount = snapshot.totalTestCases;
2532
2697
  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
@@ -2615,13 +2780,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2615
2780
  }
2616
2781
  console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2617
2782
  }
2618
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2783
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2619
2784
  return new Promise((resolve5, reject) => {
2620
2785
  const app = ink.render(
2621
2786
  React2__namespace.createElement(RunView, {
2622
2787
  runner,
2623
2788
  datasetName,
2624
2789
  evaluatorPattern,
2790
+ concurrency,
2625
2791
  onComplete: (err) => {
2626
2792
  app.unmount();
2627
2793
  if (err) {
@@ -2668,10 +2834,12 @@ async function main() {
2668
2834
  const runner = createRunner();
2669
2835
  try {
2670
2836
  if (args.command === "run") {
2837
+ const concurrency = args.concurrency ?? getDefaultConcurrency();
2671
2838
  await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
2672
2839
  runner,
2673
2840
  args.datasetName,
2674
- args.evaluatorPattern
2841
+ args.evaluatorPattern,
2842
+ concurrency
2675
2843
  );
2676
2844
  return;
2677
2845
  }