@m4trix/evals 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,14 @@
1
1
  #!/usr/bin/env node
2
2
  import { randomUUID } from 'crypto';
3
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
3
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
4
4
  import { existsSync } from 'fs';
5
5
  import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
- import { diffString } from 'json-diff';
9
+ import { diffLines } from 'diff';
10
+ import stringify from 'fast-json-stable-stringify';
11
+ import { cpus } from 'os';
10
12
  import * as React2 from 'react';
11
13
  import React2__default, { useState, useEffect, useCallback } from 'react';
12
14
  import { render, Box, Text } from 'ink';
@@ -261,10 +263,102 @@ async function collectTestCasesFromFiles(config) {
261
263
  );
262
264
  return found.flat();
263
265
  }
266
+ function preprocessForDiff(value, options) {
267
+ if (options?.sort && Array.isArray(value)) {
268
+ return [...value].sort((a, b) => {
269
+ const aStr = stringify(preprocessForDiff(a, options));
270
+ const bStr = stringify(preprocessForDiff(b, options));
271
+ return aStr.localeCompare(bStr);
272
+ }).map((item) => preprocessForDiff(item, options));
273
+ }
274
+ if (value !== null && typeof value === "object" && !Array.isArray(value) && options?.excludeKeys) {
275
+ const keys = Array.isArray(options.excludeKeys) ? options.excludeKeys : options.excludeKeys.split(",").map((k) => k.trim());
276
+ const filtered = {};
277
+ for (const [k, v] of Object.entries(value)) {
278
+ if (!keys.includes(k)) {
279
+ filtered[k] = preprocessForDiff(v, options);
280
+ }
281
+ }
282
+ return filtered;
283
+ }
284
+ if (value !== null && typeof value === "object" && !Array.isArray(value)) {
285
+ const result = {};
286
+ for (const [k, v] of Object.entries(value)) {
287
+ result[k] = preprocessForDiff(v, options);
288
+ }
289
+ return result;
290
+ }
291
+ if (typeof value === "number" && options?.precision !== void 0) {
292
+ return Number(value.toFixed(options.precision));
293
+ }
294
+ return value;
295
+ }
296
+ function toPrettyJson(value) {
297
+ const str = stringify(value);
298
+ try {
299
+ const parsed = JSON.parse(str);
300
+ return JSON.stringify(parsed, null, 2);
301
+ } catch {
302
+ return str;
303
+ }
304
+ }
305
+ function formatDiffParts(parts) {
306
+ const lines = [];
307
+ for (const part of parts) {
308
+ const prefix = part.added ? "+ " : part.removed ? "- " : "";
309
+ const partLines = part.value.split("\n");
310
+ for (let i = 0; i < partLines.length; i++) {
311
+ const line = partLines[i];
312
+ if (i === partLines.length - 1 && line === "")
313
+ continue;
314
+ lines.push(prefix + line);
315
+ }
316
+ }
317
+ return lines.join("\n");
318
+ }
264
319
  function createDiffString(expected, actual, diffOptions) {
265
- const opts = { ...diffOptions, color: false };
266
- const result = diffString(expected, actual, opts);
267
- return typeof result === "string" ? result : "";
320
+ const expectedProcessed = preprocessForDiff(expected, diffOptions);
321
+ const actualProcessed = preprocessForDiff(actual, diffOptions);
322
+ if (diffOptions?.keysOnly) {
323
+ const expectedKeys = JSON.stringify(
324
+ extractKeys(expectedProcessed),
325
+ null,
326
+ 2
327
+ );
328
+ const actualKeys = JSON.stringify(
329
+ extractKeys(actualProcessed),
330
+ null,
331
+ 2
332
+ );
333
+ const parts2 = diffLines(expectedKeys, actualKeys);
334
+ return formatDiffParts(parts2);
335
+ }
336
+ const expectedStr = toPrettyJson(expectedProcessed);
337
+ const actualStr = toPrettyJson(actualProcessed);
338
+ if (expectedStr === actualStr) {
339
+ return "";
340
+ }
341
+ const parts = diffLines(expectedStr, actualStr);
342
+ if (diffOptions?.outputNewOnly) {
343
+ const filtered = parts.filter(
344
+ (p) => p.added === true
345
+ );
346
+ return formatDiffParts(filtered);
347
+ }
348
+ return formatDiffParts(parts);
349
+ }
350
+ function extractKeys(value) {
351
+ if (value === null || typeof value !== "object") {
352
+ return "\xB7";
353
+ }
354
+ if (Array.isArray(value)) {
355
+ return value.map(extractKeys);
356
+ }
357
+ const result = {};
358
+ for (const [k, v] of Object.entries(value)) {
359
+ result[k] = extractKeys(v);
360
+ }
361
+ return result;
268
362
  }
269
363
  function formatLogMessage(msg) {
270
364
  if (typeof msg === "string")
@@ -643,6 +737,20 @@ function readOutput(testCase) {
643
737
  }
644
738
  return candidate.getOutput();
645
739
  }
740
+ function buildEvaluationUnits(testCases) {
741
+ const units = [];
742
+ for (const testCaseItem of testCases) {
743
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
744
+ for (let r = 0; r < rerunTotal; r++) {
745
+ units.push({
746
+ testCaseItem,
747
+ rerunIndex: r + 1,
748
+ rerunTotal
749
+ });
750
+ }
751
+ }
752
+ return units;
753
+ }
646
754
  function nowIsoForFile() {
647
755
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
648
756
  }
@@ -652,157 +760,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
652
760
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
653
761
  );
654
762
  }
655
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
763
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
764
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
656
765
  return Effect.gen(function* () {
657
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
658
- const rerunPassed = [];
659
- for (let r = 0; r < reruns; r++) {
660
- const evaluatorRunId = `run-${randomUUID()}`;
661
- const started = Date.now();
662
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
663
- n + 1,
664
- n + 1
665
- ]);
666
- yield* publishEvent({
667
- type: "TestCaseStarted",
668
- runId: task.runId,
669
- testCaseId: testCaseItem.id,
670
- testCaseName: testCaseItem.testCase.getName(),
671
- startedTestCases: startedEvaluations,
672
- totalTestCases: totalEvaluations,
673
- rerunIndex: r + 1,
674
- rerunTotal: reruns
675
- });
676
- const evaluatorScores = [];
677
- let testCaseError;
678
- const output = readOutput(testCaseItem.testCase);
679
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
680
- const evaluateFn = evaluator.getEvaluateFn();
681
- if (!evaluateFn) {
682
- continue;
683
- }
684
- const logs = [];
685
- const logDiff = (expected, actual, options) => {
686
- logs.push(createDiffLogEntry(expected, actual, options));
687
- };
688
- const log = (message, options) => {
689
- logs.push(createLogEntry(message, options));
690
- };
691
- const createError = (message, options) => {
692
- const entry = createLogEntry(message, options);
693
- const error = message instanceof Error ? message : new Error(entry.message);
694
- error[evaluatorErrorLogEntryKey] = entry;
695
- return error;
696
- };
697
- try {
698
- const ctx = yield* Effect.promise(
699
- () => Promise.resolve(evaluator.resolveContext())
700
- );
701
- const result = yield* Effect.promise(
702
- () => Promise.resolve().then(
703
- () => evaluateFn({
704
- input: testCaseItem.testCase.getInput(),
705
- ctx,
706
- output,
707
- meta: {
708
- triggerId: task.triggerId,
709
- runId: evaluatorRunId,
710
- datasetId: task.datasetId
711
- },
712
- logDiff,
713
- log,
714
- createError
715
- })
716
- )
717
- );
718
- if (result instanceof Error) {
719
- const evaluatorError = result;
720
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
721
- logs.push(taggedEntry ?? createLogEntry(result));
722
- testCaseError = result.message;
723
- evaluatorScores.push({
724
- evaluatorId,
725
- scores: [],
726
- passed: false,
727
- logs: logs.length > 0 ? logs : void 0
728
- });
729
- continue;
730
- }
731
- const { scores, metrics } = normalizeResult(result);
732
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
733
- evaluatorScores.push({
734
- evaluatorId,
735
- scores,
736
- passed: passed2,
737
- metrics,
738
- logs: logs.length > 0 ? logs : void 0
739
- });
740
- } catch (error) {
741
- if (error instanceof Error) {
742
- const taggedEntry = error[evaluatorErrorLogEntryKey];
743
- logs.push(taggedEntry ?? createLogEntry(error));
744
- }
745
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
766
+ const evaluatorRunId = `run-${randomUUID()}`;
767
+ const started = Date.now();
768
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
769
+ n + 1,
770
+ n + 1
771
+ ]);
772
+ yield* publishEvent({
773
+ type: "TestCaseStarted",
774
+ runId: task.runId,
775
+ testCaseId: testCaseItem.id,
776
+ testCaseName: testCaseItem.testCase.getName(),
777
+ startedTestCases: startedEvaluations,
778
+ totalTestCases: totalEvaluations,
779
+ rerunIndex,
780
+ rerunTotal
781
+ });
782
+ const evaluatorScores = [];
783
+ let testCaseError;
784
+ const output = readOutput(testCaseItem.testCase);
785
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
786
+ const evaluateFn = evaluator.getEvaluateFn();
787
+ if (!evaluateFn) {
788
+ continue;
789
+ }
790
+ const logs = [];
791
+ const logDiff = (expected, actual, options) => {
792
+ logs.push(createDiffLogEntry(expected, actual, options));
793
+ };
794
+ const log = (message, options) => {
795
+ logs.push(createLogEntry(message, options));
796
+ };
797
+ const createError = (message, options) => {
798
+ const entry = createLogEntry(message, options);
799
+ const error = message instanceof Error ? message : new Error(entry.message);
800
+ error[evaluatorErrorLogEntryKey] = entry;
801
+ return error;
802
+ };
803
+ try {
804
+ const ctx = yield* Effect.promise(
805
+ () => Promise.resolve(evaluator.resolveContext())
806
+ );
807
+ const result = yield* Effect.promise(
808
+ () => Promise.resolve().then(
809
+ () => evaluateFn({
810
+ input: testCaseItem.testCase.getInput(),
811
+ ctx,
812
+ output,
813
+ meta: {
814
+ triggerId: task.triggerId,
815
+ runId: evaluatorRunId,
816
+ datasetId: task.datasetId
817
+ },
818
+ logDiff,
819
+ log,
820
+ createError
821
+ })
822
+ )
823
+ );
824
+ if (result instanceof Error) {
825
+ const evaluatorError = result;
826
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
827
+ logs.push(taggedEntry ?? createLogEntry(result));
828
+ testCaseError = result.message;
746
829
  evaluatorScores.push({
747
830
  evaluatorId,
748
831
  scores: [],
749
832
  passed: false,
750
833
  logs: logs.length > 0 ? logs : void 0
751
834
  });
835
+ continue;
836
+ }
837
+ const { scores, metrics } = normalizeResult(result);
838
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
839
+ evaluatorScores.push({
840
+ evaluatorId,
841
+ scores,
842
+ passed,
843
+ metrics,
844
+ logs: logs.length > 0 ? logs : void 0
845
+ });
846
+ } catch (error) {
847
+ if (error instanceof Error) {
848
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
849
+ logs.push(taggedEntry ?? createLogEntry(error));
752
850
  }
851
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
852
+ evaluatorScores.push({
853
+ evaluatorId,
854
+ scores: [],
855
+ passed: false,
856
+ logs: logs.length > 0 ? logs : void 0
857
+ });
753
858
  }
754
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
755
- rerunPassed.push(rerunPassedThis);
756
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
757
- n + 1,
758
- n + 1
759
- ]);
760
- const progressEvent = {
761
- type: "TestCaseProgress",
762
- runId: task.runId,
763
- testCaseId: testCaseItem.id,
764
- testCaseName: testCaseItem.testCase.getName(),
765
- completedTestCases: completedEvaluations,
766
- totalTestCases: totalEvaluations,
767
- rerunIndex: r + 1,
768
- rerunTotal: reruns,
769
- passed: rerunPassedThis,
770
- durationMs: Date.now() - started,
771
- evaluatorScores,
772
- output,
773
- errorMessage: testCaseError
774
- };
775
- updateSnapshot(task.runId, (snapshot) => ({
776
- ...snapshot,
777
- completedTestCases: completedEvaluations
778
- }));
779
- yield* publishEvent(progressEvent);
780
- yield* Queue.offer(persistenceQueue, {
781
- runId: task.runId,
782
- artifactPath: task.snapshot.artifactPath,
783
- payload: progressEvent
784
- });
785
859
  }
786
- const testCasePassed = rerunPassed.every(Boolean);
787
- if (testCasePassed) {
788
- yield* Ref.update(passedRef, (n) => n + 1);
789
- } else {
790
- yield* Ref.update(failedRef, (n) => n + 1);
791
- }
792
- const [passed, failed] = yield* Effect.all([
793
- Ref.get(passedRef),
794
- Ref.get(failedRef)
860
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
861
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
862
+ n + 1,
863
+ n + 1
795
864
  ]);
796
- updateSnapshot(task.runId, (snapshot) => ({
865
+ const progressEvent = {
866
+ type: "TestCaseProgress",
867
+ runId: task.runId,
868
+ testCaseId: testCaseItem.id,
869
+ testCaseName: testCaseItem.testCase.getName(),
870
+ completedTestCases: completedEvaluations,
871
+ totalTestCases: totalEvaluations,
872
+ rerunIndex,
873
+ rerunTotal,
874
+ passed: rerunPassedThis,
875
+ durationMs: Date.now() - started,
876
+ evaluatorScores,
877
+ output,
878
+ errorMessage: testCaseError
879
+ };
880
+ yield* updateSnapshot(task.runId, (snapshot) => ({
797
881
  ...snapshot,
798
- passedTestCases: passed,
799
- failedTestCases: failed
882
+ completedTestCases: completedEvaluations
800
883
  }));
884
+ yield* publishEvent(progressEvent);
885
+ yield* Queue.offer(persistenceQueue, {
886
+ runId: task.runId,
887
+ artifactPath: task.snapshot.artifactPath,
888
+ payload: progressEvent
889
+ });
890
+ const testCaseCompleted = yield* Ref.modify(
891
+ testCaseResultsRef,
892
+ (map) => {
893
+ const key = testCaseItem.id;
894
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
895
+ const newResults = [...existing.results, rerunPassedThis];
896
+ const newCompletedCount = existing.completedCount + 1;
897
+ const isLast = newCompletedCount === rerunTotal;
898
+ const newMap = new Map(map);
899
+ newMap.set(key, {
900
+ completedCount: newCompletedCount,
901
+ results: newResults
902
+ });
903
+ const outcome = isLast ? newResults.every(Boolean) : null;
904
+ return [outcome, newMap];
905
+ }
906
+ );
907
+ if (testCaseCompleted !== null) {
908
+ if (testCaseCompleted) {
909
+ yield* Ref.update(passedRef, (n) => n + 1);
910
+ } else {
911
+ yield* Ref.update(failedRef, (n) => n + 1);
912
+ }
913
+ const [passed, failed] = yield* Effect.all([
914
+ Ref.get(passedRef),
915
+ Ref.get(failedRef)
916
+ ]);
917
+ yield* updateSnapshot(task.runId, (snapshot) => ({
918
+ ...snapshot,
919
+ passedTestCases: passed,
920
+ failedTestCases: failed
921
+ }));
922
+ }
801
923
  });
802
924
  }
803
925
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
804
926
  const startedAt = Date.now();
805
- updateSnapshot(task.runId, (snapshot) => ({
927
+ yield* updateSnapshot(task.runId, (snapshot) => ({
806
928
  ...snapshot,
807
929
  status: "running",
808
930
  startedAt
@@ -821,9 +943,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
821
943
  const startedRef = yield* Ref.make(0);
822
944
  const passedRef = yield* Ref.make(0);
823
945
  const failedRef = yield* Ref.make(0);
824
- const processTestCase = (testCaseItem) => processOneTestCase(
946
+ const testCaseResultsRef = yield* Ref.make(
947
+ /* @__PURE__ */ new Map()
948
+ );
949
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
950
+ const processEvaluation = (unit) => processOneEvaluation(
825
951
  task,
826
- testCaseItem,
952
+ unit,
827
953
  totalEvaluations,
828
954
  publishEvent,
829
955
  persistenceQueue,
@@ -831,11 +957,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
831
957
  startedRef,
832
958
  completedRef,
833
959
  passedRef,
834
- failedRef
960
+ failedRef,
961
+ testCaseResultsRef
835
962
  );
836
963
  yield* Effect.forEach(
837
- task.testCases,
838
- processTestCase,
964
+ evaluationUnits,
965
+ processEvaluation,
839
966
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
840
967
  );
841
968
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -853,7 +980,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
853
980
  totalTestCases: task.testCases.length,
854
981
  artifactPath: task.snapshot.artifactPath
855
982
  };
856
- updateSnapshot(task.runId, (snapshot) => ({
983
+ yield* updateSnapshot(task.runId, (snapshot) => ({
857
984
  ...snapshot,
858
985
  status: "completed",
859
986
  completedTestCases: completedEvaluations,
@@ -1106,7 +1233,9 @@ var EffectRunner = class {
1106
1233
  this.persistenceQueue = Effect.runSync(
1107
1234
  Queue.unbounded()
1108
1235
  );
1109
- this.snapshots = /* @__PURE__ */ new Map();
1236
+ this.snapshotsRef = Effect.runSync(
1237
+ Ref.make(/* @__PURE__ */ new Map())
1238
+ );
1110
1239
  this.listeners = /* @__PURE__ */ new Set();
1111
1240
  this.datasetsById = /* @__PURE__ */ new Map();
1112
1241
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1209,7 +1338,13 @@ var EffectRunner = class {
1209
1338
  status: "queued",
1210
1339
  artifactPath
1211
1340
  };
1212
- this.snapshots.set(runId, snapshot);
1341
+ await Effect.runPromise(
1342
+ Ref.update(this.snapshotsRef, (map) => {
1343
+ const next = new Map(map);
1344
+ next.set(runId, snapshot);
1345
+ return next;
1346
+ })
1347
+ );
1213
1348
  const queuedEvent = {
1214
1349
  type: "RunQueued",
1215
1350
  runId,
@@ -1250,12 +1385,12 @@ var EffectRunner = class {
1250
1385
  };
1251
1386
  }
1252
1387
  getRunSnapshot(runId) {
1253
- return this.snapshots.get(runId);
1388
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1254
1389
  }
1255
1390
  getAllRunSnapshots() {
1256
- return Array.from(this.snapshots.values()).sort(
1257
- (a, b) => b.queuedAt - a.queuedAt
1258
- );
1391
+ return Array.from(
1392
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
1393
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1259
1394
  }
1260
1395
  async loadRunSnapshotsFromArtifacts() {
1261
1396
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1284,11 +1419,15 @@ var EffectRunner = class {
1284
1419
  );
1285
1420
  }
1286
1421
  updateSnapshot(runId, updater) {
1287
- const existing = this.snapshots.get(runId);
1288
- if (!existing) {
1289
- return;
1290
- }
1291
- this.snapshots.set(runId, updater(existing));
1422
+ return Ref.modify(this.snapshotsRef, (map) => {
1423
+ const existing = map.get(runId);
1424
+ if (!existing) {
1425
+ return [void 0, map];
1426
+ }
1427
+ const next = new Map(map);
1428
+ next.set(runId, updater(existing));
1429
+ return [void 0, next];
1430
+ }).pipe(Effect.asVoid);
1292
1431
  }
1293
1432
  publishEvent(event) {
1294
1433
  return Effect.sync(() => {
@@ -1304,8 +1443,9 @@ var EffectRunner = class {
1304
1443
  );
1305
1444
  }
1306
1445
  };
1307
-
1308
- // src/cli-simple/args.ts
1446
+ function getDefaultConcurrency() {
1447
+ return Math.max(1, cpus().length);
1448
+ }
1309
1449
  function parseSimpleCliArgs(argv) {
1310
1450
  const args = {
1311
1451
  help: false,
@@ -1332,6 +1472,14 @@ function parseSimpleCliArgs(argv) {
1332
1472
  index += 1;
1333
1473
  continue;
1334
1474
  }
1475
+ if ((token === "--concurrency" || token === "-c") && argv[index + 1]) {
1476
+ const n = parseInt(argv[index + 1], 10);
1477
+ if (!Number.isNaN(n) && n >= 1) {
1478
+ args.concurrency = n;
1479
+ }
1480
+ index += 1;
1481
+ continue;
1482
+ }
1335
1483
  args.unknownArgs.push(token);
1336
1484
  }
1337
1485
  return args;
@@ -1339,9 +1487,12 @@ function parseSimpleCliArgs(argv) {
1339
1487
  function getSimpleCliUsage() {
1340
1488
  return [
1341
1489
  "Usage:",
1342
- " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern>",
1490
+ " eval-agents-simple run --dataset <datasetName> --evaluator <name-or-pattern> [--concurrency N]",
1343
1491
  " eval-agents-simple generate --dataset <datasetName>",
1344
1492
  "",
1493
+ "Options:",
1494
+ " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1495
+ "",
1345
1496
  "Pattern examples for --evaluator:",
1346
1497
  " score-evaluator exact name (case-insensitive)",
1347
1498
  ' "*score*" wildcard pattern',
@@ -1630,6 +1781,7 @@ function RunView({
1630
1781
  runner,
1631
1782
  datasetName,
1632
1783
  evaluatorPattern,
1784
+ concurrency,
1633
1785
  onComplete
1634
1786
  }) {
1635
1787
  const [phase, setPhase] = useState(
@@ -1777,7 +1929,8 @@ function RunView({
1777
1929
  });
1778
1930
  const snapshot = await runner.runDatasetWith({
1779
1931
  datasetId: dataset.id,
1780
- evaluatorIds: evaluators.map((item) => item.id)
1932
+ evaluatorIds: evaluators.map((item) => item.id),
1933
+ concurrency
1781
1934
  });
1782
1935
  setRunInfo({
1783
1936
  runId: snapshot.runId,
@@ -1805,7 +1958,7 @@ function RunView({
1805
1958
  });
1806
1959
  setPhase("completed");
1807
1960
  setTimeout(() => onComplete(), 200);
1808
- }, [runner, datasetName, evaluatorPattern, onComplete]);
1961
+ }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1809
1962
  useEffect(() => {
1810
1963
  void runEval();
1811
1964
  }, [runEval]);
@@ -1848,22 +2001,30 @@ function RunView({
1848
2001
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1849
2002
  }
1850
2003
  ),
1851
- runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
1852
- "[running ",
1853
- item.startedTestCases,
1854
- "/",
1855
- item.totalTestCases,
1856
- "] ",
1857
- item.name,
1858
- " ",
1859
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1860
- "(",
1861
- item.rerunIndex,
1862
- "/",
1863
- item.rerunTotal,
1864
- ")"
1865
- ] })
1866
- ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
2004
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
2005
+ Text,
2006
+ {
2007
+ color: "yellow",
2008
+ children: [
2009
+ "[running ",
2010
+ item.startedTestCases,
2011
+ "/",
2012
+ item.totalTestCases,
2013
+ "]",
2014
+ " ",
2015
+ item.name,
2016
+ " ",
2017
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2018
+ "(",
2019
+ item.rerunIndex,
2020
+ "/",
2021
+ item.rerunTotal,
2022
+ ")"
2023
+ ] })
2024
+ ]
2025
+ },
2026
+ `${item.testCaseId}:${item.rerunIndex}`
2027
+ )) })
1867
2028
  ] }),
1868
2029
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1869
2030
  /* @__PURE__ */ jsxs(Text, { children: [
@@ -1945,7 +2106,7 @@ function RunView({
1945
2106
  },
1946
2107
  `${item.evaluatorId}-${s.id}-${idx}`
1947
2108
  );
1948
- }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2109
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
1949
2110
  !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
1950
2111
  (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
1951
2112
  ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
@@ -2003,9 +2164,9 @@ function RunView({
2003
2164
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
2004
2165
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2005
2166
  const agg = summary.aggregates.get(id);
2006
- const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2007
- (k) => k.startsWith(`${id}:`)
2008
- );
2167
+ const scoreKeys = [
2168
+ ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2169
+ ].filter((k) => k.startsWith(`${id}:`));
2009
2170
  if (scoreKeys.length === 0) {
2010
2171
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2011
2172
  "- ",
@@ -2313,7 +2474,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2313
2474
  }
2314
2475
  return lines;
2315
2476
  }
2316
- async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern) {
2477
+ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern, concurrency) {
2317
2478
  const dataset = await runner.resolveDatasetByName(datasetName);
2318
2479
  if (!dataset) {
2319
2480
  const known = await runner.collectDatasets();
@@ -2503,7 +2664,8 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2503
2664
  });
2504
2665
  const snapshot = await runner.runDatasetWith({
2505
2666
  datasetId: dataset.id,
2506
- evaluatorIds: evaluators.map((item) => item.id)
2667
+ evaluatorIds: evaluators.map((item) => item.id),
2668
+ concurrency
2507
2669
  });
2508
2670
  totalCount = snapshot.totalTestCases;
2509
2671
  console.log(colorize("=== Eval Run Started ===", `${ansi2.bold}${ansi2.cyan}`));
@@ -2592,13 +2754,14 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2592
2754
  }
2593
2755
  console.log(`- artifact: ${colorize(completed.artifactPath, ansi2.dim)}`);
2594
2756
  }
2595
- async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2757
+ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2596
2758
  return new Promise((resolve5, reject) => {
2597
2759
  const app = render(
2598
2760
  React2.createElement(RunView, {
2599
2761
  runner,
2600
2762
  datasetName,
2601
2763
  evaluatorPattern,
2764
+ concurrency,
2602
2765
  onComplete: (err) => {
2603
2766
  app.unmount();
2604
2767
  if (err) {
@@ -2645,10 +2808,12 @@ async function main() {
2645
2808
  const runner = createRunner();
2646
2809
  try {
2647
2810
  if (args.command === "run") {
2811
+ const concurrency = args.concurrency ?? getDefaultConcurrency();
2648
2812
  await (useInk ? runSimpleEvalCommandInk : runSimpleEvalCommandPlain)(
2649
2813
  runner,
2650
2814
  args.datasetName,
2651
- args.evaluatorPattern
2815
+ args.evaluatorPattern,
2816
+ concurrency
2652
2817
  );
2653
2818
  return;
2654
2819
  }