@m4trix/evals 0.21.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,6 @@ var promises = require('fs/promises');
10
10
  var url = require('url');
11
11
  var diff = require('diff');
12
12
  var stringify = require('fast-json-stable-stringify');
13
- var os = require('os');
14
13
  var React2 = require('react');
15
14
  var ink = require('ink');
16
15
  var jsxRuntime = require('react/jsx-runtime');
@@ -763,6 +762,20 @@ function readOutput(testCase) {
763
762
  }
764
763
  return candidate.getOutput();
765
764
  }
765
+ function buildEvaluationUnits(testCases) {
766
+ const units = [];
767
+ for (const testCaseItem of testCases) {
768
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
769
+ for (let r = 0; r < rerunTotal; r++) {
770
+ units.push({
771
+ testCaseItem,
772
+ rerunIndex: r + 1,
773
+ rerunTotal
774
+ });
775
+ }
776
+ }
777
+ return units;
778
+ }
766
779
  function nowIsoForFile() {
767
780
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
768
781
  }
@@ -772,157 +785,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
772
785
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
773
786
  );
774
787
  }
775
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
788
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
789
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
776
790
  return effect.Effect.gen(function* () {
777
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
778
- const rerunPassed = [];
779
- for (let r = 0; r < reruns; r++) {
780
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
781
- const started = Date.now();
782
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
783
- n + 1,
784
- n + 1
785
- ]);
786
- yield* publishEvent({
787
- type: "TestCaseStarted",
788
- runId: task.runId,
789
- testCaseId: testCaseItem.id,
790
- testCaseName: testCaseItem.testCase.getName(),
791
- startedTestCases: startedEvaluations,
792
- totalTestCases: totalEvaluations,
793
- rerunIndex: r + 1,
794
- rerunTotal: reruns
795
- });
796
- const evaluatorScores = [];
797
- let testCaseError;
798
- const output = readOutput(testCaseItem.testCase);
799
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
800
- const evaluateFn = evaluator.getEvaluateFn();
801
- if (!evaluateFn) {
802
- continue;
803
- }
804
- const logs = [];
805
- const logDiff = (expected, actual, options) => {
806
- logs.push(createDiffLogEntry(expected, actual, options));
807
- };
808
- const log = (message, options) => {
809
- logs.push(createLogEntry(message, options));
810
- };
811
- const createError = (message, options) => {
812
- const entry = createLogEntry(message, options);
813
- const error = message instanceof Error ? message : new Error(entry.message);
814
- error[evaluatorErrorLogEntryKey] = entry;
815
- return error;
816
- };
817
- try {
818
- const ctx = yield* effect.Effect.promise(
819
- () => Promise.resolve(evaluator.resolveContext())
820
- );
821
- const result = yield* effect.Effect.promise(
822
- () => Promise.resolve().then(
823
- () => evaluateFn({
824
- input: testCaseItem.testCase.getInput(),
825
- ctx,
826
- output,
827
- meta: {
828
- triggerId: task.triggerId,
829
- runId: evaluatorRunId,
830
- datasetId: task.datasetId
831
- },
832
- logDiff,
833
- log,
834
- createError
835
- })
836
- )
837
- );
838
- if (result instanceof Error) {
839
- const evaluatorError = result;
840
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
841
- logs.push(taggedEntry ?? createLogEntry(result));
842
- testCaseError = result.message;
843
- evaluatorScores.push({
844
- evaluatorId,
845
- scores: [],
846
- passed: false,
847
- logs: logs.length > 0 ? logs : void 0
848
- });
849
- continue;
850
- }
851
- const { scores, metrics } = normalizeResult(result);
852
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
853
- evaluatorScores.push({
854
- evaluatorId,
855
- scores,
856
- passed: passed2,
857
- metrics,
858
- logs: logs.length > 0 ? logs : void 0
859
- });
860
- } catch (error) {
861
- if (error instanceof Error) {
862
- const taggedEntry = error[evaluatorErrorLogEntryKey];
863
- logs.push(taggedEntry ?? createLogEntry(error));
864
- }
865
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
791
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
792
+ const started = Date.now();
793
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
794
+ n + 1,
795
+ n + 1
796
+ ]);
797
+ yield* publishEvent({
798
+ type: "TestCaseStarted",
799
+ runId: task.runId,
800
+ testCaseId: testCaseItem.id,
801
+ testCaseName: testCaseItem.testCase.getName(),
802
+ startedTestCases: startedEvaluations,
803
+ totalTestCases: totalEvaluations,
804
+ rerunIndex,
805
+ rerunTotal
806
+ });
807
+ const evaluatorScores = [];
808
+ let testCaseError;
809
+ const output = readOutput(testCaseItem.testCase);
810
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
811
+ const evaluateFn = evaluator.getEvaluateFn();
812
+ if (!evaluateFn) {
813
+ continue;
814
+ }
815
+ const logs = [];
816
+ const logDiff = (expected, actual, options) => {
817
+ logs.push(createDiffLogEntry(expected, actual, options));
818
+ };
819
+ const log = (message, options) => {
820
+ logs.push(createLogEntry(message, options));
821
+ };
822
+ const createError = (message, options) => {
823
+ const entry = createLogEntry(message, options);
824
+ const error = message instanceof Error ? message : new Error(entry.message);
825
+ error[evaluatorErrorLogEntryKey] = entry;
826
+ return error;
827
+ };
828
+ try {
829
+ const ctx = yield* effect.Effect.promise(
830
+ () => Promise.resolve(evaluator.resolveContext())
831
+ );
832
+ const result = yield* effect.Effect.promise(
833
+ () => Promise.resolve().then(
834
+ () => evaluateFn({
835
+ input: testCaseItem.testCase.getInput(),
836
+ ctx,
837
+ output,
838
+ meta: {
839
+ triggerId: task.triggerId,
840
+ runId: evaluatorRunId,
841
+ datasetId: task.datasetId
842
+ },
843
+ logDiff,
844
+ log,
845
+ createError
846
+ })
847
+ )
848
+ );
849
+ if (result instanceof Error) {
850
+ const evaluatorError = result;
851
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
852
+ logs.push(taggedEntry ?? createLogEntry(result));
853
+ testCaseError = result.message;
866
854
  evaluatorScores.push({
867
855
  evaluatorId,
868
856
  scores: [],
869
857
  passed: false,
870
858
  logs: logs.length > 0 ? logs : void 0
871
859
  });
860
+ continue;
872
861
  }
862
+ const { scores, metrics } = normalizeResult(result);
863
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
864
+ evaluatorScores.push({
865
+ evaluatorId,
866
+ scores,
867
+ passed,
868
+ metrics,
869
+ logs: logs.length > 0 ? logs : void 0
870
+ });
871
+ } catch (error) {
872
+ if (error instanceof Error) {
873
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
874
+ logs.push(taggedEntry ?? createLogEntry(error));
875
+ }
876
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
877
+ evaluatorScores.push({
878
+ evaluatorId,
879
+ scores: [],
880
+ passed: false,
881
+ logs: logs.length > 0 ? logs : void 0
882
+ });
873
883
  }
874
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
875
- rerunPassed.push(rerunPassedThis);
876
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
877
- n + 1,
878
- n + 1
879
- ]);
880
- const progressEvent = {
881
- type: "TestCaseProgress",
882
- runId: task.runId,
883
- testCaseId: testCaseItem.id,
884
- testCaseName: testCaseItem.testCase.getName(),
885
- completedTestCases: completedEvaluations,
886
- totalTestCases: totalEvaluations,
887
- rerunIndex: r + 1,
888
- rerunTotal: reruns,
889
- passed: rerunPassedThis,
890
- durationMs: Date.now() - started,
891
- evaluatorScores,
892
- output,
893
- errorMessage: testCaseError
894
- };
895
- updateSnapshot(task.runId, (snapshot) => ({
896
- ...snapshot,
897
- completedTestCases: completedEvaluations
898
- }));
899
- yield* publishEvent(progressEvent);
900
- yield* effect.Queue.offer(persistenceQueue, {
901
- runId: task.runId,
902
- artifactPath: task.snapshot.artifactPath,
903
- payload: progressEvent
904
- });
905
- }
906
- const testCasePassed = rerunPassed.every(Boolean);
907
- if (testCasePassed) {
908
- yield* effect.Ref.update(passedRef, (n) => n + 1);
909
- } else {
910
- yield* effect.Ref.update(failedRef, (n) => n + 1);
911
884
  }
912
- const [passed, failed] = yield* effect.Effect.all([
913
- effect.Ref.get(passedRef),
914
- effect.Ref.get(failedRef)
885
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
886
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
887
+ n + 1,
888
+ n + 1
915
889
  ]);
916
- updateSnapshot(task.runId, (snapshot) => ({
890
+ const progressEvent = {
891
+ type: "TestCaseProgress",
892
+ runId: task.runId,
893
+ testCaseId: testCaseItem.id,
894
+ testCaseName: testCaseItem.testCase.getName(),
895
+ completedTestCases: completedEvaluations,
896
+ totalTestCases: totalEvaluations,
897
+ rerunIndex,
898
+ rerunTotal,
899
+ passed: rerunPassedThis,
900
+ durationMs: Date.now() - started,
901
+ evaluatorScores,
902
+ output,
903
+ errorMessage: testCaseError
904
+ };
905
+ yield* updateSnapshot(task.runId, (snapshot) => ({
917
906
  ...snapshot,
918
- passedTestCases: passed,
919
- failedTestCases: failed
907
+ completedTestCases: completedEvaluations
920
908
  }));
909
+ yield* publishEvent(progressEvent);
910
+ yield* effect.Queue.offer(persistenceQueue, {
911
+ runId: task.runId,
912
+ artifactPath: task.snapshot.artifactPath,
913
+ payload: progressEvent
914
+ });
915
+ const testCaseCompleted = yield* effect.Ref.modify(
916
+ testCaseResultsRef,
917
+ (map) => {
918
+ const key = testCaseItem.id;
919
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
920
+ const newResults = [...existing.results, rerunPassedThis];
921
+ const newCompletedCount = existing.completedCount + 1;
922
+ const isLast = newCompletedCount === rerunTotal;
923
+ const newMap = new Map(map);
924
+ newMap.set(key, {
925
+ completedCount: newCompletedCount,
926
+ results: newResults
927
+ });
928
+ const outcome = isLast ? newResults.every(Boolean) : null;
929
+ return [outcome, newMap];
930
+ }
931
+ );
932
+ if (testCaseCompleted !== null) {
933
+ if (testCaseCompleted) {
934
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
935
+ } else {
936
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
937
+ }
938
+ const [passed, failed] = yield* effect.Effect.all([
939
+ effect.Ref.get(passedRef),
940
+ effect.Ref.get(failedRef)
941
+ ]);
942
+ yield* updateSnapshot(task.runId, (snapshot) => ({
943
+ ...snapshot,
944
+ passedTestCases: passed,
945
+ failedTestCases: failed
946
+ }));
947
+ }
921
948
  });
922
949
  }
923
950
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
924
951
  const startedAt = Date.now();
925
- updateSnapshot(task.runId, (snapshot) => ({
952
+ yield* updateSnapshot(task.runId, (snapshot) => ({
926
953
  ...snapshot,
927
954
  status: "running",
928
955
  startedAt
@@ -941,9 +968,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
941
968
  const startedRef = yield* effect.Ref.make(0);
942
969
  const passedRef = yield* effect.Ref.make(0);
943
970
  const failedRef = yield* effect.Ref.make(0);
944
- const processTestCase = (testCaseItem) => processOneTestCase(
971
+ const testCaseResultsRef = yield* effect.Ref.make(
972
+ /* @__PURE__ */ new Map()
973
+ );
974
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
975
+ const processEvaluation = (unit) => processOneEvaluation(
945
976
  task,
946
- testCaseItem,
977
+ unit,
947
978
  totalEvaluations,
948
979
  publishEvent,
949
980
  persistenceQueue,
@@ -951,11 +982,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
951
982
  startedRef,
952
983
  completedRef,
953
984
  passedRef,
954
- failedRef
985
+ failedRef,
986
+ testCaseResultsRef
955
987
  );
956
988
  yield* effect.Effect.forEach(
957
- task.testCases,
958
- processTestCase,
989
+ evaluationUnits,
990
+ processEvaluation,
959
991
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
960
992
  );
961
993
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -973,7 +1005,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
973
1005
  totalTestCases: task.testCases.length,
974
1006
  artifactPath: task.snapshot.artifactPath
975
1007
  };
976
- updateSnapshot(task.runId, (snapshot) => ({
1008
+ yield* updateSnapshot(task.runId, (snapshot) => ({
977
1009
  ...snapshot,
978
1010
  status: "completed",
979
1011
  completedTestCases: completedEvaluations,
@@ -1226,7 +1258,9 @@ var EffectRunner = class {
1226
1258
  this.persistenceQueue = effect.Effect.runSync(
1227
1259
  effect.Queue.unbounded()
1228
1260
  );
1229
- this.snapshots = /* @__PURE__ */ new Map();
1261
+ this.snapshotsRef = effect.Effect.runSync(
1262
+ effect.Ref.make(/* @__PURE__ */ new Map())
1263
+ );
1230
1264
  this.listeners = /* @__PURE__ */ new Set();
1231
1265
  this.datasetsById = /* @__PURE__ */ new Map();
1232
1266
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1329,7 +1363,13 @@ var EffectRunner = class {
1329
1363
  status: "queued",
1330
1364
  artifactPath
1331
1365
  };
1332
- this.snapshots.set(runId, snapshot);
1366
+ await effect.Effect.runPromise(
1367
+ effect.Ref.update(this.snapshotsRef, (map) => {
1368
+ const next = new Map(map);
1369
+ next.set(runId, snapshot);
1370
+ return next;
1371
+ })
1372
+ );
1333
1373
  const queuedEvent = {
1334
1374
  type: "RunQueued",
1335
1375
  runId,
@@ -1370,12 +1410,12 @@ var EffectRunner = class {
1370
1410
  };
1371
1411
  }
1372
1412
  getRunSnapshot(runId) {
1373
- return this.snapshots.get(runId);
1413
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1374
1414
  }
1375
1415
  getAllRunSnapshots() {
1376
- return Array.from(this.snapshots.values()).sort(
1377
- (a, b) => b.queuedAt - a.queuedAt
1378
- );
1416
+ return Array.from(
1417
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1418
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1379
1419
  }
1380
1420
  async loadRunSnapshotsFromArtifacts() {
1381
1421
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1404,11 +1444,15 @@ var EffectRunner = class {
1404
1444
  );
1405
1445
  }
1406
1446
  updateSnapshot(runId, updater) {
1407
- const existing = this.snapshots.get(runId);
1408
- if (!existing) {
1409
- return;
1410
- }
1411
- this.snapshots.set(runId, updater(existing));
1447
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
1448
+ const existing = map.get(runId);
1449
+ if (!existing) {
1450
+ return [void 0, map];
1451
+ }
1452
+ const next = new Map(map);
1453
+ next.set(runId, updater(existing));
1454
+ return [void 0, next];
1455
+ }).pipe(effect.Effect.asVoid);
1412
1456
  }
1413
1457
  publishEvent(event) {
1414
1458
  return effect.Effect.sync(() => {
@@ -1424,8 +1468,10 @@ var EffectRunner = class {
1424
1468
  );
1425
1469
  }
1426
1470
  };
1471
+
1472
+ // src/cli-simple/args.ts
1427
1473
  function getDefaultConcurrency() {
1428
- return Math.max(1, os.cpus().length);
1474
+ return 4;
1429
1475
  }
1430
1476
  function parseSimpleCliArgs(argv) {
1431
1477
  const args = {
@@ -1472,7 +1518,7 @@ function getSimpleCliUsage() {
1472
1518
  " eval-agents-simple generate --dataset <datasetName>",
1473
1519
  "",
1474
1520
  "Options:",
1475
- " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1521
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1476
1522
  "",
1477
1523
  "Pattern examples for --evaluator:",
1478
1524
  " score-evaluator exact name (case-insensitive)",