@m4trix/evals 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -763,6 +763,20 @@ function readOutput(testCase) {
763
763
  }
764
764
  return candidate.getOutput();
765
765
  }
766
+ function buildEvaluationUnits(testCases) {
767
+ const units = [];
768
+ for (const testCaseItem of testCases) {
769
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
770
+ for (let r = 0; r < rerunTotal; r++) {
771
+ units.push({
772
+ testCaseItem,
773
+ rerunIndex: r + 1,
774
+ rerunTotal
775
+ });
776
+ }
777
+ }
778
+ return units;
779
+ }
766
780
  function nowIsoForFile() {
767
781
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
768
782
  }
@@ -772,157 +786,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
772
786
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
773
787
  );
774
788
  }
775
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
789
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
790
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
776
791
  return effect.Effect.gen(function* () {
777
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
778
- const rerunPassed = [];
779
- for (let r = 0; r < reruns; r++) {
780
- const evaluatorRunId = `run-${crypto.randomUUID()}`;
781
- const started = Date.now();
782
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
783
- n + 1,
784
- n + 1
785
- ]);
786
- yield* publishEvent({
787
- type: "TestCaseStarted",
788
- runId: task.runId,
789
- testCaseId: testCaseItem.id,
790
- testCaseName: testCaseItem.testCase.getName(),
791
- startedTestCases: startedEvaluations,
792
- totalTestCases: totalEvaluations,
793
- rerunIndex: r + 1,
794
- rerunTotal: reruns
795
- });
796
- const evaluatorScores = [];
797
- let testCaseError;
798
- const output = readOutput(testCaseItem.testCase);
799
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
800
- const evaluateFn = evaluator.getEvaluateFn();
801
- if (!evaluateFn) {
802
- continue;
803
- }
804
- const logs = [];
805
- const logDiff = (expected, actual, options) => {
806
- logs.push(createDiffLogEntry(expected, actual, options));
807
- };
808
- const log = (message, options) => {
809
- logs.push(createLogEntry(message, options));
810
- };
811
- const createError = (message, options) => {
812
- const entry = createLogEntry(message, options);
813
- const error = message instanceof Error ? message : new Error(entry.message);
814
- error[evaluatorErrorLogEntryKey] = entry;
815
- return error;
816
- };
817
- try {
818
- const ctx = yield* effect.Effect.promise(
819
- () => Promise.resolve(evaluator.resolveContext())
820
- );
821
- const result = yield* effect.Effect.promise(
822
- () => Promise.resolve().then(
823
- () => evaluateFn({
824
- input: testCaseItem.testCase.getInput(),
825
- ctx,
826
- output,
827
- meta: {
828
- triggerId: task.triggerId,
829
- runId: evaluatorRunId,
830
- datasetId: task.datasetId
831
- },
832
- logDiff,
833
- log,
834
- createError
835
- })
836
- )
837
- );
838
- if (result instanceof Error) {
839
- const evaluatorError = result;
840
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
841
- logs.push(taggedEntry ?? createLogEntry(result));
842
- testCaseError = result.message;
843
- evaluatorScores.push({
844
- evaluatorId,
845
- scores: [],
846
- passed: false,
847
- logs: logs.length > 0 ? logs : void 0
848
- });
849
- continue;
850
- }
851
- const { scores, metrics } = normalizeResult(result);
852
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
853
- evaluatorScores.push({
854
- evaluatorId,
855
- scores,
856
- passed: passed2,
857
- metrics,
858
- logs: logs.length > 0 ? logs : void 0
859
- });
860
- } catch (error) {
861
- if (error instanceof Error) {
862
- const taggedEntry = error[evaluatorErrorLogEntryKey];
863
- logs.push(taggedEntry ?? createLogEntry(error));
864
- }
865
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
792
+ const evaluatorRunId = `run-${crypto.randomUUID()}`;
793
+ const started = Date.now();
794
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
795
+ n + 1,
796
+ n + 1
797
+ ]);
798
+ yield* publishEvent({
799
+ type: "TestCaseStarted",
800
+ runId: task.runId,
801
+ testCaseId: testCaseItem.id,
802
+ testCaseName: testCaseItem.testCase.getName(),
803
+ startedTestCases: startedEvaluations,
804
+ totalTestCases: totalEvaluations,
805
+ rerunIndex,
806
+ rerunTotal
807
+ });
808
+ const evaluatorScores = [];
809
+ let testCaseError;
810
+ const output = readOutput(testCaseItem.testCase);
811
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
812
+ const evaluateFn = evaluator.getEvaluateFn();
813
+ if (!evaluateFn) {
814
+ continue;
815
+ }
816
+ const logs = [];
817
+ const logDiff = (expected, actual, options) => {
818
+ logs.push(createDiffLogEntry(expected, actual, options));
819
+ };
820
+ const log = (message, options) => {
821
+ logs.push(createLogEntry(message, options));
822
+ };
823
+ const createError = (message, options) => {
824
+ const entry = createLogEntry(message, options);
825
+ const error = message instanceof Error ? message : new Error(entry.message);
826
+ error[evaluatorErrorLogEntryKey] = entry;
827
+ return error;
828
+ };
829
+ try {
830
+ const ctx = yield* effect.Effect.promise(
831
+ () => Promise.resolve(evaluator.resolveContext())
832
+ );
833
+ const result = yield* effect.Effect.promise(
834
+ () => Promise.resolve().then(
835
+ () => evaluateFn({
836
+ input: testCaseItem.testCase.getInput(),
837
+ ctx,
838
+ output,
839
+ meta: {
840
+ triggerId: task.triggerId,
841
+ runId: evaluatorRunId,
842
+ datasetId: task.datasetId
843
+ },
844
+ logDiff,
845
+ log,
846
+ createError
847
+ })
848
+ )
849
+ );
850
+ if (result instanceof Error) {
851
+ const evaluatorError = result;
852
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
853
+ logs.push(taggedEntry ?? createLogEntry(result));
854
+ testCaseError = result.message;
866
855
  evaluatorScores.push({
867
856
  evaluatorId,
868
857
  scores: [],
869
858
  passed: false,
870
859
  logs: logs.length > 0 ? logs : void 0
871
860
  });
861
+ continue;
862
+ }
863
+ const { scores, metrics } = normalizeResult(result);
864
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
865
+ evaluatorScores.push({
866
+ evaluatorId,
867
+ scores,
868
+ passed,
869
+ metrics,
870
+ logs: logs.length > 0 ? logs : void 0
871
+ });
872
+ } catch (error) {
873
+ if (error instanceof Error) {
874
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
875
+ logs.push(taggedEntry ?? createLogEntry(error));
872
876
  }
877
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
878
+ evaluatorScores.push({
879
+ evaluatorId,
880
+ scores: [],
881
+ passed: false,
882
+ logs: logs.length > 0 ? logs : void 0
883
+ });
873
884
  }
874
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
875
- rerunPassed.push(rerunPassedThis);
876
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
877
- n + 1,
878
- n + 1
879
- ]);
880
- const progressEvent = {
881
- type: "TestCaseProgress",
882
- runId: task.runId,
883
- testCaseId: testCaseItem.id,
884
- testCaseName: testCaseItem.testCase.getName(),
885
- completedTestCases: completedEvaluations,
886
- totalTestCases: totalEvaluations,
887
- rerunIndex: r + 1,
888
- rerunTotal: reruns,
889
- passed: rerunPassedThis,
890
- durationMs: Date.now() - started,
891
- evaluatorScores,
892
- output,
893
- errorMessage: testCaseError
894
- };
895
- updateSnapshot(task.runId, (snapshot) => ({
896
- ...snapshot,
897
- completedTestCases: completedEvaluations
898
- }));
899
- yield* publishEvent(progressEvent);
900
- yield* effect.Queue.offer(persistenceQueue, {
901
- runId: task.runId,
902
- artifactPath: task.snapshot.artifactPath,
903
- payload: progressEvent
904
- });
905
- }
906
- const testCasePassed = rerunPassed.every(Boolean);
907
- if (testCasePassed) {
908
- yield* effect.Ref.update(passedRef, (n) => n + 1);
909
- } else {
910
- yield* effect.Ref.update(failedRef, (n) => n + 1);
911
885
  }
912
- const [passed, failed] = yield* effect.Effect.all([
913
- effect.Ref.get(passedRef),
914
- effect.Ref.get(failedRef)
886
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
887
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
888
+ n + 1,
889
+ n + 1
915
890
  ]);
916
- updateSnapshot(task.runId, (snapshot) => ({
891
+ const progressEvent = {
892
+ type: "TestCaseProgress",
893
+ runId: task.runId,
894
+ testCaseId: testCaseItem.id,
895
+ testCaseName: testCaseItem.testCase.getName(),
896
+ completedTestCases: completedEvaluations,
897
+ totalTestCases: totalEvaluations,
898
+ rerunIndex,
899
+ rerunTotal,
900
+ passed: rerunPassedThis,
901
+ durationMs: Date.now() - started,
902
+ evaluatorScores,
903
+ output,
904
+ errorMessage: testCaseError
905
+ };
906
+ yield* updateSnapshot(task.runId, (snapshot) => ({
917
907
  ...snapshot,
918
- passedTestCases: passed,
919
- failedTestCases: failed
908
+ completedTestCases: completedEvaluations
920
909
  }));
910
+ yield* publishEvent(progressEvent);
911
+ yield* effect.Queue.offer(persistenceQueue, {
912
+ runId: task.runId,
913
+ artifactPath: task.snapshot.artifactPath,
914
+ payload: progressEvent
915
+ });
916
+ const testCaseCompleted = yield* effect.Ref.modify(
917
+ testCaseResultsRef,
918
+ (map) => {
919
+ const key = testCaseItem.id;
920
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
921
+ const newResults = [...existing.results, rerunPassedThis];
922
+ const newCompletedCount = existing.completedCount + 1;
923
+ const isLast = newCompletedCount === rerunTotal;
924
+ const newMap = new Map(map);
925
+ newMap.set(key, {
926
+ completedCount: newCompletedCount,
927
+ results: newResults
928
+ });
929
+ const outcome = isLast ? newResults.every(Boolean) : null;
930
+ return [outcome, newMap];
931
+ }
932
+ );
933
+ if (testCaseCompleted !== null) {
934
+ if (testCaseCompleted) {
935
+ yield* effect.Ref.update(passedRef, (n) => n + 1);
936
+ } else {
937
+ yield* effect.Ref.update(failedRef, (n) => n + 1);
938
+ }
939
+ const [passed, failed] = yield* effect.Effect.all([
940
+ effect.Ref.get(passedRef),
941
+ effect.Ref.get(failedRef)
942
+ ]);
943
+ yield* updateSnapshot(task.runId, (snapshot) => ({
944
+ ...snapshot,
945
+ passedTestCases: passed,
946
+ failedTestCases: failed
947
+ }));
948
+ }
921
949
  });
922
950
  }
923
951
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
924
952
  const startedAt = Date.now();
925
- updateSnapshot(task.runId, (snapshot) => ({
953
+ yield* updateSnapshot(task.runId, (snapshot) => ({
926
954
  ...snapshot,
927
955
  status: "running",
928
956
  startedAt
@@ -941,9 +969,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
941
969
  const startedRef = yield* effect.Ref.make(0);
942
970
  const passedRef = yield* effect.Ref.make(0);
943
971
  const failedRef = yield* effect.Ref.make(0);
944
- const processTestCase = (testCaseItem) => processOneTestCase(
972
+ const testCaseResultsRef = yield* effect.Ref.make(
973
+ /* @__PURE__ */ new Map()
974
+ );
975
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
976
+ const processEvaluation = (unit) => processOneEvaluation(
945
977
  task,
946
- testCaseItem,
978
+ unit,
947
979
  totalEvaluations,
948
980
  publishEvent,
949
981
  persistenceQueue,
@@ -951,11 +983,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
951
983
  startedRef,
952
984
  completedRef,
953
985
  passedRef,
954
- failedRef
986
+ failedRef,
987
+ testCaseResultsRef
955
988
  );
956
989
  yield* effect.Effect.forEach(
957
- task.testCases,
958
- processTestCase,
990
+ evaluationUnits,
991
+ processEvaluation,
959
992
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
960
993
  );
961
994
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
@@ -973,7 +1006,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
973
1006
  totalTestCases: task.testCases.length,
974
1007
  artifactPath: task.snapshot.artifactPath
975
1008
  };
976
- updateSnapshot(task.runId, (snapshot) => ({
1009
+ yield* updateSnapshot(task.runId, (snapshot) => ({
977
1010
  ...snapshot,
978
1011
  status: "completed",
979
1012
  completedTestCases: completedEvaluations,
@@ -1226,7 +1259,9 @@ var EffectRunner = class {
1226
1259
  this.persistenceQueue = effect.Effect.runSync(
1227
1260
  effect.Queue.unbounded()
1228
1261
  );
1229
- this.snapshots = /* @__PURE__ */ new Map();
1262
+ this.snapshotsRef = effect.Effect.runSync(
1263
+ effect.Ref.make(/* @__PURE__ */ new Map())
1264
+ );
1230
1265
  this.listeners = /* @__PURE__ */ new Set();
1231
1266
  this.datasetsById = /* @__PURE__ */ new Map();
1232
1267
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1329,7 +1364,13 @@ var EffectRunner = class {
1329
1364
  status: "queued",
1330
1365
  artifactPath
1331
1366
  };
1332
- this.snapshots.set(runId, snapshot);
1367
+ await effect.Effect.runPromise(
1368
+ effect.Ref.update(this.snapshotsRef, (map) => {
1369
+ const next = new Map(map);
1370
+ next.set(runId, snapshot);
1371
+ return next;
1372
+ })
1373
+ );
1333
1374
  const queuedEvent = {
1334
1375
  type: "RunQueued",
1335
1376
  runId,
@@ -1370,12 +1411,12 @@ var EffectRunner = class {
1370
1411
  };
1371
1412
  }
1372
1413
  getRunSnapshot(runId) {
1373
- return this.snapshots.get(runId);
1414
+ return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1374
1415
  }
1375
1416
  getAllRunSnapshots() {
1376
- return Array.from(this.snapshots.values()).sort(
1377
- (a, b) => b.queuedAt - a.queuedAt
1378
- );
1417
+ return Array.from(
1418
+ effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1419
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1379
1420
  }
1380
1421
  async loadRunSnapshotsFromArtifacts() {
1381
1422
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1404,11 +1445,15 @@ var EffectRunner = class {
1404
1445
  );
1405
1446
  }
1406
1447
  updateSnapshot(runId, updater) {
1407
- const existing = this.snapshots.get(runId);
1408
- if (!existing) {
1409
- return;
1410
- }
1411
- this.snapshots.set(runId, updater(existing));
1448
+ return effect.Ref.modify(this.snapshotsRef, (map) => {
1449
+ const existing = map.get(runId);
1450
+ if (!existing) {
1451
+ return [void 0, map];
1452
+ }
1453
+ const next = new Map(map);
1454
+ next.set(runId, updater(existing));
1455
+ return [void 0, next];
1456
+ }).pipe(effect.Effect.asVoid);
1412
1457
  }
1413
1458
  publishEvent(event) {
1414
1459
  return effect.Effect.sync(() => {