@m4trix/evals 0.21.1 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
  import { randomUUID } from 'crypto';
3
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
3
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
4
4
  import { existsSync } from 'fs';
5
5
  import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
@@ -737,6 +737,20 @@ function readOutput(testCase) {
737
737
  }
738
738
  return candidate.getOutput();
739
739
  }
740
+ function buildEvaluationUnits(testCases) {
741
+ const units = [];
742
+ for (const testCaseItem of testCases) {
743
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
744
+ for (let r = 0; r < rerunTotal; r++) {
745
+ units.push({
746
+ testCaseItem,
747
+ rerunIndex: r + 1,
748
+ rerunTotal
749
+ });
750
+ }
751
+ }
752
+ return units;
753
+ }
740
754
  function nowIsoForFile() {
741
755
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
742
756
  }
@@ -746,157 +760,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
746
760
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
747
761
  );
748
762
  }
749
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
763
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
764
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
750
765
  return Effect.gen(function* () {
751
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
752
- const rerunPassed = [];
753
- for (let r = 0; r < reruns; r++) {
754
- const evaluatorRunId = `run-${randomUUID()}`;
755
- const started = Date.now();
756
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
757
- n + 1,
758
- n + 1
759
- ]);
760
- yield* publishEvent({
761
- type: "TestCaseStarted",
762
- runId: task.runId,
763
- testCaseId: testCaseItem.id,
764
- testCaseName: testCaseItem.testCase.getName(),
765
- startedTestCases: startedEvaluations,
766
- totalTestCases: totalEvaluations,
767
- rerunIndex: r + 1,
768
- rerunTotal: reruns
769
- });
770
- const evaluatorScores = [];
771
- let testCaseError;
772
- const output = readOutput(testCaseItem.testCase);
773
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
774
- const evaluateFn = evaluator.getEvaluateFn();
775
- if (!evaluateFn) {
776
- continue;
777
- }
778
- const logs = [];
779
- const logDiff = (expected, actual, options) => {
780
- logs.push(createDiffLogEntry(expected, actual, options));
781
- };
782
- const log = (message, options) => {
783
- logs.push(createLogEntry(message, options));
784
- };
785
- const createError = (message, options) => {
786
- const entry = createLogEntry(message, options);
787
- const error = message instanceof Error ? message : new Error(entry.message);
788
- error[evaluatorErrorLogEntryKey] = entry;
789
- return error;
790
- };
791
- try {
792
- const ctx = yield* Effect.promise(
793
- () => Promise.resolve(evaluator.resolveContext())
794
- );
795
- const result = yield* Effect.promise(
796
- () => Promise.resolve().then(
797
- () => evaluateFn({
798
- input: testCaseItem.testCase.getInput(),
799
- ctx,
800
- output,
801
- meta: {
802
- triggerId: task.triggerId,
803
- runId: evaluatorRunId,
804
- datasetId: task.datasetId
805
- },
806
- logDiff,
807
- log,
808
- createError
809
- })
810
- )
811
- );
812
- if (result instanceof Error) {
813
- const evaluatorError = result;
814
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
815
- logs.push(taggedEntry ?? createLogEntry(result));
816
- testCaseError = result.message;
817
- evaluatorScores.push({
818
- evaluatorId,
819
- scores: [],
820
- passed: false,
821
- logs: logs.length > 0 ? logs : void 0
822
- });
823
- continue;
824
- }
825
- const { scores, metrics } = normalizeResult(result);
826
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
827
- evaluatorScores.push({
828
- evaluatorId,
829
- scores,
830
- passed: passed2,
831
- metrics,
832
- logs: logs.length > 0 ? logs : void 0
833
- });
834
- } catch (error) {
835
- if (error instanceof Error) {
836
- const taggedEntry = error[evaluatorErrorLogEntryKey];
837
- logs.push(taggedEntry ?? createLogEntry(error));
838
- }
839
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
766
+ const evaluatorRunId = `run-${randomUUID()}`;
767
+ const started = Date.now();
768
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
769
+ n + 1,
770
+ n + 1
771
+ ]);
772
+ yield* publishEvent({
773
+ type: "TestCaseStarted",
774
+ runId: task.runId,
775
+ testCaseId: testCaseItem.id,
776
+ testCaseName: testCaseItem.testCase.getName(),
777
+ startedTestCases: startedEvaluations,
778
+ totalTestCases: totalEvaluations,
779
+ rerunIndex,
780
+ rerunTotal
781
+ });
782
+ const evaluatorScores = [];
783
+ let testCaseError;
784
+ const output = readOutput(testCaseItem.testCase);
785
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
786
+ const evaluateFn = evaluator.getEvaluateFn();
787
+ if (!evaluateFn) {
788
+ continue;
789
+ }
790
+ const logs = [];
791
+ const logDiff = (expected, actual, options) => {
792
+ logs.push(createDiffLogEntry(expected, actual, options));
793
+ };
794
+ const log = (message, options) => {
795
+ logs.push(createLogEntry(message, options));
796
+ };
797
+ const createError = (message, options) => {
798
+ const entry = createLogEntry(message, options);
799
+ const error = message instanceof Error ? message : new Error(entry.message);
800
+ error[evaluatorErrorLogEntryKey] = entry;
801
+ return error;
802
+ };
803
+ try {
804
+ const ctx = yield* Effect.promise(
805
+ () => Promise.resolve(evaluator.resolveContext())
806
+ );
807
+ const result = yield* Effect.promise(
808
+ () => Promise.resolve().then(
809
+ () => evaluateFn({
810
+ input: testCaseItem.testCase.getInput(),
811
+ ctx,
812
+ output,
813
+ meta: {
814
+ triggerId: task.triggerId,
815
+ runId: evaluatorRunId,
816
+ datasetId: task.datasetId
817
+ },
818
+ logDiff,
819
+ log,
820
+ createError
821
+ })
822
+ )
823
+ );
824
+ if (result instanceof Error) {
825
+ const evaluatorError = result;
826
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
827
+ logs.push(taggedEntry ?? createLogEntry(result));
828
+ testCaseError = result.message;
840
829
  evaluatorScores.push({
841
830
  evaluatorId,
842
831
  scores: [],
843
832
  passed: false,
844
833
  logs: logs.length > 0 ? logs : void 0
845
834
  });
835
+ continue;
836
+ }
837
+ const { scores, metrics } = normalizeResult(result);
838
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
839
+ evaluatorScores.push({
840
+ evaluatorId,
841
+ scores,
842
+ passed,
843
+ metrics,
844
+ logs: logs.length > 0 ? logs : void 0
845
+ });
846
+ } catch (error) {
847
+ if (error instanceof Error) {
848
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
849
+ logs.push(taggedEntry ?? createLogEntry(error));
846
850
  }
851
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
852
+ evaluatorScores.push({
853
+ evaluatorId,
854
+ scores: [],
855
+ passed: false,
856
+ logs: logs.length > 0 ? logs : void 0
857
+ });
847
858
  }
848
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
849
- rerunPassed.push(rerunPassedThis);
850
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
851
- n + 1,
852
- n + 1
853
- ]);
854
- const progressEvent = {
855
- type: "TestCaseProgress",
856
- runId: task.runId,
857
- testCaseId: testCaseItem.id,
858
- testCaseName: testCaseItem.testCase.getName(),
859
- completedTestCases: completedEvaluations,
860
- totalTestCases: totalEvaluations,
861
- rerunIndex: r + 1,
862
- rerunTotal: reruns,
863
- passed: rerunPassedThis,
864
- durationMs: Date.now() - started,
865
- evaluatorScores,
866
- output,
867
- errorMessage: testCaseError
868
- };
869
- updateSnapshot(task.runId, (snapshot) => ({
870
- ...snapshot,
871
- completedTestCases: completedEvaluations
872
- }));
873
- yield* publishEvent(progressEvent);
874
- yield* Queue.offer(persistenceQueue, {
875
- runId: task.runId,
876
- artifactPath: task.snapshot.artifactPath,
877
- payload: progressEvent
878
- });
879
- }
880
- const testCasePassed = rerunPassed.every(Boolean);
881
- if (testCasePassed) {
882
- yield* Ref.update(passedRef, (n) => n + 1);
883
- } else {
884
- yield* Ref.update(failedRef, (n) => n + 1);
885
859
  }
886
- const [passed, failed] = yield* Effect.all([
887
- Ref.get(passedRef),
888
- Ref.get(failedRef)
860
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
861
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
862
+ n + 1,
863
+ n + 1
889
864
  ]);
890
- updateSnapshot(task.runId, (snapshot) => ({
865
+ const progressEvent = {
866
+ type: "TestCaseProgress",
867
+ runId: task.runId,
868
+ testCaseId: testCaseItem.id,
869
+ testCaseName: testCaseItem.testCase.getName(),
870
+ completedTestCases: completedEvaluations,
871
+ totalTestCases: totalEvaluations,
872
+ rerunIndex,
873
+ rerunTotal,
874
+ passed: rerunPassedThis,
875
+ durationMs: Date.now() - started,
876
+ evaluatorScores,
877
+ output,
878
+ errorMessage: testCaseError
879
+ };
880
+ yield* updateSnapshot(task.runId, (snapshot) => ({
891
881
  ...snapshot,
892
- passedTestCases: passed,
893
- failedTestCases: failed
882
+ completedTestCases: completedEvaluations
894
883
  }));
884
+ yield* publishEvent(progressEvent);
885
+ yield* Queue.offer(persistenceQueue, {
886
+ runId: task.runId,
887
+ artifactPath: task.snapshot.artifactPath,
888
+ payload: progressEvent
889
+ });
890
+ const testCaseCompleted = yield* Ref.modify(
891
+ testCaseResultsRef,
892
+ (map) => {
893
+ const key = testCaseItem.id;
894
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
895
+ const newResults = [...existing.results, rerunPassedThis];
896
+ const newCompletedCount = existing.completedCount + 1;
897
+ const isLast = newCompletedCount === rerunTotal;
898
+ const newMap = new Map(map);
899
+ newMap.set(key, {
900
+ completedCount: newCompletedCount,
901
+ results: newResults
902
+ });
903
+ const outcome = isLast ? newResults.every(Boolean) : null;
904
+ return [outcome, newMap];
905
+ }
906
+ );
907
+ if (testCaseCompleted !== null) {
908
+ if (testCaseCompleted) {
909
+ yield* Ref.update(passedRef, (n) => n + 1);
910
+ } else {
911
+ yield* Ref.update(failedRef, (n) => n + 1);
912
+ }
913
+ const [passed, failed] = yield* Effect.all([
914
+ Ref.get(passedRef),
915
+ Ref.get(failedRef)
916
+ ]);
917
+ yield* updateSnapshot(task.runId, (snapshot) => ({
918
+ ...snapshot,
919
+ passedTestCases: passed,
920
+ failedTestCases: failed
921
+ }));
922
+ }
895
923
  });
896
924
  }
897
925
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
898
926
  const startedAt = Date.now();
899
- updateSnapshot(task.runId, (snapshot) => ({
927
+ yield* updateSnapshot(task.runId, (snapshot) => ({
900
928
  ...snapshot,
901
929
  status: "running",
902
930
  startedAt
@@ -915,9 +943,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
915
943
  const startedRef = yield* Ref.make(0);
916
944
  const passedRef = yield* Ref.make(0);
917
945
  const failedRef = yield* Ref.make(0);
918
- const processTestCase = (testCaseItem) => processOneTestCase(
946
+ const testCaseResultsRef = yield* Ref.make(
947
+ /* @__PURE__ */ new Map()
948
+ );
949
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
950
+ const processEvaluation = (unit) => processOneEvaluation(
919
951
  task,
920
- testCaseItem,
952
+ unit,
921
953
  totalEvaluations,
922
954
  publishEvent,
923
955
  persistenceQueue,
@@ -925,11 +957,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
925
957
  startedRef,
926
958
  completedRef,
927
959
  passedRef,
928
- failedRef
960
+ failedRef,
961
+ testCaseResultsRef
929
962
  );
930
963
  yield* Effect.forEach(
931
- task.testCases,
932
- processTestCase,
964
+ evaluationUnits,
965
+ processEvaluation,
933
966
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
934
967
  );
935
968
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -947,7 +980,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
947
980
  totalTestCases: task.testCases.length,
948
981
  artifactPath: task.snapshot.artifactPath
949
982
  };
950
- updateSnapshot(task.runId, (snapshot) => ({
983
+ yield* updateSnapshot(task.runId, (snapshot) => ({
951
984
  ...snapshot,
952
985
  status: "completed",
953
986
  completedTestCases: completedEvaluations,
@@ -1200,7 +1233,9 @@ var EffectRunner = class {
1200
1233
  this.persistenceQueue = Effect.runSync(
1201
1234
  Queue.unbounded()
1202
1235
  );
1203
- this.snapshots = /* @__PURE__ */ new Map();
1236
+ this.snapshotsRef = Effect.runSync(
1237
+ Ref.make(/* @__PURE__ */ new Map())
1238
+ );
1204
1239
  this.listeners = /* @__PURE__ */ new Set();
1205
1240
  this.datasetsById = /* @__PURE__ */ new Map();
1206
1241
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1303,7 +1338,13 @@ var EffectRunner = class {
1303
1338
  status: "queued",
1304
1339
  artifactPath
1305
1340
  };
1306
- this.snapshots.set(runId, snapshot);
1341
+ await Effect.runPromise(
1342
+ Ref.update(this.snapshotsRef, (map) => {
1343
+ const next = new Map(map);
1344
+ next.set(runId, snapshot);
1345
+ return next;
1346
+ })
1347
+ );
1307
1348
  const queuedEvent = {
1308
1349
  type: "RunQueued",
1309
1350
  runId,
@@ -1344,12 +1385,12 @@ var EffectRunner = class {
1344
1385
  };
1345
1386
  }
1346
1387
  getRunSnapshot(runId) {
1347
- return this.snapshots.get(runId);
1388
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1348
1389
  }
1349
1390
  getAllRunSnapshots() {
1350
- return Array.from(this.snapshots.values()).sort(
1351
- (a, b) => b.queuedAt - a.queuedAt
1352
- );
1391
+ return Array.from(
1392
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
1393
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1353
1394
  }
1354
1395
  async loadRunSnapshotsFromArtifacts() {
1355
1396
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1378,11 +1419,15 @@ var EffectRunner = class {
1378
1419
  );
1379
1420
  }
1380
1421
  updateSnapshot(runId, updater) {
1381
- const existing = this.snapshots.get(runId);
1382
- if (!existing) {
1383
- return;
1384
- }
1385
- this.snapshots.set(runId, updater(existing));
1422
+ return Ref.modify(this.snapshotsRef, (map) => {
1423
+ const existing = map.get(runId);
1424
+ if (!existing) {
1425
+ return [void 0, map];
1426
+ }
1427
+ const next = new Map(map);
1428
+ next.set(runId, updater(existing));
1429
+ return [void 0, next];
1430
+ }).pipe(Effect.asVoid);
1386
1431
  }
1387
1432
  publishEvent(event) {
1388
1433
  return Effect.sync(() => {