@m4trix/evals 0.21.1 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
  import { randomUUID } from 'crypto';
3
- import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
3
+ import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
4
4
  import { existsSync } from 'fs';
5
5
  import { resolve, relative, join, parse, dirname } from 'path';
6
6
  import * as jitiModule from 'jiti';
@@ -8,7 +8,6 @@ import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import { diffLines } from 'diff';
10
10
  import stringify from 'fast-json-stable-stringify';
11
- import { cpus } from 'os';
12
11
  import * as React2 from 'react';
13
12
  import React2__default, { useState, useEffect, useCallback } from 'react';
14
13
  import { render, Box, Text } from 'ink';
@@ -737,6 +736,20 @@ function readOutput(testCase) {
737
736
  }
738
737
  return candidate.getOutput();
739
738
  }
739
+ function buildEvaluationUnits(testCases) {
740
+ const units = [];
741
+ for (const testCaseItem of testCases) {
742
+ const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
743
+ for (let r = 0; r < rerunTotal; r++) {
744
+ units.push({
745
+ testCaseItem,
746
+ rerunIndex: r + 1,
747
+ rerunTotal
748
+ });
749
+ }
750
+ }
751
+ return units;
752
+ }
740
753
  function nowIsoForFile() {
741
754
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
742
755
  }
@@ -746,157 +759,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
746
759
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
747
760
  );
748
761
  }
749
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
762
+ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
763
+ const { testCaseItem, rerunIndex, rerunTotal } = unit;
750
764
  return Effect.gen(function* () {
751
- const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
752
- const rerunPassed = [];
753
- for (let r = 0; r < reruns; r++) {
754
- const evaluatorRunId = `run-${randomUUID()}`;
755
- const started = Date.now();
756
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
757
- n + 1,
758
- n + 1
759
- ]);
760
- yield* publishEvent({
761
- type: "TestCaseStarted",
762
- runId: task.runId,
763
- testCaseId: testCaseItem.id,
764
- testCaseName: testCaseItem.testCase.getName(),
765
- startedTestCases: startedEvaluations,
766
- totalTestCases: totalEvaluations,
767
- rerunIndex: r + 1,
768
- rerunTotal: reruns
769
- });
770
- const evaluatorScores = [];
771
- let testCaseError;
772
- const output = readOutput(testCaseItem.testCase);
773
- for (const { id: evaluatorId, evaluator } of task.evaluators) {
774
- const evaluateFn = evaluator.getEvaluateFn();
775
- if (!evaluateFn) {
776
- continue;
777
- }
778
- const logs = [];
779
- const logDiff = (expected, actual, options) => {
780
- logs.push(createDiffLogEntry(expected, actual, options));
781
- };
782
- const log = (message, options) => {
783
- logs.push(createLogEntry(message, options));
784
- };
785
- const createError = (message, options) => {
786
- const entry = createLogEntry(message, options);
787
- const error = message instanceof Error ? message : new Error(entry.message);
788
- error[evaluatorErrorLogEntryKey] = entry;
789
- return error;
790
- };
791
- try {
792
- const ctx = yield* Effect.promise(
793
- () => Promise.resolve(evaluator.resolveContext())
794
- );
795
- const result = yield* Effect.promise(
796
- () => Promise.resolve().then(
797
- () => evaluateFn({
798
- input: testCaseItem.testCase.getInput(),
799
- ctx,
800
- output,
801
- meta: {
802
- triggerId: task.triggerId,
803
- runId: evaluatorRunId,
804
- datasetId: task.datasetId
805
- },
806
- logDiff,
807
- log,
808
- createError
809
- })
810
- )
811
- );
812
- if (result instanceof Error) {
813
- const evaluatorError = result;
814
- const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
815
- logs.push(taggedEntry ?? createLogEntry(result));
816
- testCaseError = result.message;
817
- evaluatorScores.push({
818
- evaluatorId,
819
- scores: [],
820
- passed: false,
821
- logs: logs.length > 0 ? logs : void 0
822
- });
823
- continue;
824
- }
825
- const { scores, metrics } = normalizeResult(result);
826
- const passed2 = computeEvaluatorPassed(evaluator, result, scores);
827
- evaluatorScores.push({
828
- evaluatorId,
829
- scores,
830
- passed: passed2,
831
- metrics,
832
- logs: logs.length > 0 ? logs : void 0
833
- });
834
- } catch (error) {
835
- if (error instanceof Error) {
836
- const taggedEntry = error[evaluatorErrorLogEntryKey];
837
- logs.push(taggedEntry ?? createLogEntry(error));
838
- }
839
- testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
765
+ const evaluatorRunId = `run-${randomUUID()}`;
766
+ const started = Date.now();
767
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
768
+ n + 1,
769
+ n + 1
770
+ ]);
771
+ yield* publishEvent({
772
+ type: "TestCaseStarted",
773
+ runId: task.runId,
774
+ testCaseId: testCaseItem.id,
775
+ testCaseName: testCaseItem.testCase.getName(),
776
+ startedTestCases: startedEvaluations,
777
+ totalTestCases: totalEvaluations,
778
+ rerunIndex,
779
+ rerunTotal
780
+ });
781
+ const evaluatorScores = [];
782
+ let testCaseError;
783
+ const output = readOutput(testCaseItem.testCase);
784
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
785
+ const evaluateFn = evaluator.getEvaluateFn();
786
+ if (!evaluateFn) {
787
+ continue;
788
+ }
789
+ const logs = [];
790
+ const logDiff = (expected, actual, options) => {
791
+ logs.push(createDiffLogEntry(expected, actual, options));
792
+ };
793
+ const log = (message, options) => {
794
+ logs.push(createLogEntry(message, options));
795
+ };
796
+ const createError = (message, options) => {
797
+ const entry = createLogEntry(message, options);
798
+ const error = message instanceof Error ? message : new Error(entry.message);
799
+ error[evaluatorErrorLogEntryKey] = entry;
800
+ return error;
801
+ };
802
+ try {
803
+ const ctx = yield* Effect.promise(
804
+ () => Promise.resolve(evaluator.resolveContext())
805
+ );
806
+ const result = yield* Effect.promise(
807
+ () => Promise.resolve().then(
808
+ () => evaluateFn({
809
+ input: testCaseItem.testCase.getInput(),
810
+ ctx,
811
+ output,
812
+ meta: {
813
+ triggerId: task.triggerId,
814
+ runId: evaluatorRunId,
815
+ datasetId: task.datasetId
816
+ },
817
+ logDiff,
818
+ log,
819
+ createError
820
+ })
821
+ )
822
+ );
823
+ if (result instanceof Error) {
824
+ const evaluatorError = result;
825
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
826
+ logs.push(taggedEntry ?? createLogEntry(result));
827
+ testCaseError = result.message;
840
828
  evaluatorScores.push({
841
829
  evaluatorId,
842
830
  scores: [],
843
831
  passed: false,
844
832
  logs: logs.length > 0 ? logs : void 0
845
833
  });
834
+ continue;
846
835
  }
836
+ const { scores, metrics } = normalizeResult(result);
837
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
838
+ evaluatorScores.push({
839
+ evaluatorId,
840
+ scores,
841
+ passed,
842
+ metrics,
843
+ logs: logs.length > 0 ? logs : void 0
844
+ });
845
+ } catch (error) {
846
+ if (error instanceof Error) {
847
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
848
+ logs.push(taggedEntry ?? createLogEntry(error));
849
+ }
850
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
851
+ evaluatorScores.push({
852
+ evaluatorId,
853
+ scores: [],
854
+ passed: false,
855
+ logs: logs.length > 0 ? logs : void 0
856
+ });
847
857
  }
848
- const rerunPassedThis = evaluatorScores.every((s) => s.passed);
849
- rerunPassed.push(rerunPassedThis);
850
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
851
- n + 1,
852
- n + 1
853
- ]);
854
- const progressEvent = {
855
- type: "TestCaseProgress",
856
- runId: task.runId,
857
- testCaseId: testCaseItem.id,
858
- testCaseName: testCaseItem.testCase.getName(),
859
- completedTestCases: completedEvaluations,
860
- totalTestCases: totalEvaluations,
861
- rerunIndex: r + 1,
862
- rerunTotal: reruns,
863
- passed: rerunPassedThis,
864
- durationMs: Date.now() - started,
865
- evaluatorScores,
866
- output,
867
- errorMessage: testCaseError
868
- };
869
- updateSnapshot(task.runId, (snapshot) => ({
870
- ...snapshot,
871
- completedTestCases: completedEvaluations
872
- }));
873
- yield* publishEvent(progressEvent);
874
- yield* Queue.offer(persistenceQueue, {
875
- runId: task.runId,
876
- artifactPath: task.snapshot.artifactPath,
877
- payload: progressEvent
878
- });
879
- }
880
- const testCasePassed = rerunPassed.every(Boolean);
881
- if (testCasePassed) {
882
- yield* Ref.update(passedRef, (n) => n + 1);
883
- } else {
884
- yield* Ref.update(failedRef, (n) => n + 1);
885
858
  }
886
- const [passed, failed] = yield* Effect.all([
887
- Ref.get(passedRef),
888
- Ref.get(failedRef)
859
+ const rerunPassedThis = evaluatorScores.every((s) => s.passed);
860
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
861
+ n + 1,
862
+ n + 1
889
863
  ]);
890
- updateSnapshot(task.runId, (snapshot) => ({
864
+ const progressEvent = {
865
+ type: "TestCaseProgress",
866
+ runId: task.runId,
867
+ testCaseId: testCaseItem.id,
868
+ testCaseName: testCaseItem.testCase.getName(),
869
+ completedTestCases: completedEvaluations,
870
+ totalTestCases: totalEvaluations,
871
+ rerunIndex,
872
+ rerunTotal,
873
+ passed: rerunPassedThis,
874
+ durationMs: Date.now() - started,
875
+ evaluatorScores,
876
+ output,
877
+ errorMessage: testCaseError
878
+ };
879
+ yield* updateSnapshot(task.runId, (snapshot) => ({
891
880
  ...snapshot,
892
- passedTestCases: passed,
893
- failedTestCases: failed
881
+ completedTestCases: completedEvaluations
894
882
  }));
883
+ yield* publishEvent(progressEvent);
884
+ yield* Queue.offer(persistenceQueue, {
885
+ runId: task.runId,
886
+ artifactPath: task.snapshot.artifactPath,
887
+ payload: progressEvent
888
+ });
889
+ const testCaseCompleted = yield* Ref.modify(
890
+ testCaseResultsRef,
891
+ (map) => {
892
+ const key = testCaseItem.id;
893
+ const existing = map.get(key) ?? { completedCount: 0, results: [] };
894
+ const newResults = [...existing.results, rerunPassedThis];
895
+ const newCompletedCount = existing.completedCount + 1;
896
+ const isLast = newCompletedCount === rerunTotal;
897
+ const newMap = new Map(map);
898
+ newMap.set(key, {
899
+ completedCount: newCompletedCount,
900
+ results: newResults
901
+ });
902
+ const outcome = isLast ? newResults.every(Boolean) : null;
903
+ return [outcome, newMap];
904
+ }
905
+ );
906
+ if (testCaseCompleted !== null) {
907
+ if (testCaseCompleted) {
908
+ yield* Ref.update(passedRef, (n) => n + 1);
909
+ } else {
910
+ yield* Ref.update(failedRef, (n) => n + 1);
911
+ }
912
+ const [passed, failed] = yield* Effect.all([
913
+ Ref.get(passedRef),
914
+ Ref.get(failedRef)
915
+ ]);
916
+ yield* updateSnapshot(task.runId, (snapshot) => ({
917
+ ...snapshot,
918
+ passedTestCases: passed,
919
+ failedTestCases: failed
920
+ }));
921
+ }
895
922
  });
896
923
  }
897
924
  var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
898
925
  const startedAt = Date.now();
899
- updateSnapshot(task.runId, (snapshot) => ({
926
+ yield* updateSnapshot(task.runId, (snapshot) => ({
900
927
  ...snapshot,
901
928
  status: "running",
902
929
  startedAt
@@ -915,9 +942,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
915
942
  const startedRef = yield* Ref.make(0);
916
943
  const passedRef = yield* Ref.make(0);
917
944
  const failedRef = yield* Ref.make(0);
918
- const processTestCase = (testCaseItem) => processOneTestCase(
945
+ const testCaseResultsRef = yield* Ref.make(
946
+ /* @__PURE__ */ new Map()
947
+ );
948
+ const evaluationUnits = buildEvaluationUnits(task.testCases);
949
+ const processEvaluation = (unit) => processOneEvaluation(
919
950
  task,
920
- testCaseItem,
951
+ unit,
921
952
  totalEvaluations,
922
953
  publishEvent,
923
954
  persistenceQueue,
@@ -925,11 +956,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
925
956
  startedRef,
926
957
  completedRef,
927
958
  passedRef,
928
- failedRef
959
+ failedRef,
960
+ testCaseResultsRef
929
961
  );
930
962
  yield* Effect.forEach(
931
- task.testCases,
932
- processTestCase,
963
+ evaluationUnits,
964
+ processEvaluation,
933
965
  maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
934
966
  );
935
967
  const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
@@ -947,7 +979,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
947
979
  totalTestCases: task.testCases.length,
948
980
  artifactPath: task.snapshot.artifactPath
949
981
  };
950
- updateSnapshot(task.runId, (snapshot) => ({
982
+ yield* updateSnapshot(task.runId, (snapshot) => ({
951
983
  ...snapshot,
952
984
  status: "completed",
953
985
  completedTestCases: completedEvaluations,
@@ -1200,7 +1232,9 @@ var EffectRunner = class {
1200
1232
  this.persistenceQueue = Effect.runSync(
1201
1233
  Queue.unbounded()
1202
1234
  );
1203
- this.snapshots = /* @__PURE__ */ new Map();
1235
+ this.snapshotsRef = Effect.runSync(
1236
+ Ref.make(/* @__PURE__ */ new Map())
1237
+ );
1204
1238
  this.listeners = /* @__PURE__ */ new Set();
1205
1239
  this.datasetsById = /* @__PURE__ */ new Map();
1206
1240
  this.evaluatorsById = /* @__PURE__ */ new Map();
@@ -1303,7 +1337,13 @@ var EffectRunner = class {
1303
1337
  status: "queued",
1304
1338
  artifactPath
1305
1339
  };
1306
- this.snapshots.set(runId, snapshot);
1340
+ await Effect.runPromise(
1341
+ Ref.update(this.snapshotsRef, (map) => {
1342
+ const next = new Map(map);
1343
+ next.set(runId, snapshot);
1344
+ return next;
1345
+ })
1346
+ );
1307
1347
  const queuedEvent = {
1308
1348
  type: "RunQueued",
1309
1349
  runId,
@@ -1344,12 +1384,12 @@ var EffectRunner = class {
1344
1384
  };
1345
1385
  }
1346
1386
  getRunSnapshot(runId) {
1347
- return this.snapshots.get(runId);
1387
+ return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1348
1388
  }
1349
1389
  getAllRunSnapshots() {
1350
- return Array.from(this.snapshots.values()).sort(
1351
- (a, b) => b.queuedAt - a.queuedAt
1352
- );
1390
+ return Array.from(
1391
+ Effect.runSync(Ref.get(this.snapshotsRef)).values()
1392
+ ).sort((a, b) => b.queuedAt - a.queuedAt);
1353
1393
  }
1354
1394
  async loadRunSnapshotsFromArtifacts() {
1355
1395
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1378,11 +1418,15 @@ var EffectRunner = class {
1378
1418
  );
1379
1419
  }
1380
1420
  updateSnapshot(runId, updater) {
1381
- const existing = this.snapshots.get(runId);
1382
- if (!existing) {
1383
- return;
1384
- }
1385
- this.snapshots.set(runId, updater(existing));
1421
+ return Ref.modify(this.snapshotsRef, (map) => {
1422
+ const existing = map.get(runId);
1423
+ if (!existing) {
1424
+ return [void 0, map];
1425
+ }
1426
+ const next = new Map(map);
1427
+ next.set(runId, updater(existing));
1428
+ return [void 0, next];
1429
+ }).pipe(Effect.asVoid);
1386
1430
  }
1387
1431
  publishEvent(event) {
1388
1432
  return Effect.sync(() => {
@@ -1398,8 +1442,10 @@ var EffectRunner = class {
1398
1442
  );
1399
1443
  }
1400
1444
  };
1445
+
1446
+ // src/cli-simple/args.ts
1401
1447
  function getDefaultConcurrency() {
1402
- return Math.max(1, cpus().length);
1448
+ return 4;
1403
1449
  }
1404
1450
  function parseSimpleCliArgs(argv) {
1405
1451
  const args = {
@@ -1446,7 +1492,7 @@ function getSimpleCliUsage() {
1446
1492
  " eval-agents-simple generate --dataset <datasetName>",
1447
1493
  "",
1448
1494
  "Options:",
1449
- " --concurrency, -c N Max concurrent test cases (default: CPU count). Use 1 for sequential.",
1495
+ " --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
1450
1496
  "",
1451
1497
  "Pattern examples for --evaluator:",
1452
1498
  " score-evaluator exact name (case-insensitive)",