@m4trix/evals 0.21.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -154
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +201 -155
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue,
|
|
3
|
+
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
4
4
|
import { existsSync } from 'fs';
|
|
5
5
|
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
@@ -8,7 +8,6 @@ import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffLines } from 'diff';
|
|
10
10
|
import stringify from 'fast-json-stable-stringify';
|
|
11
|
-
import { cpus } from 'os';
|
|
12
11
|
import * as React2 from 'react';
|
|
13
12
|
import React2__default, { useState, useEffect, useCallback } from 'react';
|
|
14
13
|
import { render, Box, Text } from 'ink';
|
|
@@ -737,6 +736,20 @@ function readOutput(testCase) {
|
|
|
737
736
|
}
|
|
738
737
|
return candidate.getOutput();
|
|
739
738
|
}
|
|
739
|
+
function buildEvaluationUnits(testCases) {
|
|
740
|
+
const units = [];
|
|
741
|
+
for (const testCaseItem of testCases) {
|
|
742
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
743
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
744
|
+
units.push({
|
|
745
|
+
testCaseItem,
|
|
746
|
+
rerunIndex: r + 1,
|
|
747
|
+
rerunTotal
|
|
748
|
+
});
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
return units;
|
|
752
|
+
}
|
|
740
753
|
function nowIsoForFile() {
|
|
741
754
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
742
755
|
}
|
|
@@ -746,157 +759,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
746
759
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
747
760
|
);
|
|
748
761
|
}
|
|
749
|
-
function
|
|
762
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
763
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
750
764
|
return Effect.gen(function* () {
|
|
751
|
-
const
|
|
752
|
-
const
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
815
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
816
|
-
testCaseError = result.message;
|
|
817
|
-
evaluatorScores.push({
|
|
818
|
-
evaluatorId,
|
|
819
|
-
scores: [],
|
|
820
|
-
passed: false,
|
|
821
|
-
logs: logs.length > 0 ? logs : void 0
|
|
822
|
-
});
|
|
823
|
-
continue;
|
|
824
|
-
}
|
|
825
|
-
const { scores, metrics } = normalizeResult(result);
|
|
826
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
827
|
-
evaluatorScores.push({
|
|
828
|
-
evaluatorId,
|
|
829
|
-
scores,
|
|
830
|
-
passed: passed2,
|
|
831
|
-
metrics,
|
|
832
|
-
logs: logs.length > 0 ? logs : void 0
|
|
833
|
-
});
|
|
834
|
-
} catch (error) {
|
|
835
|
-
if (error instanceof Error) {
|
|
836
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
837
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
838
|
-
}
|
|
839
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
765
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
766
|
+
const started = Date.now();
|
|
767
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
768
|
+
n + 1,
|
|
769
|
+
n + 1
|
|
770
|
+
]);
|
|
771
|
+
yield* publishEvent({
|
|
772
|
+
type: "TestCaseStarted",
|
|
773
|
+
runId: task.runId,
|
|
774
|
+
testCaseId: testCaseItem.id,
|
|
775
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
776
|
+
startedTestCases: startedEvaluations,
|
|
777
|
+
totalTestCases: totalEvaluations,
|
|
778
|
+
rerunIndex,
|
|
779
|
+
rerunTotal
|
|
780
|
+
});
|
|
781
|
+
const evaluatorScores = [];
|
|
782
|
+
let testCaseError;
|
|
783
|
+
const output = readOutput(testCaseItem.testCase);
|
|
784
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
785
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
786
|
+
if (!evaluateFn) {
|
|
787
|
+
continue;
|
|
788
|
+
}
|
|
789
|
+
const logs = [];
|
|
790
|
+
const logDiff = (expected, actual, options) => {
|
|
791
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
792
|
+
};
|
|
793
|
+
const log = (message, options) => {
|
|
794
|
+
logs.push(createLogEntry(message, options));
|
|
795
|
+
};
|
|
796
|
+
const createError = (message, options) => {
|
|
797
|
+
const entry = createLogEntry(message, options);
|
|
798
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
799
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
800
|
+
return error;
|
|
801
|
+
};
|
|
802
|
+
try {
|
|
803
|
+
const ctx = yield* Effect.promise(
|
|
804
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
805
|
+
);
|
|
806
|
+
const result = yield* Effect.promise(
|
|
807
|
+
() => Promise.resolve().then(
|
|
808
|
+
() => evaluateFn({
|
|
809
|
+
input: testCaseItem.testCase.getInput(),
|
|
810
|
+
ctx,
|
|
811
|
+
output,
|
|
812
|
+
meta: {
|
|
813
|
+
triggerId: task.triggerId,
|
|
814
|
+
runId: evaluatorRunId,
|
|
815
|
+
datasetId: task.datasetId
|
|
816
|
+
},
|
|
817
|
+
logDiff,
|
|
818
|
+
log,
|
|
819
|
+
createError
|
|
820
|
+
})
|
|
821
|
+
)
|
|
822
|
+
);
|
|
823
|
+
if (result instanceof Error) {
|
|
824
|
+
const evaluatorError = result;
|
|
825
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
826
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
827
|
+
testCaseError = result.message;
|
|
840
828
|
evaluatorScores.push({
|
|
841
829
|
evaluatorId,
|
|
842
830
|
scores: [],
|
|
843
831
|
passed: false,
|
|
844
832
|
logs: logs.length > 0 ? logs : void 0
|
|
845
833
|
});
|
|
834
|
+
continue;
|
|
846
835
|
}
|
|
836
|
+
const { scores, metrics } = normalizeResult(result);
|
|
837
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
838
|
+
evaluatorScores.push({
|
|
839
|
+
evaluatorId,
|
|
840
|
+
scores,
|
|
841
|
+
passed,
|
|
842
|
+
metrics,
|
|
843
|
+
logs: logs.length > 0 ? logs : void 0
|
|
844
|
+
});
|
|
845
|
+
} catch (error) {
|
|
846
|
+
if (error instanceof Error) {
|
|
847
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
848
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
849
|
+
}
|
|
850
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
851
|
+
evaluatorScores.push({
|
|
852
|
+
evaluatorId,
|
|
853
|
+
scores: [],
|
|
854
|
+
passed: false,
|
|
855
|
+
logs: logs.length > 0 ? logs : void 0
|
|
856
|
+
});
|
|
847
857
|
}
|
|
848
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
849
|
-
rerunPassed.push(rerunPassedThis);
|
|
850
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
851
|
-
n + 1,
|
|
852
|
-
n + 1
|
|
853
|
-
]);
|
|
854
|
-
const progressEvent = {
|
|
855
|
-
type: "TestCaseProgress",
|
|
856
|
-
runId: task.runId,
|
|
857
|
-
testCaseId: testCaseItem.id,
|
|
858
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
859
|
-
completedTestCases: completedEvaluations,
|
|
860
|
-
totalTestCases: totalEvaluations,
|
|
861
|
-
rerunIndex: r + 1,
|
|
862
|
-
rerunTotal: reruns,
|
|
863
|
-
passed: rerunPassedThis,
|
|
864
|
-
durationMs: Date.now() - started,
|
|
865
|
-
evaluatorScores,
|
|
866
|
-
output,
|
|
867
|
-
errorMessage: testCaseError
|
|
868
|
-
};
|
|
869
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
870
|
-
...snapshot,
|
|
871
|
-
completedTestCases: completedEvaluations
|
|
872
|
-
}));
|
|
873
|
-
yield* publishEvent(progressEvent);
|
|
874
|
-
yield* Queue.offer(persistenceQueue, {
|
|
875
|
-
runId: task.runId,
|
|
876
|
-
artifactPath: task.snapshot.artifactPath,
|
|
877
|
-
payload: progressEvent
|
|
878
|
-
});
|
|
879
|
-
}
|
|
880
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
881
|
-
if (testCasePassed) {
|
|
882
|
-
yield* Ref.update(passedRef, (n) => n + 1);
|
|
883
|
-
} else {
|
|
884
|
-
yield* Ref.update(failedRef, (n) => n + 1);
|
|
885
858
|
}
|
|
886
|
-
const
|
|
887
|
-
|
|
888
|
-
|
|
859
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
860
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
861
|
+
n + 1,
|
|
862
|
+
n + 1
|
|
889
863
|
]);
|
|
890
|
-
|
|
864
|
+
const progressEvent = {
|
|
865
|
+
type: "TestCaseProgress",
|
|
866
|
+
runId: task.runId,
|
|
867
|
+
testCaseId: testCaseItem.id,
|
|
868
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
869
|
+
completedTestCases: completedEvaluations,
|
|
870
|
+
totalTestCases: totalEvaluations,
|
|
871
|
+
rerunIndex,
|
|
872
|
+
rerunTotal,
|
|
873
|
+
passed: rerunPassedThis,
|
|
874
|
+
durationMs: Date.now() - started,
|
|
875
|
+
evaluatorScores,
|
|
876
|
+
output,
|
|
877
|
+
errorMessage: testCaseError
|
|
878
|
+
};
|
|
879
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
891
880
|
...snapshot,
|
|
892
|
-
|
|
893
|
-
failedTestCases: failed
|
|
881
|
+
completedTestCases: completedEvaluations
|
|
894
882
|
}));
|
|
883
|
+
yield* publishEvent(progressEvent);
|
|
884
|
+
yield* Queue.offer(persistenceQueue, {
|
|
885
|
+
runId: task.runId,
|
|
886
|
+
artifactPath: task.snapshot.artifactPath,
|
|
887
|
+
payload: progressEvent
|
|
888
|
+
});
|
|
889
|
+
const testCaseCompleted = yield* Ref.modify(
|
|
890
|
+
testCaseResultsRef,
|
|
891
|
+
(map) => {
|
|
892
|
+
const key = testCaseItem.id;
|
|
893
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
894
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
895
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
896
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
897
|
+
const newMap = new Map(map);
|
|
898
|
+
newMap.set(key, {
|
|
899
|
+
completedCount: newCompletedCount,
|
|
900
|
+
results: newResults
|
|
901
|
+
});
|
|
902
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
903
|
+
return [outcome, newMap];
|
|
904
|
+
}
|
|
905
|
+
);
|
|
906
|
+
if (testCaseCompleted !== null) {
|
|
907
|
+
if (testCaseCompleted) {
|
|
908
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
909
|
+
} else {
|
|
910
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
911
|
+
}
|
|
912
|
+
const [passed, failed] = yield* Effect.all([
|
|
913
|
+
Ref.get(passedRef),
|
|
914
|
+
Ref.get(failedRef)
|
|
915
|
+
]);
|
|
916
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
917
|
+
...snapshot,
|
|
918
|
+
passedTestCases: passed,
|
|
919
|
+
failedTestCases: failed
|
|
920
|
+
}));
|
|
921
|
+
}
|
|
895
922
|
});
|
|
896
923
|
}
|
|
897
924
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
898
925
|
const startedAt = Date.now();
|
|
899
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
926
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
900
927
|
...snapshot,
|
|
901
928
|
status: "running",
|
|
902
929
|
startedAt
|
|
@@ -915,9 +942,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
915
942
|
const startedRef = yield* Ref.make(0);
|
|
916
943
|
const passedRef = yield* Ref.make(0);
|
|
917
944
|
const failedRef = yield* Ref.make(0);
|
|
918
|
-
const
|
|
945
|
+
const testCaseResultsRef = yield* Ref.make(
|
|
946
|
+
/* @__PURE__ */ new Map()
|
|
947
|
+
);
|
|
948
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
949
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
919
950
|
task,
|
|
920
|
-
|
|
951
|
+
unit,
|
|
921
952
|
totalEvaluations,
|
|
922
953
|
publishEvent,
|
|
923
954
|
persistenceQueue,
|
|
@@ -925,11 +956,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
925
956
|
startedRef,
|
|
926
957
|
completedRef,
|
|
927
958
|
passedRef,
|
|
928
|
-
failedRef
|
|
959
|
+
failedRef,
|
|
960
|
+
testCaseResultsRef
|
|
929
961
|
);
|
|
930
962
|
yield* Effect.forEach(
|
|
931
|
-
|
|
932
|
-
|
|
963
|
+
evaluationUnits,
|
|
964
|
+
processEvaluation,
|
|
933
965
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
934
966
|
);
|
|
935
967
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
@@ -947,7 +979,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
947
979
|
totalTestCases: task.testCases.length,
|
|
948
980
|
artifactPath: task.snapshot.artifactPath
|
|
949
981
|
};
|
|
950
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
982
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
951
983
|
...snapshot,
|
|
952
984
|
status: "completed",
|
|
953
985
|
completedTestCases: completedEvaluations,
|
|
@@ -1200,7 +1232,9 @@ var EffectRunner = class {
|
|
|
1200
1232
|
this.persistenceQueue = Effect.runSync(
|
|
1201
1233
|
Queue.unbounded()
|
|
1202
1234
|
);
|
|
1203
|
-
this.
|
|
1235
|
+
this.snapshotsRef = Effect.runSync(
|
|
1236
|
+
Ref.make(/* @__PURE__ */ new Map())
|
|
1237
|
+
);
|
|
1204
1238
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1205
1239
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1206
1240
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1303,7 +1337,13 @@ var EffectRunner = class {
|
|
|
1303
1337
|
status: "queued",
|
|
1304
1338
|
artifactPath
|
|
1305
1339
|
};
|
|
1306
|
-
|
|
1340
|
+
await Effect.runPromise(
|
|
1341
|
+
Ref.update(this.snapshotsRef, (map) => {
|
|
1342
|
+
const next = new Map(map);
|
|
1343
|
+
next.set(runId, snapshot);
|
|
1344
|
+
return next;
|
|
1345
|
+
})
|
|
1346
|
+
);
|
|
1307
1347
|
const queuedEvent = {
|
|
1308
1348
|
type: "RunQueued",
|
|
1309
1349
|
runId,
|
|
@@ -1344,12 +1384,12 @@ var EffectRunner = class {
|
|
|
1344
1384
|
};
|
|
1345
1385
|
}
|
|
1346
1386
|
getRunSnapshot(runId) {
|
|
1347
|
-
return this.
|
|
1387
|
+
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1348
1388
|
}
|
|
1349
1389
|
getAllRunSnapshots() {
|
|
1350
|
-
return Array.from(
|
|
1351
|
-
(
|
|
1352
|
-
);
|
|
1390
|
+
return Array.from(
|
|
1391
|
+
Effect.runSync(Ref.get(this.snapshotsRef)).values()
|
|
1392
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1353
1393
|
}
|
|
1354
1394
|
async loadRunSnapshotsFromArtifacts() {
|
|
1355
1395
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1378,11 +1418,15 @@ var EffectRunner = class {
|
|
|
1378
1418
|
);
|
|
1379
1419
|
}
|
|
1380
1420
|
updateSnapshot(runId, updater) {
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1421
|
+
return Ref.modify(this.snapshotsRef, (map) => {
|
|
1422
|
+
const existing = map.get(runId);
|
|
1423
|
+
if (!existing) {
|
|
1424
|
+
return [void 0, map];
|
|
1425
|
+
}
|
|
1426
|
+
const next = new Map(map);
|
|
1427
|
+
next.set(runId, updater(existing));
|
|
1428
|
+
return [void 0, next];
|
|
1429
|
+
}).pipe(Effect.asVoid);
|
|
1386
1430
|
}
|
|
1387
1431
|
publishEvent(event) {
|
|
1388
1432
|
return Effect.sync(() => {
|
|
@@ -1398,8 +1442,10 @@ var EffectRunner = class {
|
|
|
1398
1442
|
);
|
|
1399
1443
|
}
|
|
1400
1444
|
};
|
|
1445
|
+
|
|
1446
|
+
// src/cli-simple/args.ts
|
|
1401
1447
|
function getDefaultConcurrency() {
|
|
1402
|
-
return
|
|
1448
|
+
return 4;
|
|
1403
1449
|
}
|
|
1404
1450
|
function parseSimpleCliArgs(argv) {
|
|
1405
1451
|
const args = {
|
|
@@ -1446,7 +1492,7 @@ function getSimpleCliUsage() {
|
|
|
1446
1492
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1447
1493
|
"",
|
|
1448
1494
|
"Options:",
|
|
1449
|
-
" --concurrency, -c N Max concurrent
|
|
1495
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1450
1496
|
"",
|
|
1451
1497
|
"Pattern examples for --evaluator:",
|
|
1452
1498
|
" score-evaluator exact name (case-insensitive)",
|