@m4trix/evals 0.21.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +200 -154
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +201 -155
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -10,7 +10,6 @@ var promises = require('fs/promises');
|
|
|
10
10
|
var url = require('url');
|
|
11
11
|
var diff = require('diff');
|
|
12
12
|
var stringify = require('fast-json-stable-stringify');
|
|
13
|
-
var os = require('os');
|
|
14
13
|
var React2 = require('react');
|
|
15
14
|
var ink = require('ink');
|
|
16
15
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -763,6 +762,20 @@ function readOutput(testCase) {
|
|
|
763
762
|
}
|
|
764
763
|
return candidate.getOutput();
|
|
765
764
|
}
|
|
765
|
+
function buildEvaluationUnits(testCases) {
|
|
766
|
+
const units = [];
|
|
767
|
+
for (const testCaseItem of testCases) {
|
|
768
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
769
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
770
|
+
units.push({
|
|
771
|
+
testCaseItem,
|
|
772
|
+
rerunIndex: r + 1,
|
|
773
|
+
rerunTotal
|
|
774
|
+
});
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
return units;
|
|
778
|
+
}
|
|
766
779
|
function nowIsoForFile() {
|
|
767
780
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
768
781
|
}
|
|
@@ -772,157 +785,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
772
785
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
773
786
|
);
|
|
774
787
|
}
|
|
775
|
-
function
|
|
788
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
789
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
776
790
|
return effect.Effect.gen(function* () {
|
|
777
|
-
const
|
|
778
|
-
const
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
841
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
842
|
-
testCaseError = result.message;
|
|
843
|
-
evaluatorScores.push({
|
|
844
|
-
evaluatorId,
|
|
845
|
-
scores: [],
|
|
846
|
-
passed: false,
|
|
847
|
-
logs: logs.length > 0 ? logs : void 0
|
|
848
|
-
});
|
|
849
|
-
continue;
|
|
850
|
-
}
|
|
851
|
-
const { scores, metrics } = normalizeResult(result);
|
|
852
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
853
|
-
evaluatorScores.push({
|
|
854
|
-
evaluatorId,
|
|
855
|
-
scores,
|
|
856
|
-
passed: passed2,
|
|
857
|
-
metrics,
|
|
858
|
-
logs: logs.length > 0 ? logs : void 0
|
|
859
|
-
});
|
|
860
|
-
} catch (error) {
|
|
861
|
-
if (error instanceof Error) {
|
|
862
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
863
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
864
|
-
}
|
|
865
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
791
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
792
|
+
const started = Date.now();
|
|
793
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
794
|
+
n + 1,
|
|
795
|
+
n + 1
|
|
796
|
+
]);
|
|
797
|
+
yield* publishEvent({
|
|
798
|
+
type: "TestCaseStarted",
|
|
799
|
+
runId: task.runId,
|
|
800
|
+
testCaseId: testCaseItem.id,
|
|
801
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
802
|
+
startedTestCases: startedEvaluations,
|
|
803
|
+
totalTestCases: totalEvaluations,
|
|
804
|
+
rerunIndex,
|
|
805
|
+
rerunTotal
|
|
806
|
+
});
|
|
807
|
+
const evaluatorScores = [];
|
|
808
|
+
let testCaseError;
|
|
809
|
+
const output = readOutput(testCaseItem.testCase);
|
|
810
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
811
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
812
|
+
if (!evaluateFn) {
|
|
813
|
+
continue;
|
|
814
|
+
}
|
|
815
|
+
const logs = [];
|
|
816
|
+
const logDiff = (expected, actual, options) => {
|
|
817
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
818
|
+
};
|
|
819
|
+
const log = (message, options) => {
|
|
820
|
+
logs.push(createLogEntry(message, options));
|
|
821
|
+
};
|
|
822
|
+
const createError = (message, options) => {
|
|
823
|
+
const entry = createLogEntry(message, options);
|
|
824
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
825
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
826
|
+
return error;
|
|
827
|
+
};
|
|
828
|
+
try {
|
|
829
|
+
const ctx = yield* effect.Effect.promise(
|
|
830
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
831
|
+
);
|
|
832
|
+
const result = yield* effect.Effect.promise(
|
|
833
|
+
() => Promise.resolve().then(
|
|
834
|
+
() => evaluateFn({
|
|
835
|
+
input: testCaseItem.testCase.getInput(),
|
|
836
|
+
ctx,
|
|
837
|
+
output,
|
|
838
|
+
meta: {
|
|
839
|
+
triggerId: task.triggerId,
|
|
840
|
+
runId: evaluatorRunId,
|
|
841
|
+
datasetId: task.datasetId
|
|
842
|
+
},
|
|
843
|
+
logDiff,
|
|
844
|
+
log,
|
|
845
|
+
createError
|
|
846
|
+
})
|
|
847
|
+
)
|
|
848
|
+
);
|
|
849
|
+
if (result instanceof Error) {
|
|
850
|
+
const evaluatorError = result;
|
|
851
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
852
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
853
|
+
testCaseError = result.message;
|
|
866
854
|
evaluatorScores.push({
|
|
867
855
|
evaluatorId,
|
|
868
856
|
scores: [],
|
|
869
857
|
passed: false,
|
|
870
858
|
logs: logs.length > 0 ? logs : void 0
|
|
871
859
|
});
|
|
860
|
+
continue;
|
|
872
861
|
}
|
|
862
|
+
const { scores, metrics } = normalizeResult(result);
|
|
863
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
864
|
+
evaluatorScores.push({
|
|
865
|
+
evaluatorId,
|
|
866
|
+
scores,
|
|
867
|
+
passed,
|
|
868
|
+
metrics,
|
|
869
|
+
logs: logs.length > 0 ? logs : void 0
|
|
870
|
+
});
|
|
871
|
+
} catch (error) {
|
|
872
|
+
if (error instanceof Error) {
|
|
873
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
874
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
875
|
+
}
|
|
876
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
877
|
+
evaluatorScores.push({
|
|
878
|
+
evaluatorId,
|
|
879
|
+
scores: [],
|
|
880
|
+
passed: false,
|
|
881
|
+
logs: logs.length > 0 ? logs : void 0
|
|
882
|
+
});
|
|
873
883
|
}
|
|
874
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
875
|
-
rerunPassed.push(rerunPassedThis);
|
|
876
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
877
|
-
n + 1,
|
|
878
|
-
n + 1
|
|
879
|
-
]);
|
|
880
|
-
const progressEvent = {
|
|
881
|
-
type: "TestCaseProgress",
|
|
882
|
-
runId: task.runId,
|
|
883
|
-
testCaseId: testCaseItem.id,
|
|
884
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
885
|
-
completedTestCases: completedEvaluations,
|
|
886
|
-
totalTestCases: totalEvaluations,
|
|
887
|
-
rerunIndex: r + 1,
|
|
888
|
-
rerunTotal: reruns,
|
|
889
|
-
passed: rerunPassedThis,
|
|
890
|
-
durationMs: Date.now() - started,
|
|
891
|
-
evaluatorScores,
|
|
892
|
-
output,
|
|
893
|
-
errorMessage: testCaseError
|
|
894
|
-
};
|
|
895
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
896
|
-
...snapshot,
|
|
897
|
-
completedTestCases: completedEvaluations
|
|
898
|
-
}));
|
|
899
|
-
yield* publishEvent(progressEvent);
|
|
900
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
901
|
-
runId: task.runId,
|
|
902
|
-
artifactPath: task.snapshot.artifactPath,
|
|
903
|
-
payload: progressEvent
|
|
904
|
-
});
|
|
905
|
-
}
|
|
906
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
907
|
-
if (testCasePassed) {
|
|
908
|
-
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
909
|
-
} else {
|
|
910
|
-
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
911
884
|
}
|
|
912
|
-
const
|
|
913
|
-
|
|
914
|
-
|
|
885
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
886
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
887
|
+
n + 1,
|
|
888
|
+
n + 1
|
|
915
889
|
]);
|
|
916
|
-
|
|
890
|
+
const progressEvent = {
|
|
891
|
+
type: "TestCaseProgress",
|
|
892
|
+
runId: task.runId,
|
|
893
|
+
testCaseId: testCaseItem.id,
|
|
894
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
895
|
+
completedTestCases: completedEvaluations,
|
|
896
|
+
totalTestCases: totalEvaluations,
|
|
897
|
+
rerunIndex,
|
|
898
|
+
rerunTotal,
|
|
899
|
+
passed: rerunPassedThis,
|
|
900
|
+
durationMs: Date.now() - started,
|
|
901
|
+
evaluatorScores,
|
|
902
|
+
output,
|
|
903
|
+
errorMessage: testCaseError
|
|
904
|
+
};
|
|
905
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
917
906
|
...snapshot,
|
|
918
|
-
|
|
919
|
-
failedTestCases: failed
|
|
907
|
+
completedTestCases: completedEvaluations
|
|
920
908
|
}));
|
|
909
|
+
yield* publishEvent(progressEvent);
|
|
910
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
911
|
+
runId: task.runId,
|
|
912
|
+
artifactPath: task.snapshot.artifactPath,
|
|
913
|
+
payload: progressEvent
|
|
914
|
+
});
|
|
915
|
+
const testCaseCompleted = yield* effect.Ref.modify(
|
|
916
|
+
testCaseResultsRef,
|
|
917
|
+
(map) => {
|
|
918
|
+
const key = testCaseItem.id;
|
|
919
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
920
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
921
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
922
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
923
|
+
const newMap = new Map(map);
|
|
924
|
+
newMap.set(key, {
|
|
925
|
+
completedCount: newCompletedCount,
|
|
926
|
+
results: newResults
|
|
927
|
+
});
|
|
928
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
929
|
+
return [outcome, newMap];
|
|
930
|
+
}
|
|
931
|
+
);
|
|
932
|
+
if (testCaseCompleted !== null) {
|
|
933
|
+
if (testCaseCompleted) {
|
|
934
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
935
|
+
} else {
|
|
936
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
937
|
+
}
|
|
938
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
939
|
+
effect.Ref.get(passedRef),
|
|
940
|
+
effect.Ref.get(failedRef)
|
|
941
|
+
]);
|
|
942
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
943
|
+
...snapshot,
|
|
944
|
+
passedTestCases: passed,
|
|
945
|
+
failedTestCases: failed
|
|
946
|
+
}));
|
|
947
|
+
}
|
|
921
948
|
});
|
|
922
949
|
}
|
|
923
950
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
924
951
|
const startedAt = Date.now();
|
|
925
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
952
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
926
953
|
...snapshot,
|
|
927
954
|
status: "running",
|
|
928
955
|
startedAt
|
|
@@ -941,9 +968,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
941
968
|
const startedRef = yield* effect.Ref.make(0);
|
|
942
969
|
const passedRef = yield* effect.Ref.make(0);
|
|
943
970
|
const failedRef = yield* effect.Ref.make(0);
|
|
944
|
-
const
|
|
971
|
+
const testCaseResultsRef = yield* effect.Ref.make(
|
|
972
|
+
/* @__PURE__ */ new Map()
|
|
973
|
+
);
|
|
974
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
975
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
945
976
|
task,
|
|
946
|
-
|
|
977
|
+
unit,
|
|
947
978
|
totalEvaluations,
|
|
948
979
|
publishEvent,
|
|
949
980
|
persistenceQueue,
|
|
@@ -951,11 +982,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
951
982
|
startedRef,
|
|
952
983
|
completedRef,
|
|
953
984
|
passedRef,
|
|
954
|
-
failedRef
|
|
985
|
+
failedRef,
|
|
986
|
+
testCaseResultsRef
|
|
955
987
|
);
|
|
956
988
|
yield* effect.Effect.forEach(
|
|
957
|
-
|
|
958
|
-
|
|
989
|
+
evaluationUnits,
|
|
990
|
+
processEvaluation,
|
|
959
991
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
960
992
|
);
|
|
961
993
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
@@ -973,7 +1005,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
973
1005
|
totalTestCases: task.testCases.length,
|
|
974
1006
|
artifactPath: task.snapshot.artifactPath
|
|
975
1007
|
};
|
|
976
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1008
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
977
1009
|
...snapshot,
|
|
978
1010
|
status: "completed",
|
|
979
1011
|
completedTestCases: completedEvaluations,
|
|
@@ -1226,7 +1258,9 @@ var EffectRunner = class {
|
|
|
1226
1258
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1227
1259
|
effect.Queue.unbounded()
|
|
1228
1260
|
);
|
|
1229
|
-
this.
|
|
1261
|
+
this.snapshotsRef = effect.Effect.runSync(
|
|
1262
|
+
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1263
|
+
);
|
|
1230
1264
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1231
1265
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1232
1266
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1329,7 +1363,13 @@ var EffectRunner = class {
|
|
|
1329
1363
|
status: "queued",
|
|
1330
1364
|
artifactPath
|
|
1331
1365
|
};
|
|
1332
|
-
|
|
1366
|
+
await effect.Effect.runPromise(
|
|
1367
|
+
effect.Ref.update(this.snapshotsRef, (map) => {
|
|
1368
|
+
const next = new Map(map);
|
|
1369
|
+
next.set(runId, snapshot);
|
|
1370
|
+
return next;
|
|
1371
|
+
})
|
|
1372
|
+
);
|
|
1333
1373
|
const queuedEvent = {
|
|
1334
1374
|
type: "RunQueued",
|
|
1335
1375
|
runId,
|
|
@@ -1370,12 +1410,12 @@ var EffectRunner = class {
|
|
|
1370
1410
|
};
|
|
1371
1411
|
}
|
|
1372
1412
|
getRunSnapshot(runId) {
|
|
1373
|
-
return this.
|
|
1413
|
+
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1374
1414
|
}
|
|
1375
1415
|
getAllRunSnapshots() {
|
|
1376
|
-
return Array.from(
|
|
1377
|
-
(
|
|
1378
|
-
);
|
|
1416
|
+
return Array.from(
|
|
1417
|
+
effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
|
|
1418
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1379
1419
|
}
|
|
1380
1420
|
async loadRunSnapshotsFromArtifacts() {
|
|
1381
1421
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1404,11 +1444,15 @@ var EffectRunner = class {
|
|
|
1404
1444
|
);
|
|
1405
1445
|
}
|
|
1406
1446
|
updateSnapshot(runId, updater) {
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1447
|
+
return effect.Ref.modify(this.snapshotsRef, (map) => {
|
|
1448
|
+
const existing = map.get(runId);
|
|
1449
|
+
if (!existing) {
|
|
1450
|
+
return [void 0, map];
|
|
1451
|
+
}
|
|
1452
|
+
const next = new Map(map);
|
|
1453
|
+
next.set(runId, updater(existing));
|
|
1454
|
+
return [void 0, next];
|
|
1455
|
+
}).pipe(effect.Effect.asVoid);
|
|
1412
1456
|
}
|
|
1413
1457
|
publishEvent(event) {
|
|
1414
1458
|
return effect.Effect.sync(() => {
|
|
@@ -1424,8 +1468,10 @@ var EffectRunner = class {
|
|
|
1424
1468
|
);
|
|
1425
1469
|
}
|
|
1426
1470
|
};
|
|
1471
|
+
|
|
1472
|
+
// src/cli-simple/args.ts
|
|
1427
1473
|
function getDefaultConcurrency() {
|
|
1428
|
-
return
|
|
1474
|
+
return 4;
|
|
1429
1475
|
}
|
|
1430
1476
|
function parseSimpleCliArgs(argv) {
|
|
1431
1477
|
const args = {
|
|
@@ -1472,7 +1518,7 @@ function getSimpleCliUsage() {
|
|
|
1472
1518
|
" eval-agents-simple generate --dataset <datasetName>",
|
|
1473
1519
|
"",
|
|
1474
1520
|
"Options:",
|
|
1475
|
-
" --concurrency, -c N Max concurrent
|
|
1521
|
+
" --concurrency, -c N Max concurrent evaluations (default: 4). Use 1 for sequential.",
|
|
1476
1522
|
"",
|
|
1477
1523
|
"Pattern examples for --evaluator:",
|
|
1478
1524
|
" score-evaluator exact name (case-insensitive)",
|