@m4trix/evals 0.21.1 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +196 -151
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +197 -152
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -763,6 +763,20 @@ function readOutput(testCase) {
|
|
|
763
763
|
}
|
|
764
764
|
return candidate.getOutput();
|
|
765
765
|
}
|
|
766
|
+
function buildEvaluationUnits(testCases) {
|
|
767
|
+
const units = [];
|
|
768
|
+
for (const testCaseItem of testCases) {
|
|
769
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
770
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
771
|
+
units.push({
|
|
772
|
+
testCaseItem,
|
|
773
|
+
rerunIndex: r + 1,
|
|
774
|
+
rerunTotal
|
|
775
|
+
});
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
return units;
|
|
779
|
+
}
|
|
766
780
|
function nowIsoForFile() {
|
|
767
781
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
768
782
|
}
|
|
@@ -772,157 +786,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
772
786
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
773
787
|
);
|
|
774
788
|
}
|
|
775
|
-
function
|
|
789
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
790
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
776
791
|
return effect.Effect.gen(function* () {
|
|
777
|
-
const
|
|
778
|
-
const
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
841
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
842
|
-
testCaseError = result.message;
|
|
843
|
-
evaluatorScores.push({
|
|
844
|
-
evaluatorId,
|
|
845
|
-
scores: [],
|
|
846
|
-
passed: false,
|
|
847
|
-
logs: logs.length > 0 ? logs : void 0
|
|
848
|
-
});
|
|
849
|
-
continue;
|
|
850
|
-
}
|
|
851
|
-
const { scores, metrics } = normalizeResult(result);
|
|
852
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
853
|
-
evaluatorScores.push({
|
|
854
|
-
evaluatorId,
|
|
855
|
-
scores,
|
|
856
|
-
passed: passed2,
|
|
857
|
-
metrics,
|
|
858
|
-
logs: logs.length > 0 ? logs : void 0
|
|
859
|
-
});
|
|
860
|
-
} catch (error) {
|
|
861
|
-
if (error instanceof Error) {
|
|
862
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
863
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
864
|
-
}
|
|
865
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
792
|
+
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
793
|
+
const started = Date.now();
|
|
794
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
795
|
+
n + 1,
|
|
796
|
+
n + 1
|
|
797
|
+
]);
|
|
798
|
+
yield* publishEvent({
|
|
799
|
+
type: "TestCaseStarted",
|
|
800
|
+
runId: task.runId,
|
|
801
|
+
testCaseId: testCaseItem.id,
|
|
802
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
803
|
+
startedTestCases: startedEvaluations,
|
|
804
|
+
totalTestCases: totalEvaluations,
|
|
805
|
+
rerunIndex,
|
|
806
|
+
rerunTotal
|
|
807
|
+
});
|
|
808
|
+
const evaluatorScores = [];
|
|
809
|
+
let testCaseError;
|
|
810
|
+
const output = readOutput(testCaseItem.testCase);
|
|
811
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
812
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
813
|
+
if (!evaluateFn) {
|
|
814
|
+
continue;
|
|
815
|
+
}
|
|
816
|
+
const logs = [];
|
|
817
|
+
const logDiff = (expected, actual, options) => {
|
|
818
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
819
|
+
};
|
|
820
|
+
const log = (message, options) => {
|
|
821
|
+
logs.push(createLogEntry(message, options));
|
|
822
|
+
};
|
|
823
|
+
const createError = (message, options) => {
|
|
824
|
+
const entry = createLogEntry(message, options);
|
|
825
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
826
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
827
|
+
return error;
|
|
828
|
+
};
|
|
829
|
+
try {
|
|
830
|
+
const ctx = yield* effect.Effect.promise(
|
|
831
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
832
|
+
);
|
|
833
|
+
const result = yield* effect.Effect.promise(
|
|
834
|
+
() => Promise.resolve().then(
|
|
835
|
+
() => evaluateFn({
|
|
836
|
+
input: testCaseItem.testCase.getInput(),
|
|
837
|
+
ctx,
|
|
838
|
+
output,
|
|
839
|
+
meta: {
|
|
840
|
+
triggerId: task.triggerId,
|
|
841
|
+
runId: evaluatorRunId,
|
|
842
|
+
datasetId: task.datasetId
|
|
843
|
+
},
|
|
844
|
+
logDiff,
|
|
845
|
+
log,
|
|
846
|
+
createError
|
|
847
|
+
})
|
|
848
|
+
)
|
|
849
|
+
);
|
|
850
|
+
if (result instanceof Error) {
|
|
851
|
+
const evaluatorError = result;
|
|
852
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
853
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
854
|
+
testCaseError = result.message;
|
|
866
855
|
evaluatorScores.push({
|
|
867
856
|
evaluatorId,
|
|
868
857
|
scores: [],
|
|
869
858
|
passed: false,
|
|
870
859
|
logs: logs.length > 0 ? logs : void 0
|
|
871
860
|
});
|
|
861
|
+
continue;
|
|
862
|
+
}
|
|
863
|
+
const { scores, metrics } = normalizeResult(result);
|
|
864
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
865
|
+
evaluatorScores.push({
|
|
866
|
+
evaluatorId,
|
|
867
|
+
scores,
|
|
868
|
+
passed,
|
|
869
|
+
metrics,
|
|
870
|
+
logs: logs.length > 0 ? logs : void 0
|
|
871
|
+
});
|
|
872
|
+
} catch (error) {
|
|
873
|
+
if (error instanceof Error) {
|
|
874
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
875
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
872
876
|
}
|
|
877
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
878
|
+
evaluatorScores.push({
|
|
879
|
+
evaluatorId,
|
|
880
|
+
scores: [],
|
|
881
|
+
passed: false,
|
|
882
|
+
logs: logs.length > 0 ? logs : void 0
|
|
883
|
+
});
|
|
873
884
|
}
|
|
874
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
875
|
-
rerunPassed.push(rerunPassedThis);
|
|
876
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
877
|
-
n + 1,
|
|
878
|
-
n + 1
|
|
879
|
-
]);
|
|
880
|
-
const progressEvent = {
|
|
881
|
-
type: "TestCaseProgress",
|
|
882
|
-
runId: task.runId,
|
|
883
|
-
testCaseId: testCaseItem.id,
|
|
884
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
885
|
-
completedTestCases: completedEvaluations,
|
|
886
|
-
totalTestCases: totalEvaluations,
|
|
887
|
-
rerunIndex: r + 1,
|
|
888
|
-
rerunTotal: reruns,
|
|
889
|
-
passed: rerunPassedThis,
|
|
890
|
-
durationMs: Date.now() - started,
|
|
891
|
-
evaluatorScores,
|
|
892
|
-
output,
|
|
893
|
-
errorMessage: testCaseError
|
|
894
|
-
};
|
|
895
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
896
|
-
...snapshot,
|
|
897
|
-
completedTestCases: completedEvaluations
|
|
898
|
-
}));
|
|
899
|
-
yield* publishEvent(progressEvent);
|
|
900
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
901
|
-
runId: task.runId,
|
|
902
|
-
artifactPath: task.snapshot.artifactPath,
|
|
903
|
-
payload: progressEvent
|
|
904
|
-
});
|
|
905
|
-
}
|
|
906
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
907
|
-
if (testCasePassed) {
|
|
908
|
-
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
909
|
-
} else {
|
|
910
|
-
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
911
885
|
}
|
|
912
|
-
const
|
|
913
|
-
|
|
914
|
-
|
|
886
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
887
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
888
|
+
n + 1,
|
|
889
|
+
n + 1
|
|
915
890
|
]);
|
|
916
|
-
|
|
891
|
+
const progressEvent = {
|
|
892
|
+
type: "TestCaseProgress",
|
|
893
|
+
runId: task.runId,
|
|
894
|
+
testCaseId: testCaseItem.id,
|
|
895
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
896
|
+
completedTestCases: completedEvaluations,
|
|
897
|
+
totalTestCases: totalEvaluations,
|
|
898
|
+
rerunIndex,
|
|
899
|
+
rerunTotal,
|
|
900
|
+
passed: rerunPassedThis,
|
|
901
|
+
durationMs: Date.now() - started,
|
|
902
|
+
evaluatorScores,
|
|
903
|
+
output,
|
|
904
|
+
errorMessage: testCaseError
|
|
905
|
+
};
|
|
906
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
917
907
|
...snapshot,
|
|
918
|
-
|
|
919
|
-
failedTestCases: failed
|
|
908
|
+
completedTestCases: completedEvaluations
|
|
920
909
|
}));
|
|
910
|
+
yield* publishEvent(progressEvent);
|
|
911
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
912
|
+
runId: task.runId,
|
|
913
|
+
artifactPath: task.snapshot.artifactPath,
|
|
914
|
+
payload: progressEvent
|
|
915
|
+
});
|
|
916
|
+
const testCaseCompleted = yield* effect.Ref.modify(
|
|
917
|
+
testCaseResultsRef,
|
|
918
|
+
(map) => {
|
|
919
|
+
const key = testCaseItem.id;
|
|
920
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
921
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
922
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
923
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
924
|
+
const newMap = new Map(map);
|
|
925
|
+
newMap.set(key, {
|
|
926
|
+
completedCount: newCompletedCount,
|
|
927
|
+
results: newResults
|
|
928
|
+
});
|
|
929
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
930
|
+
return [outcome, newMap];
|
|
931
|
+
}
|
|
932
|
+
);
|
|
933
|
+
if (testCaseCompleted !== null) {
|
|
934
|
+
if (testCaseCompleted) {
|
|
935
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
936
|
+
} else {
|
|
937
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
938
|
+
}
|
|
939
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
940
|
+
effect.Ref.get(passedRef),
|
|
941
|
+
effect.Ref.get(failedRef)
|
|
942
|
+
]);
|
|
943
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
944
|
+
...snapshot,
|
|
945
|
+
passedTestCases: passed,
|
|
946
|
+
failedTestCases: failed
|
|
947
|
+
}));
|
|
948
|
+
}
|
|
921
949
|
});
|
|
922
950
|
}
|
|
923
951
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
924
952
|
const startedAt = Date.now();
|
|
925
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
953
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
926
954
|
...snapshot,
|
|
927
955
|
status: "running",
|
|
928
956
|
startedAt
|
|
@@ -941,9 +969,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
941
969
|
const startedRef = yield* effect.Ref.make(0);
|
|
942
970
|
const passedRef = yield* effect.Ref.make(0);
|
|
943
971
|
const failedRef = yield* effect.Ref.make(0);
|
|
944
|
-
const
|
|
972
|
+
const testCaseResultsRef = yield* effect.Ref.make(
|
|
973
|
+
/* @__PURE__ */ new Map()
|
|
974
|
+
);
|
|
975
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
976
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
945
977
|
task,
|
|
946
|
-
|
|
978
|
+
unit,
|
|
947
979
|
totalEvaluations,
|
|
948
980
|
publishEvent,
|
|
949
981
|
persistenceQueue,
|
|
@@ -951,11 +983,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
951
983
|
startedRef,
|
|
952
984
|
completedRef,
|
|
953
985
|
passedRef,
|
|
954
|
-
failedRef
|
|
986
|
+
failedRef,
|
|
987
|
+
testCaseResultsRef
|
|
955
988
|
);
|
|
956
989
|
yield* effect.Effect.forEach(
|
|
957
|
-
|
|
958
|
-
|
|
990
|
+
evaluationUnits,
|
|
991
|
+
processEvaluation,
|
|
959
992
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
960
993
|
);
|
|
961
994
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
@@ -973,7 +1006,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
973
1006
|
totalTestCases: task.testCases.length,
|
|
974
1007
|
artifactPath: task.snapshot.artifactPath
|
|
975
1008
|
};
|
|
976
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1009
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
977
1010
|
...snapshot,
|
|
978
1011
|
status: "completed",
|
|
979
1012
|
completedTestCases: completedEvaluations,
|
|
@@ -1226,7 +1259,9 @@ var EffectRunner = class {
|
|
|
1226
1259
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1227
1260
|
effect.Queue.unbounded()
|
|
1228
1261
|
);
|
|
1229
|
-
this.
|
|
1262
|
+
this.snapshotsRef = effect.Effect.runSync(
|
|
1263
|
+
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1264
|
+
);
|
|
1230
1265
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1231
1266
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1232
1267
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1329,7 +1364,13 @@ var EffectRunner = class {
|
|
|
1329
1364
|
status: "queued",
|
|
1330
1365
|
artifactPath
|
|
1331
1366
|
};
|
|
1332
|
-
|
|
1367
|
+
await effect.Effect.runPromise(
|
|
1368
|
+
effect.Ref.update(this.snapshotsRef, (map) => {
|
|
1369
|
+
const next = new Map(map);
|
|
1370
|
+
next.set(runId, snapshot);
|
|
1371
|
+
return next;
|
|
1372
|
+
})
|
|
1373
|
+
);
|
|
1333
1374
|
const queuedEvent = {
|
|
1334
1375
|
type: "RunQueued",
|
|
1335
1376
|
runId,
|
|
@@ -1370,12 +1411,12 @@ var EffectRunner = class {
|
|
|
1370
1411
|
};
|
|
1371
1412
|
}
|
|
1372
1413
|
getRunSnapshot(runId) {
|
|
1373
|
-
return this.
|
|
1414
|
+
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1374
1415
|
}
|
|
1375
1416
|
getAllRunSnapshots() {
|
|
1376
|
-
return Array.from(
|
|
1377
|
-
(
|
|
1378
|
-
);
|
|
1417
|
+
return Array.from(
|
|
1418
|
+
effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
|
|
1419
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1379
1420
|
}
|
|
1380
1421
|
async loadRunSnapshotsFromArtifacts() {
|
|
1381
1422
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1404,11 +1445,15 @@ var EffectRunner = class {
|
|
|
1404
1445
|
);
|
|
1405
1446
|
}
|
|
1406
1447
|
updateSnapshot(runId, updater) {
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1448
|
+
return effect.Ref.modify(this.snapshotsRef, (map) => {
|
|
1449
|
+
const existing = map.get(runId);
|
|
1450
|
+
if (!existing) {
|
|
1451
|
+
return [void 0, map];
|
|
1452
|
+
}
|
|
1453
|
+
const next = new Map(map);
|
|
1454
|
+
next.set(runId, updater(existing));
|
|
1455
|
+
return [void 0, next];
|
|
1456
|
+
}).pipe(effect.Effect.asVoid);
|
|
1412
1457
|
}
|
|
1413
1458
|
publishEvent(event) {
|
|
1414
1459
|
return effect.Effect.sync(() => {
|