@m4trix/evals 0.21.1 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +196 -151
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +197 -152
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +196 -151
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +197 -152
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +196 -151
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -152
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { randomUUID } from 'crypto';
|
|
3
|
-
import { Effect, PubSub, Queue,
|
|
3
|
+
import { Effect, PubSub, Queue, Ref, Fiber } from 'effect';
|
|
4
4
|
import { existsSync } from 'fs';
|
|
5
5
|
import { resolve, relative, join, parse, dirname } from 'path';
|
|
6
6
|
import * as jitiModule from 'jiti';
|
|
@@ -737,6 +737,20 @@ function readOutput(testCase) {
|
|
|
737
737
|
}
|
|
738
738
|
return candidate.getOutput();
|
|
739
739
|
}
|
|
740
|
+
function buildEvaluationUnits(testCases) {
|
|
741
|
+
const units = [];
|
|
742
|
+
for (const testCaseItem of testCases) {
|
|
743
|
+
const rerunTotal = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
744
|
+
for (let r = 0; r < rerunTotal; r++) {
|
|
745
|
+
units.push({
|
|
746
|
+
testCaseItem,
|
|
747
|
+
rerunIndex: r + 1,
|
|
748
|
+
rerunTotal
|
|
749
|
+
});
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
return units;
|
|
753
|
+
}
|
|
740
754
|
function nowIsoForFile() {
|
|
741
755
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
742
756
|
}
|
|
@@ -746,157 +760,171 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
746
760
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
747
761
|
);
|
|
748
762
|
}
|
|
749
|
-
function
|
|
763
|
+
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
764
|
+
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
750
765
|
return Effect.gen(function* () {
|
|
751
|
-
const
|
|
752
|
-
const
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
815
|
-
logs.push(taggedEntry ?? createLogEntry(result));
|
|
816
|
-
testCaseError = result.message;
|
|
817
|
-
evaluatorScores.push({
|
|
818
|
-
evaluatorId,
|
|
819
|
-
scores: [],
|
|
820
|
-
passed: false,
|
|
821
|
-
logs: logs.length > 0 ? logs : void 0
|
|
822
|
-
});
|
|
823
|
-
continue;
|
|
824
|
-
}
|
|
825
|
-
const { scores, metrics } = normalizeResult(result);
|
|
826
|
-
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
827
|
-
evaluatorScores.push({
|
|
828
|
-
evaluatorId,
|
|
829
|
-
scores,
|
|
830
|
-
passed: passed2,
|
|
831
|
-
metrics,
|
|
832
|
-
logs: logs.length > 0 ? logs : void 0
|
|
833
|
-
});
|
|
834
|
-
} catch (error) {
|
|
835
|
-
if (error instanceof Error) {
|
|
836
|
-
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
837
|
-
logs.push(taggedEntry ?? createLogEntry(error));
|
|
838
|
-
}
|
|
839
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
766
|
+
const evaluatorRunId = `run-${randomUUID()}`;
|
|
767
|
+
const started = Date.now();
|
|
768
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
769
|
+
n + 1,
|
|
770
|
+
n + 1
|
|
771
|
+
]);
|
|
772
|
+
yield* publishEvent({
|
|
773
|
+
type: "TestCaseStarted",
|
|
774
|
+
runId: task.runId,
|
|
775
|
+
testCaseId: testCaseItem.id,
|
|
776
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
777
|
+
startedTestCases: startedEvaluations,
|
|
778
|
+
totalTestCases: totalEvaluations,
|
|
779
|
+
rerunIndex,
|
|
780
|
+
rerunTotal
|
|
781
|
+
});
|
|
782
|
+
const evaluatorScores = [];
|
|
783
|
+
let testCaseError;
|
|
784
|
+
const output = readOutput(testCaseItem.testCase);
|
|
785
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
786
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
787
|
+
if (!evaluateFn) {
|
|
788
|
+
continue;
|
|
789
|
+
}
|
|
790
|
+
const logs = [];
|
|
791
|
+
const logDiff = (expected, actual, options) => {
|
|
792
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
793
|
+
};
|
|
794
|
+
const log = (message, options) => {
|
|
795
|
+
logs.push(createLogEntry(message, options));
|
|
796
|
+
};
|
|
797
|
+
const createError = (message, options) => {
|
|
798
|
+
const entry = createLogEntry(message, options);
|
|
799
|
+
const error = message instanceof Error ? message : new Error(entry.message);
|
|
800
|
+
error[evaluatorErrorLogEntryKey] = entry;
|
|
801
|
+
return error;
|
|
802
|
+
};
|
|
803
|
+
try {
|
|
804
|
+
const ctx = yield* Effect.promise(
|
|
805
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
806
|
+
);
|
|
807
|
+
const result = yield* Effect.promise(
|
|
808
|
+
() => Promise.resolve().then(
|
|
809
|
+
() => evaluateFn({
|
|
810
|
+
input: testCaseItem.testCase.getInput(),
|
|
811
|
+
ctx,
|
|
812
|
+
output,
|
|
813
|
+
meta: {
|
|
814
|
+
triggerId: task.triggerId,
|
|
815
|
+
runId: evaluatorRunId,
|
|
816
|
+
datasetId: task.datasetId
|
|
817
|
+
},
|
|
818
|
+
logDiff,
|
|
819
|
+
log,
|
|
820
|
+
createError
|
|
821
|
+
})
|
|
822
|
+
)
|
|
823
|
+
);
|
|
824
|
+
if (result instanceof Error) {
|
|
825
|
+
const evaluatorError = result;
|
|
826
|
+
const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
|
|
827
|
+
logs.push(taggedEntry ?? createLogEntry(result));
|
|
828
|
+
testCaseError = result.message;
|
|
840
829
|
evaluatorScores.push({
|
|
841
830
|
evaluatorId,
|
|
842
831
|
scores: [],
|
|
843
832
|
passed: false,
|
|
844
833
|
logs: logs.length > 0 ? logs : void 0
|
|
845
834
|
});
|
|
835
|
+
continue;
|
|
836
|
+
}
|
|
837
|
+
const { scores, metrics } = normalizeResult(result);
|
|
838
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
839
|
+
evaluatorScores.push({
|
|
840
|
+
evaluatorId,
|
|
841
|
+
scores,
|
|
842
|
+
passed,
|
|
843
|
+
metrics,
|
|
844
|
+
logs: logs.length > 0 ? logs : void 0
|
|
845
|
+
});
|
|
846
|
+
} catch (error) {
|
|
847
|
+
if (error instanceof Error) {
|
|
848
|
+
const taggedEntry = error[evaluatorErrorLogEntryKey];
|
|
849
|
+
logs.push(taggedEntry ?? createLogEntry(error));
|
|
846
850
|
}
|
|
851
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
852
|
+
evaluatorScores.push({
|
|
853
|
+
evaluatorId,
|
|
854
|
+
scores: [],
|
|
855
|
+
passed: false,
|
|
856
|
+
logs: logs.length > 0 ? logs : void 0
|
|
857
|
+
});
|
|
847
858
|
}
|
|
848
|
-
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
849
|
-
rerunPassed.push(rerunPassedThis);
|
|
850
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
851
|
-
n + 1,
|
|
852
|
-
n + 1
|
|
853
|
-
]);
|
|
854
|
-
const progressEvent = {
|
|
855
|
-
type: "TestCaseProgress",
|
|
856
|
-
runId: task.runId,
|
|
857
|
-
testCaseId: testCaseItem.id,
|
|
858
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
859
|
-
completedTestCases: completedEvaluations,
|
|
860
|
-
totalTestCases: totalEvaluations,
|
|
861
|
-
rerunIndex: r + 1,
|
|
862
|
-
rerunTotal: reruns,
|
|
863
|
-
passed: rerunPassedThis,
|
|
864
|
-
durationMs: Date.now() - started,
|
|
865
|
-
evaluatorScores,
|
|
866
|
-
output,
|
|
867
|
-
errorMessage: testCaseError
|
|
868
|
-
};
|
|
869
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
870
|
-
...snapshot,
|
|
871
|
-
completedTestCases: completedEvaluations
|
|
872
|
-
}));
|
|
873
|
-
yield* publishEvent(progressEvent);
|
|
874
|
-
yield* Queue.offer(persistenceQueue, {
|
|
875
|
-
runId: task.runId,
|
|
876
|
-
artifactPath: task.snapshot.artifactPath,
|
|
877
|
-
payload: progressEvent
|
|
878
|
-
});
|
|
879
|
-
}
|
|
880
|
-
const testCasePassed = rerunPassed.every(Boolean);
|
|
881
|
-
if (testCasePassed) {
|
|
882
|
-
yield* Ref.update(passedRef, (n) => n + 1);
|
|
883
|
-
} else {
|
|
884
|
-
yield* Ref.update(failedRef, (n) => n + 1);
|
|
885
859
|
}
|
|
886
|
-
const
|
|
887
|
-
|
|
888
|
-
|
|
860
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
861
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
862
|
+
n + 1,
|
|
863
|
+
n + 1
|
|
889
864
|
]);
|
|
890
|
-
|
|
865
|
+
const progressEvent = {
|
|
866
|
+
type: "TestCaseProgress",
|
|
867
|
+
runId: task.runId,
|
|
868
|
+
testCaseId: testCaseItem.id,
|
|
869
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
870
|
+
completedTestCases: completedEvaluations,
|
|
871
|
+
totalTestCases: totalEvaluations,
|
|
872
|
+
rerunIndex,
|
|
873
|
+
rerunTotal,
|
|
874
|
+
passed: rerunPassedThis,
|
|
875
|
+
durationMs: Date.now() - started,
|
|
876
|
+
evaluatorScores,
|
|
877
|
+
output,
|
|
878
|
+
errorMessage: testCaseError
|
|
879
|
+
};
|
|
880
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
891
881
|
...snapshot,
|
|
892
|
-
|
|
893
|
-
failedTestCases: failed
|
|
882
|
+
completedTestCases: completedEvaluations
|
|
894
883
|
}));
|
|
884
|
+
yield* publishEvent(progressEvent);
|
|
885
|
+
yield* Queue.offer(persistenceQueue, {
|
|
886
|
+
runId: task.runId,
|
|
887
|
+
artifactPath: task.snapshot.artifactPath,
|
|
888
|
+
payload: progressEvent
|
|
889
|
+
});
|
|
890
|
+
const testCaseCompleted = yield* Ref.modify(
|
|
891
|
+
testCaseResultsRef,
|
|
892
|
+
(map) => {
|
|
893
|
+
const key = testCaseItem.id;
|
|
894
|
+
const existing = map.get(key) ?? { completedCount: 0, results: [] };
|
|
895
|
+
const newResults = [...existing.results, rerunPassedThis];
|
|
896
|
+
const newCompletedCount = existing.completedCount + 1;
|
|
897
|
+
const isLast = newCompletedCount === rerunTotal;
|
|
898
|
+
const newMap = new Map(map);
|
|
899
|
+
newMap.set(key, {
|
|
900
|
+
completedCount: newCompletedCount,
|
|
901
|
+
results: newResults
|
|
902
|
+
});
|
|
903
|
+
const outcome = isLast ? newResults.every(Boolean) : null;
|
|
904
|
+
return [outcome, newMap];
|
|
905
|
+
}
|
|
906
|
+
);
|
|
907
|
+
if (testCaseCompleted !== null) {
|
|
908
|
+
if (testCaseCompleted) {
|
|
909
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
910
|
+
} else {
|
|
911
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
912
|
+
}
|
|
913
|
+
const [passed, failed] = yield* Effect.all([
|
|
914
|
+
Ref.get(passedRef),
|
|
915
|
+
Ref.get(failedRef)
|
|
916
|
+
]);
|
|
917
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
918
|
+
...snapshot,
|
|
919
|
+
passedTestCases: passed,
|
|
920
|
+
failedTestCases: failed
|
|
921
|
+
}));
|
|
922
|
+
}
|
|
895
923
|
});
|
|
896
924
|
}
|
|
897
925
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
898
926
|
const startedAt = Date.now();
|
|
899
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
927
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
900
928
|
...snapshot,
|
|
901
929
|
status: "running",
|
|
902
930
|
startedAt
|
|
@@ -915,9 +943,13 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
915
943
|
const startedRef = yield* Ref.make(0);
|
|
916
944
|
const passedRef = yield* Ref.make(0);
|
|
917
945
|
const failedRef = yield* Ref.make(0);
|
|
918
|
-
const
|
|
946
|
+
const testCaseResultsRef = yield* Ref.make(
|
|
947
|
+
/* @__PURE__ */ new Map()
|
|
948
|
+
);
|
|
949
|
+
const evaluationUnits = buildEvaluationUnits(task.testCases);
|
|
950
|
+
const processEvaluation = (unit) => processOneEvaluation(
|
|
919
951
|
task,
|
|
920
|
-
|
|
952
|
+
unit,
|
|
921
953
|
totalEvaluations,
|
|
922
954
|
publishEvent,
|
|
923
955
|
persistenceQueue,
|
|
@@ -925,11 +957,12 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
925
957
|
startedRef,
|
|
926
958
|
completedRef,
|
|
927
959
|
passedRef,
|
|
928
|
-
failedRef
|
|
960
|
+
failedRef,
|
|
961
|
+
testCaseResultsRef
|
|
929
962
|
);
|
|
930
963
|
yield* Effect.forEach(
|
|
931
|
-
|
|
932
|
-
|
|
964
|
+
evaluationUnits,
|
|
965
|
+
processEvaluation,
|
|
933
966
|
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
934
967
|
);
|
|
935
968
|
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
@@ -947,7 +980,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
947
980
|
totalTestCases: task.testCases.length,
|
|
948
981
|
artifactPath: task.snapshot.artifactPath
|
|
949
982
|
};
|
|
950
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
983
|
+
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
951
984
|
...snapshot,
|
|
952
985
|
status: "completed",
|
|
953
986
|
completedTestCases: completedEvaluations,
|
|
@@ -1200,7 +1233,9 @@ var EffectRunner = class {
|
|
|
1200
1233
|
this.persistenceQueue = Effect.runSync(
|
|
1201
1234
|
Queue.unbounded()
|
|
1202
1235
|
);
|
|
1203
|
-
this.
|
|
1236
|
+
this.snapshotsRef = Effect.runSync(
|
|
1237
|
+
Ref.make(/* @__PURE__ */ new Map())
|
|
1238
|
+
);
|
|
1204
1239
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1205
1240
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1206
1241
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
@@ -1303,7 +1338,13 @@ var EffectRunner = class {
|
|
|
1303
1338
|
status: "queued",
|
|
1304
1339
|
artifactPath
|
|
1305
1340
|
};
|
|
1306
|
-
|
|
1341
|
+
await Effect.runPromise(
|
|
1342
|
+
Ref.update(this.snapshotsRef, (map) => {
|
|
1343
|
+
const next = new Map(map);
|
|
1344
|
+
next.set(runId, snapshot);
|
|
1345
|
+
return next;
|
|
1346
|
+
})
|
|
1347
|
+
);
|
|
1307
1348
|
const queuedEvent = {
|
|
1308
1349
|
type: "RunQueued",
|
|
1309
1350
|
runId,
|
|
@@ -1344,12 +1385,12 @@ var EffectRunner = class {
|
|
|
1344
1385
|
};
|
|
1345
1386
|
}
|
|
1346
1387
|
getRunSnapshot(runId) {
|
|
1347
|
-
return this.
|
|
1388
|
+
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1348
1389
|
}
|
|
1349
1390
|
getAllRunSnapshots() {
|
|
1350
|
-
return Array.from(
|
|
1351
|
-
(
|
|
1352
|
-
);
|
|
1391
|
+
return Array.from(
|
|
1392
|
+
Effect.runSync(Ref.get(this.snapshotsRef)).values()
|
|
1393
|
+
).sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1353
1394
|
}
|
|
1354
1395
|
async loadRunSnapshotsFromArtifacts() {
|
|
1355
1396
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1378,11 +1419,15 @@ var EffectRunner = class {
|
|
|
1378
1419
|
);
|
|
1379
1420
|
}
|
|
1380
1421
|
updateSnapshot(runId, updater) {
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1422
|
+
return Ref.modify(this.snapshotsRef, (map) => {
|
|
1423
|
+
const existing = map.get(runId);
|
|
1424
|
+
if (!existing) {
|
|
1425
|
+
return [void 0, map];
|
|
1426
|
+
}
|
|
1427
|
+
const next = new Map(map);
|
|
1428
|
+
next.set(runId, updater(existing));
|
|
1429
|
+
return [void 0, next];
|
|
1430
|
+
}).pipe(Effect.asVoid);
|
|
1386
1431
|
}
|
|
1387
1432
|
publishEvent(event) {
|
|
1388
1433
|
return Effect.sync(() => {
|