@m4trix/evals 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +213 -38
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +214 -39
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +1136 -832
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1137 -833
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +149 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +26 -13
- package/dist/index.js +150 -6
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -279,11 +279,17 @@ function toEvaluatorOption(item) {
|
|
|
279
279
|
};
|
|
280
280
|
}
|
|
281
281
|
async function loadRunnerData(runner) {
|
|
282
|
-
const [datasets, evaluators] = await Promise.all([
|
|
282
|
+
const [datasets, evaluators, diskSnapshots] = await Promise.all([
|
|
283
283
|
runner.collectDatasets(),
|
|
284
|
-
runner.collectEvaluators()
|
|
284
|
+
runner.collectEvaluators(),
|
|
285
|
+
runner.loadRunSnapshotsFromArtifacts()
|
|
285
286
|
]);
|
|
286
|
-
const
|
|
287
|
+
const memSnapshots = runner.getAllRunSnapshots();
|
|
288
|
+
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
289
|
+
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
290
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
291
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
292
|
+
);
|
|
287
293
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
288
294
|
return loadMockData();
|
|
289
295
|
}
|
|
@@ -576,6 +582,16 @@ var binaryScore = Score.of({
|
|
|
576
582
|
displayStrategy: "passFail",
|
|
577
583
|
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
578
584
|
});
|
|
585
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
586
|
+
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
587
|
+
return {
|
|
588
|
+
type: "diff",
|
|
589
|
+
label: options?.label,
|
|
590
|
+
expected,
|
|
591
|
+
actual,
|
|
592
|
+
diff: diff || "(no differences)"
|
|
593
|
+
};
|
|
594
|
+
}
|
|
579
595
|
function printJsonDiff(expected, actual, options = {}) {
|
|
580
596
|
const { color = true } = options;
|
|
581
597
|
const diff = jsonDiff.diffString(expected, actual, { color });
|
|
@@ -937,6 +953,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
937
953
|
continue;
|
|
938
954
|
}
|
|
939
955
|
try {
|
|
956
|
+
const logs = [];
|
|
957
|
+
const logDiff = (expected, actual, options) => {
|
|
958
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
959
|
+
};
|
|
940
960
|
const ctx = yield* effect.Effect.promise(
|
|
941
961
|
() => Promise.resolve(evaluator.resolveContext())
|
|
942
962
|
);
|
|
@@ -945,13 +965,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
945
965
|
evaluateFn({
|
|
946
966
|
input: testCaseItem.testCase.getInput(),
|
|
947
967
|
ctx,
|
|
948
|
-
output
|
|
968
|
+
output,
|
|
969
|
+
logDiff
|
|
949
970
|
})
|
|
950
971
|
)
|
|
951
972
|
);
|
|
952
973
|
const { scores, metrics } = normalizeResult(result);
|
|
953
974
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
954
|
-
evaluatorScores.push({
|
|
975
|
+
evaluatorScores.push({
|
|
976
|
+
evaluatorId,
|
|
977
|
+
scores,
|
|
978
|
+
passed,
|
|
979
|
+
metrics,
|
|
980
|
+
logs: logs.length > 0 ? logs : void 0
|
|
981
|
+
});
|
|
955
982
|
} catch (error) {
|
|
956
983
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
957
984
|
evaluatorScores.push({
|
|
@@ -1024,6 +1051,120 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1024
1051
|
artifactPath: task.snapshot.artifactPath
|
|
1025
1052
|
});
|
|
1026
1053
|
});
|
|
1054
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
1055
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
1056
|
+
let entries;
|
|
1057
|
+
try {
|
|
1058
|
+
entries = await promises.readdir(baseDir);
|
|
1059
|
+
} catch {
|
|
1060
|
+
return [];
|
|
1061
|
+
}
|
|
1062
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
1063
|
+
const snapshots = [];
|
|
1064
|
+
for (const fileName of jsonlFiles) {
|
|
1065
|
+
const filePath = path.join(baseDir, fileName);
|
|
1066
|
+
try {
|
|
1067
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
1068
|
+
if (snapshot) {
|
|
1069
|
+
snapshots.push(snapshot);
|
|
1070
|
+
}
|
|
1071
|
+
} catch {
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
1075
|
+
}
|
|
1076
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
1077
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
1078
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
1079
|
+
if (lines.length === 0) {
|
|
1080
|
+
return null;
|
|
1081
|
+
}
|
|
1082
|
+
let runQueued = null;
|
|
1083
|
+
let runCompleted = null;
|
|
1084
|
+
let runFailed = null;
|
|
1085
|
+
let runStarted = null;
|
|
1086
|
+
for (const line of lines) {
|
|
1087
|
+
try {
|
|
1088
|
+
const event = JSON.parse(line);
|
|
1089
|
+
const type = event.type;
|
|
1090
|
+
if (type === "RunQueued") {
|
|
1091
|
+
runQueued = {
|
|
1092
|
+
runId: event.runId,
|
|
1093
|
+
datasetId: event.datasetId,
|
|
1094
|
+
datasetName: event.datasetName,
|
|
1095
|
+
evaluatorIds: event.evaluatorIds,
|
|
1096
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
1097
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
1098
|
+
ts: event.ts
|
|
1099
|
+
};
|
|
1100
|
+
}
|
|
1101
|
+
if (type === "RunStarted") {
|
|
1102
|
+
runStarted = { startedAt: event.startedAt };
|
|
1103
|
+
}
|
|
1104
|
+
if (type === "RunCompleted") {
|
|
1105
|
+
runCompleted = {
|
|
1106
|
+
passedTestCases: event.passedTestCases,
|
|
1107
|
+
failedTestCases: event.failedTestCases,
|
|
1108
|
+
totalTestCases: event.totalTestCases,
|
|
1109
|
+
finishedAt: event.finishedAt
|
|
1110
|
+
};
|
|
1111
|
+
}
|
|
1112
|
+
if (type === "RunFailed") {
|
|
1113
|
+
runFailed = {
|
|
1114
|
+
finishedAt: event.finishedAt,
|
|
1115
|
+
errorMessage: event.errorMessage
|
|
1116
|
+
};
|
|
1117
|
+
}
|
|
1118
|
+
} catch {
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
if (!runQueued) {
|
|
1122
|
+
return null;
|
|
1123
|
+
}
|
|
1124
|
+
const artifactPath = filePath;
|
|
1125
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1126
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
1127
|
+
const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
|
|
1128
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1129
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1130
|
+
return {
|
|
1131
|
+
runId: runQueued.runId,
|
|
1132
|
+
datasetId: runQueued.datasetId,
|
|
1133
|
+
datasetName: runQueued.datasetName,
|
|
1134
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
1135
|
+
queuedAt: runQueued.ts ?? 0,
|
|
1136
|
+
startedAt: runStarted?.startedAt,
|
|
1137
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
1138
|
+
totalTestCases: runQueued.totalTestCases,
|
|
1139
|
+
completedTestCases,
|
|
1140
|
+
passedTestCases,
|
|
1141
|
+
failedTestCases,
|
|
1142
|
+
status,
|
|
1143
|
+
artifactPath,
|
|
1144
|
+
errorMessage: runFailed?.errorMessage
|
|
1145
|
+
};
|
|
1146
|
+
}
|
|
1147
|
+
function aggregateTestCaseProgress(lines) {
|
|
1148
|
+
let completedTestCases = 0;
|
|
1149
|
+
let passedTestCases = 0;
|
|
1150
|
+
let failedTestCases = 0;
|
|
1151
|
+
for (const line of lines) {
|
|
1152
|
+
try {
|
|
1153
|
+
const event = JSON.parse(line);
|
|
1154
|
+
if (event.type === "TestCaseProgress") {
|
|
1155
|
+
const ev = event;
|
|
1156
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1157
|
+
if (ev.passed) {
|
|
1158
|
+
passedTestCases += 1;
|
|
1159
|
+
} else {
|
|
1160
|
+
failedTestCases += 1;
|
|
1161
|
+
}
|
|
1162
|
+
}
|
|
1163
|
+
} catch {
|
|
1164
|
+
}
|
|
1165
|
+
}
|
|
1166
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1167
|
+
}
|
|
1027
1168
|
async function appendJsonLine(artifactPath, payload) {
|
|
1028
1169
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
1029
1170
|
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
@@ -1280,6 +1421,9 @@ var EffectRunner = class {
|
|
|
1280
1421
|
(a, b) => b.queuedAt - a.queuedAt
|
|
1281
1422
|
);
|
|
1282
1423
|
}
|
|
1424
|
+
async loadRunSnapshotsFromArtifacts() {
|
|
1425
|
+
return loadRunSnapshotsFromArtifacts(this.config);
|
|
1426
|
+
}
|
|
1283
1427
|
async shutdown() {
|
|
1284
1428
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
1285
1429
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|