@m4trix/evals 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +84 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +84 -10
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +17 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +17 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1128,13 +1128,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1128
1128
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1129
1129
|
);
|
|
1130
1130
|
}
|
|
1131
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1131
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1132
1132
|
return effect.Effect.gen(function* () {
|
|
1133
1133
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1134
1134
|
const rerunPassed = [];
|
|
1135
1135
|
for (let r = 0; r < reruns; r++) {
|
|
1136
1136
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1137
1137
|
const started = Date.now();
|
|
1138
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1139
|
+
n + 1,
|
|
1140
|
+
n + 1
|
|
1141
|
+
]);
|
|
1142
|
+
yield* publishEvent({
|
|
1143
|
+
type: "TestCaseStarted",
|
|
1144
|
+
runId: task.runId,
|
|
1145
|
+
testCaseId: testCaseItem.id,
|
|
1146
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1147
|
+
startedTestCases: startedEvaluations,
|
|
1148
|
+
totalTestCases: totalEvaluations,
|
|
1149
|
+
rerunIndex: r + 1,
|
|
1150
|
+
rerunTotal: reruns
|
|
1151
|
+
});
|
|
1138
1152
|
const evaluatorScores = [];
|
|
1139
1153
|
let testCaseError;
|
|
1140
1154
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1280,6 +1294,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1280
1294
|
);
|
|
1281
1295
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1282
1296
|
const completedRef = yield* effect.Ref.make(0);
|
|
1297
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
1283
1298
|
const passedRef = yield* effect.Ref.make(0);
|
|
1284
1299
|
const failedRef = yield* effect.Ref.make(0);
|
|
1285
1300
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1289,6 +1304,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1289
1304
|
publishEvent,
|
|
1290
1305
|
persistenceQueue,
|
|
1291
1306
|
updateSnapshot,
|
|
1307
|
+
startedRef,
|
|
1292
1308
|
completedRef,
|
|
1293
1309
|
passedRef,
|
|
1294
1310
|
failedRef
|