@m4trix/evals 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +84 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +84 -10
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +17 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +17 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -437,6 +437,15 @@ type RunnerEvent = {
|
|
|
437
437
|
type: 'RunStarted';
|
|
438
438
|
runId: string;
|
|
439
439
|
startedAt: number;
|
|
440
|
+
} | {
|
|
441
|
+
type: 'TestCaseStarted';
|
|
442
|
+
runId: string;
|
|
443
|
+
testCaseId: string;
|
|
444
|
+
testCaseName: string;
|
|
445
|
+
startedTestCases: number;
|
|
446
|
+
totalTestCases: number;
|
|
447
|
+
rerunIndex: number;
|
|
448
|
+
rerunTotal: number;
|
|
440
449
|
} | {
|
|
441
450
|
type: 'TestCaseProgress';
|
|
442
451
|
runId: string;
|
package/dist/index.js
CHANGED
|
@@ -1106,13 +1106,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1106
1106
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1107
1107
|
);
|
|
1108
1108
|
}
|
|
1109
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1109
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1110
1110
|
return Effect.gen(function* () {
|
|
1111
1111
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1112
1112
|
const rerunPassed = [];
|
|
1113
1113
|
for (let r = 0; r < reruns; r++) {
|
|
1114
1114
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1115
1115
|
const started = Date.now();
|
|
1116
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1117
|
+
n + 1,
|
|
1118
|
+
n + 1
|
|
1119
|
+
]);
|
|
1120
|
+
yield* publishEvent({
|
|
1121
|
+
type: "TestCaseStarted",
|
|
1122
|
+
runId: task.runId,
|
|
1123
|
+
testCaseId: testCaseItem.id,
|
|
1124
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1125
|
+
startedTestCases: startedEvaluations,
|
|
1126
|
+
totalTestCases: totalEvaluations,
|
|
1127
|
+
rerunIndex: r + 1,
|
|
1128
|
+
rerunTotal: reruns
|
|
1129
|
+
});
|
|
1116
1130
|
const evaluatorScores = [];
|
|
1117
1131
|
let testCaseError;
|
|
1118
1132
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1258,6 +1272,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1258
1272
|
);
|
|
1259
1273
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1260
1274
|
const completedRef = yield* Ref.make(0);
|
|
1275
|
+
const startedRef = yield* Ref.make(0);
|
|
1261
1276
|
const passedRef = yield* Ref.make(0);
|
|
1262
1277
|
const failedRef = yield* Ref.make(0);
|
|
1263
1278
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1267,6 +1282,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1267
1282
|
publishEvent,
|
|
1268
1283
|
persistenceQueue,
|
|
1269
1284
|
updateSnapshot,
|
|
1285
|
+
startedRef,
|
|
1270
1286
|
completedRef,
|
|
1271
1287
|
passedRef,
|
|
1272
1288
|
failedRef
|