@m4trix/evals 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +84 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +84 -10
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +17 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +17 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1332,13 +1332,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1332
1332
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1333
1333
|
);
|
|
1334
1334
|
}
|
|
1335
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1335
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1336
1336
|
return Effect.gen(function* () {
|
|
1337
1337
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1338
1338
|
const rerunPassed = [];
|
|
1339
1339
|
for (let r = 0; r < reruns; r++) {
|
|
1340
1340
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1341
1341
|
const started = Date.now();
|
|
1342
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1343
|
+
n + 1,
|
|
1344
|
+
n + 1
|
|
1345
|
+
]);
|
|
1346
|
+
yield* publishEvent({
|
|
1347
|
+
type: "TestCaseStarted",
|
|
1348
|
+
runId: task.runId,
|
|
1349
|
+
testCaseId: testCaseItem.id,
|
|
1350
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1351
|
+
startedTestCases: startedEvaluations,
|
|
1352
|
+
totalTestCases: totalEvaluations,
|
|
1353
|
+
rerunIndex: r + 1,
|
|
1354
|
+
rerunTotal: reruns
|
|
1355
|
+
});
|
|
1342
1356
|
const evaluatorScores = [];
|
|
1343
1357
|
let testCaseError;
|
|
1344
1358
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1484,6 +1498,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1484
1498
|
);
|
|
1485
1499
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1486
1500
|
const completedRef = yield* Ref.make(0);
|
|
1501
|
+
const startedRef = yield* Ref.make(0);
|
|
1487
1502
|
const passedRef = yield* Ref.make(0);
|
|
1488
1503
|
const failedRef = yield* Ref.make(0);
|
|
1489
1504
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1493,6 +1508,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1493
1508
|
publishEvent,
|
|
1494
1509
|
persistenceQueue,
|
|
1495
1510
|
updateSnapshot,
|
|
1511
|
+
startedRef,
|
|
1496
1512
|
completedRef,
|
|
1497
1513
|
passedRef,
|
|
1498
1514
|
failedRef
|