@m4trix/evals 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +84 -13
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +84 -10
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +17 -1
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +17 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +17 -1
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +9 -0
- package/dist/index.js +17 -1
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.cjs
CHANGED
|
@@ -1358,13 +1358,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1358
1358
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1359
1359
|
);
|
|
1360
1360
|
}
|
|
1361
|
-
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1361
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
|
|
1362
1362
|
return effect.Effect.gen(function* () {
|
|
1363
1363
|
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1364
1364
|
const rerunPassed = [];
|
|
1365
1365
|
for (let r = 0; r < reruns; r++) {
|
|
1366
1366
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1367
1367
|
const started = Date.now();
|
|
1368
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1369
|
+
n + 1,
|
|
1370
|
+
n + 1
|
|
1371
|
+
]);
|
|
1372
|
+
yield* publishEvent({
|
|
1373
|
+
type: "TestCaseStarted",
|
|
1374
|
+
runId: task.runId,
|
|
1375
|
+
testCaseId: testCaseItem.id,
|
|
1376
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1377
|
+
startedTestCases: startedEvaluations,
|
|
1378
|
+
totalTestCases: totalEvaluations,
|
|
1379
|
+
rerunIndex: r + 1,
|
|
1380
|
+
rerunTotal: reruns
|
|
1381
|
+
});
|
|
1368
1382
|
const evaluatorScores = [];
|
|
1369
1383
|
let testCaseError;
|
|
1370
1384
|
const output = readOutput(testCaseItem.testCase);
|
|
@@ -1510,6 +1524,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1510
1524
|
);
|
|
1511
1525
|
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1512
1526
|
const completedRef = yield* effect.Ref.make(0);
|
|
1527
|
+
const startedRef = yield* effect.Ref.make(0);
|
|
1513
1528
|
const passedRef = yield* effect.Ref.make(0);
|
|
1514
1529
|
const failedRef = yield* effect.Ref.make(0);
|
|
1515
1530
|
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
@@ -1519,6 +1534,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1519
1534
|
publishEvent,
|
|
1520
1535
|
persistenceQueue,
|
|
1521
1536
|
updateSnapshot,
|
|
1537
|
+
startedRef,
|
|
1522
1538
|
completedRef,
|
|
1523
1539
|
passedRef,
|
|
1524
1540
|
failedRef
|