@m4trix/evals 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1332,13 +1332,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1332
1332
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1333
1333
  );
1334
1334
  }
1335
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1335
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1336
1336
  return Effect.gen(function* () {
1337
1337
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1338
1338
  const rerunPassed = [];
1339
1339
  for (let r = 0; r < reruns; r++) {
1340
1340
  const evaluatorRunId = `run-${randomUUID()}`;
1341
1341
  const started = Date.now();
1342
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1343
+ n + 1,
1344
+ n + 1
1345
+ ]);
1346
+ yield* publishEvent({
1347
+ type: "TestCaseStarted",
1348
+ runId: task.runId,
1349
+ testCaseId: testCaseItem.id,
1350
+ testCaseName: testCaseItem.testCase.getName(),
1351
+ startedTestCases: startedEvaluations,
1352
+ totalTestCases: totalEvaluations,
1353
+ rerunIndex: r + 1,
1354
+ rerunTotal: reruns
1355
+ });
1342
1356
  const evaluatorScores = [];
1343
1357
  let testCaseError;
1344
1358
  const output = readOutput(testCaseItem.testCase);
@@ -1484,6 +1498,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1484
1498
  );
1485
1499
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1486
1500
  const completedRef = yield* Ref.make(0);
1501
+ const startedRef = yield* Ref.make(0);
1487
1502
  const passedRef = yield* Ref.make(0);
1488
1503
  const failedRef = yield* Ref.make(0);
1489
1504
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1493,6 +1508,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1493
1508
  publishEvent,
1494
1509
  persistenceQueue,
1495
1510
  updateSnapshot,
1511
+ startedRef,
1496
1512
  completedRef,
1497
1513
  passedRef,
1498
1514
  failedRef