@m4trix/evals 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -437,6 +437,15 @@ type RunnerEvent = {
437
437
  type: 'RunStarted';
438
438
  runId: string;
439
439
  startedAt: number;
440
+ } | {
441
+ type: 'TestCaseStarted';
442
+ runId: string;
443
+ testCaseId: string;
444
+ testCaseName: string;
445
+ startedTestCases: number;
446
+ totalTestCases: number;
447
+ rerunIndex: number;
448
+ rerunTotal: number;
440
449
  } | {
441
450
  type: 'TestCaseProgress';
442
451
  runId: string;
package/dist/index.js CHANGED
@@ -1106,13 +1106,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
1106
1106
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1107
1107
  );
1108
1108
  }
1109
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
1109
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
1110
1110
  return Effect.gen(function* () {
1111
1111
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
1112
1112
  const rerunPassed = [];
1113
1113
  for (let r = 0; r < reruns; r++) {
1114
1114
  const evaluatorRunId = `run-${randomUUID()}`;
1115
1115
  const started = Date.now();
1116
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
1117
+ n + 1,
1118
+ n + 1
1119
+ ]);
1120
+ yield* publishEvent({
1121
+ type: "TestCaseStarted",
1122
+ runId: task.runId,
1123
+ testCaseId: testCaseItem.id,
1124
+ testCaseName: testCaseItem.testCase.getName(),
1125
+ startedTestCases: startedEvaluations,
1126
+ totalTestCases: totalEvaluations,
1127
+ rerunIndex: r + 1,
1128
+ rerunTotal: reruns
1129
+ });
1116
1130
  const evaluatorScores = [];
1117
1131
  let testCaseError;
1118
1132
  const output = readOutput(testCaseItem.testCase);
@@ -1258,6 +1272,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1258
1272
  );
1259
1273
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
1260
1274
  const completedRef = yield* Ref.make(0);
1275
+ const startedRef = yield* Ref.make(0);
1261
1276
  const passedRef = yield* Ref.make(0);
1262
1277
  const failedRef = yield* Ref.make(0);
1263
1278
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -1267,6 +1282,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
1267
1282
  publishEvent,
1268
1283
  persistenceQueue,
1269
1284
  updateSnapshot,
1285
+ startedRef,
1270
1286
  completedRef,
1271
1287
  passedRef,
1272
1288
  failedRef