@m4trix/evals 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,8 @@ import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import { diffString } from 'json-diff';
10
- import React2, { useState, useEffect, useCallback } from 'react';
10
+ import * as React2 from 'react';
11
+ import React2__default, { useState, useEffect, useCallback } from 'react';
11
12
  import { render, Box, Text } from 'ink';
12
13
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
13
14
 
@@ -651,13 +652,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
651
652
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
652
653
  );
653
654
  }
654
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
655
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
655
656
  return Effect.gen(function* () {
656
657
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
657
658
  const rerunPassed = [];
658
659
  for (let r = 0; r < reruns; r++) {
659
660
  const evaluatorRunId = `run-${randomUUID()}`;
660
661
  const started = Date.now();
662
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
663
+ n + 1,
664
+ n + 1
665
+ ]);
666
+ yield* publishEvent({
667
+ type: "TestCaseStarted",
668
+ runId: task.runId,
669
+ testCaseId: testCaseItem.id,
670
+ testCaseName: testCaseItem.testCase.getName(),
671
+ startedTestCases: startedEvaluations,
672
+ totalTestCases: totalEvaluations,
673
+ rerunIndex: r + 1,
674
+ rerunTotal: reruns
675
+ });
661
676
  const evaluatorScores = [];
662
677
  let testCaseError;
663
678
  const output = readOutput(testCaseItem.testCase);
@@ -803,6 +818,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
803
818
  );
804
819
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
805
820
  const completedRef = yield* Ref.make(0);
821
+ const startedRef = yield* Ref.make(0);
806
822
  const passedRef = yield* Ref.make(0);
807
823
  const failedRef = yield* Ref.make(0);
808
824
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -812,6 +828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
812
828
  publishEvent,
813
829
  persistenceQueue,
814
830
  updateSnapshot,
831
+ startedRef,
815
832
  completedRef,
816
833
  passedRef,
817
834
  failedRef
@@ -1463,7 +1480,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1463
1480
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1464
1481
  return new Promise((resolve5, reject) => {
1465
1482
  const app = render(
1466
- React2.createElement(GenerateView, {
1483
+ React2__default.createElement(GenerateView, {
1467
1484
  runner,
1468
1485
  datasetName,
1469
1486
  onComplete: (err) => {
@@ -1620,7 +1637,9 @@ function RunView({
1620
1637
  );
1621
1638
  const [runInfo, setRunInfo] = useState(null);
1622
1639
  const [testCases, setTestCases] = useState([]);
1640
+ const [startedEvaluations, setStartedEvaluations] = useState(0);
1623
1641
  const [completedEvaluations, setCompletedEvaluations] = useState(0);
1642
+ const [runningEvaluations, setRunningEvaluations] = useState([]);
1624
1643
  const [summary, setSummary] = useState(null);
1625
1644
  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1626
1645
  const runEval = useCallback(async () => {
@@ -1657,6 +1676,25 @@ function RunView({
1657
1676
  let overallScoreCount = 0;
1658
1677
  const done = new Promise((resolve5) => {
1659
1678
  const unsubscribe = runner.subscribeRunEvents((event) => {
1679
+ if (event.type === "TestCaseStarted") {
1680
+ setStartedEvaluations(event.startedTestCases);
1681
+ setRunningEvaluations((prev) => {
1682
+ const withoutDuplicate = prev.filter(
1683
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1684
+ );
1685
+ return [
1686
+ ...withoutDuplicate,
1687
+ {
1688
+ testCaseId: event.testCaseId,
1689
+ name: event.testCaseName,
1690
+ rerunIndex: event.rerunIndex,
1691
+ rerunTotal: event.rerunTotal,
1692
+ startedTestCases: event.startedTestCases,
1693
+ totalTestCases: event.totalTestCases
1694
+ }
1695
+ ];
1696
+ });
1697
+ }
1660
1698
  if (event.type === "TestCaseProgress") {
1661
1699
  for (const item of event.evaluatorScores) {
1662
1700
  const numeric = toNumericScoreFromScores(item.scores);
@@ -1723,6 +1761,11 @@ function RunView({
1723
1761
  };
1724
1762
  byId.set(event.testCaseId, merged);
1725
1763
  setCompletedEvaluations(event.completedTestCases);
1764
+ setRunningEvaluations(
1765
+ (running) => running.filter(
1766
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1767
+ )
1768
+ );
1726
1769
  return Array.from(byId.values());
1727
1770
  });
1728
1771
  }
@@ -1798,12 +1841,30 @@ function RunView({
1798
1841
  runInfo.totalTestCases
1799
1842
  ] })
1800
1843
  ] }),
1801
- phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1802
- Spinner,
1803
- {
1804
- label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1805
- }
1806
- ) }),
1844
+ phase === "running" && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1845
+ /* @__PURE__ */ jsx(
1846
+ Spinner,
1847
+ {
1848
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1849
+ }
1850
+ ),
1851
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
1852
+ "[running ",
1853
+ item.startedTestCases,
1854
+ "/",
1855
+ item.totalTestCases,
1856
+ "] ",
1857
+ item.name,
1858
+ " ",
1859
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1860
+ "(",
1861
+ item.rerunIndex,
1862
+ "/",
1863
+ item.rerunTotal,
1864
+ ")"
1865
+ ] })
1866
+ ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
1867
+ ] }),
1807
1868
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1808
1869
  /* @__PURE__ */ jsxs(Text, { children: [
1809
1870
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -2278,9 +2339,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2278
2339
  let overallScoreTotal = 0;
2279
2340
  let overallScoreSumSq = 0;
2280
2341
  let overallScoreCount = 0;
2342
+ let startedCount = 0;
2281
2343
  let completedCount = 0;
2282
2344
  let totalCount = 0;
2283
2345
  let runFinished = false;
2346
+ const inFlightReruns = /* @__PURE__ */ new Set();
2284
2347
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2285
2348
  let spinnerIndex = 0;
2286
2349
  function clearLine() {
@@ -2304,7 +2367,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2304
2367
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2305
2368
  `${completedCount}/${totalCount}`,
2306
2369
  ansi2.bold
2307
- )} ${colorize("(live)", ansi2.dim)}`
2370
+ )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2308
2371
  );
2309
2372
  }
2310
2373
  let lastPrintedTestCaseId = null;
@@ -2312,8 +2375,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2312
2375
  let spinnerTimer;
2313
2376
  const done = new Promise((resolve5) => {
2314
2377
  const unsubscribe = runner.subscribeRunEvents((event) => {
2378
+ if (event.type === "TestCaseStarted") {
2379
+ startedCount = event.startedTestCases;
2380
+ inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2381
+ clearLine();
2382
+ process.stdout.write(
2383
+ `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2384
+ `
2385
+ );
2386
+ drawSpinner();
2387
+ }
2315
2388
  if (event.type === "TestCaseProgress") {
2316
2389
  completedCount = event.completedTestCases;
2390
+ inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2317
2391
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2318
2392
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2319
2393
  const testCaseId = event.testCaseId;