@m4trix/evals 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,8 +14,6 @@ var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
15
15
 
16
16
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
17
- function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
-
19
17
  function _interopNamespace(e) {
20
18
  if (e && e.__esModule) return e;
21
19
  var n = Object.create(null);
@@ -35,7 +33,7 @@ function _interopNamespace(e) {
35
33
  }
36
34
 
37
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
38
- var React2__default = /*#__PURE__*/_interopDefault(React2);
36
+ var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
39
37
 
40
38
  // src/runner/config.ts
41
39
  var defaultRunnerConfig = {
@@ -677,13 +675,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
677
675
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
678
676
  );
679
677
  }
680
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
678
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
681
679
  return effect.Effect.gen(function* () {
682
680
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
683
681
  const rerunPassed = [];
684
682
  for (let r = 0; r < reruns; r++) {
685
683
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
686
684
  const started = Date.now();
685
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
686
+ n + 1,
687
+ n + 1
688
+ ]);
689
+ yield* publishEvent({
690
+ type: "TestCaseStarted",
691
+ runId: task.runId,
692
+ testCaseId: testCaseItem.id,
693
+ testCaseName: testCaseItem.testCase.getName(),
694
+ startedTestCases: startedEvaluations,
695
+ totalTestCases: totalEvaluations,
696
+ rerunIndex: r + 1,
697
+ rerunTotal: reruns
698
+ });
687
699
  const evaluatorScores = [];
688
700
  let testCaseError;
689
701
  const output = readOutput(testCaseItem.testCase);
@@ -829,6 +841,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
829
841
  );
830
842
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
831
843
  const completedRef = yield* effect.Ref.make(0);
844
+ const startedRef = yield* effect.Ref.make(0);
832
845
  const passedRef = yield* effect.Ref.make(0);
833
846
  const failedRef = yield* effect.Ref.make(0);
834
847
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -838,6 +851,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
838
851
  publishEvent,
839
852
  persistenceQueue,
840
853
  updateSnapshot,
854
+ startedRef,
841
855
  completedRef,
842
856
  passedRef,
843
857
  failedRef
@@ -1489,7 +1503,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1489
1503
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1490
1504
  return new Promise((resolve5, reject) => {
1491
1505
  const app = ink.render(
1492
- React2__default.default.createElement(GenerateView, {
1506
+ React2__namespace.default.createElement(GenerateView, {
1493
1507
  runner,
1494
1508
  datasetName,
1495
1509
  onComplete: (err) => {
@@ -1646,7 +1660,9 @@ function RunView({
1646
1660
  );
1647
1661
  const [runInfo, setRunInfo] = React2.useState(null);
1648
1662
  const [testCases, setTestCases] = React2.useState([]);
1663
+ const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
1649
1664
  const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1665
+ const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
1650
1666
  const [summary, setSummary] = React2.useState(null);
1651
1667
  const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1652
1668
  const runEval = React2.useCallback(async () => {
@@ -1683,6 +1699,25 @@ function RunView({
1683
1699
  let overallScoreCount = 0;
1684
1700
  const done = new Promise((resolve5) => {
1685
1701
  const unsubscribe = runner.subscribeRunEvents((event) => {
1702
+ if (event.type === "TestCaseStarted") {
1703
+ setStartedEvaluations(event.startedTestCases);
1704
+ setRunningEvaluations((prev) => {
1705
+ const withoutDuplicate = prev.filter(
1706
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1707
+ );
1708
+ return [
1709
+ ...withoutDuplicate,
1710
+ {
1711
+ testCaseId: event.testCaseId,
1712
+ name: event.testCaseName,
1713
+ rerunIndex: event.rerunIndex,
1714
+ rerunTotal: event.rerunTotal,
1715
+ startedTestCases: event.startedTestCases,
1716
+ totalTestCases: event.totalTestCases
1717
+ }
1718
+ ];
1719
+ });
1720
+ }
1686
1721
  if (event.type === "TestCaseProgress") {
1687
1722
  for (const item of event.evaluatorScores) {
1688
1723
  const numeric = toNumericScoreFromScores(item.scores);
@@ -1749,6 +1784,11 @@ function RunView({
1749
1784
  };
1750
1785
  byId.set(event.testCaseId, merged);
1751
1786
  setCompletedEvaluations(event.completedTestCases);
1787
+ setRunningEvaluations(
1788
+ (running) => running.filter(
1789
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1790
+ )
1791
+ );
1752
1792
  return Array.from(byId.values());
1753
1793
  });
1754
1794
  }
@@ -1824,12 +1864,30 @@ function RunView({
1824
1864
  runInfo.totalTestCases
1825
1865
  ] })
1826
1866
  ] }),
1827
- phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1828
- Spinner,
1829
- {
1830
- label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1831
- }
1832
- ) }),
1867
+ phase === "running" && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1868
+ /* @__PURE__ */ jsxRuntime.jsx(
1869
+ Spinner,
1870
+ {
1871
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1872
+ }
1873
+ ),
1874
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
1875
+ "[running ",
1876
+ item.startedTestCases,
1877
+ "/",
1878
+ item.totalTestCases,
1879
+ "] ",
1880
+ item.name,
1881
+ " ",
1882
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1883
+ "(",
1884
+ item.rerunIndex,
1885
+ "/",
1886
+ item.rerunTotal,
1887
+ ")"
1888
+ ] })
1889
+ ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
1890
+ ] }),
1833
1891
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1834
1892
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1835
1893
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -2304,9 +2362,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2304
2362
  let overallScoreTotal = 0;
2305
2363
  let overallScoreSumSq = 0;
2306
2364
  let overallScoreCount = 0;
2365
+ let startedCount = 0;
2307
2366
  let completedCount = 0;
2308
2367
  let totalCount = 0;
2309
2368
  let runFinished = false;
2369
+ const inFlightReruns = /* @__PURE__ */ new Set();
2310
2370
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2311
2371
  let spinnerIndex = 0;
2312
2372
  function clearLine() {
@@ -2330,7 +2390,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2330
2390
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2331
2391
  `${completedCount}/${totalCount}`,
2332
2392
  ansi2.bold
2333
- )} ${colorize("(live)", ansi2.dim)}`
2393
+ )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2334
2394
  );
2335
2395
  }
2336
2396
  let lastPrintedTestCaseId = null;
@@ -2338,8 +2398,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2338
2398
  let spinnerTimer;
2339
2399
  const done = new Promise((resolve5) => {
2340
2400
  const unsubscribe = runner.subscribeRunEvents((event) => {
2401
+ if (event.type === "TestCaseStarted") {
2402
+ startedCount = event.startedTestCases;
2403
+ inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2404
+ clearLine();
2405
+ process.stdout.write(
2406
+ `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2407
+ `
2408
+ );
2409
+ drawSpinner();
2410
+ }
2341
2411
  if (event.type === "TestCaseProgress") {
2342
2412
  completedCount = event.completedTestCases;
2413
+ inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2343
2414
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2344
2415
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2345
2416
  const testCaseId = event.testCaseId;
@@ -2547,7 +2618,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2547
2618
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2548
2619
  return new Promise((resolve5, reject) => {
2549
2620
  const app = ink.render(
2550
- React2__default.default.createElement(RunView, {
2621
+ React2__namespace.createElement(RunView, {
2551
2622
  runner,
2552
2623
  datasetName,
2553
2624
  evaluatorPattern,