@m4trix/evals 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,8 @@ import * as jitiModule from 'jiti';
7
7
  import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import { diffString } from 'json-diff';
10
- import React2, { useState, useEffect, useCallback } from 'react';
10
+ import * as React2 from 'react';
11
+ import React2__default, { useState, useEffect, useCallback } from 'react';
11
12
  import { render, Box, Text } from 'ink';
12
13
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
13
14
 
@@ -268,6 +269,8 @@ function createDiffString(expected, actual, diffOptions) {
268
269
  function formatLogMessage(msg) {
269
270
  if (typeof msg === "string")
270
271
  return msg;
272
+ if (msg instanceof Error)
273
+ return msg.stack ?? msg.message;
271
274
  try {
272
275
  if (msg !== null && typeof msg === "object") {
273
276
  return JSON.stringify(msg, null, 2);
@@ -607,6 +610,7 @@ function toNumericScore(value) {
607
610
  }
608
611
 
609
612
  // src/runner/execution.ts
613
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
610
614
  function computeEvaluatorPassed(evaluator, result, scores) {
611
615
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
612
616
  if (scoresWithPassed.length > 0) {
@@ -648,13 +652,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
648
652
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
649
653
  );
650
654
  }
651
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
655
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
652
656
  return Effect.gen(function* () {
653
657
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
654
658
  const rerunPassed = [];
655
659
  for (let r = 0; r < reruns; r++) {
656
660
  const evaluatorRunId = `run-${randomUUID()}`;
657
661
  const started = Date.now();
662
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
663
+ n + 1,
664
+ n + 1
665
+ ]);
666
+ yield* publishEvent({
667
+ type: "TestCaseStarted",
668
+ runId: task.runId,
669
+ testCaseId: testCaseItem.id,
670
+ testCaseName: testCaseItem.testCase.getName(),
671
+ startedTestCases: startedEvaluations,
672
+ totalTestCases: totalEvaluations,
673
+ rerunIndex: r + 1,
674
+ rerunTotal: reruns
675
+ });
658
676
  const evaluatorScores = [];
659
677
  let testCaseError;
660
678
  const output = readOutput(testCaseItem.testCase);
@@ -663,20 +681,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
663
681
  if (!evaluateFn) {
664
682
  continue;
665
683
  }
684
+ const logs = [];
685
+ const logDiff = (expected, actual, options) => {
686
+ logs.push(createDiffLogEntry(expected, actual, options));
687
+ };
688
+ const log = (message, options) => {
689
+ logs.push(createLogEntry(message, options));
690
+ };
691
+ const createError = (message, options) => {
692
+ const entry = createLogEntry(message, options);
693
+ const error = message instanceof Error ? message : new Error(entry.message);
694
+ error[evaluatorErrorLogEntryKey] = entry;
695
+ return error;
696
+ };
666
697
  try {
667
- const logs = [];
668
- const logDiff = (expected, actual, options) => {
669
- logs.push(createDiffLogEntry(expected, actual, options));
670
- };
671
- const log = (message, options) => {
672
- logs.push(createLogEntry(message, options));
673
- };
674
698
  const ctx = yield* Effect.promise(
675
699
  () => Promise.resolve(evaluator.resolveContext())
676
700
  );
677
701
  const result = yield* Effect.promise(
678
- () => Promise.resolve(
679
- evaluateFn({
702
+ () => Promise.resolve().then(
703
+ () => evaluateFn({
680
704
  input: testCaseItem.testCase.getInput(),
681
705
  ctx,
682
706
  output,
@@ -686,10 +710,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
686
710
  datasetId: task.datasetId
687
711
  },
688
712
  logDiff,
689
- log
713
+ log,
714
+ createError
690
715
  })
691
716
  )
692
717
  );
718
+ if (result instanceof Error) {
719
+ const evaluatorError = result;
720
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
721
+ logs.push(taggedEntry ?? createLogEntry(result));
722
+ testCaseError = result.message;
723
+ evaluatorScores.push({
724
+ evaluatorId,
725
+ scores: [],
726
+ passed: false,
727
+ logs: logs.length > 0 ? logs : void 0
728
+ });
729
+ continue;
730
+ }
693
731
  const { scores, metrics } = normalizeResult(result);
694
732
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
695
733
  evaluatorScores.push({
@@ -700,11 +738,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
700
738
  logs: logs.length > 0 ? logs : void 0
701
739
  });
702
740
  } catch (error) {
741
+ if (error instanceof Error) {
742
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
743
+ logs.push(taggedEntry ?? createLogEntry(error));
744
+ }
703
745
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
704
746
  evaluatorScores.push({
705
747
  evaluatorId,
706
748
  scores: [],
707
- passed: false
749
+ passed: false,
750
+ logs: logs.length > 0 ? logs : void 0
708
751
  });
709
752
  }
710
753
  }
@@ -775,6 +818,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
775
818
  );
776
819
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
777
820
  const completedRef = yield* Ref.make(0);
821
+ const startedRef = yield* Ref.make(0);
778
822
  const passedRef = yield* Ref.make(0);
779
823
  const failedRef = yield* Ref.make(0);
780
824
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -784,6 +828,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
784
828
  publishEvent,
785
829
  persistenceQueue,
786
830
  updateSnapshot,
831
+ startedRef,
787
832
  completedRef,
788
833
  passedRef,
789
834
  failedRef
@@ -1435,7 +1480,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1435
1480
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1436
1481
  return new Promise((resolve5, reject) => {
1437
1482
  const app = render(
1438
- React2.createElement(GenerateView, {
1483
+ React2__default.createElement(GenerateView, {
1439
1484
  runner,
1440
1485
  datasetName,
1441
1486
  onComplete: (err) => {
@@ -1592,7 +1637,9 @@ function RunView({
1592
1637
  );
1593
1638
  const [runInfo, setRunInfo] = useState(null);
1594
1639
  const [testCases, setTestCases] = useState([]);
1640
+ const [startedEvaluations, setStartedEvaluations] = useState(0);
1595
1641
  const [completedEvaluations, setCompletedEvaluations] = useState(0);
1642
+ const [runningEvaluations, setRunningEvaluations] = useState([]);
1596
1643
  const [summary, setSummary] = useState(null);
1597
1644
  const [evaluatorNameById, setEvaluatorNameById] = useState(/* @__PURE__ */ new Map());
1598
1645
  const runEval = useCallback(async () => {
@@ -1629,6 +1676,25 @@ function RunView({
1629
1676
  let overallScoreCount = 0;
1630
1677
  const done = new Promise((resolve5) => {
1631
1678
  const unsubscribe = runner.subscribeRunEvents((event) => {
1679
+ if (event.type === "TestCaseStarted") {
1680
+ setStartedEvaluations(event.startedTestCases);
1681
+ setRunningEvaluations((prev) => {
1682
+ const withoutDuplicate = prev.filter(
1683
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1684
+ );
1685
+ return [
1686
+ ...withoutDuplicate,
1687
+ {
1688
+ testCaseId: event.testCaseId,
1689
+ name: event.testCaseName,
1690
+ rerunIndex: event.rerunIndex,
1691
+ rerunTotal: event.rerunTotal,
1692
+ startedTestCases: event.startedTestCases,
1693
+ totalTestCases: event.totalTestCases
1694
+ }
1695
+ ];
1696
+ });
1697
+ }
1632
1698
  if (event.type === "TestCaseProgress") {
1633
1699
  for (const item of event.evaluatorScores) {
1634
1700
  const numeric = toNumericScoreFromScores(item.scores);
@@ -1688,12 +1754,18 @@ function RunView({
1688
1754
  rerunTotal: event.rerunTotal,
1689
1755
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1690
1756
  passed: events.every((e) => e.passed),
1757
+ errorMessage: event.errorMessage,
1691
1758
  events,
1692
1759
  aggregatedEvaluatorScores,
1693
1760
  isAggregated
1694
1761
  };
1695
1762
  byId.set(event.testCaseId, merged);
1696
1763
  setCompletedEvaluations(event.completedTestCases);
1764
+ setRunningEvaluations(
1765
+ (running) => running.filter(
1766
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1767
+ )
1768
+ );
1697
1769
  return Array.from(byId.values());
1698
1770
  });
1699
1771
  }
@@ -1769,12 +1841,30 @@ function RunView({
1769
1841
  runInfo.totalTestCases
1770
1842
  ] })
1771
1843
  ] }),
1772
- phase === "running" && /* @__PURE__ */ jsx(Box, { marginBottom: 1, children: /* @__PURE__ */ jsx(
1773
- Spinner,
1774
- {
1775
- label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1776
- }
1777
- ) }),
1844
+ phase === "running" && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 1, children: [
1845
+ /* @__PURE__ */ jsx(
1846
+ Spinner,
1847
+ {
1848
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1849
+ }
1850
+ ),
1851
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
1852
+ "[running ",
1853
+ item.startedTestCases,
1854
+ "/",
1855
+ item.totalTestCases,
1856
+ "] ",
1857
+ item.name,
1858
+ " ",
1859
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1860
+ "(",
1861
+ item.rerunIndex,
1862
+ "/",
1863
+ item.rerunTotal,
1864
+ ")"
1865
+ ] })
1866
+ ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
1867
+ ] }),
1778
1868
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
1779
1869
  /* @__PURE__ */ jsxs(Text, { children: [
1780
1870
  /* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
@@ -1798,8 +1888,13 @@ function RunView({
1798
1888
  " (",
1799
1889
  tc.durationMs,
1800
1890
  "ms)"
1801
- ] })
1891
+ ] }),
1892
+ tc.errorMessage ? /* @__PURE__ */ jsxs(Text, { color: "red", bold: true, children: [
1893
+ " ",
1894
+ "ERROR"
1895
+ ] }) : null
1802
1896
  ] }),
1897
+ tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
1803
1898
  tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
1804
1899
  Box,
1805
1900
  {
@@ -2244,9 +2339,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2244
2339
  let overallScoreTotal = 0;
2245
2340
  let overallScoreSumSq = 0;
2246
2341
  let overallScoreCount = 0;
2342
+ let startedCount = 0;
2247
2343
  let completedCount = 0;
2248
2344
  let totalCount = 0;
2249
2345
  let runFinished = false;
2346
+ const inFlightReruns = /* @__PURE__ */ new Set();
2250
2347
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2251
2348
  let spinnerIndex = 0;
2252
2349
  function clearLine() {
@@ -2270,7 +2367,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2270
2367
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2271
2368
  `${completedCount}/${totalCount}`,
2272
2369
  ansi2.bold
2273
- )} ${colorize("(live)", ansi2.dim)}`
2370
+ )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2274
2371
  );
2275
2372
  }
2276
2373
  let lastPrintedTestCaseId = null;
@@ -2278,8 +2375,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2278
2375
  let spinnerTimer;
2279
2376
  const done = new Promise((resolve5) => {
2280
2377
  const unsubscribe = runner.subscribeRunEvents((event) => {
2378
+ if (event.type === "TestCaseStarted") {
2379
+ startedCount = event.startedTestCases;
2380
+ inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2381
+ clearLine();
2382
+ process.stdout.write(
2383
+ `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2384
+ `
2385
+ );
2386
+ drawSpinner();
2387
+ }
2281
2388
  if (event.type === "TestCaseProgress") {
2282
2389
  completedCount = event.completedTestCases;
2390
+ inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2283
2391
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2284
2392
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2285
2393
  const testCaseId = event.testCaseId;
@@ -2337,9 +2445,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2337
2445
  0
2338
2446
  );
2339
2447
  const lines = [];
2448
+ const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2340
2449
  lines.push(
2341
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2450
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2342
2451
  );
2452
+ if (event.errorMessage) {
2453
+ lines.push(colorize(event.errorMessage, ansi2.red));
2454
+ }
2343
2455
  for (const item of aggregatedScores) {
2344
2456
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2345
2457
  lines.push(