@m4trix/evals 0.19.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -77,8 +77,15 @@ export const myEvaluator = Evaluator.define({
77
77
  inputSchema,
78
78
  outputSchema: S.Unknown,
79
79
  scoreSchema: S.Struct({ scores: S.Array(S.Unknown) }),
80
- }).evaluate(async ({ input, ctx: _ctx, output }) => {
80
+ }).evaluate(async ({ input, ctx: _ctx, output, createError }) => {
81
81
  const start = Date.now();
82
+ const value = 85;
83
+ if (value < 50) {
84
+ return createError(
85
+ { reason: 'score below minimum', value, prompt: input.prompt, output },
86
+ { label: 'quality-check' },
87
+ );
88
+ }
82
89
  const latencyMs = Date.now() - start;
83
90
  const minScore =
84
91
  typeof output === 'object' &&
@@ -90,7 +97,7 @@ export const myEvaluator = Evaluator.define({
90
97
  return {
91
98
  scores: [
92
99
  percentScore.make(
93
- { value: 85 },
100
+ { value },
94
101
  { definePassed: (d) => d.value >= (minScore ?? 50) },
95
102
  ),
96
103
  ],
@@ -14,8 +14,6 @@ var ink = require('ink');
14
14
  var jsxRuntime = require('react/jsx-runtime');
15
15
 
16
16
  var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
17
- function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
18
-
19
17
  function _interopNamespace(e) {
20
18
  if (e && e.__esModule) return e;
21
19
  var n = Object.create(null);
@@ -35,7 +33,7 @@ function _interopNamespace(e) {
35
33
  }
36
34
 
37
35
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
38
- var React2__default = /*#__PURE__*/_interopDefault(React2);
36
+ var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
39
37
 
40
38
  // src/runner/config.ts
41
39
  var defaultRunnerConfig = {
@@ -294,6 +292,8 @@ function createDiffString(expected, actual, diffOptions) {
294
292
  function formatLogMessage(msg) {
295
293
  if (typeof msg === "string")
296
294
  return msg;
295
+ if (msg instanceof Error)
296
+ return msg.stack ?? msg.message;
297
297
  try {
298
298
  if (msg !== null && typeof msg === "object") {
299
299
  return JSON.stringify(msg, null, 2);
@@ -633,6 +633,7 @@ function toNumericScore(value) {
633
633
  }
634
634
 
635
635
  // src/runner/execution.ts
636
+ var evaluatorErrorLogEntryKey = "__m4trixEvaluatorLogEntry";
636
637
  function computeEvaluatorPassed(evaluator, result, scores) {
637
638
  const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
638
639
  if (scoresWithPassed.length > 0) {
@@ -674,13 +675,27 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
674
675
  `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
675
676
  );
676
677
  }
677
- function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
678
+ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef) {
678
679
  return effect.Effect.gen(function* () {
679
680
  const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
680
681
  const rerunPassed = [];
681
682
  for (let r = 0; r < reruns; r++) {
682
683
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
683
684
  const started = Date.now();
685
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
686
+ n + 1,
687
+ n + 1
688
+ ]);
689
+ yield* publishEvent({
690
+ type: "TestCaseStarted",
691
+ runId: task.runId,
692
+ testCaseId: testCaseItem.id,
693
+ testCaseName: testCaseItem.testCase.getName(),
694
+ startedTestCases: startedEvaluations,
695
+ totalTestCases: totalEvaluations,
696
+ rerunIndex: r + 1,
697
+ rerunTotal: reruns
698
+ });
684
699
  const evaluatorScores = [];
685
700
  let testCaseError;
686
701
  const output = readOutput(testCaseItem.testCase);
@@ -689,20 +704,26 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
689
704
  if (!evaluateFn) {
690
705
  continue;
691
706
  }
707
+ const logs = [];
708
+ const logDiff = (expected, actual, options) => {
709
+ logs.push(createDiffLogEntry(expected, actual, options));
710
+ };
711
+ const log = (message, options) => {
712
+ logs.push(createLogEntry(message, options));
713
+ };
714
+ const createError = (message, options) => {
715
+ const entry = createLogEntry(message, options);
716
+ const error = message instanceof Error ? message : new Error(entry.message);
717
+ error[evaluatorErrorLogEntryKey] = entry;
718
+ return error;
719
+ };
692
720
  try {
693
- const logs = [];
694
- const logDiff = (expected, actual, options) => {
695
- logs.push(createDiffLogEntry(expected, actual, options));
696
- };
697
- const log = (message, options) => {
698
- logs.push(createLogEntry(message, options));
699
- };
700
721
  const ctx = yield* effect.Effect.promise(
701
722
  () => Promise.resolve(evaluator.resolveContext())
702
723
  );
703
724
  const result = yield* effect.Effect.promise(
704
- () => Promise.resolve(
705
- evaluateFn({
725
+ () => Promise.resolve().then(
726
+ () => evaluateFn({
706
727
  input: testCaseItem.testCase.getInput(),
707
728
  ctx,
708
729
  output,
@@ -712,10 +733,24 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
712
733
  datasetId: task.datasetId
713
734
  },
714
735
  logDiff,
715
- log
736
+ log,
737
+ createError
716
738
  })
717
739
  )
718
740
  );
741
+ if (result instanceof Error) {
742
+ const evaluatorError = result;
743
+ const taggedEntry = evaluatorError[evaluatorErrorLogEntryKey];
744
+ logs.push(taggedEntry ?? createLogEntry(result));
745
+ testCaseError = result.message;
746
+ evaluatorScores.push({
747
+ evaluatorId,
748
+ scores: [],
749
+ passed: false,
750
+ logs: logs.length > 0 ? logs : void 0
751
+ });
752
+ continue;
753
+ }
719
754
  const { scores, metrics } = normalizeResult(result);
720
755
  const passed2 = computeEvaluatorPassed(evaluator, result, scores);
721
756
  evaluatorScores.push({
@@ -726,11 +761,16 @@ function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent,
726
761
  logs: logs.length > 0 ? logs : void 0
727
762
  });
728
763
  } catch (error) {
764
+ if (error instanceof Error) {
765
+ const taggedEntry = error[evaluatorErrorLogEntryKey];
766
+ logs.push(taggedEntry ?? createLogEntry(error));
767
+ }
729
768
  testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
730
769
  evaluatorScores.push({
731
770
  evaluatorId,
732
771
  scores: [],
733
- passed: false
772
+ passed: false,
773
+ logs: logs.length > 0 ? logs : void 0
734
774
  });
735
775
  }
736
776
  }
@@ -801,6 +841,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
801
841
  );
802
842
  const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
803
843
  const completedRef = yield* effect.Ref.make(0);
844
+ const startedRef = yield* effect.Ref.make(0);
804
845
  const passedRef = yield* effect.Ref.make(0);
805
846
  const failedRef = yield* effect.Ref.make(0);
806
847
  const processTestCase = (testCaseItem) => processOneTestCase(
@@ -810,6 +851,7 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
810
851
  publishEvent,
811
852
  persistenceQueue,
812
853
  updateSnapshot,
854
+ startedRef,
813
855
  completedRef,
814
856
  passedRef,
815
857
  failedRef
@@ -1461,7 +1503,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1461
1503
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1462
1504
  return new Promise((resolve5, reject) => {
1463
1505
  const app = ink.render(
1464
- React2__default.default.createElement(GenerateView, {
1506
+ React2__namespace.default.createElement(GenerateView, {
1465
1507
  runner,
1466
1508
  datasetName,
1467
1509
  onComplete: (err) => {
@@ -1618,7 +1660,9 @@ function RunView({
1618
1660
  );
1619
1661
  const [runInfo, setRunInfo] = React2.useState(null);
1620
1662
  const [testCases, setTestCases] = React2.useState([]);
1663
+ const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
1621
1664
  const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1665
+ const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
1622
1666
  const [summary, setSummary] = React2.useState(null);
1623
1667
  const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1624
1668
  const runEval = React2.useCallback(async () => {
@@ -1655,6 +1699,25 @@ function RunView({
1655
1699
  let overallScoreCount = 0;
1656
1700
  const done = new Promise((resolve5) => {
1657
1701
  const unsubscribe = runner.subscribeRunEvents((event) => {
1702
+ if (event.type === "TestCaseStarted") {
1703
+ setStartedEvaluations(event.startedTestCases);
1704
+ setRunningEvaluations((prev) => {
1705
+ const withoutDuplicate = prev.filter(
1706
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1707
+ );
1708
+ return [
1709
+ ...withoutDuplicate,
1710
+ {
1711
+ testCaseId: event.testCaseId,
1712
+ name: event.testCaseName,
1713
+ rerunIndex: event.rerunIndex,
1714
+ rerunTotal: event.rerunTotal,
1715
+ startedTestCases: event.startedTestCases,
1716
+ totalTestCases: event.totalTestCases
1717
+ }
1718
+ ];
1719
+ });
1720
+ }
1658
1721
  if (event.type === "TestCaseProgress") {
1659
1722
  for (const item of event.evaluatorScores) {
1660
1723
  const numeric = toNumericScoreFromScores(item.scores);
@@ -1714,12 +1777,18 @@ function RunView({
1714
1777
  rerunTotal: event.rerunTotal,
1715
1778
  durationMs: events.reduce((s, e) => s + e.durationMs, 0),
1716
1779
  passed: events.every((e) => e.passed),
1780
+ errorMessage: event.errorMessage,
1717
1781
  events,
1718
1782
  aggregatedEvaluatorScores,
1719
1783
  isAggregated
1720
1784
  };
1721
1785
  byId.set(event.testCaseId, merged);
1722
1786
  setCompletedEvaluations(event.completedTestCases);
1787
+ setRunningEvaluations(
1788
+ (running) => running.filter(
1789
+ (item) => !(item.testCaseId === event.testCaseId && item.rerunIndex === event.rerunIndex)
1790
+ )
1791
+ );
1723
1792
  return Array.from(byId.values());
1724
1793
  });
1725
1794
  }
@@ -1795,12 +1864,30 @@ function RunView({
1795
1864
  runInfo.totalTestCases
1796
1865
  ] })
1797
1866
  ] }),
1798
- phase === "running" && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginBottom: 1, children: /* @__PURE__ */ jsxRuntime.jsx(
1799
- Spinner,
1800
- {
1801
- label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0}`
1802
- }
1803
- ) }),
1867
+ phase === "running" && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 1, children: [
1868
+ /* @__PURE__ */ jsxRuntime.jsx(
1869
+ Spinner,
1870
+ {
1871
+ label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
1872
+ }
1873
+ ),
1874
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
1875
+ "[running ",
1876
+ item.startedTestCases,
1877
+ "/",
1878
+ item.totalTestCases,
1879
+ "] ",
1880
+ item.name,
1881
+ " ",
1882
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1883
+ "(",
1884
+ item.rerunIndex,
1885
+ "/",
1886
+ item.rerunTotal,
1887
+ ")"
1888
+ ] })
1889
+ ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
1890
+ ] }),
1804
1891
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
1805
1892
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
1806
1893
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
@@ -1824,8 +1911,13 @@ function RunView({
1824
1911
  " (",
1825
1912
  tc.durationMs,
1826
1913
  "ms)"
1827
- ] })
1914
+ ] }),
1915
+ tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", bold: true, children: [
1916
+ " ",
1917
+ "ERROR"
1918
+ ] }) : null
1828
1919
  ] }),
1920
+ tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
1829
1921
  tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
1830
1922
  ink.Box,
1831
1923
  {
@@ -2270,9 +2362,11 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2270
2362
  let overallScoreTotal = 0;
2271
2363
  let overallScoreSumSq = 0;
2272
2364
  let overallScoreCount = 0;
2365
+ let startedCount = 0;
2273
2366
  let completedCount = 0;
2274
2367
  let totalCount = 0;
2275
2368
  let runFinished = false;
2369
+ const inFlightReruns = /* @__PURE__ */ new Set();
2276
2370
  const spinnerFrames = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
2277
2371
  let spinnerIndex = 0;
2278
2372
  function clearLine() {
@@ -2296,7 +2390,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2296
2390
  `\r${colorize(frame, ansi2.cyan)} Running evaluations ${colorize(
2297
2391
  `${completedCount}/${totalCount}`,
2298
2392
  ansi2.bold
2299
- )} ${colorize("(live)", ansi2.dim)}`
2393
+ )} completed ${colorize(`${startedCount}/${totalCount}`, ansi2.bold)} started ${colorize(`(${inFlightReruns.size} running)`, ansi2.dim)}`
2300
2394
  );
2301
2395
  }
2302
2396
  let lastPrintedTestCaseId = null;
@@ -2304,8 +2398,19 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2304
2398
  let spinnerTimer;
2305
2399
  const done = new Promise((resolve5) => {
2306
2400
  const unsubscribe = runner.subscribeRunEvents((event) => {
2401
+ if (event.type === "TestCaseStarted") {
2402
+ startedCount = event.startedTestCases;
2403
+ inFlightReruns.add(`${event.testCaseId}:${event.rerunIndex}`);
2404
+ clearLine();
2405
+ process.stdout.write(
2406
+ `${colorize(`[started ${event.startedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize("(running)", ansi2.dim)}
2407
+ `
2408
+ );
2409
+ drawSpinner();
2410
+ }
2307
2411
  if (event.type === "TestCaseProgress") {
2308
2412
  completedCount = event.completedTestCases;
2413
+ inFlightReruns.delete(`${event.testCaseId}:${event.rerunIndex}`);
2309
2414
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
2310
2415
  const averageScore = numericScores.length > 0 ? numericScores.reduce((sum, value) => sum + value, 0) / numericScores.length : void 0;
2311
2416
  const testCaseId = event.testCaseId;
@@ -2363,9 +2468,13 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2363
2468
  0
2364
2469
  );
2365
2470
  const lines = [];
2471
+ const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2366
2472
  lines.push(
2367
- `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}`
2473
+ `${colorize(`[${event.completedTestCases}/${event.totalTestCases}]`, ansi2.cyan)} ${event.testCaseName} ${colorize(`(${event.rerunIndex}/${event.rerunTotal})`, ansi2.cyan)} ${colorize(`(${durationMs}ms)`, ansi2.dim)}${statusSuffix}`
2368
2474
  );
2475
+ if (event.errorMessage) {
2476
+ lines.push(colorize(event.errorMessage, ansi2.red));
2477
+ }
2369
2478
  for (const item of aggregatedScores) {
2370
2479
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2371
2480
  lines.push(
@@ -2509,7 +2618,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
2509
2618
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
2510
2619
  return new Promise((resolve5, reject) => {
2511
2620
  const app = ink.render(
2512
- React2__default.default.createElement(RunView, {
2621
+ React2__namespace.createElement(RunView, {
2513
2622
  runner,
2514
2623
  datasetName,
2515
2624
  evaluatorPattern,