@m4trix/evals 0.24.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,7 +10,7 @@ var promises = require('fs/promises');
10
10
  var url = require('url');
11
11
  var diff = require('diff');
12
12
  var stringify = require('fast-json-stable-stringify');
13
- var React2 = require('react');
13
+ var React = require('react');
14
14
  var ink = require('ink');
15
15
  var jsxRuntime = require('react/jsx-runtime');
16
16
 
@@ -37,25 +37,15 @@ function _interopNamespace(e) {
37
37
 
38
38
  var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
39
39
  var stringify__default = /*#__PURE__*/_interopDefault(stringify);
40
- var React2__namespace = /*#__PURE__*/_interopNamespace(React2);
40
+ var React__namespace = /*#__PURE__*/_interopNamespace(React);
41
41
 
42
42
  // src/runner/config.ts
43
43
  var defaultRunnerConfig = {
44
44
  discovery: {
45
45
  rootDir: process.cwd(),
46
46
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
47
- evaluatorSuffixes: [
48
- ".evaluator.ts",
49
- ".evaluator.tsx",
50
- ".evaluator.js",
51
- ".evaluator.mjs"
52
- ],
53
- testCaseSuffixes: [
54
- ".test-case.ts",
55
- ".test-case.tsx",
56
- ".test-case.js",
57
- ".test-case.mjs"
58
- ],
47
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
48
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
59
49
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
60
50
  },
61
51
  artifactDirectory: ".eval-results",
@@ -122,14 +112,15 @@ function getJitiLoader() {
122
112
  }
123
113
  const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
124
114
  if (typeof createJiti2 !== "function") {
125
- throw new Error(
126
- "Failed to initialize jiti for m4trix eval config loading."
127
- );
115
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
128
116
  }
129
- cachedLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
130
- interopDefault: true,
131
- moduleCache: true
132
- });
117
+ cachedLoader = createJiti2(
118
+ (typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
119
+ {
120
+ interopDefault: true,
121
+ moduleCache: true
122
+ }
123
+ );
133
124
  return cachedLoader;
134
125
  }
135
126
  function resolveConfigModuleExport(loadedModule) {
@@ -233,9 +224,7 @@ async function loadModuleExports(filePath) {
233
224
  }
234
225
  async function collectDatasetsFromFiles(config) {
235
226
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
236
- const matched = files.filter(
237
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
238
- );
227
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
239
228
  const found = await Promise.all(
240
229
  matched.map(async (absolutePath) => {
241
230
  const exports = await loadModuleExports(absolutePath);
@@ -252,9 +241,7 @@ async function collectDatasetsFromFiles(config) {
252
241
  }
253
242
  async function collectEvaluatorsFromFiles(config) {
254
243
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
255
- const matched = files.filter(
256
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
257
- );
244
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
258
245
  const found = await Promise.all(
259
246
  matched.map(async (absolutePath) => {
260
247
  const exports = await loadModuleExports(absolutePath);
@@ -271,9 +258,7 @@ async function collectEvaluatorsFromFiles(config) {
271
258
  }
272
259
  async function collectTestCasesFromFiles(config) {
273
260
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
274
- const matched = files.filter(
275
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
276
- );
261
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
277
262
  const found = await Promise.all(
278
263
  matched.map(async (absolutePath) => {
279
264
  const exports = await loadModuleExports(absolutePath);
@@ -345,16 +330,8 @@ function createDiffString(expected, actual, diffOptions) {
345
330
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
346
331
  const actualProcessed = preprocessForDiff(actual, diffOptions);
347
332
  if (diffOptions?.keysOnly) {
348
- const expectedKeys = JSON.stringify(
349
- extractKeys(expectedProcessed),
350
- null,
351
- 2
352
- );
353
- const actualKeys = JSON.stringify(
354
- extractKeys(actualProcessed),
355
- null,
356
- 2
357
- );
333
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
334
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
358
335
  const parts2 = diff.diffLines(expectedKeys, actualKeys);
359
336
  return formatDiffParts(parts2);
360
337
  }
@@ -365,9 +342,7 @@ function createDiffString(expected, actual, diffOptions) {
365
342
  }
366
343
  const parts = diff.diffLines(expectedStr, actualStr);
367
344
  if (diffOptions?.outputNewOnly) {
368
- const filtered = parts.filter(
369
- (p) => p.added === true
370
- );
345
+ const filtered = parts.filter((p) => p.added === true);
371
346
  return formatDiffParts(filtered);
372
347
  }
373
348
  return formatDiffParts(parts);
@@ -469,10 +444,7 @@ var ScoreAggregate = {
469
444
  const count = values.length || 1;
470
445
  const result = {};
471
446
  for (const field of fields) {
472
- result[field] = values.reduce(
473
- (s, v) => s + (v[field] ?? 0),
474
- 0
475
- ) / count;
447
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
476
448
  }
477
449
  return result;
478
450
  };
@@ -506,13 +478,10 @@ var ScoreAggregate = {
506
478
  (s, v) => s + (v[valueField] ?? 0),
507
479
  0
508
480
  );
509
- const sumSq = values.reduce(
510
- (s, v) => {
511
- const value = v[valueField] ?? 0;
512
- return s + value * value;
513
- },
514
- 0
515
- );
481
+ const sumSq = values.reduce((s, v) => {
482
+ const value = v[valueField] ?? 0;
483
+ return s + value * value;
484
+ }, 0);
516
485
  const mean = sum / count;
517
486
  const variance = (sumSq - count * mean * mean) / (count - 1);
518
487
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -780,20 +749,14 @@ function nowIsoForFile() {
780
749
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
781
750
  }
782
751
  function createArtifactPath(artifactDirectory, datasetId, runId) {
783
- return path.join(
784
- artifactDirectory,
785
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
786
- );
752
+ return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
787
753
  }
788
754
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
789
755
  const { testCaseItem, rerunIndex, rerunTotal } = unit;
790
756
  return effect.Effect.gen(function* () {
791
757
  const evaluatorRunId = `run-${crypto.randomUUID()}`;
792
758
  const started = Date.now();
793
- const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
794
- n + 1,
795
- n + 1
796
- ]);
759
+ const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
797
760
  yield* publishEvent({
798
761
  type: "TestCaseStarted",
799
762
  runId: task.runId,
@@ -826,9 +789,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
826
789
  return error;
827
790
  };
828
791
  try {
829
- const ctx = yield* effect.Effect.promise(
830
- () => Promise.resolve(evaluator.resolveContext())
831
- );
792
+ const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
832
793
  const result = yield* effect.Effect.promise(
833
794
  () => Promise.resolve().then(
834
795
  () => evaluateFn({
@@ -883,10 +844,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
883
844
  }
884
845
  }
885
846
  const rerunPassedThis = evaluatorScores.every((s) => s.passed);
886
- const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
887
- n + 1,
888
- n + 1
889
- ]);
847
+ const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
890
848
  const progressEvent = {
891
849
  type: "TestCaseProgress",
892
850
  runId: task.runId,
@@ -935,10 +893,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
935
893
  } else {
936
894
  yield* effect.Ref.update(failedRef, (n) => n + 1);
937
895
  }
938
- const [passed, failed] = yield* effect.Effect.all([
939
- effect.Ref.get(passedRef),
940
- effect.Ref.get(failedRef)
941
- ]);
896
+ const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
942
897
  yield* updateSnapshot(task.runId, (snapshot) => ({
943
898
  ...snapshot,
944
899
  passedTestCases: passed,
@@ -1258,15 +1213,11 @@ var EffectRunner = class {
1258
1213
  this.persistenceQueue = effect.Effect.runSync(
1259
1214
  effect.Queue.unbounded()
1260
1215
  );
1261
- this.snapshotsRef = effect.Effect.runSync(
1262
- effect.Ref.make(/* @__PURE__ */ new Map())
1263
- );
1216
+ this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
1264
1217
  this.listeners = /* @__PURE__ */ new Set();
1265
1218
  this.datasetsById = /* @__PURE__ */ new Map();
1266
1219
  this.evaluatorsById = /* @__PURE__ */ new Map();
1267
- this.schedulerFiber = effect.Effect.runFork(
1268
- this.createSchedulerEffect()
1269
- );
1220
+ this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
1270
1221
  this.persistenceFiber = effect.Effect.runFork(
1271
1222
  createPersistenceWorker(this.persistenceQueue)
1272
1223
  );
@@ -1413,9 +1364,9 @@ var EffectRunner = class {
1413
1364
  return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
1414
1365
  }
1415
1366
  getAllRunSnapshots() {
1416
- return Array.from(
1417
- effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()
1418
- ).sort((a, b) => b.queuedAt - a.queuedAt);
1367
+ return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
1368
+ (a, b) => b.queuedAt - a.queuedAt
1369
+ );
1419
1370
  }
1420
1371
  async loadRunSnapshotsFromArtifacts() {
1421
1372
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1557,9 +1508,9 @@ function GenerateView({
1557
1508
  datasetName,
1558
1509
  onComplete
1559
1510
  }) {
1560
- const [result, setResult] = React2.useState(null);
1561
- const [error, setError] = React2.useState(null);
1562
- React2.useEffect(() => {
1511
+ const [result, setResult] = React.useState(null);
1512
+ const [error, setError] = React.useState(null);
1513
+ React.useEffect(() => {
1563
1514
  let cancelled = false;
1564
1515
  async function run() {
1565
1516
  const dataset = await runner.resolveDatasetByName(datasetName);
@@ -1582,12 +1533,8 @@ function GenerateView({
1582
1533
  const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
1583
1534
  const parsed = parse2(absoluteDatasetPath);
1584
1535
  const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
1585
- await writeFile2(
1586
- outputPath,
1587
- `${JSON.stringify(payload, null, 2)}
1588
- `,
1589
- "utf8"
1590
- );
1536
+ await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
1537
+ `, "utf8");
1591
1538
  if (!cancelled) {
1592
1539
  setResult({
1593
1540
  count: payload.length,
@@ -1658,7 +1605,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1658
1605
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1659
1606
  return new Promise((resolve5, reject) => {
1660
1607
  const app = ink.render(
1661
- React2__namespace.default.createElement(GenerateView, {
1608
+ React__namespace.default.createElement(GenerateView, {
1662
1609
  runner,
1663
1610
  datasetName,
1664
1611
  onComplete: (err) => {
@@ -1708,8 +1655,8 @@ function TextBar({
1708
1655
  }
1709
1656
  var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
1710
1657
  function Spinner({ label = "Running" }) {
1711
- const [frame, setFrame] = React2.useState(0);
1712
- React2.useEffect(() => {
1658
+ const [frame, setFrame] = React.useState(0);
1659
+ React.useEffect(() => {
1713
1660
  const timer = setInterval(() => {
1714
1661
  setFrame((f) => (f + 1) % FRAMES.length);
1715
1662
  }, 100);
@@ -1743,9 +1690,7 @@ function createBar(value, max = 100, width = 20) {
1743
1690
  function aggregateEvaluatorScores(events, nameById) {
1744
1691
  if (events.length === 0)
1745
1692
  return [];
1746
- const evaluatorIds = new Set(
1747
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1748
- );
1693
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
1749
1694
  const result = [];
1750
1695
  for (const evaluatorId of evaluatorIds) {
1751
1696
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1775,9 +1720,7 @@ function aggregateEvaluatorScores(events, nameById) {
1775
1720
  return es?.passed ?? false;
1776
1721
  });
1777
1722
  const lastEvent = events[events.length - 1];
1778
- const lastEs = lastEvent?.evaluatorScores.find(
1779
- (x) => x.evaluatorId === evaluatorId
1780
- );
1723
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1781
1724
  result.push({
1782
1725
  evaluatorId,
1783
1726
  evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
@@ -1811,17 +1754,15 @@ function RunView({
1811
1754
  concurrency,
1812
1755
  onComplete
1813
1756
  }) {
1814
- const [phase, setPhase] = React2.useState(
1815
- "loading"
1816
- );
1817
- const [runInfo, setRunInfo] = React2.useState(null);
1818
- const [testCases, setTestCases] = React2.useState([]);
1819
- const [startedEvaluations, setStartedEvaluations] = React2.useState(0);
1820
- const [completedEvaluations, setCompletedEvaluations] = React2.useState(0);
1821
- const [runningEvaluations, setRunningEvaluations] = React2.useState([]);
1822
- const [summary, setSummary] = React2.useState(null);
1823
- const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
1824
- const runEval = React2.useCallback(async () => {
1757
+ const [phase, setPhase] = React.useState("loading");
1758
+ const [runInfo, setRunInfo] = React.useState(null);
1759
+ const [testCases, setTestCases] = React.useState([]);
1760
+ const [startedEvaluations, setStartedEvaluations] = React.useState(0);
1761
+ const [completedEvaluations, setCompletedEvaluations] = React.useState(0);
1762
+ const [runningEvaluations, setRunningEvaluations] = React.useState([]);
1763
+ const [summary, setSummary] = React.useState(null);
1764
+ const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
1765
+ const runEval = React.useCallback(async () => {
1825
1766
  const dataset = await runner.resolveDatasetByName(datasetName);
1826
1767
  if (!dataset) {
1827
1768
  const known = await runner.collectDatasets();
@@ -1920,10 +1861,7 @@ function RunView({
1920
1861
  };
1921
1862
  const events = existing ? [...existing.events, newEvent] : [newEvent];
1922
1863
  const isAggregated = events.length > 1;
1923
- const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1924
- events,
1925
- nameById
1926
- );
1864
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
1927
1865
  const merged = {
1928
1866
  name: event.testCaseName,
1929
1867
  testCaseId: event.testCaseId,
@@ -1986,7 +1924,7 @@ function RunView({
1986
1924
  setPhase("completed");
1987
1925
  setTimeout(() => onComplete(), 200);
1988
1926
  }, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
1989
- React2.useEffect(() => {
1927
+ React.useEffect(() => {
1990
1928
  void runEval();
1991
1929
  }, [runEval]);
1992
1930
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
@@ -2028,30 +1966,22 @@ function RunView({
2028
1966
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
2029
1967
  }
2030
1968
  ),
2031
- runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
2032
- ink.Text,
2033
- {
2034
- color: "yellow",
2035
- children: [
2036
- "[running ",
2037
- item.startedTestCases,
2038
- "/",
2039
- item.totalTestCases,
2040
- "]",
2041
- " ",
2042
- item.name,
2043
- " ",
2044
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2045
- "(",
2046
- item.rerunIndex,
2047
- "/",
2048
- item.rerunTotal,
2049
- ")"
2050
- ] })
2051
- ]
2052
- },
2053
- `${item.testCaseId}:${item.rerunIndex}`
2054
- )) })
1969
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
1970
+ "[running ",
1971
+ item.startedTestCases,
1972
+ "/",
1973
+ item.totalTestCases,
1974
+ "] ",
1975
+ item.name,
1976
+ " ",
1977
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
1978
+ "(",
1979
+ item.rerunIndex,
1980
+ "/",
1981
+ item.rerunTotal,
1982
+ ")"
1983
+ ] })
1984
+ ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
2055
1985
  ] }),
2056
1986
  testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
2057
1987
  /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
@@ -2083,73 +2013,63 @@ function RunView({
2083
2013
  ] }) : null
2084
2014
  ] }),
2085
2015
  tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
2086
- tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
2087
- ink.Box,
2088
- {
2089
- flexDirection: "column",
2090
- marginLeft: 2,
2091
- children: [
2092
- /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2093
- item.evaluatorName,
2094
- ":",
2095
- " ",
2096
- /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2097
- item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2016
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
2017
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
2018
+ item.evaluatorName,
2019
+ ":",
2020
+ " ",
2021
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2022
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
2023
+ " ",
2024
+ item.metrics.map((m) => {
2025
+ const def = getMetricById(m.id);
2026
+ if (!def)
2027
+ return null;
2028
+ const formatted = def.format(m.data, {
2029
+ isAggregated: tc.isAggregated
2030
+ });
2031
+ const label = m.name ?? def.name;
2032
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2033
+ "[",
2034
+ label ? `${label}: ` : "",
2035
+ formatted,
2036
+ "]",
2037
+ " "
2038
+ ] }, m.id);
2039
+ })
2040
+ ] }) : null
2041
+ ] }),
2042
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
2043
+ const def = s.def ?? getScoreById(s.id);
2044
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2045
+ return /* @__PURE__ */ jsxRuntime.jsxs(
2046
+ ink.Text,
2047
+ {
2048
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2049
+ children: [
2050
+ " ",
2051
+ scoreLabel,
2052
+ ":",
2098
2053
  " ",
2099
- item.metrics.map((m) => {
2100
- const def = getMetricById(m.id);
2101
- if (!def)
2102
- return null;
2103
- const formatted = def.format(m.data, {
2104
- isAggregated: tc.isAggregated
2105
- });
2106
- const label = m.name ?? def.name;
2107
- return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2108
- "[",
2109
- label ? `${label}: ` : "",
2110
- formatted,
2111
- "]",
2112
- " "
2113
- ] }, m.id);
2054
+ formatScorePart(s, scoreColor, {
2055
+ isAggregated: tc.isAggregated
2114
2056
  })
2115
- ] }) : null
2116
- ] }),
2117
- item.scores.length > 0 ? item.scores.map((s, idx) => {
2118
- const def = s.def ?? getScoreById(s.id);
2119
- const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2120
- return /* @__PURE__ */ jsxRuntime.jsxs(
2121
- ink.Text,
2122
- {
2123
- color: scoreColor(toNumericScore(s.data) ?? 0),
2124
- children: [
2125
- " ",
2126
- scoreLabel,
2127
- ":",
2128
- " ",
2129
- formatScorePart(s, scoreColor, {
2130
- isAggregated: tc.isAggregated
2131
- })
2132
- ]
2133
- },
2134
- `${item.evaluatorId}-${s.id}-${idx}`
2135
- );
2136
- }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2137
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2138
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
2139
- ({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
2140
- ink.Text,
2141
- {
2142
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2143
- children: line
2144
- },
2145
- lineIdx
2146
- )
2147
- ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2148
- ) })
2149
- ]
2150
- },
2151
- item.evaluatorId
2152
- ))
2057
+ ]
2058
+ },
2059
+ `${item.evaluatorId}-${s.id}-${idx}`
2060
+ );
2061
+ }) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
2062
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2063
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
2064
+ ink.Text,
2065
+ {
2066
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2067
+ children: line
2068
+ },
2069
+ lineIdx
2070
+ )) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2071
+ ) })
2072
+ ] }, item.evaluatorId))
2153
2073
  ] }, tc.testCaseId)) }),
2154
2074
  phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
2155
2075
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -2191,9 +2111,9 @@ function RunView({
2191
2111
  /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
2192
2112
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2193
2113
  const agg = summary.aggregates.get(id);
2194
- const scoreKeys = [
2195
- ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2196
- ].filter((k) => k.startsWith(`${id}:`));
2114
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2115
+ (k) => k.startsWith(`${id}:`)
2116
+ );
2197
2117
  if (scoreKeys.length === 0) {
2198
2118
  return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
2199
2119
  "- ",
@@ -2223,19 +2143,12 @@ function RunView({
2223
2143
  const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
2224
2144
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
2225
2145
  const numeric = toNumericScore(aggregated.data);
2226
- return /* @__PURE__ */ jsxRuntime.jsxs(
2227
- ink.Text,
2228
- {
2229
- color: numeric !== void 0 ? scoreColor(numeric) : "gray",
2230
- children: [
2231
- " ",
2232
- label,
2233
- ": ",
2234
- formatted
2235
- ]
2236
- },
2237
- key
2238
- );
2146
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
2147
+ " ",
2148
+ label,
2149
+ ": ",
2150
+ formatted
2151
+ ] }, key);
2239
2152
  })
2240
2153
  ] }, id);
2241
2154
  })
@@ -2311,9 +2224,7 @@ function buildTestCaseSummaries(byId) {
2311
2224
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
2312
2225
  const scoreIdToItems = /* @__PURE__ */ new Map();
2313
2226
  for (const ev of events) {
2314
- const es = ev.evaluatorScores.find(
2315
- (x) => x.evaluatorId === evaluatorScores.evaluatorId
2316
- );
2227
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
2317
2228
  for (const s of es?.scores ?? []) {
2318
2229
  const list = scoreIdToItems.get(s.id) ?? [];
2319
2230
  list.push(s);
@@ -2366,9 +2277,7 @@ function scoreToColor(score) {
2366
2277
  }
2367
2278
  function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2368
2279
  const lines = [];
2369
- const scoreKeys = [...scoreItemsByKey.keys()].filter(
2370
- (k) => k.startsWith(`${evaluatorId}:`)
2371
- );
2280
+ const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
2372
2281
  if (scoreKeys.length === 0) {
2373
2282
  lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2374
2283
  return lines;
@@ -2403,9 +2312,7 @@ function createBar2(value, max = 100, width = 20) {
2403
2312
  function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2404
2313
  if (events.length === 0)
2405
2314
  return [];
2406
- const evaluatorIds = new Set(
2407
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
2408
- );
2315
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
2409
2316
  const result = [];
2410
2317
  for (const evaluatorId of evaluatorIds) {
2411
2318
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -2452,9 +2359,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2452
2359
  if (def) {
2453
2360
  const formatted = def.format(m.data, options);
2454
2361
  const label = m.name ?? def.name;
2455
- metricParts.push(
2456
- label ? `[${label}: ${formatted}]` : `[${formatted}]`
2457
- );
2362
+ metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
2458
2363
  }
2459
2364
  }
2460
2365
  }
@@ -2628,10 +2533,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2628
2533
  const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2629
2534
  existing.events);
2630
2535
  const isAggregated = existing.events.length > 1;
2631
- const durationMs = existing.events.reduce(
2632
- (s, e) => s + e.durationMs,
2633
- 0
2634
- );
2536
+ const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
2635
2537
  const lines = [];
2636
2538
  const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2637
2539
  lines.push(
@@ -2643,18 +2545,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2643
2545
  for (const item of aggregatedScores) {
2644
2546
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2645
2547
  lines.push(
2646
- ...formatEvaluatorScoreLine(
2647
- name,
2648
- item.scores,
2649
- item.passed,
2650
- item.metrics,
2651
- { isAggregated }
2652
- )
2548
+ ...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
2549
+ isAggregated
2550
+ })
2653
2551
  );
2654
2552
  const lastEvent = existing.events[existing.events.length - 1];
2655
- const lastEs = lastEvent?.evaluatorScores.find(
2656
- (x) => x.evaluatorId === item.evaluatorId
2657
- );
2553
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
2658
2554
  if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2659
2555
  for (const log of lastEs.logs) {
2660
2556
  if (log.type === "diff") {
@@ -2701,9 +2597,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2701
2597
  console.log(
2702
2598
  `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
2703
2599
  );
2704
- console.log(
2705
- `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
2706
- );
2600
+ console.log(`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`);
2707
2601
  console.log("");
2708
2602
  drawSpinner();
2709
2603
  spinnerTimer = setInterval(drawSpinner, 100);
@@ -2718,10 +2612,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2718
2612
  console.log("");
2719
2613
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2720
2614
  console.log(
2721
- `- passed: ${colorize(
2722
- `${completed.passedTestCases}/${completed.totalTestCases}`,
2723
- ansi2.green
2724
- )}`
2615
+ `- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
2725
2616
  );
2726
2617
  console.log(
2727
2618
  `- failed: ${colorize(
@@ -2731,11 +2622,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2731
2622
  );
2732
2623
  if (overallScoreCount > 0) {
2733
2624
  const overallAverage = overallScoreTotal / overallScoreCount;
2734
- const overallSd = sampleStdDev2(
2735
- overallScoreTotal,
2736
- overallScoreSumSq,
2737
- overallScoreCount
2738
- );
2625
+ const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
2739
2626
  const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2740
2627
  console.log(
2741
2628
  `- overall avg score: ${colorize(
@@ -2784,7 +2671,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2784
2671
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2785
2672
  return new Promise((resolve5, reject) => {
2786
2673
  const app = ink.render(
2787
- React2__namespace.createElement(RunView, {
2674
+ React__namespace.createElement(RunView, {
2788
2675
  runner,
2789
2676
  datasetName,
2790
2677
  evaluatorPattern,