@m4trix/evals 0.25.0 → 0.25.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,8 +8,8 @@ import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
8
8
  import { pathToFileURL } from 'url';
9
9
  import { diffLines } from 'diff';
10
10
  import stringify from 'fast-json-stable-stringify';
11
- import * as React2 from 'react';
12
- import React2__default, { useState, useEffect, useCallback } from 'react';
11
+ import * as React from 'react';
12
+ import React__default, { useState, useEffect, useCallback } from 'react';
13
13
  import { render, Box, Text } from 'ink';
14
14
  import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
15
15
 
@@ -18,18 +18,8 @@ var defaultRunnerConfig = {
18
18
  discovery: {
19
19
  rootDir: process.cwd(),
20
20
  datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
21
- evaluatorSuffixes: [
22
- ".evaluator.ts",
23
- ".evaluator.tsx",
24
- ".evaluator.js",
25
- ".evaluator.mjs"
26
- ],
27
- testCaseSuffixes: [
28
- ".test-case.ts",
29
- ".test-case.tsx",
30
- ".test-case.js",
31
- ".test-case.mjs"
32
- ],
21
+ evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
22
+ testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
33
23
  excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
34
24
  },
35
25
  artifactDirectory: ".eval-results",
@@ -96,14 +86,15 @@ function getJitiLoader() {
96
86
  }
97
87
  const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
98
88
  if (typeof createJiti2 !== "function") {
99
- throw new Error(
100
- "Failed to initialize jiti for m4trix eval config loading."
101
- );
89
+ throw new Error("Failed to initialize jiti for m4trix eval config loading.");
102
90
  }
103
- cachedLoader = createJiti2(import.meta.url, {
104
- interopDefault: true,
105
- moduleCache: true
106
- });
91
+ cachedLoader = createJiti2(
92
+ import.meta.url,
93
+ {
94
+ interopDefault: true,
95
+ moduleCache: true
96
+ }
97
+ );
107
98
  return cachedLoader;
108
99
  }
109
100
  function resolveConfigModuleExport(loadedModule) {
@@ -207,9 +198,7 @@ async function loadModuleExports(filePath) {
207
198
  }
208
199
  async function collectDatasetsFromFiles(config) {
209
200
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
210
- const matched = files.filter(
211
- (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
212
- );
201
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
213
202
  const found = await Promise.all(
214
203
  matched.map(async (absolutePath) => {
215
204
  const exports = await loadModuleExports(absolutePath);
@@ -226,9 +215,7 @@ async function collectDatasetsFromFiles(config) {
226
215
  }
227
216
  async function collectEvaluatorsFromFiles(config) {
228
217
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
229
- const matched = files.filter(
230
- (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
231
- );
218
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
232
219
  const found = await Promise.all(
233
220
  matched.map(async (absolutePath) => {
234
221
  const exports = await loadModuleExports(absolutePath);
@@ -245,9 +232,7 @@ async function collectEvaluatorsFromFiles(config) {
245
232
  }
246
233
  async function collectTestCasesFromFiles(config) {
247
234
  const files = await walkDirectory(config.rootDir, config.excludeDirectories);
248
- const matched = files.filter(
249
- (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
250
- );
235
+ const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
251
236
  const found = await Promise.all(
252
237
  matched.map(async (absolutePath) => {
253
238
  const exports = await loadModuleExports(absolutePath);
@@ -319,16 +304,8 @@ function createDiffString(expected, actual, diffOptions) {
319
304
  const expectedProcessed = preprocessForDiff(expected, diffOptions);
320
305
  const actualProcessed = preprocessForDiff(actual, diffOptions);
321
306
  if (diffOptions?.keysOnly) {
322
- const expectedKeys = JSON.stringify(
323
- extractKeys(expectedProcessed),
324
- null,
325
- 2
326
- );
327
- const actualKeys = JSON.stringify(
328
- extractKeys(actualProcessed),
329
- null,
330
- 2
331
- );
307
+ const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
308
+ const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
332
309
  const parts2 = diffLines(expectedKeys, actualKeys);
333
310
  return formatDiffParts(parts2);
334
311
  }
@@ -339,9 +316,7 @@ function createDiffString(expected, actual, diffOptions) {
339
316
  }
340
317
  const parts = diffLines(expectedStr, actualStr);
341
318
  if (diffOptions?.outputNewOnly) {
342
- const filtered = parts.filter(
343
- (p) => p.added === true
344
- );
319
+ const filtered = parts.filter((p) => p.added === true);
345
320
  return formatDiffParts(filtered);
346
321
  }
347
322
  return formatDiffParts(parts);
@@ -443,10 +418,7 @@ var ScoreAggregate = {
443
418
  const count = values.length || 1;
444
419
  const result = {};
445
420
  for (const field of fields) {
446
- result[field] = values.reduce(
447
- (s, v) => s + (v[field] ?? 0),
448
- 0
449
- ) / count;
421
+ result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
450
422
  }
451
423
  return result;
452
424
  };
@@ -480,13 +452,10 @@ var ScoreAggregate = {
480
452
  (s, v) => s + (v[valueField] ?? 0),
481
453
  0
482
454
  );
483
- const sumSq = values.reduce(
484
- (s, v) => {
485
- const value = v[valueField] ?? 0;
486
- return s + value * value;
487
- },
488
- 0
489
- );
455
+ const sumSq = values.reduce((s, v) => {
456
+ const value = v[valueField] ?? 0;
457
+ return s + value * value;
458
+ }, 0);
490
459
  const mean = sum / count;
491
460
  const variance = (sumSq - count * mean * mean) / (count - 1);
492
461
  stdDev = variance > 0 ? Math.sqrt(variance) : 0;
@@ -754,20 +723,14 @@ function nowIsoForFile() {
754
723
  return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
755
724
  }
756
725
  function createArtifactPath(artifactDirectory, datasetId, runId) {
757
- return join(
758
- artifactDirectory,
759
- `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
760
- );
726
+ return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
761
727
  }
762
728
  function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
763
729
  const { testCaseItem, rerunIndex, rerunTotal } = unit;
764
730
  return Effect.gen(function* () {
765
731
  const evaluatorRunId = `run-${randomUUID()}`;
766
732
  const started = Date.now();
767
- const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
768
- n + 1,
769
- n + 1
770
- ]);
733
+ const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
771
734
  yield* publishEvent({
772
735
  type: "TestCaseStarted",
773
736
  runId: task.runId,
@@ -800,9 +763,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
800
763
  return error;
801
764
  };
802
765
  try {
803
- const ctx = yield* Effect.promise(
804
- () => Promise.resolve(evaluator.resolveContext())
805
- );
766
+ const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
806
767
  const result = yield* Effect.promise(
807
768
  () => Promise.resolve().then(
808
769
  () => evaluateFn({
@@ -857,10 +818,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
857
818
  }
858
819
  }
859
820
  const rerunPassedThis = evaluatorScores.every((s) => s.passed);
860
- const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
861
- n + 1,
862
- n + 1
863
- ]);
821
+ const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
864
822
  const progressEvent = {
865
823
  type: "TestCaseProgress",
866
824
  runId: task.runId,
@@ -909,10 +867,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
909
867
  } else {
910
868
  yield* Ref.update(failedRef, (n) => n + 1);
911
869
  }
912
- const [passed, failed] = yield* Effect.all([
913
- Ref.get(passedRef),
914
- Ref.get(failedRef)
915
- ]);
870
+ const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
916
871
  yield* updateSnapshot(task.runId, (snapshot) => ({
917
872
  ...snapshot,
918
873
  passedTestCases: passed,
@@ -1232,15 +1187,11 @@ var EffectRunner = class {
1232
1187
  this.persistenceQueue = Effect.runSync(
1233
1188
  Queue.unbounded()
1234
1189
  );
1235
- this.snapshotsRef = Effect.runSync(
1236
- Ref.make(/* @__PURE__ */ new Map())
1237
- );
1190
+ this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
1238
1191
  this.listeners = /* @__PURE__ */ new Set();
1239
1192
  this.datasetsById = /* @__PURE__ */ new Map();
1240
1193
  this.evaluatorsById = /* @__PURE__ */ new Map();
1241
- this.schedulerFiber = Effect.runFork(
1242
- this.createSchedulerEffect()
1243
- );
1194
+ this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
1244
1195
  this.persistenceFiber = Effect.runFork(
1245
1196
  createPersistenceWorker(this.persistenceQueue)
1246
1197
  );
@@ -1387,9 +1338,9 @@ var EffectRunner = class {
1387
1338
  return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
1388
1339
  }
1389
1340
  getAllRunSnapshots() {
1390
- return Array.from(
1391
- Effect.runSync(Ref.get(this.snapshotsRef)).values()
1392
- ).sort((a, b) => b.queuedAt - a.queuedAt);
1341
+ return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
1342
+ (a, b) => b.queuedAt - a.queuedAt
1343
+ );
1393
1344
  }
1394
1345
  async loadRunSnapshotsFromArtifacts() {
1395
1346
  return loadRunSnapshotsFromArtifacts(this.config);
@@ -1556,12 +1507,8 @@ function GenerateView({
1556
1507
  const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
1557
1508
  const parsed = parse2(absoluteDatasetPath);
1558
1509
  const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
1559
- await writeFile2(
1560
- outputPath,
1561
- `${JSON.stringify(payload, null, 2)}
1562
- `,
1563
- "utf8"
1564
- );
1510
+ await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
1511
+ `, "utf8");
1565
1512
  if (!cancelled) {
1566
1513
  setResult({
1567
1514
  count: payload.length,
@@ -1632,7 +1579,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1632
1579
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1633
1580
  return new Promise((resolve5, reject) => {
1634
1581
  const app = render(
1635
- React2__default.createElement(GenerateView, {
1582
+ React__default.createElement(GenerateView, {
1636
1583
  runner,
1637
1584
  datasetName,
1638
1585
  onComplete: (err) => {
@@ -1717,9 +1664,7 @@ function createBar(value, max = 100, width = 20) {
1717
1664
  function aggregateEvaluatorScores(events, nameById) {
1718
1665
  if (events.length === 0)
1719
1666
  return [];
1720
- const evaluatorIds = new Set(
1721
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
1722
- );
1667
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
1723
1668
  const result = [];
1724
1669
  for (const evaluatorId of evaluatorIds) {
1725
1670
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -1749,9 +1694,7 @@ function aggregateEvaluatorScores(events, nameById) {
1749
1694
  return es?.passed ?? false;
1750
1695
  });
1751
1696
  const lastEvent = events[events.length - 1];
1752
- const lastEs = lastEvent?.evaluatorScores.find(
1753
- (x) => x.evaluatorId === evaluatorId
1754
- );
1697
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
1755
1698
  result.push({
1756
1699
  evaluatorId,
1757
1700
  evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
@@ -1785,9 +1728,7 @@ function RunView({
1785
1728
  concurrency,
1786
1729
  onComplete
1787
1730
  }) {
1788
- const [phase, setPhase] = useState(
1789
- "loading"
1790
- );
1731
+ const [phase, setPhase] = useState("loading");
1791
1732
  const [runInfo, setRunInfo] = useState(null);
1792
1733
  const [testCases, setTestCases] = useState([]);
1793
1734
  const [startedEvaluations, setStartedEvaluations] = useState(0);
@@ -1894,10 +1835,7 @@ function RunView({
1894
1835
  };
1895
1836
  const events = existing ? [...existing.events, newEvent] : [newEvent];
1896
1837
  const isAggregated = events.length > 1;
1897
- const aggregatedEvaluatorScores = aggregateEvaluatorScores(
1898
- events,
1899
- nameById
1900
- );
1838
+ const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
1901
1839
  const merged = {
1902
1840
  name: event.testCaseName,
1903
1841
  testCaseId: event.testCaseId,
@@ -2002,30 +1940,22 @@ function RunView({
2002
1940
  label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
2003
1941
  }
2004
1942
  ),
2005
- runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
2006
- Text,
2007
- {
2008
- color: "yellow",
2009
- children: [
2010
- "[running ",
2011
- item.startedTestCases,
2012
- "/",
2013
- item.totalTestCases,
2014
- "]",
2015
- " ",
2016
- item.name,
2017
- " ",
2018
- /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2019
- "(",
2020
- item.rerunIndex,
2021
- "/",
2022
- item.rerunTotal,
2023
- ")"
2024
- ] })
2025
- ]
2026
- },
2027
- `${item.testCaseId}:${item.rerunIndex}`
2028
- )) })
1943
+ runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
1944
+ "[running ",
1945
+ item.startedTestCases,
1946
+ "/",
1947
+ item.totalTestCases,
1948
+ "] ",
1949
+ item.name,
1950
+ " ",
1951
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
1952
+ "(",
1953
+ item.rerunIndex,
1954
+ "/",
1955
+ item.rerunTotal,
1956
+ ")"
1957
+ ] })
1958
+ ] }, `${item.testCaseId}:${item.rerunIndex}`)) })
2029
1959
  ] }),
2030
1960
  testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
2031
1961
  /* @__PURE__ */ jsxs(Text, { children: [
@@ -2057,73 +1987,63 @@ function RunView({
2057
1987
  ] }) : null
2058
1988
  ] }),
2059
1989
  tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
2060
- tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
2061
- Box,
2062
- {
2063
- flexDirection: "column",
2064
- marginLeft: 2,
2065
- children: [
2066
- /* @__PURE__ */ jsxs(Text, { children: [
2067
- item.evaluatorName,
2068
- ":",
2069
- " ",
2070
- /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
2071
- item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1990
+ tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
1991
+ /* @__PURE__ */ jsxs(Text, { children: [
1992
+ item.evaluatorName,
1993
+ ":",
1994
+ " ",
1995
+ /* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
1996
+ item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
1997
+ " ",
1998
+ item.metrics.map((m) => {
1999
+ const def = getMetricById(m.id);
2000
+ if (!def)
2001
+ return null;
2002
+ const formatted = def.format(m.data, {
2003
+ isAggregated: tc.isAggregated
2004
+ });
2005
+ const label = m.name ?? def.name;
2006
+ return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2007
+ "[",
2008
+ label ? `${label}: ` : "",
2009
+ formatted,
2010
+ "]",
2011
+ " "
2012
+ ] }, m.id);
2013
+ })
2014
+ ] }) : null
2015
+ ] }),
2016
+ item.scores.length > 0 ? item.scores.map((s, idx) => {
2017
+ const def = s.def ?? getScoreById(s.id);
2018
+ const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2019
+ return /* @__PURE__ */ jsxs(
2020
+ Text,
2021
+ {
2022
+ color: scoreColor(toNumericScore(s.data) ?? 0),
2023
+ children: [
2024
+ " ",
2025
+ scoreLabel,
2026
+ ":",
2072
2027
  " ",
2073
- item.metrics.map((m) => {
2074
- const def = getMetricById(m.id);
2075
- if (!def)
2076
- return null;
2077
- const formatted = def.format(m.data, {
2078
- isAggregated: tc.isAggregated
2079
- });
2080
- const label = m.name ?? def.name;
2081
- return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2082
- "[",
2083
- label ? `${label}: ` : "",
2084
- formatted,
2085
- "]",
2086
- " "
2087
- ] }, m.id);
2028
+ formatScorePart(s, scoreColor, {
2029
+ isAggregated: tc.isAggregated
2088
2030
  })
2089
- ] }) : null
2090
- ] }),
2091
- item.scores.length > 0 ? item.scores.map((s, idx) => {
2092
- const def = s.def ?? getScoreById(s.id);
2093
- const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
2094
- return /* @__PURE__ */ jsxs(
2095
- Text,
2096
- {
2097
- color: scoreColor(toNumericScore(s.data) ?? 0),
2098
- children: [
2099
- " ",
2100
- scoreLabel,
2101
- ":",
2102
- " ",
2103
- formatScorePart(s, scoreColor, {
2104
- isAggregated: tc.isAggregated
2105
- })
2106
- ]
2107
- },
2108
- `${item.evaluatorId}-${s.id}-${idx}`
2109
- );
2110
- }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2111
- !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2112
- (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
2113
- ({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
2114
- Text,
2115
- {
2116
- color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2117
- children: line
2118
- },
2119
- lineIdx
2120
- )
2121
- ) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2122
- ) })
2123
- ]
2124
- },
2125
- item.evaluatorId
2126
- ))
2031
+ ]
2032
+ },
2033
+ `${item.evaluatorId}-${s.id}-${idx}`
2034
+ );
2035
+ }) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
2036
+ !item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
2037
+ (log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
2038
+ Text,
2039
+ {
2040
+ color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
2041
+ children: line
2042
+ },
2043
+ lineIdx
2044
+ )) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
2045
+ ) })
2046
+ ] }, item.evaluatorId))
2127
2047
  ] }, tc.testCaseId)) }),
2128
2048
  phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
2129
2049
  /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
@@ -2165,9 +2085,9 @@ function RunView({
2165
2085
  /* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
2166
2086
  Array.from(evaluatorNameById.entries()).map(([id, name]) => {
2167
2087
  const agg = summary.aggregates.get(id);
2168
- const scoreKeys = [
2169
- ...summary.scoreItemsByEvaluatorScore?.keys() ?? []
2170
- ].filter((k) => k.startsWith(`${id}:`));
2088
+ const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
2089
+ (k) => k.startsWith(`${id}:`)
2090
+ );
2171
2091
  if (scoreKeys.length === 0) {
2172
2092
  return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
2173
2093
  "- ",
@@ -2197,19 +2117,12 @@ function RunView({
2197
2117
  const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
2198
2118
  const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
2199
2119
  const numeric = toNumericScore(aggregated.data);
2200
- return /* @__PURE__ */ jsxs(
2201
- Text,
2202
- {
2203
- color: numeric !== void 0 ? scoreColor(numeric) : "gray",
2204
- children: [
2205
- " ",
2206
- label,
2207
- ": ",
2208
- formatted
2209
- ]
2210
- },
2211
- key
2212
- );
2120
+ return /* @__PURE__ */ jsxs(Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
2121
+ " ",
2122
+ label,
2123
+ ": ",
2124
+ formatted
2125
+ ] }, key);
2213
2126
  })
2214
2127
  ] }, id);
2215
2128
  })
@@ -2285,9 +2198,7 @@ function buildTestCaseSummaries(byId) {
2285
2198
  for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
2286
2199
  const scoreIdToItems = /* @__PURE__ */ new Map();
2287
2200
  for (const ev of events) {
2288
- const es = ev.evaluatorScores.find(
2289
- (x) => x.evaluatorId === evaluatorScores.evaluatorId
2290
- );
2201
+ const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
2291
2202
  for (const s of es?.scores ?? []) {
2292
2203
  const list = scoreIdToItems.get(s.id) ?? [];
2293
2204
  list.push(s);
@@ -2340,9 +2251,7 @@ function scoreToColor(score) {
2340
2251
  }
2341
2252
  function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
2342
2253
  const lines = [];
2343
- const scoreKeys = [...scoreItemsByKey.keys()].filter(
2344
- (k) => k.startsWith(`${evaluatorId}:`)
2345
- );
2254
+ const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
2346
2255
  if (scoreKeys.length === 0) {
2347
2256
  lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
2348
2257
  return lines;
@@ -2377,9 +2286,7 @@ function createBar2(value, max = 100, width = 20) {
2377
2286
  function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
2378
2287
  if (events.length === 0)
2379
2288
  return [];
2380
- const evaluatorIds = new Set(
2381
- events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
2382
- );
2289
+ const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
2383
2290
  const result = [];
2384
2291
  for (const evaluatorId of evaluatorIds) {
2385
2292
  const scoreIdToItems = /* @__PURE__ */ new Map();
@@ -2426,9 +2333,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
2426
2333
  if (def) {
2427
2334
  const formatted = def.format(m.data, options);
2428
2335
  const label = m.name ?? def.name;
2429
- metricParts.push(
2430
- label ? `[${label}: ${formatted}]` : `[${formatted}]`
2431
- );
2336
+ metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
2432
2337
  }
2433
2338
  }
2434
2339
  }
@@ -2602,10 +2507,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2602
2507
  const aggregatedScores = aggregateEvaluatorScoresFromEvents(
2603
2508
  existing.events);
2604
2509
  const isAggregated = existing.events.length > 1;
2605
- const durationMs = existing.events.reduce(
2606
- (s, e) => s + e.durationMs,
2607
- 0
2608
- );
2510
+ const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
2609
2511
  const lines = [];
2610
2512
  const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
2611
2513
  lines.push(
@@ -2617,18 +2519,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2617
2519
  for (const item of aggregatedScores) {
2618
2520
  const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
2619
2521
  lines.push(
2620
- ...formatEvaluatorScoreLine(
2621
- name,
2622
- item.scores,
2623
- item.passed,
2624
- item.metrics,
2625
- { isAggregated }
2626
- )
2522
+ ...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
2523
+ isAggregated
2524
+ })
2627
2525
  );
2628
2526
  const lastEvent = existing.events[existing.events.length - 1];
2629
- const lastEs = lastEvent?.evaluatorScores.find(
2630
- (x) => x.evaluatorId === item.evaluatorId
2631
- );
2527
+ const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
2632
2528
  if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
2633
2529
  for (const log of lastEs.logs) {
2634
2530
  if (log.type === "diff") {
@@ -2675,9 +2571,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2675
2571
  console.log(
2676
2572
  `Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
2677
2573
  );
2678
- console.log(
2679
- `Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
2680
- );
2574
+ console.log(`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`);
2681
2575
  console.log("");
2682
2576
  drawSpinner();
2683
2577
  spinnerTimer = setInterval(drawSpinner, 100);
@@ -2692,10 +2586,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2692
2586
  console.log("");
2693
2587
  console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
2694
2588
  console.log(
2695
- `- passed: ${colorize(
2696
- `${completed.passedTestCases}/${completed.totalTestCases}`,
2697
- ansi2.green
2698
- )}`
2589
+ `- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
2699
2590
  );
2700
2591
  console.log(
2701
2592
  `- failed: ${colorize(
@@ -2705,11 +2596,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2705
2596
  );
2706
2597
  if (overallScoreCount > 0) {
2707
2598
  const overallAverage = overallScoreTotal / overallScoreCount;
2708
- const overallSd = sampleStdDev2(
2709
- overallScoreTotal,
2710
- overallScoreSumSq,
2711
- overallScoreCount
2712
- );
2599
+ const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
2713
2600
  const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
2714
2601
  console.log(
2715
2602
  `- overall avg score: ${colorize(
@@ -2758,7 +2645,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
2758
2645
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
2759
2646
  return new Promise((resolve5, reject) => {
2760
2647
  const app = render(
2761
- React2.createElement(RunView, {
2648
+ React.createElement(RunView, {
2762
2649
  runner,
2763
2650
  datasetName,
2764
2651
  evaluatorPattern,