@m4trix/evals 0.24.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +147 -260
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +133 -246
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +143 -291
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +130 -278
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +49 -92
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli-simple.js
CHANGED
|
@@ -8,8 +8,8 @@ import { writeFile, readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
|
8
8
|
import { pathToFileURL } from 'url';
|
|
9
9
|
import { diffLines } from 'diff';
|
|
10
10
|
import stringify from 'fast-json-stable-stringify';
|
|
11
|
-
import * as
|
|
12
|
-
import
|
|
11
|
+
import * as React from 'react';
|
|
12
|
+
import React__default, { useState, useEffect, useCallback } from 'react';
|
|
13
13
|
import { render, Box, Text } from 'ink';
|
|
14
14
|
import { jsxs, jsx, Fragment } from 'react/jsx-runtime';
|
|
15
15
|
|
|
@@ -18,18 +18,8 @@ var defaultRunnerConfig = {
|
|
|
18
18
|
discovery: {
|
|
19
19
|
rootDir: process.cwd(),
|
|
20
20
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
21
|
-
evaluatorSuffixes: [
|
|
22
|
-
|
|
23
|
-
".evaluator.tsx",
|
|
24
|
-
".evaluator.js",
|
|
25
|
-
".evaluator.mjs"
|
|
26
|
-
],
|
|
27
|
-
testCaseSuffixes: [
|
|
28
|
-
".test-case.ts",
|
|
29
|
-
".test-case.tsx",
|
|
30
|
-
".test-case.js",
|
|
31
|
-
".test-case.mjs"
|
|
32
|
-
],
|
|
21
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
22
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
33
23
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
34
24
|
},
|
|
35
25
|
artifactDirectory: ".eval-results",
|
|
@@ -96,14 +86,15 @@ function getJitiLoader() {
|
|
|
96
86
|
}
|
|
97
87
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
98
88
|
if (typeof createJiti2 !== "function") {
|
|
99
|
-
throw new Error(
|
|
100
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
101
|
-
);
|
|
89
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
102
90
|
}
|
|
103
|
-
cachedLoader = createJiti2(
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
91
|
+
cachedLoader = createJiti2(
|
|
92
|
+
import.meta.url,
|
|
93
|
+
{
|
|
94
|
+
interopDefault: true,
|
|
95
|
+
moduleCache: true
|
|
96
|
+
}
|
|
97
|
+
);
|
|
107
98
|
return cachedLoader;
|
|
108
99
|
}
|
|
109
100
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -207,9 +198,7 @@ async function loadModuleExports(filePath) {
|
|
|
207
198
|
}
|
|
208
199
|
async function collectDatasetsFromFiles(config) {
|
|
209
200
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
210
|
-
const matched = files.filter(
|
|
211
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
212
|
-
);
|
|
201
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
213
202
|
const found = await Promise.all(
|
|
214
203
|
matched.map(async (absolutePath) => {
|
|
215
204
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -226,9 +215,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
226
215
|
}
|
|
227
216
|
async function collectEvaluatorsFromFiles(config) {
|
|
228
217
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
229
|
-
const matched = files.filter(
|
|
230
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
231
|
-
);
|
|
218
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
232
219
|
const found = await Promise.all(
|
|
233
220
|
matched.map(async (absolutePath) => {
|
|
234
221
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -245,9 +232,7 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
245
232
|
}
|
|
246
233
|
async function collectTestCasesFromFiles(config) {
|
|
247
234
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
248
|
-
const matched = files.filter(
|
|
249
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
250
|
-
);
|
|
235
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
251
236
|
const found = await Promise.all(
|
|
252
237
|
matched.map(async (absolutePath) => {
|
|
253
238
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -319,16 +304,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
319
304
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
320
305
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
321
306
|
if (diffOptions?.keysOnly) {
|
|
322
|
-
const expectedKeys = JSON.stringify(
|
|
323
|
-
|
|
324
|
-
null,
|
|
325
|
-
2
|
|
326
|
-
);
|
|
327
|
-
const actualKeys = JSON.stringify(
|
|
328
|
-
extractKeys(actualProcessed),
|
|
329
|
-
null,
|
|
330
|
-
2
|
|
331
|
-
);
|
|
307
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
308
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
332
309
|
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
333
310
|
return formatDiffParts(parts2);
|
|
334
311
|
}
|
|
@@ -339,9 +316,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
339
316
|
}
|
|
340
317
|
const parts = diffLines(expectedStr, actualStr);
|
|
341
318
|
if (diffOptions?.outputNewOnly) {
|
|
342
|
-
const filtered = parts.filter(
|
|
343
|
-
(p) => p.added === true
|
|
344
|
-
);
|
|
319
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
345
320
|
return formatDiffParts(filtered);
|
|
346
321
|
}
|
|
347
322
|
return formatDiffParts(parts);
|
|
@@ -443,10 +418,7 @@ var ScoreAggregate = {
|
|
|
443
418
|
const count = values.length || 1;
|
|
444
419
|
const result = {};
|
|
445
420
|
for (const field of fields) {
|
|
446
|
-
result[field] = values.reduce(
|
|
447
|
-
(s, v) => s + (v[field] ?? 0),
|
|
448
|
-
0
|
|
449
|
-
) / count;
|
|
421
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
450
422
|
}
|
|
451
423
|
return result;
|
|
452
424
|
};
|
|
@@ -480,13 +452,10 @@ var ScoreAggregate = {
|
|
|
480
452
|
(s, v) => s + (v[valueField] ?? 0),
|
|
481
453
|
0
|
|
482
454
|
);
|
|
483
|
-
const sumSq = values.reduce(
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
},
|
|
488
|
-
0
|
|
489
|
-
);
|
|
455
|
+
const sumSq = values.reduce((s, v) => {
|
|
456
|
+
const value = v[valueField] ?? 0;
|
|
457
|
+
return s + value * value;
|
|
458
|
+
}, 0);
|
|
490
459
|
const mean = sum / count;
|
|
491
460
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
492
461
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -754,20 +723,14 @@ function nowIsoForFile() {
|
|
|
754
723
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
755
724
|
}
|
|
756
725
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
757
|
-
return join(
|
|
758
|
-
artifactDirectory,
|
|
759
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
760
|
-
);
|
|
726
|
+
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
761
727
|
}
|
|
762
728
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
763
729
|
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
764
730
|
return Effect.gen(function* () {
|
|
765
731
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
766
732
|
const started = Date.now();
|
|
767
|
-
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
768
|
-
n + 1,
|
|
769
|
-
n + 1
|
|
770
|
-
]);
|
|
733
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
771
734
|
yield* publishEvent({
|
|
772
735
|
type: "TestCaseStarted",
|
|
773
736
|
runId: task.runId,
|
|
@@ -800,9 +763,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
800
763
|
return error;
|
|
801
764
|
};
|
|
802
765
|
try {
|
|
803
|
-
const ctx = yield* Effect.promise(
|
|
804
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
805
|
-
);
|
|
766
|
+
const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
806
767
|
const result = yield* Effect.promise(
|
|
807
768
|
() => Promise.resolve().then(
|
|
808
769
|
() => evaluateFn({
|
|
@@ -857,10 +818,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
857
818
|
}
|
|
858
819
|
}
|
|
859
820
|
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
860
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
861
|
-
n + 1,
|
|
862
|
-
n + 1
|
|
863
|
-
]);
|
|
821
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
864
822
|
const progressEvent = {
|
|
865
823
|
type: "TestCaseProgress",
|
|
866
824
|
runId: task.runId,
|
|
@@ -909,10 +867,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
909
867
|
} else {
|
|
910
868
|
yield* Ref.update(failedRef, (n) => n + 1);
|
|
911
869
|
}
|
|
912
|
-
const [passed, failed] = yield* Effect.all([
|
|
913
|
-
Ref.get(passedRef),
|
|
914
|
-
Ref.get(failedRef)
|
|
915
|
-
]);
|
|
870
|
+
const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
|
|
916
871
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
917
872
|
...snapshot,
|
|
918
873
|
passedTestCases: passed,
|
|
@@ -1232,15 +1187,11 @@ var EffectRunner = class {
|
|
|
1232
1187
|
this.persistenceQueue = Effect.runSync(
|
|
1233
1188
|
Queue.unbounded()
|
|
1234
1189
|
);
|
|
1235
|
-
this.snapshotsRef = Effect.runSync(
|
|
1236
|
-
Ref.make(/* @__PURE__ */ new Map())
|
|
1237
|
-
);
|
|
1190
|
+
this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
|
|
1238
1191
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1239
1192
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1240
1193
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1241
|
-
this.schedulerFiber = Effect.runFork(
|
|
1242
|
-
this.createSchedulerEffect()
|
|
1243
|
-
);
|
|
1194
|
+
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1244
1195
|
this.persistenceFiber = Effect.runFork(
|
|
1245
1196
|
createPersistenceWorker(this.persistenceQueue)
|
|
1246
1197
|
);
|
|
@@ -1387,9 +1338,9 @@ var EffectRunner = class {
|
|
|
1387
1338
|
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
1388
1339
|
}
|
|
1389
1340
|
getAllRunSnapshots() {
|
|
1390
|
-
return Array.from(
|
|
1391
|
-
|
|
1392
|
-
)
|
|
1341
|
+
return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
|
|
1342
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1343
|
+
);
|
|
1393
1344
|
}
|
|
1394
1345
|
async loadRunSnapshotsFromArtifacts() {
|
|
1395
1346
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1556,12 +1507,8 @@ function GenerateView({
|
|
|
1556
1507
|
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
1557
1508
|
const parsed = parse2(absoluteDatasetPath);
|
|
1558
1509
|
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
1559
|
-
await writeFile2(
|
|
1560
|
-
|
|
1561
|
-
`${JSON.stringify(payload, null, 2)}
|
|
1562
|
-
`,
|
|
1563
|
-
"utf8"
|
|
1564
|
-
);
|
|
1510
|
+
await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1511
|
+
`, "utf8");
|
|
1565
1512
|
if (!cancelled) {
|
|
1566
1513
|
setResult({
|
|
1567
1514
|
count: payload.length,
|
|
@@ -1632,7 +1579,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1632
1579
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1633
1580
|
return new Promise((resolve5, reject) => {
|
|
1634
1581
|
const app = render(
|
|
1635
|
-
|
|
1582
|
+
React__default.createElement(GenerateView, {
|
|
1636
1583
|
runner,
|
|
1637
1584
|
datasetName,
|
|
1638
1585
|
onComplete: (err) => {
|
|
@@ -1717,9 +1664,7 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1717
1664
|
function aggregateEvaluatorScores(events, nameById) {
|
|
1718
1665
|
if (events.length === 0)
|
|
1719
1666
|
return [];
|
|
1720
|
-
const evaluatorIds = new Set(
|
|
1721
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1722
|
-
);
|
|
1667
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
1723
1668
|
const result = [];
|
|
1724
1669
|
for (const evaluatorId of evaluatorIds) {
|
|
1725
1670
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1749,9 +1694,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1749
1694
|
return es?.passed ?? false;
|
|
1750
1695
|
});
|
|
1751
1696
|
const lastEvent = events[events.length - 1];
|
|
1752
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1753
|
-
(x) => x.evaluatorId === evaluatorId
|
|
1754
|
-
);
|
|
1697
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1755
1698
|
result.push({
|
|
1756
1699
|
evaluatorId,
|
|
1757
1700
|
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
@@ -1785,9 +1728,7 @@ function RunView({
|
|
|
1785
1728
|
concurrency,
|
|
1786
1729
|
onComplete
|
|
1787
1730
|
}) {
|
|
1788
|
-
const [phase, setPhase] = useState(
|
|
1789
|
-
"loading"
|
|
1790
|
-
);
|
|
1731
|
+
const [phase, setPhase] = useState("loading");
|
|
1791
1732
|
const [runInfo, setRunInfo] = useState(null);
|
|
1792
1733
|
const [testCases, setTestCases] = useState([]);
|
|
1793
1734
|
const [startedEvaluations, setStartedEvaluations] = useState(0);
|
|
@@ -1894,10 +1835,7 @@ function RunView({
|
|
|
1894
1835
|
};
|
|
1895
1836
|
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1896
1837
|
const isAggregated = events.length > 1;
|
|
1897
|
-
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1898
|
-
events,
|
|
1899
|
-
nameById
|
|
1900
|
-
);
|
|
1838
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1901
1839
|
const merged = {
|
|
1902
1840
|
name: event.testCaseName,
|
|
1903
1841
|
testCaseId: event.testCaseId,
|
|
@@ -2002,30 +1940,22 @@ function RunView({
|
|
|
2002
1940
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
2003
1941
|
}
|
|
2004
1942
|
),
|
|
2005
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
"/",
|
|
2022
|
-
item.rerunTotal,
|
|
2023
|
-
")"
|
|
2024
|
-
] })
|
|
2025
|
-
]
|
|
2026
|
-
},
|
|
2027
|
-
`${item.testCaseId}:${item.rerunIndex}`
|
|
2028
|
-
)) })
|
|
1943
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxs(Text, { color: "yellow", children: [
|
|
1944
|
+
"[running ",
|
|
1945
|
+
item.startedTestCases,
|
|
1946
|
+
"/",
|
|
1947
|
+
item.totalTestCases,
|
|
1948
|
+
"] ",
|
|
1949
|
+
item.name,
|
|
1950
|
+
" ",
|
|
1951
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1952
|
+
"(",
|
|
1953
|
+
item.rerunIndex,
|
|
1954
|
+
"/",
|
|
1955
|
+
item.rerunTotal,
|
|
1956
|
+
")"
|
|
1957
|
+
] })
|
|
1958
|
+
] }, `${item.testCaseId}:${item.rerunIndex}`)) })
|
|
2029
1959
|
] }),
|
|
2030
1960
|
testCases.length > 0 && /* @__PURE__ */ jsx(Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
2031
1961
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
@@ -2057,73 +1987,63 @@ function RunView({
|
|
|
2057
1987
|
] }) : null
|
|
2058
1988
|
] }),
|
|
2059
1989
|
tc.errorMessage ? /* @__PURE__ */ jsx(Text, { color: "red", children: tc.errorMessage }) : null,
|
|
2060
|
-
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2064
|
-
|
|
2065
|
-
children:
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
1990
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxs(Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
1991
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1992
|
+
item.evaluatorName,
|
|
1993
|
+
":",
|
|
1994
|
+
" ",
|
|
1995
|
+
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1996
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
1997
|
+
" ",
|
|
1998
|
+
item.metrics.map((m) => {
|
|
1999
|
+
const def = getMetricById(m.id);
|
|
2000
|
+
if (!def)
|
|
2001
|
+
return null;
|
|
2002
|
+
const formatted = def.format(m.data, {
|
|
2003
|
+
isAggregated: tc.isAggregated
|
|
2004
|
+
});
|
|
2005
|
+
const label = m.name ?? def.name;
|
|
2006
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2007
|
+
"[",
|
|
2008
|
+
label ? `${label}: ` : "",
|
|
2009
|
+
formatted,
|
|
2010
|
+
"]",
|
|
2011
|
+
" "
|
|
2012
|
+
] }, m.id);
|
|
2013
|
+
})
|
|
2014
|
+
] }) : null
|
|
2015
|
+
] }),
|
|
2016
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
2017
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2018
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2019
|
+
return /* @__PURE__ */ jsxs(
|
|
2020
|
+
Text,
|
|
2021
|
+
{
|
|
2022
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2023
|
+
children: [
|
|
2024
|
+
" ",
|
|
2025
|
+
scoreLabel,
|
|
2026
|
+
":",
|
|
2072
2027
|
" ",
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
if (!def)
|
|
2076
|
-
return null;
|
|
2077
|
-
const formatted = def.format(m.data, {
|
|
2078
|
-
isAggregated: tc.isAggregated
|
|
2079
|
-
});
|
|
2080
|
-
const label = m.name ?? def.name;
|
|
2081
|
-
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2082
|
-
"[",
|
|
2083
|
-
label ? `${label}: ` : "",
|
|
2084
|
-
formatted,
|
|
2085
|
-
"]",
|
|
2086
|
-
" "
|
|
2087
|
-
] }, m.id);
|
|
2028
|
+
formatScorePart(s, scoreColor, {
|
|
2029
|
+
isAggregated: tc.isAggregated
|
|
2088
2030
|
})
|
|
2089
|
-
]
|
|
2090
|
-
|
|
2091
|
-
item.
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
})
|
|
2106
|
-
]
|
|
2107
|
-
},
|
|
2108
|
-
`${item.evaluatorId}-${s.id}-${idx}`
|
|
2109
|
-
);
|
|
2110
|
-
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
2111
|
-
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2112
|
-
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
2113
|
-
({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
2114
|
-
Text,
|
|
2115
|
-
{
|
|
2116
|
-
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2117
|
-
children: line
|
|
2118
|
-
},
|
|
2119
|
-
lineIdx
|
|
2120
|
-
)
|
|
2121
|
-
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
2122
|
-
) })
|
|
2123
|
-
]
|
|
2124
|
-
},
|
|
2125
|
-
item.evaluatorId
|
|
2126
|
-
))
|
|
2031
|
+
]
|
|
2032
|
+
},
|
|
2033
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
2034
|
+
);
|
|
2035
|
+
}) : /* @__PURE__ */ jsx(Text, { color: "gray", children: " n/a" }),
|
|
2036
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsx(Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2037
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsx(
|
|
2038
|
+
Text,
|
|
2039
|
+
{
|
|
2040
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2041
|
+
children: line
|
|
2042
|
+
},
|
|
2043
|
+
lineIdx
|
|
2044
|
+
)) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsx(Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
2045
|
+
) })
|
|
2046
|
+
] }, item.evaluatorId))
|
|
2127
2047
|
] }, tc.testCaseId)) }),
|
|
2128
2048
|
phase === "completed" && summary && /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
2129
2049
|
/* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -2165,9 +2085,9 @@ function RunView({
|
|
|
2165
2085
|
/* @__PURE__ */ jsx(Text, { color: "magenta", children: "evaluator averages" }),
|
|
2166
2086
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2167
2087
|
const agg = summary.aggregates.get(id);
|
|
2168
|
-
const scoreKeys = [
|
|
2169
|
-
|
|
2170
|
-
|
|
2088
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
2089
|
+
(k) => k.startsWith(`${id}:`)
|
|
2090
|
+
);
|
|
2171
2091
|
if (scoreKeys.length === 0) {
|
|
2172
2092
|
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2173
2093
|
"- ",
|
|
@@ -2197,19 +2117,12 @@ function RunView({
|
|
|
2197
2117
|
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
2198
2118
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
2199
2119
|
const numeric = toNumericScore(aggregated.data);
|
|
2200
|
-
return /* @__PURE__ */ jsxs(
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
label,
|
|
2207
|
-
": ",
|
|
2208
|
-
formatted
|
|
2209
|
-
]
|
|
2210
|
-
},
|
|
2211
|
-
key
|
|
2212
|
-
);
|
|
2120
|
+
return /* @__PURE__ */ jsxs(Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
|
|
2121
|
+
" ",
|
|
2122
|
+
label,
|
|
2123
|
+
": ",
|
|
2124
|
+
formatted
|
|
2125
|
+
] }, key);
|
|
2213
2126
|
})
|
|
2214
2127
|
] }, id);
|
|
2215
2128
|
})
|
|
@@ -2285,9 +2198,7 @@ function buildTestCaseSummaries(byId) {
|
|
|
2285
2198
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
2286
2199
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
2287
2200
|
for (const ev of events) {
|
|
2288
|
-
const es = ev.evaluatorScores.find(
|
|
2289
|
-
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
2290
|
-
);
|
|
2201
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
|
|
2291
2202
|
for (const s of es?.scores ?? []) {
|
|
2292
2203
|
const list = scoreIdToItems.get(s.id) ?? [];
|
|
2293
2204
|
list.push(s);
|
|
@@ -2340,9 +2251,7 @@ function scoreToColor(score) {
|
|
|
2340
2251
|
}
|
|
2341
2252
|
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2342
2253
|
const lines = [];
|
|
2343
|
-
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2344
|
-
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2345
|
-
);
|
|
2254
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
|
|
2346
2255
|
if (scoreKeys.length === 0) {
|
|
2347
2256
|
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2348
2257
|
return lines;
|
|
@@ -2377,9 +2286,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2377
2286
|
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2378
2287
|
if (events.length === 0)
|
|
2379
2288
|
return [];
|
|
2380
|
-
const evaluatorIds = new Set(
|
|
2381
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
2382
|
-
);
|
|
2289
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
2383
2290
|
const result = [];
|
|
2384
2291
|
for (const evaluatorId of evaluatorIds) {
|
|
2385
2292
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -2426,9 +2333,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2426
2333
|
if (def) {
|
|
2427
2334
|
const formatted = def.format(m.data, options);
|
|
2428
2335
|
const label = m.name ?? def.name;
|
|
2429
|
-
metricParts.push(
|
|
2430
|
-
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2431
|
-
);
|
|
2336
|
+
metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
|
|
2432
2337
|
}
|
|
2433
2338
|
}
|
|
2434
2339
|
}
|
|
@@ -2602,10 +2507,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2602
2507
|
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2603
2508
|
existing.events);
|
|
2604
2509
|
const isAggregated = existing.events.length > 1;
|
|
2605
|
-
const durationMs = existing.events.reduce(
|
|
2606
|
-
(s, e) => s + e.durationMs,
|
|
2607
|
-
0
|
|
2608
|
-
);
|
|
2510
|
+
const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
|
|
2609
2511
|
const lines = [];
|
|
2610
2512
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2611
2513
|
lines.push(
|
|
@@ -2617,18 +2519,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2617
2519
|
for (const item of aggregatedScores) {
|
|
2618
2520
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2619
2521
|
lines.push(
|
|
2620
|
-
...formatEvaluatorScoreLine(
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
item.passed,
|
|
2624
|
-
item.metrics,
|
|
2625
|
-
{ isAggregated }
|
|
2626
|
-
)
|
|
2522
|
+
...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
|
|
2523
|
+
isAggregated
|
|
2524
|
+
})
|
|
2627
2525
|
);
|
|
2628
2526
|
const lastEvent = existing.events[existing.events.length - 1];
|
|
2629
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2630
|
-
(x) => x.evaluatorId === item.evaluatorId
|
|
2631
|
-
);
|
|
2527
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
|
|
2632
2528
|
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2633
2529
|
for (const log of lastEs.logs) {
|
|
2634
2530
|
if (log.type === "diff") {
|
|
@@ -2675,9 +2571,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2675
2571
|
console.log(
|
|
2676
2572
|
`Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
|
|
2677
2573
|
);
|
|
2678
|
-
console.log(
|
|
2679
|
-
`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
|
|
2680
|
-
);
|
|
2574
|
+
console.log(`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`);
|
|
2681
2575
|
console.log("");
|
|
2682
2576
|
drawSpinner();
|
|
2683
2577
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
@@ -2692,10 +2586,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2692
2586
|
console.log("");
|
|
2693
2587
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2694
2588
|
console.log(
|
|
2695
|
-
`- passed: ${colorize(
|
|
2696
|
-
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2697
|
-
ansi2.green
|
|
2698
|
-
)}`
|
|
2589
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2699
2590
|
);
|
|
2700
2591
|
console.log(
|
|
2701
2592
|
`- failed: ${colorize(
|
|
@@ -2705,11 +2596,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2705
2596
|
);
|
|
2706
2597
|
if (overallScoreCount > 0) {
|
|
2707
2598
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2708
|
-
const overallSd = sampleStdDev2(
|
|
2709
|
-
overallScoreTotal,
|
|
2710
|
-
overallScoreSumSq,
|
|
2711
|
-
overallScoreCount
|
|
2712
|
-
);
|
|
2599
|
+
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2713
2600
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2714
2601
|
console.log(
|
|
2715
2602
|
`- overall avg score: ${colorize(
|
|
@@ -2758,7 +2645,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2758
2645
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2759
2646
|
return new Promise((resolve5, reject) => {
|
|
2760
2647
|
const app = render(
|
|
2761
|
-
|
|
2648
|
+
React.createElement(RunView, {
|
|
2762
2649
|
runner,
|
|
2763
2650
|
datasetName,
|
|
2764
2651
|
evaluatorPattern,
|