@m4trix/evals 0.24.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +147 -260
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +133 -246
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +143 -291
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +130 -278
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +49 -92
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli-simple.cjs
CHANGED
|
@@ -10,7 +10,7 @@ var promises = require('fs/promises');
|
|
|
10
10
|
var url = require('url');
|
|
11
11
|
var diff = require('diff');
|
|
12
12
|
var stringify = require('fast-json-stable-stringify');
|
|
13
|
-
var
|
|
13
|
+
var React = require('react');
|
|
14
14
|
var ink = require('ink');
|
|
15
15
|
var jsxRuntime = require('react/jsx-runtime');
|
|
16
16
|
|
|
@@ -37,25 +37,15 @@ function _interopNamespace(e) {
|
|
|
37
37
|
|
|
38
38
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
39
39
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
40
|
-
var
|
|
40
|
+
var React__namespace = /*#__PURE__*/_interopNamespace(React);
|
|
41
41
|
|
|
42
42
|
// src/runner/config.ts
|
|
43
43
|
var defaultRunnerConfig = {
|
|
44
44
|
discovery: {
|
|
45
45
|
rootDir: process.cwd(),
|
|
46
46
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
47
|
-
evaluatorSuffixes: [
|
|
48
|
-
|
|
49
|
-
".evaluator.tsx",
|
|
50
|
-
".evaluator.js",
|
|
51
|
-
".evaluator.mjs"
|
|
52
|
-
],
|
|
53
|
-
testCaseSuffixes: [
|
|
54
|
-
".test-case.ts",
|
|
55
|
-
".test-case.tsx",
|
|
56
|
-
".test-case.js",
|
|
57
|
-
".test-case.mjs"
|
|
58
|
-
],
|
|
47
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
48
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
59
49
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
60
50
|
},
|
|
61
51
|
artifactDirectory: ".eval-results",
|
|
@@ -122,14 +112,15 @@ function getJitiLoader() {
|
|
|
122
112
|
}
|
|
123
113
|
const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
|
|
124
114
|
if (typeof createJiti2 !== "function") {
|
|
125
|
-
throw new Error(
|
|
126
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
127
|
-
);
|
|
115
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
128
116
|
}
|
|
129
|
-
cachedLoader = createJiti2(
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
117
|
+
cachedLoader = createJiti2(
|
|
118
|
+
(typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
|
|
119
|
+
{
|
|
120
|
+
interopDefault: true,
|
|
121
|
+
moduleCache: true
|
|
122
|
+
}
|
|
123
|
+
);
|
|
133
124
|
return cachedLoader;
|
|
134
125
|
}
|
|
135
126
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -233,9 +224,7 @@ async function loadModuleExports(filePath) {
|
|
|
233
224
|
}
|
|
234
225
|
async function collectDatasetsFromFiles(config) {
|
|
235
226
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
236
|
-
const matched = files.filter(
|
|
237
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
238
|
-
);
|
|
227
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
239
228
|
const found = await Promise.all(
|
|
240
229
|
matched.map(async (absolutePath) => {
|
|
241
230
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -252,9 +241,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
252
241
|
}
|
|
253
242
|
async function collectEvaluatorsFromFiles(config) {
|
|
254
243
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
255
|
-
const matched = files.filter(
|
|
256
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
257
|
-
);
|
|
244
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
258
245
|
const found = await Promise.all(
|
|
259
246
|
matched.map(async (absolutePath) => {
|
|
260
247
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -271,9 +258,7 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
271
258
|
}
|
|
272
259
|
async function collectTestCasesFromFiles(config) {
|
|
273
260
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
274
|
-
const matched = files.filter(
|
|
275
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
276
|
-
);
|
|
261
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
277
262
|
const found = await Promise.all(
|
|
278
263
|
matched.map(async (absolutePath) => {
|
|
279
264
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -345,16 +330,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
345
330
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
346
331
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
347
332
|
if (diffOptions?.keysOnly) {
|
|
348
|
-
const expectedKeys = JSON.stringify(
|
|
349
|
-
|
|
350
|
-
null,
|
|
351
|
-
2
|
|
352
|
-
);
|
|
353
|
-
const actualKeys = JSON.stringify(
|
|
354
|
-
extractKeys(actualProcessed),
|
|
355
|
-
null,
|
|
356
|
-
2
|
|
357
|
-
);
|
|
333
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
334
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
358
335
|
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
359
336
|
return formatDiffParts(parts2);
|
|
360
337
|
}
|
|
@@ -365,9 +342,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
365
342
|
}
|
|
366
343
|
const parts = diff.diffLines(expectedStr, actualStr);
|
|
367
344
|
if (diffOptions?.outputNewOnly) {
|
|
368
|
-
const filtered = parts.filter(
|
|
369
|
-
(p) => p.added === true
|
|
370
|
-
);
|
|
345
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
371
346
|
return formatDiffParts(filtered);
|
|
372
347
|
}
|
|
373
348
|
return formatDiffParts(parts);
|
|
@@ -469,10 +444,7 @@ var ScoreAggregate = {
|
|
|
469
444
|
const count = values.length || 1;
|
|
470
445
|
const result = {};
|
|
471
446
|
for (const field of fields) {
|
|
472
|
-
result[field] = values.reduce(
|
|
473
|
-
(s, v) => s + (v[field] ?? 0),
|
|
474
|
-
0
|
|
475
|
-
) / count;
|
|
447
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
476
448
|
}
|
|
477
449
|
return result;
|
|
478
450
|
};
|
|
@@ -506,13 +478,10 @@ var ScoreAggregate = {
|
|
|
506
478
|
(s, v) => s + (v[valueField] ?? 0),
|
|
507
479
|
0
|
|
508
480
|
);
|
|
509
|
-
const sumSq = values.reduce(
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
},
|
|
514
|
-
0
|
|
515
|
-
);
|
|
481
|
+
const sumSq = values.reduce((s, v) => {
|
|
482
|
+
const value = v[valueField] ?? 0;
|
|
483
|
+
return s + value * value;
|
|
484
|
+
}, 0);
|
|
516
485
|
const mean = sum / count;
|
|
517
486
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
518
487
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -780,20 +749,14 @@ function nowIsoForFile() {
|
|
|
780
749
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
781
750
|
}
|
|
782
751
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
783
|
-
return path.join(
|
|
784
|
-
artifactDirectory,
|
|
785
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
786
|
-
);
|
|
752
|
+
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
787
753
|
}
|
|
788
754
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
789
755
|
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
790
756
|
return effect.Effect.gen(function* () {
|
|
791
757
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
792
758
|
const started = Date.now();
|
|
793
|
-
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
794
|
-
n + 1,
|
|
795
|
-
n + 1
|
|
796
|
-
]);
|
|
759
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
797
760
|
yield* publishEvent({
|
|
798
761
|
type: "TestCaseStarted",
|
|
799
762
|
runId: task.runId,
|
|
@@ -826,9 +789,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
826
789
|
return error;
|
|
827
790
|
};
|
|
828
791
|
try {
|
|
829
|
-
const ctx = yield* effect.Effect.promise(
|
|
830
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
831
|
-
);
|
|
792
|
+
const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
832
793
|
const result = yield* effect.Effect.promise(
|
|
833
794
|
() => Promise.resolve().then(
|
|
834
795
|
() => evaluateFn({
|
|
@@ -883,10 +844,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
883
844
|
}
|
|
884
845
|
}
|
|
885
846
|
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
886
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
887
|
-
n + 1,
|
|
888
|
-
n + 1
|
|
889
|
-
]);
|
|
847
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
890
848
|
const progressEvent = {
|
|
891
849
|
type: "TestCaseProgress",
|
|
892
850
|
runId: task.runId,
|
|
@@ -935,10 +893,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
935
893
|
} else {
|
|
936
894
|
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
937
895
|
}
|
|
938
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
939
|
-
effect.Ref.get(passedRef),
|
|
940
|
-
effect.Ref.get(failedRef)
|
|
941
|
-
]);
|
|
896
|
+
const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
|
|
942
897
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
943
898
|
...snapshot,
|
|
944
899
|
passedTestCases: passed,
|
|
@@ -1258,15 +1213,11 @@ var EffectRunner = class {
|
|
|
1258
1213
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1259
1214
|
effect.Queue.unbounded()
|
|
1260
1215
|
);
|
|
1261
|
-
this.snapshotsRef = effect.Effect.runSync(
|
|
1262
|
-
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1263
|
-
);
|
|
1216
|
+
this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
|
|
1264
1217
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1265
1218
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1266
1219
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1267
|
-
this.schedulerFiber = effect.Effect.runFork(
|
|
1268
|
-
this.createSchedulerEffect()
|
|
1269
|
-
);
|
|
1220
|
+
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1270
1221
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1271
1222
|
createPersistenceWorker(this.persistenceQueue)
|
|
1272
1223
|
);
|
|
@@ -1413,9 +1364,9 @@ var EffectRunner = class {
|
|
|
1413
1364
|
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
1414
1365
|
}
|
|
1415
1366
|
getAllRunSnapshots() {
|
|
1416
|
-
return Array.from(
|
|
1417
|
-
|
|
1418
|
-
)
|
|
1367
|
+
return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
|
|
1368
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1369
|
+
);
|
|
1419
1370
|
}
|
|
1420
1371
|
async loadRunSnapshotsFromArtifacts() {
|
|
1421
1372
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -1557,9 +1508,9 @@ function GenerateView({
|
|
|
1557
1508
|
datasetName,
|
|
1558
1509
|
onComplete
|
|
1559
1510
|
}) {
|
|
1560
|
-
const [result, setResult] =
|
|
1561
|
-
const [error, setError] =
|
|
1562
|
-
|
|
1511
|
+
const [result, setResult] = React.useState(null);
|
|
1512
|
+
const [error, setError] = React.useState(null);
|
|
1513
|
+
React.useEffect(() => {
|
|
1563
1514
|
let cancelled = false;
|
|
1564
1515
|
async function run() {
|
|
1565
1516
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
@@ -1582,12 +1533,8 @@ function GenerateView({
|
|
|
1582
1533
|
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
1583
1534
|
const parsed = parse2(absoluteDatasetPath);
|
|
1584
1535
|
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
1585
|
-
await writeFile2(
|
|
1586
|
-
|
|
1587
|
-
`${JSON.stringify(payload, null, 2)}
|
|
1588
|
-
`,
|
|
1589
|
-
"utf8"
|
|
1590
|
-
);
|
|
1536
|
+
await writeFile2(outputPath, `${JSON.stringify(payload, null, 2)}
|
|
1537
|
+
`, "utf8");
|
|
1591
1538
|
if (!cancelled) {
|
|
1592
1539
|
setResult({
|
|
1593
1540
|
count: payload.length,
|
|
@@ -1658,7 +1605,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1658
1605
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1659
1606
|
return new Promise((resolve5, reject) => {
|
|
1660
1607
|
const app = ink.render(
|
|
1661
|
-
|
|
1608
|
+
React__namespace.default.createElement(GenerateView, {
|
|
1662
1609
|
runner,
|
|
1663
1610
|
datasetName,
|
|
1664
1611
|
onComplete: (err) => {
|
|
@@ -1708,8 +1655,8 @@ function TextBar({
|
|
|
1708
1655
|
}
|
|
1709
1656
|
var FRAMES = ["\u280B", "\u2819", "\u2838", "\u2834", "\u2826", "\u2807"];
|
|
1710
1657
|
function Spinner({ label = "Running" }) {
|
|
1711
|
-
const [frame, setFrame] =
|
|
1712
|
-
|
|
1658
|
+
const [frame, setFrame] = React.useState(0);
|
|
1659
|
+
React.useEffect(() => {
|
|
1713
1660
|
const timer = setInterval(() => {
|
|
1714
1661
|
setFrame((f) => (f + 1) % FRAMES.length);
|
|
1715
1662
|
}, 100);
|
|
@@ -1743,9 +1690,7 @@ function createBar(value, max = 100, width = 20) {
|
|
|
1743
1690
|
function aggregateEvaluatorScores(events, nameById) {
|
|
1744
1691
|
if (events.length === 0)
|
|
1745
1692
|
return [];
|
|
1746
|
-
const evaluatorIds = new Set(
|
|
1747
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
1748
|
-
);
|
|
1693
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
1749
1694
|
const result = [];
|
|
1750
1695
|
for (const evaluatorId of evaluatorIds) {
|
|
1751
1696
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -1775,9 +1720,7 @@ function aggregateEvaluatorScores(events, nameById) {
|
|
|
1775
1720
|
return es?.passed ?? false;
|
|
1776
1721
|
});
|
|
1777
1722
|
const lastEvent = events[events.length - 1];
|
|
1778
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
1779
|
-
(x) => x.evaluatorId === evaluatorId
|
|
1780
|
-
);
|
|
1723
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === evaluatorId);
|
|
1781
1724
|
result.push({
|
|
1782
1725
|
evaluatorId,
|
|
1783
1726
|
evaluatorName: nameById.get(evaluatorId) ?? evaluatorId,
|
|
@@ -1811,17 +1754,15 @@ function RunView({
|
|
|
1811
1754
|
concurrency,
|
|
1812
1755
|
onComplete
|
|
1813
1756
|
}) {
|
|
1814
|
-
const [phase, setPhase] =
|
|
1815
|
-
|
|
1816
|
-
);
|
|
1817
|
-
const [
|
|
1818
|
-
const [
|
|
1819
|
-
const [
|
|
1820
|
-
const [
|
|
1821
|
-
const [
|
|
1822
|
-
const
|
|
1823
|
-
const [evaluatorNameById, setEvaluatorNameById] = React2.useState(/* @__PURE__ */ new Map());
|
|
1824
|
-
const runEval = React2.useCallback(async () => {
|
|
1757
|
+
const [phase, setPhase] = React.useState("loading");
|
|
1758
|
+
const [runInfo, setRunInfo] = React.useState(null);
|
|
1759
|
+
const [testCases, setTestCases] = React.useState([]);
|
|
1760
|
+
const [startedEvaluations, setStartedEvaluations] = React.useState(0);
|
|
1761
|
+
const [completedEvaluations, setCompletedEvaluations] = React.useState(0);
|
|
1762
|
+
const [runningEvaluations, setRunningEvaluations] = React.useState([]);
|
|
1763
|
+
const [summary, setSummary] = React.useState(null);
|
|
1764
|
+
const [evaluatorNameById, setEvaluatorNameById] = React.useState(/* @__PURE__ */ new Map());
|
|
1765
|
+
const runEval = React.useCallback(async () => {
|
|
1825
1766
|
const dataset = await runner.resolveDatasetByName(datasetName);
|
|
1826
1767
|
if (!dataset) {
|
|
1827
1768
|
const known = await runner.collectDatasets();
|
|
@@ -1920,10 +1861,7 @@ function RunView({
|
|
|
1920
1861
|
};
|
|
1921
1862
|
const events = existing ? [...existing.events, newEvent] : [newEvent];
|
|
1922
1863
|
const isAggregated = events.length > 1;
|
|
1923
|
-
const aggregatedEvaluatorScores = aggregateEvaluatorScores(
|
|
1924
|
-
events,
|
|
1925
|
-
nameById
|
|
1926
|
-
);
|
|
1864
|
+
const aggregatedEvaluatorScores = aggregateEvaluatorScores(events, nameById);
|
|
1927
1865
|
const merged = {
|
|
1928
1866
|
name: event.testCaseName,
|
|
1929
1867
|
testCaseId: event.testCaseId,
|
|
@@ -1986,7 +1924,7 @@ function RunView({
|
|
|
1986
1924
|
setPhase("completed");
|
|
1987
1925
|
setTimeout(() => onComplete(), 200);
|
|
1988
1926
|
}, [runner, datasetName, evaluatorPattern, concurrency, onComplete]);
|
|
1989
|
-
|
|
1927
|
+
React.useEffect(() => {
|
|
1990
1928
|
void runEval();
|
|
1991
1929
|
}, [runEval]);
|
|
1992
1930
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", padding: 1, children: [
|
|
@@ -2028,30 +1966,22 @@ function RunView({
|
|
|
2028
1966
|
label: `Evaluations ${completedEvaluations}/${runInfo?.totalTestCases ?? 0} completed \u2022 ${startedEvaluations}/${runInfo?.totalTestCases ?? 0} started`
|
|
2029
1967
|
}
|
|
2030
1968
|
),
|
|
2031
|
-
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2032
|
-
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
"/",
|
|
2048
|
-
item.rerunTotal,
|
|
2049
|
-
")"
|
|
2050
|
-
] })
|
|
2051
|
-
]
|
|
2052
|
-
},
|
|
2053
|
-
`${item.testCaseId}:${item.rerunIndex}`
|
|
2054
|
-
)) })
|
|
1969
|
+
runningEvaluations.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginTop: 1, children: runningEvaluations.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "yellow", children: [
|
|
1970
|
+
"[running ",
|
|
1971
|
+
item.startedTestCases,
|
|
1972
|
+
"/",
|
|
1973
|
+
item.totalTestCases,
|
|
1974
|
+
"] ",
|
|
1975
|
+
item.name,
|
|
1976
|
+
" ",
|
|
1977
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1978
|
+
"(",
|
|
1979
|
+
item.rerunIndex,
|
|
1980
|
+
"/",
|
|
1981
|
+
item.rerunTotal,
|
|
1982
|
+
")"
|
|
1983
|
+
] })
|
|
1984
|
+
] }, `${item.testCaseId}:${item.rerunIndex}`)) })
|
|
2055
1985
|
] }),
|
|
2056
1986
|
testCases.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", marginBottom: 1, children: testCases.map((tc) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginBottom: 0, children: [
|
|
2057
1987
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
@@ -2083,73 +2013,63 @@ function RunView({
|
|
|
2083
2013
|
] }) : null
|
|
2084
2014
|
] }),
|
|
2085
2015
|
tc.errorMessage ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "red", children: tc.errorMessage }) : null,
|
|
2086
|
-
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2087
|
-
ink.
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
children:
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2016
|
+
tc.aggregatedEvaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
2017
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2018
|
+
item.evaluatorName,
|
|
2019
|
+
":",
|
|
2020
|
+
" ",
|
|
2021
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
2022
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2023
|
+
" ",
|
|
2024
|
+
item.metrics.map((m) => {
|
|
2025
|
+
const def = getMetricById(m.id);
|
|
2026
|
+
if (!def)
|
|
2027
|
+
return null;
|
|
2028
|
+
const formatted = def.format(m.data, {
|
|
2029
|
+
isAggregated: tc.isAggregated
|
|
2030
|
+
});
|
|
2031
|
+
const label = m.name ?? def.name;
|
|
2032
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2033
|
+
"[",
|
|
2034
|
+
label ? `${label}: ` : "",
|
|
2035
|
+
formatted,
|
|
2036
|
+
"]",
|
|
2037
|
+
" "
|
|
2038
|
+
] }, m.id);
|
|
2039
|
+
})
|
|
2040
|
+
] }) : null
|
|
2041
|
+
] }),
|
|
2042
|
+
item.scores.length > 0 ? item.scores.map((s, idx) => {
|
|
2043
|
+
const def = s.def ?? getScoreById(s.id);
|
|
2044
|
+
const scoreLabel = s.name ?? def?.name ?? def?.id ?? s.id;
|
|
2045
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2046
|
+
ink.Text,
|
|
2047
|
+
{
|
|
2048
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2049
|
+
children: [
|
|
2050
|
+
" ",
|
|
2051
|
+
scoreLabel,
|
|
2052
|
+
":",
|
|
2098
2053
|
" ",
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
if (!def)
|
|
2102
|
-
return null;
|
|
2103
|
-
const formatted = def.format(m.data, {
|
|
2104
|
-
isAggregated: tc.isAggregated
|
|
2105
|
-
});
|
|
2106
|
-
const label = m.name ?? def.name;
|
|
2107
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2108
|
-
"[",
|
|
2109
|
-
label ? `${label}: ` : "",
|
|
2110
|
-
formatted,
|
|
2111
|
-
"]",
|
|
2112
|
-
" "
|
|
2113
|
-
] }, m.id);
|
|
2054
|
+
formatScorePart(s, scoreColor, {
|
|
2055
|
+
isAggregated: tc.isAggregated
|
|
2114
2056
|
})
|
|
2115
|
-
]
|
|
2116
|
-
|
|
2117
|
-
item.
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
})
|
|
2132
|
-
]
|
|
2133
|
-
},
|
|
2134
|
-
`${item.evaluatorId}-${s.id}-${idx}`
|
|
2135
|
-
);
|
|
2136
|
-
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
2137
|
-
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2138
|
-
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(
|
|
2139
|
-
({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2140
|
-
ink.Text,
|
|
2141
|
-
{
|
|
2142
|
-
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2143
|
-
children: line
|
|
2144
|
-
},
|
|
2145
|
-
lineIdx
|
|
2146
|
-
)
|
|
2147
|
-
) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
2148
|
-
) })
|
|
2149
|
-
]
|
|
2150
|
-
},
|
|
2151
|
-
item.evaluatorId
|
|
2152
|
-
))
|
|
2057
|
+
]
|
|
2058
|
+
},
|
|
2059
|
+
`${item.evaluatorId}-${s.id}-${idx}`
|
|
2060
|
+
);
|
|
2061
|
+
}) : /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: " n/a" }),
|
|
2062
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
2063
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2064
|
+
ink.Text,
|
|
2065
|
+
{
|
|
2066
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
2067
|
+
children: line
|
|
2068
|
+
},
|
|
2069
|
+
lineIdx
|
|
2070
|
+
)) }, logIdx) : log.type === "log" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getLogLines(log).map((line, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: line }, lineIdx)) }, logIdx) : null
|
|
2071
|
+
) })
|
|
2072
|
+
] }, item.evaluatorId))
|
|
2153
2073
|
] }, tc.testCaseId)) }),
|
|
2154
2074
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
2155
2075
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -2191,9 +2111,9 @@ function RunView({
|
|
|
2191
2111
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", children: "evaluator averages" }),
|
|
2192
2112
|
Array.from(evaluatorNameById.entries()).map(([id, name]) => {
|
|
2193
2113
|
const agg = summary.aggregates.get(id);
|
|
2194
|
-
const scoreKeys = [
|
|
2195
|
-
|
|
2196
|
-
|
|
2114
|
+
const scoreKeys = [...summary.scoreItemsByEvaluatorScore?.keys() ?? []].filter(
|
|
2115
|
+
(k) => k.startsWith(`${id}:`)
|
|
2116
|
+
);
|
|
2197
2117
|
if (scoreKeys.length === 0) {
|
|
2198
2118
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2199
2119
|
"- ",
|
|
@@ -2223,19 +2143,12 @@ function RunView({
|
|
|
2223
2143
|
const label = aggregated.name ?? def?.name ?? def?.id ?? aggregated.id;
|
|
2224
2144
|
const formatted = def ? def.formatAggregate(aggregated.data) : "n/a";
|
|
2225
2145
|
const numeric = toNumericScore(aggregated.data);
|
|
2226
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
|
|
2231
|
-
|
|
2232
|
-
label,
|
|
2233
|
-
": ",
|
|
2234
|
-
formatted
|
|
2235
|
-
]
|
|
2236
|
-
},
|
|
2237
|
-
key
|
|
2238
|
-
);
|
|
2146
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: numeric !== void 0 ? scoreColor(numeric) : "gray", children: [
|
|
2147
|
+
" ",
|
|
2148
|
+
label,
|
|
2149
|
+
": ",
|
|
2150
|
+
formatted
|
|
2151
|
+
] }, key);
|
|
2239
2152
|
})
|
|
2240
2153
|
] }, id);
|
|
2241
2154
|
})
|
|
@@ -2311,9 +2224,7 @@ function buildTestCaseSummaries(byId) {
|
|
|
2311
2224
|
for (const evaluatorScores of events[0]?.evaluatorScores ?? []) {
|
|
2312
2225
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
2313
2226
|
for (const ev of events) {
|
|
2314
|
-
const es = ev.evaluatorScores.find(
|
|
2315
|
-
(x) => x.evaluatorId === evaluatorScores.evaluatorId
|
|
2316
|
-
);
|
|
2227
|
+
const es = ev.evaluatorScores.find((x) => x.evaluatorId === evaluatorScores.evaluatorId);
|
|
2317
2228
|
for (const s of es?.scores ?? []) {
|
|
2318
2229
|
const list = scoreIdToItems.get(s.id) ?? [];
|
|
2319
2230
|
list.push(s);
|
|
@@ -2366,9 +2277,7 @@ function scoreToColor(score) {
|
|
|
2366
2277
|
}
|
|
2367
2278
|
function getEvaluatorSummaryLines(evaluatorId, evaluatorName, aggregate, scoreItemsByKey) {
|
|
2368
2279
|
const lines = [];
|
|
2369
|
-
const scoreKeys = [...scoreItemsByKey.keys()].filter(
|
|
2370
|
-
(k) => k.startsWith(`${evaluatorId}:`)
|
|
2371
|
-
);
|
|
2280
|
+
const scoreKeys = [...scoreItemsByKey.keys()].filter((k) => k.startsWith(`${evaluatorId}:`));
|
|
2372
2281
|
if (scoreKeys.length === 0) {
|
|
2373
2282
|
lines.push(`- ${evaluatorName.padEnd(28)} no scores`);
|
|
2374
2283
|
return lines;
|
|
@@ -2403,9 +2312,7 @@ function createBar2(value, max = 100, width = 20) {
|
|
|
2403
2312
|
function aggregateEvaluatorScoresFromEvents(events, _evaluatorNameById) {
|
|
2404
2313
|
if (events.length === 0)
|
|
2405
2314
|
return [];
|
|
2406
|
-
const evaluatorIds = new Set(
|
|
2407
|
-
events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId))
|
|
2408
|
-
);
|
|
2315
|
+
const evaluatorIds = new Set(events.flatMap((e) => e.evaluatorScores.map((x) => x.evaluatorId)));
|
|
2409
2316
|
const result = [];
|
|
2410
2317
|
for (const evaluatorId of evaluatorIds) {
|
|
2411
2318
|
const scoreIdToItems = /* @__PURE__ */ new Map();
|
|
@@ -2452,9 +2359,7 @@ function formatEvaluatorScoreLine(name, scores, passed, metrics, options) {
|
|
|
2452
2359
|
if (def) {
|
|
2453
2360
|
const formatted = def.format(m.data, options);
|
|
2454
2361
|
const label = m.name ?? def.name;
|
|
2455
|
-
metricParts.push(
|
|
2456
|
-
label ? `[${label}: ${formatted}]` : `[${formatted}]`
|
|
2457
|
-
);
|
|
2362
|
+
metricParts.push(label ? `[${label}: ${formatted}]` : `[${formatted}]`);
|
|
2458
2363
|
}
|
|
2459
2364
|
}
|
|
2460
2365
|
}
|
|
@@ -2628,10 +2533,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2628
2533
|
const aggregatedScores = aggregateEvaluatorScoresFromEvents(
|
|
2629
2534
|
existing.events);
|
|
2630
2535
|
const isAggregated = existing.events.length > 1;
|
|
2631
|
-
const durationMs = existing.events.reduce(
|
|
2632
|
-
(s, e) => s + e.durationMs,
|
|
2633
|
-
0
|
|
2634
|
-
);
|
|
2536
|
+
const durationMs = existing.events.reduce((s, e) => s + e.durationMs, 0);
|
|
2635
2537
|
const lines = [];
|
|
2636
2538
|
const statusSuffix = event.errorMessage ? ` ${colorize("ERROR", `${ansi2.bold}${ansi2.red}`)}` : "";
|
|
2637
2539
|
lines.push(
|
|
@@ -2643,18 +2545,12 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2643
2545
|
for (const item of aggregatedScores) {
|
|
2644
2546
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
2645
2547
|
lines.push(
|
|
2646
|
-
...formatEvaluatorScoreLine(
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
item.passed,
|
|
2650
|
-
item.metrics,
|
|
2651
|
-
{ isAggregated }
|
|
2652
|
-
)
|
|
2548
|
+
...formatEvaluatorScoreLine(name, item.scores, item.passed, item.metrics, {
|
|
2549
|
+
isAggregated
|
|
2550
|
+
})
|
|
2653
2551
|
);
|
|
2654
2552
|
const lastEvent = existing.events[existing.events.length - 1];
|
|
2655
|
-
const lastEs = lastEvent?.evaluatorScores.find(
|
|
2656
|
-
(x) => x.evaluatorId === item.evaluatorId
|
|
2657
|
-
);
|
|
2553
|
+
const lastEs = lastEvent?.evaluatorScores.find((x) => x.evaluatorId === item.evaluatorId);
|
|
2658
2554
|
if (!item.passed && lastEs?.logs && lastEs.logs.length > 0) {
|
|
2659
2555
|
for (const log of lastEs.logs) {
|
|
2660
2556
|
if (log.type === "diff") {
|
|
@@ -2701,9 +2597,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2701
2597
|
console.log(
|
|
2702
2598
|
`Evaluators: ${evaluators.map((item) => item.evaluator.getName() ?? item.id).join(", ")}`
|
|
2703
2599
|
);
|
|
2704
|
-
console.log(
|
|
2705
|
-
`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`
|
|
2706
|
-
);
|
|
2600
|
+
console.log(`Total test cases: ${colorize(String(snapshot.totalTestCases), ansi2.bold)}`);
|
|
2707
2601
|
console.log("");
|
|
2708
2602
|
drawSpinner();
|
|
2709
2603
|
spinnerTimer = setInterval(drawSpinner, 100);
|
|
@@ -2718,10 +2612,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2718
2612
|
console.log("");
|
|
2719
2613
|
console.log(colorize("=== Run Summary ===", `${ansi2.bold}${ansi2.cyan}`));
|
|
2720
2614
|
console.log(
|
|
2721
|
-
`- passed: ${colorize(
|
|
2722
|
-
`${completed.passedTestCases}/${completed.totalTestCases}`,
|
|
2723
|
-
ansi2.green
|
|
2724
|
-
)}`
|
|
2615
|
+
`- passed: ${colorize(`${completed.passedTestCases}/${completed.totalTestCases}`, ansi2.green)}`
|
|
2725
2616
|
);
|
|
2726
2617
|
console.log(
|
|
2727
2618
|
`- failed: ${colorize(
|
|
@@ -2731,11 +2622,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2731
2622
|
);
|
|
2732
2623
|
if (overallScoreCount > 0) {
|
|
2733
2624
|
const overallAverage = overallScoreTotal / overallScoreCount;
|
|
2734
|
-
const overallSd = sampleStdDev2(
|
|
2735
|
-
overallScoreTotal,
|
|
2736
|
-
overallScoreSumSq,
|
|
2737
|
-
overallScoreCount
|
|
2738
|
-
);
|
|
2625
|
+
const overallSd = sampleStdDev2(overallScoreTotal, overallScoreSumSq, overallScoreCount);
|
|
2739
2626
|
const avgStr = overallSd !== void 0 ? `${overallAverage.toFixed(2)} \xB1 ${overallSd.toFixed(2)}` : overallAverage.toFixed(2);
|
|
2740
2627
|
console.log(
|
|
2741
2628
|
`- overall avg score: ${colorize(
|
|
@@ -2784,7 +2671,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern,
|
|
|
2784
2671
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern, concurrency) {
|
|
2785
2672
|
return new Promise((resolve5, reject) => {
|
|
2786
2673
|
const app = ink.render(
|
|
2787
|
-
|
|
2674
|
+
React__namespace.createElement(RunView, {
|
|
2788
2675
|
runner,
|
|
2789
2676
|
datasetName,
|
|
2790
2677
|
evaluatorPattern,
|