@m4trix/evals 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +213 -38
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +214 -39
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +1136 -832
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1137 -833
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +149 -5
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +26 -13
- package/dist/index.js +150 -6
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -8,7 +8,7 @@ var path = require('path');
|
|
|
8
8
|
var jitiModule = require('jiti');
|
|
9
9
|
var promises = require('fs/promises');
|
|
10
10
|
var url = require('url');
|
|
11
|
-
require('json-diff');
|
|
11
|
+
var jsonDiff = require('json-diff');
|
|
12
12
|
var React2 = require('react');
|
|
13
13
|
var ink = require('ink');
|
|
14
14
|
var jsxRuntime = require('react/jsx-runtime');
|
|
@@ -282,6 +282,29 @@ async function collectTestCasesFromFiles(config) {
|
|
|
282
282
|
);
|
|
283
283
|
return found.flat();
|
|
284
284
|
}
|
|
285
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
286
|
+
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
287
|
+
return {
|
|
288
|
+
type: "diff",
|
|
289
|
+
label: options?.label,
|
|
290
|
+
expected,
|
|
291
|
+
actual,
|
|
292
|
+
diff: diff || "(no differences)"
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
function getDiffLines(entry) {
|
|
296
|
+
const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
297
|
+
return raw.split("\n").map((line) => {
|
|
298
|
+
const trimmed = line.trimStart();
|
|
299
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
300
|
+
return { type: "remove", line };
|
|
301
|
+
}
|
|
302
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
303
|
+
return { type: "add", line };
|
|
304
|
+
}
|
|
305
|
+
return { type: "context", line };
|
|
306
|
+
});
|
|
307
|
+
}
|
|
285
308
|
|
|
286
309
|
// src/evals/metric.ts
|
|
287
310
|
var registry = /* @__PURE__ */ new Map();
|
|
@@ -465,6 +488,10 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
465
488
|
continue;
|
|
466
489
|
}
|
|
467
490
|
try {
|
|
491
|
+
const logs = [];
|
|
492
|
+
const logDiff = (expected, actual, options) => {
|
|
493
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
494
|
+
};
|
|
468
495
|
const ctx = yield* effect.Effect.promise(
|
|
469
496
|
() => Promise.resolve(evaluator.resolveContext())
|
|
470
497
|
);
|
|
@@ -473,13 +500,20 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
473
500
|
evaluateFn({
|
|
474
501
|
input: testCaseItem.testCase.getInput(),
|
|
475
502
|
ctx,
|
|
476
|
-
output
|
|
503
|
+
output,
|
|
504
|
+
logDiff
|
|
477
505
|
})
|
|
478
506
|
)
|
|
479
507
|
);
|
|
480
508
|
const { scores, metrics } = normalizeResult(result);
|
|
481
509
|
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
482
|
-
evaluatorScores.push({
|
|
510
|
+
evaluatorScores.push({
|
|
511
|
+
evaluatorId,
|
|
512
|
+
scores,
|
|
513
|
+
passed,
|
|
514
|
+
metrics,
|
|
515
|
+
logs: logs.length > 0 ? logs : void 0
|
|
516
|
+
});
|
|
483
517
|
} catch (error) {
|
|
484
518
|
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
485
519
|
evaluatorScores.push({
|
|
@@ -552,6 +586,120 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
552
586
|
artifactPath: task.snapshot.artifactPath
|
|
553
587
|
});
|
|
554
588
|
});
|
|
589
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
590
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
591
|
+
let entries;
|
|
592
|
+
try {
|
|
593
|
+
entries = await promises.readdir(baseDir);
|
|
594
|
+
} catch {
|
|
595
|
+
return [];
|
|
596
|
+
}
|
|
597
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
598
|
+
const snapshots = [];
|
|
599
|
+
for (const fileName of jsonlFiles) {
|
|
600
|
+
const filePath = path.join(baseDir, fileName);
|
|
601
|
+
try {
|
|
602
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
603
|
+
if (snapshot) {
|
|
604
|
+
snapshots.push(snapshot);
|
|
605
|
+
}
|
|
606
|
+
} catch {
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
610
|
+
}
|
|
611
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
612
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
613
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
614
|
+
if (lines.length === 0) {
|
|
615
|
+
return null;
|
|
616
|
+
}
|
|
617
|
+
let runQueued = null;
|
|
618
|
+
let runCompleted = null;
|
|
619
|
+
let runFailed = null;
|
|
620
|
+
let runStarted = null;
|
|
621
|
+
for (const line of lines) {
|
|
622
|
+
try {
|
|
623
|
+
const event = JSON.parse(line);
|
|
624
|
+
const type = event.type;
|
|
625
|
+
if (type === "RunQueued") {
|
|
626
|
+
runQueued = {
|
|
627
|
+
runId: event.runId,
|
|
628
|
+
datasetId: event.datasetId,
|
|
629
|
+
datasetName: event.datasetName,
|
|
630
|
+
evaluatorIds: event.evaluatorIds,
|
|
631
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
632
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
633
|
+
ts: event.ts
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
if (type === "RunStarted") {
|
|
637
|
+
runStarted = { startedAt: event.startedAt };
|
|
638
|
+
}
|
|
639
|
+
if (type === "RunCompleted") {
|
|
640
|
+
runCompleted = {
|
|
641
|
+
passedTestCases: event.passedTestCases,
|
|
642
|
+
failedTestCases: event.failedTestCases,
|
|
643
|
+
totalTestCases: event.totalTestCases,
|
|
644
|
+
finishedAt: event.finishedAt
|
|
645
|
+
};
|
|
646
|
+
}
|
|
647
|
+
if (type === "RunFailed") {
|
|
648
|
+
runFailed = {
|
|
649
|
+
finishedAt: event.finishedAt,
|
|
650
|
+
errorMessage: event.errorMessage
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
} catch {
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
if (!runQueued) {
|
|
657
|
+
return null;
|
|
658
|
+
}
|
|
659
|
+
const artifactPath = filePath;
|
|
660
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
661
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
662
|
+
const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
|
|
663
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
664
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
665
|
+
return {
|
|
666
|
+
runId: runQueued.runId,
|
|
667
|
+
datasetId: runQueued.datasetId,
|
|
668
|
+
datasetName: runQueued.datasetName,
|
|
669
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
670
|
+
queuedAt: runQueued.ts ?? 0,
|
|
671
|
+
startedAt: runStarted?.startedAt,
|
|
672
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
673
|
+
totalTestCases: runQueued.totalTestCases,
|
|
674
|
+
completedTestCases,
|
|
675
|
+
passedTestCases,
|
|
676
|
+
failedTestCases,
|
|
677
|
+
status,
|
|
678
|
+
artifactPath,
|
|
679
|
+
errorMessage: runFailed?.errorMessage
|
|
680
|
+
};
|
|
681
|
+
}
|
|
682
|
+
function aggregateTestCaseProgress(lines) {
|
|
683
|
+
let completedTestCases = 0;
|
|
684
|
+
let passedTestCases = 0;
|
|
685
|
+
let failedTestCases = 0;
|
|
686
|
+
for (const line of lines) {
|
|
687
|
+
try {
|
|
688
|
+
const event = JSON.parse(line);
|
|
689
|
+
if (event.type === "TestCaseProgress") {
|
|
690
|
+
const ev = event;
|
|
691
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
692
|
+
if (ev.passed) {
|
|
693
|
+
passedTestCases += 1;
|
|
694
|
+
} else {
|
|
695
|
+
failedTestCases += 1;
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
} catch {
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
702
|
+
}
|
|
555
703
|
async function appendJsonLine(artifactPath, payload) {
|
|
556
704
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
557
705
|
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
@@ -808,6 +956,9 @@ var EffectRunner = class {
|
|
|
808
956
|
(a, b) => b.queuedAt - a.queuedAt
|
|
809
957
|
);
|
|
810
958
|
}
|
|
959
|
+
async loadRunSnapshotsFromArtifacts() {
|
|
960
|
+
return loadRunSnapshotsFromArtifacts(this.config);
|
|
961
|
+
}
|
|
811
962
|
async shutdown() {
|
|
812
963
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
813
964
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|
|
@@ -939,7 +1090,7 @@ function GenerateView({
|
|
|
939
1090
|
return;
|
|
940
1091
|
}
|
|
941
1092
|
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
942
|
-
const { join:
|
|
1093
|
+
const { join: join4, parse: parse2, resolve: resolve5 } = await import('path');
|
|
943
1094
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
944
1095
|
const payload = testCases.map((item) => {
|
|
945
1096
|
const tc = item.testCase;
|
|
@@ -949,9 +1100,9 @@ function GenerateView({
|
|
|
949
1100
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
950
1101
|
};
|
|
951
1102
|
});
|
|
952
|
-
const absoluteDatasetPath =
|
|
1103
|
+
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
953
1104
|
const parsed = parse2(absoluteDatasetPath);
|
|
954
|
-
const outputPath =
|
|
1105
|
+
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
955
1106
|
await writeFile2(
|
|
956
1107
|
outputPath,
|
|
957
1108
|
`${JSON.stringify(payload, null, 2)}
|
|
@@ -1026,7 +1177,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1026
1177
|
console.log(`Wrote ${outputPath}`);
|
|
1027
1178
|
}
|
|
1028
1179
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1029
|
-
return new Promise((
|
|
1180
|
+
return new Promise((resolve5, reject) => {
|
|
1030
1181
|
const app = ink.render(
|
|
1031
1182
|
React2__default.default.createElement(GenerateView, {
|
|
1032
1183
|
runner,
|
|
@@ -1036,7 +1187,7 @@ async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
|
1036
1187
|
if (err) {
|
|
1037
1188
|
reject(err);
|
|
1038
1189
|
} else {
|
|
1039
|
-
|
|
1190
|
+
resolve5();
|
|
1040
1191
|
}
|
|
1041
1192
|
}
|
|
1042
1193
|
})
|
|
@@ -1164,7 +1315,7 @@ function RunView({
|
|
|
1164
1315
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1165
1316
|
let overallScoreTotal = 0;
|
|
1166
1317
|
let overallScoreCount = 0;
|
|
1167
|
-
const done = new Promise((
|
|
1318
|
+
const done = new Promise((resolve5) => {
|
|
1168
1319
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1169
1320
|
if (event.type === "TestCaseProgress") {
|
|
1170
1321
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
@@ -1202,14 +1353,15 @@ function RunView({
|
|
|
1202
1353
|
evaluatorName: nameById.get(item.evaluatorId) ?? item.evaluatorId,
|
|
1203
1354
|
scores: item.scores,
|
|
1204
1355
|
passed: item.passed,
|
|
1205
|
-
metrics: item.metrics
|
|
1356
|
+
metrics: item.metrics,
|
|
1357
|
+
logs: item.logs
|
|
1206
1358
|
}))
|
|
1207
1359
|
}
|
|
1208
1360
|
]);
|
|
1209
1361
|
}
|
|
1210
1362
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1211
1363
|
unsubscribe();
|
|
1212
|
-
|
|
1364
|
+
resolve5(event);
|
|
1213
1365
|
}
|
|
1214
1366
|
});
|
|
1215
1367
|
});
|
|
@@ -1289,30 +1441,42 @@ function RunView({
|
|
|
1289
1441
|
"ms)"
|
|
1290
1442
|
] })
|
|
1291
1443
|
] }),
|
|
1292
|
-
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
] }, s.id)),
|
|
1302
|
-
item.metrics?.map((m) => {
|
|
1303
|
-
const def = getMetricById(m.id);
|
|
1304
|
-
if (!def)
|
|
1305
|
-
return null;
|
|
1306
|
-
const formatted = def.format(m.data);
|
|
1307
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1308
|
-
"[",
|
|
1309
|
-
def.name ? `${def.name}: ` : "",
|
|
1310
|
-
formatted,
|
|
1311
|
-
"]",
|
|
1444
|
+
tc.evaluatorScores.map((item) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", marginLeft: 2, children: [
|
|
1445
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1446
|
+
item.evaluatorName,
|
|
1447
|
+
":",
|
|
1448
|
+
" ",
|
|
1449
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
1450
|
+
" ",
|
|
1451
|
+
item.scores.map((s) => /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: scoreColor(toNumericScore(s.data) ?? 0), children: [
|
|
1452
|
+
formatScorePart(s),
|
|
1312
1453
|
" "
|
|
1313
|
-
] },
|
|
1314
|
-
|
|
1315
|
-
|
|
1454
|
+
] }, s.id)),
|
|
1455
|
+
item.metrics?.map((m) => {
|
|
1456
|
+
const def = getMetricById(m.id);
|
|
1457
|
+
if (!def)
|
|
1458
|
+
return null;
|
|
1459
|
+
const formatted = def.format(m.data);
|
|
1460
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1461
|
+
"[",
|
|
1462
|
+
def.name ? `${def.name}: ` : "",
|
|
1463
|
+
formatted,
|
|
1464
|
+
"]",
|
|
1465
|
+
" "
|
|
1466
|
+
] }, m.id);
|
|
1467
|
+
})
|
|
1468
|
+
] }),
|
|
1469
|
+
!item.passed && item.logs && item.logs.length > 0 && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginLeft: 2, flexDirection: "column", children: item.logs.map(
|
|
1470
|
+
(log, logIdx) => log.type === "diff" ? /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: getDiffLines(log).map(({ type, line }, lineIdx) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
1471
|
+
ink.Text,
|
|
1472
|
+
{
|
|
1473
|
+
color: type === "remove" ? "red" : type === "add" ? "green" : "gray",
|
|
1474
|
+
children: line
|
|
1475
|
+
},
|
|
1476
|
+
lineIdx
|
|
1477
|
+
)) }, logIdx) : null
|
|
1478
|
+
) })
|
|
1479
|
+
] }, item.evaluatorId))
|
|
1316
1480
|
] }, i)) }),
|
|
1317
1481
|
phase === "completed" && summary && /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
1318
1482
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: "Run Summary" }),
|
|
@@ -1541,7 +1705,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1541
1705
|
);
|
|
1542
1706
|
}
|
|
1543
1707
|
let spinnerTimer;
|
|
1544
|
-
const done = new Promise((
|
|
1708
|
+
const done = new Promise((resolve5) => {
|
|
1545
1709
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1546
1710
|
if (event.type === "TestCaseProgress") {
|
|
1547
1711
|
completedCount = event.completedTestCases;
|
|
@@ -1561,6 +1725,17 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1561
1725
|
item.metrics
|
|
1562
1726
|
)
|
|
1563
1727
|
);
|
|
1728
|
+
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
1729
|
+
for (const log of item.logs) {
|
|
1730
|
+
if (log.type === "diff") {
|
|
1731
|
+
const useColor = process.stdout.isTTY;
|
|
1732
|
+
for (const { type, line } of getDiffLines(log)) {
|
|
1733
|
+
const colored = useColor && type === "remove" ? colorize(` ${line}`, ansi2.red) : useColor && type === "add" ? colorize(` ${line}`, ansi2.green) : ` ${line}`;
|
|
1734
|
+
console.log(colored);
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
}
|
|
1564
1739
|
const numeric = toNumericScoreFromScores(item.scores);
|
|
1565
1740
|
if (numeric !== void 0) {
|
|
1566
1741
|
const current = aggregates.get(item.evaluatorId) ?? {
|
|
@@ -1591,7 +1766,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1591
1766
|
runFinished = true;
|
|
1592
1767
|
clearLine();
|
|
1593
1768
|
unsubscribe();
|
|
1594
|
-
|
|
1769
|
+
resolve5(event);
|
|
1595
1770
|
}
|
|
1596
1771
|
});
|
|
1597
1772
|
});
|
|
@@ -1669,7 +1844,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1669
1844
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1670
1845
|
}
|
|
1671
1846
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1672
|
-
return new Promise((
|
|
1847
|
+
return new Promise((resolve5, reject) => {
|
|
1673
1848
|
const app = ink.render(
|
|
1674
1849
|
React2__default.default.createElement(RunView, {
|
|
1675
1850
|
runner,
|
|
@@ -1680,7 +1855,7 @@ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
|
1680
1855
|
if (err) {
|
|
1681
1856
|
reject(err);
|
|
1682
1857
|
} else {
|
|
1683
|
-
|
|
1858
|
+
resolve5();
|
|
1684
1859
|
}
|
|
1685
1860
|
}
|
|
1686
1861
|
})
|