@m4trix/evals 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +128 -11
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +129 -12
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +1120 -837
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +1121 -838
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +126 -3
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +127 -4
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/cli-simple.cjs
CHANGED
|
@@ -586,6 +586,120 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
586
586
|
artifactPath: task.snapshot.artifactPath
|
|
587
587
|
});
|
|
588
588
|
});
|
|
589
|
+
async function loadRunSnapshotsFromArtifacts(config) {
|
|
590
|
+
const baseDir = path.resolve(config.artifactDirectory);
|
|
591
|
+
let entries;
|
|
592
|
+
try {
|
|
593
|
+
entries = await promises.readdir(baseDir);
|
|
594
|
+
} catch {
|
|
595
|
+
return [];
|
|
596
|
+
}
|
|
597
|
+
const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
|
|
598
|
+
const snapshots = [];
|
|
599
|
+
for (const fileName of jsonlFiles) {
|
|
600
|
+
const filePath = path.join(baseDir, fileName);
|
|
601
|
+
try {
|
|
602
|
+
const snapshot = await parseArtifactToSnapshot(filePath, config);
|
|
603
|
+
if (snapshot) {
|
|
604
|
+
snapshots.push(snapshot);
|
|
605
|
+
}
|
|
606
|
+
} catch {
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
|
|
610
|
+
}
|
|
611
|
+
async function parseArtifactToSnapshot(filePath, _config) {
|
|
612
|
+
const content = await promises.readFile(filePath, "utf8");
|
|
613
|
+
const lines = content.split("\n").filter((line) => line.trim().length > 0);
|
|
614
|
+
if (lines.length === 0) {
|
|
615
|
+
return null;
|
|
616
|
+
}
|
|
617
|
+
let runQueued = null;
|
|
618
|
+
let runCompleted = null;
|
|
619
|
+
let runFailed = null;
|
|
620
|
+
let runStarted = null;
|
|
621
|
+
for (const line of lines) {
|
|
622
|
+
try {
|
|
623
|
+
const event = JSON.parse(line);
|
|
624
|
+
const type = event.type;
|
|
625
|
+
if (type === "RunQueued") {
|
|
626
|
+
runQueued = {
|
|
627
|
+
runId: event.runId,
|
|
628
|
+
datasetId: event.datasetId,
|
|
629
|
+
datasetName: event.datasetName,
|
|
630
|
+
evaluatorIds: event.evaluatorIds,
|
|
631
|
+
totalTestCases: event.totalTestCases ?? 0,
|
|
632
|
+
artifactPath: event.artifactPath ?? filePath,
|
|
633
|
+
ts: event.ts
|
|
634
|
+
};
|
|
635
|
+
}
|
|
636
|
+
if (type === "RunStarted") {
|
|
637
|
+
runStarted = { startedAt: event.startedAt };
|
|
638
|
+
}
|
|
639
|
+
if (type === "RunCompleted") {
|
|
640
|
+
runCompleted = {
|
|
641
|
+
passedTestCases: event.passedTestCases,
|
|
642
|
+
failedTestCases: event.failedTestCases,
|
|
643
|
+
totalTestCases: event.totalTestCases,
|
|
644
|
+
finishedAt: event.finishedAt
|
|
645
|
+
};
|
|
646
|
+
}
|
|
647
|
+
if (type === "RunFailed") {
|
|
648
|
+
runFailed = {
|
|
649
|
+
finishedAt: event.finishedAt,
|
|
650
|
+
errorMessage: event.errorMessage
|
|
651
|
+
};
|
|
652
|
+
}
|
|
653
|
+
} catch {
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
if (!runQueued) {
|
|
657
|
+
return null;
|
|
658
|
+
}
|
|
659
|
+
const artifactPath = filePath;
|
|
660
|
+
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
661
|
+
const progress = aggregateTestCaseProgress(lines);
|
|
662
|
+
const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
|
|
663
|
+
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
664
|
+
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
665
|
+
return {
|
|
666
|
+
runId: runQueued.runId,
|
|
667
|
+
datasetId: runQueued.datasetId,
|
|
668
|
+
datasetName: runQueued.datasetName,
|
|
669
|
+
evaluatorIds: runQueued.evaluatorIds,
|
|
670
|
+
queuedAt: runQueued.ts ?? 0,
|
|
671
|
+
startedAt: runStarted?.startedAt,
|
|
672
|
+
finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
|
|
673
|
+
totalTestCases: runQueued.totalTestCases,
|
|
674
|
+
completedTestCases,
|
|
675
|
+
passedTestCases,
|
|
676
|
+
failedTestCases,
|
|
677
|
+
status,
|
|
678
|
+
artifactPath,
|
|
679
|
+
errorMessage: runFailed?.errorMessage
|
|
680
|
+
};
|
|
681
|
+
}
|
|
682
|
+
function aggregateTestCaseProgress(lines) {
|
|
683
|
+
let completedTestCases = 0;
|
|
684
|
+
let passedTestCases = 0;
|
|
685
|
+
let failedTestCases = 0;
|
|
686
|
+
for (const line of lines) {
|
|
687
|
+
try {
|
|
688
|
+
const event = JSON.parse(line);
|
|
689
|
+
if (event.type === "TestCaseProgress") {
|
|
690
|
+
const ev = event;
|
|
691
|
+
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
692
|
+
if (ev.passed) {
|
|
693
|
+
passedTestCases += 1;
|
|
694
|
+
} else {
|
|
695
|
+
failedTestCases += 1;
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
} catch {
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
return { completedTestCases, passedTestCases, failedTestCases };
|
|
702
|
+
}
|
|
589
703
|
async function appendJsonLine(artifactPath, payload) {
|
|
590
704
|
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
591
705
|
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
@@ -842,6 +956,9 @@ var EffectRunner = class {
|
|
|
842
956
|
(a, b) => b.queuedAt - a.queuedAt
|
|
843
957
|
);
|
|
844
958
|
}
|
|
959
|
+
async loadRunSnapshotsFromArtifacts() {
|
|
960
|
+
return loadRunSnapshotsFromArtifacts(this.config);
|
|
961
|
+
}
|
|
845
962
|
async shutdown() {
|
|
846
963
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
847
964
|
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|
|
@@ -973,7 +1090,7 @@ function GenerateView({
|
|
|
973
1090
|
return;
|
|
974
1091
|
}
|
|
975
1092
|
const { writeFile: writeFile2 } = await import('fs/promises');
|
|
976
|
-
const { join:
|
|
1093
|
+
const { join: join4, parse: parse2, resolve: resolve5 } = await import('path');
|
|
977
1094
|
const testCases = await runner.collectDatasetTestCases(dataset.id);
|
|
978
1095
|
const payload = testCases.map((item) => {
|
|
979
1096
|
const tc = item.testCase;
|
|
@@ -983,9 +1100,9 @@ function GenerateView({
|
|
|
983
1100
|
output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
|
|
984
1101
|
};
|
|
985
1102
|
});
|
|
986
|
-
const absoluteDatasetPath =
|
|
1103
|
+
const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
|
|
987
1104
|
const parsed = parse2(absoluteDatasetPath);
|
|
988
|
-
const outputPath =
|
|
1105
|
+
const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
|
|
989
1106
|
await writeFile2(
|
|
990
1107
|
outputPath,
|
|
991
1108
|
`${JSON.stringify(payload, null, 2)}
|
|
@@ -1060,7 +1177,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
|
|
|
1060
1177
|
console.log(`Wrote ${outputPath}`);
|
|
1061
1178
|
}
|
|
1062
1179
|
async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
1063
|
-
return new Promise((
|
|
1180
|
+
return new Promise((resolve5, reject) => {
|
|
1064
1181
|
const app = ink.render(
|
|
1065
1182
|
React2__default.default.createElement(GenerateView, {
|
|
1066
1183
|
runner,
|
|
@@ -1070,7 +1187,7 @@ async function generateDatasetJsonCommandInk(runner, datasetName) {
|
|
|
1070
1187
|
if (err) {
|
|
1071
1188
|
reject(err);
|
|
1072
1189
|
} else {
|
|
1073
|
-
|
|
1190
|
+
resolve5();
|
|
1074
1191
|
}
|
|
1075
1192
|
}
|
|
1076
1193
|
})
|
|
@@ -1198,7 +1315,7 @@ function RunView({
|
|
|
1198
1315
|
const aggregates = /* @__PURE__ */ new Map();
|
|
1199
1316
|
let overallScoreTotal = 0;
|
|
1200
1317
|
let overallScoreCount = 0;
|
|
1201
|
-
const done = new Promise((
|
|
1318
|
+
const done = new Promise((resolve5) => {
|
|
1202
1319
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1203
1320
|
if (event.type === "TestCaseProgress") {
|
|
1204
1321
|
const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
|
|
@@ -1244,7 +1361,7 @@ function RunView({
|
|
|
1244
1361
|
}
|
|
1245
1362
|
if (event.type === "RunCompleted" || event.type === "RunFailed") {
|
|
1246
1363
|
unsubscribe();
|
|
1247
|
-
|
|
1364
|
+
resolve5(event);
|
|
1248
1365
|
}
|
|
1249
1366
|
});
|
|
1250
1367
|
});
|
|
@@ -1588,7 +1705,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1588
1705
|
);
|
|
1589
1706
|
}
|
|
1590
1707
|
let spinnerTimer;
|
|
1591
|
-
const done = new Promise((
|
|
1708
|
+
const done = new Promise((resolve5) => {
|
|
1592
1709
|
const unsubscribe = runner.subscribeRunEvents((event) => {
|
|
1593
1710
|
if (event.type === "TestCaseProgress") {
|
|
1594
1711
|
completedCount = event.completedTestCases;
|
|
@@ -1649,7 +1766,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1649
1766
|
runFinished = true;
|
|
1650
1767
|
clearLine();
|
|
1651
1768
|
unsubscribe();
|
|
1652
|
-
|
|
1769
|
+
resolve5(event);
|
|
1653
1770
|
}
|
|
1654
1771
|
});
|
|
1655
1772
|
});
|
|
@@ -1727,7 +1844,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
|
|
|
1727
1844
|
console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
|
|
1728
1845
|
}
|
|
1729
1846
|
async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
1730
|
-
return new Promise((
|
|
1847
|
+
return new Promise((resolve5, reject) => {
|
|
1731
1848
|
const app = ink.render(
|
|
1732
1849
|
React2__default.default.createElement(RunView, {
|
|
1733
1850
|
runner,
|
|
@@ -1738,7 +1855,7 @@ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
|
|
|
1738
1855
|
if (err) {
|
|
1739
1856
|
reject(err);
|
|
1740
1857
|
} else {
|
|
1741
|
-
|
|
1858
|
+
resolve5();
|
|
1742
1859
|
}
|
|
1743
1860
|
}
|
|
1744
1861
|
})
|