@m4trix/evals 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -586,6 +586,120 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
586
586
  artifactPath: task.snapshot.artifactPath
587
587
  });
588
588
  });
589
+ async function loadRunSnapshotsFromArtifacts(config) {
590
+ const baseDir = path.resolve(config.artifactDirectory);
591
+ let entries;
592
+ try {
593
+ entries = await promises.readdir(baseDir);
594
+ } catch {
595
+ return [];
596
+ }
597
+ const jsonlFiles = entries.filter((name) => name.endsWith(".jsonl"));
598
+ const snapshots = [];
599
+ for (const fileName of jsonlFiles) {
600
+ const filePath = path.join(baseDir, fileName);
601
+ try {
602
+ const snapshot = await parseArtifactToSnapshot(filePath, config);
603
+ if (snapshot) {
604
+ snapshots.push(snapshot);
605
+ }
606
+ } catch {
607
+ }
608
+ }
609
+ return snapshots.sort((a, b) => b.queuedAt - a.queuedAt);
610
+ }
611
+ async function parseArtifactToSnapshot(filePath, _config) {
612
+ const content = await promises.readFile(filePath, "utf8");
613
+ const lines = content.split("\n").filter((line) => line.trim().length > 0);
614
+ if (lines.length === 0) {
615
+ return null;
616
+ }
617
+ let runQueued = null;
618
+ let runCompleted = null;
619
+ let runFailed = null;
620
+ let runStarted = null;
621
+ for (const line of lines) {
622
+ try {
623
+ const event = JSON.parse(line);
624
+ const type = event.type;
625
+ if (type === "RunQueued") {
626
+ runQueued = {
627
+ runId: event.runId,
628
+ datasetId: event.datasetId,
629
+ datasetName: event.datasetName,
630
+ evaluatorIds: event.evaluatorIds,
631
+ totalTestCases: event.totalTestCases ?? 0,
632
+ artifactPath: event.artifactPath ?? filePath,
633
+ ts: event.ts
634
+ };
635
+ }
636
+ if (type === "RunStarted") {
637
+ runStarted = { startedAt: event.startedAt };
638
+ }
639
+ if (type === "RunCompleted") {
640
+ runCompleted = {
641
+ passedTestCases: event.passedTestCases,
642
+ failedTestCases: event.failedTestCases,
643
+ totalTestCases: event.totalTestCases,
644
+ finishedAt: event.finishedAt
645
+ };
646
+ }
647
+ if (type === "RunFailed") {
648
+ runFailed = {
649
+ finishedAt: event.finishedAt,
650
+ errorMessage: event.errorMessage
651
+ };
652
+ }
653
+ } catch {
654
+ }
655
+ }
656
+ if (!runQueued) {
657
+ return null;
658
+ }
659
+ const artifactPath = filePath;
660
+ const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
661
+ const progress = aggregateTestCaseProgress(lines);
662
+ const completedTestCases = runCompleted?.totalTestCases ?? progress.completedTestCases;
663
+ const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
664
+ const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
665
+ return {
666
+ runId: runQueued.runId,
667
+ datasetId: runQueued.datasetId,
668
+ datasetName: runQueued.datasetName,
669
+ evaluatorIds: runQueued.evaluatorIds,
670
+ queuedAt: runQueued.ts ?? 0,
671
+ startedAt: runStarted?.startedAt,
672
+ finishedAt: runCompleted?.finishedAt ?? runFailed?.finishedAt,
673
+ totalTestCases: runQueued.totalTestCases,
674
+ completedTestCases,
675
+ passedTestCases,
676
+ failedTestCases,
677
+ status,
678
+ artifactPath,
679
+ errorMessage: runFailed?.errorMessage
680
+ };
681
+ }
682
+ function aggregateTestCaseProgress(lines) {
683
+ let completedTestCases = 0;
684
+ let passedTestCases = 0;
685
+ let failedTestCases = 0;
686
+ for (const line of lines) {
687
+ try {
688
+ const event = JSON.parse(line);
689
+ if (event.type === "TestCaseProgress") {
690
+ const ev = event;
691
+ completedTestCases = ev.completedTestCases ?? completedTestCases;
692
+ if (ev.passed) {
693
+ passedTestCases += 1;
694
+ } else {
695
+ failedTestCases += 1;
696
+ }
697
+ }
698
+ } catch {
699
+ }
700
+ }
701
+ return { completedTestCases, passedTestCases, failedTestCases };
702
+ }
589
703
  async function appendJsonLine(artifactPath, payload) {
590
704
  await promises.mkdir(path.dirname(artifactPath), { recursive: true });
591
705
  await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
@@ -842,6 +956,9 @@ var EffectRunner = class {
842
956
  (a, b) => b.queuedAt - a.queuedAt
843
957
  );
844
958
  }
959
+ async loadRunSnapshotsFromArtifacts() {
960
+ return loadRunSnapshotsFromArtifacts(this.config);
961
+ }
845
962
  async shutdown() {
846
963
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
847
964
  await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
@@ -973,7 +1090,7 @@ function GenerateView({
973
1090
  return;
974
1091
  }
975
1092
  const { writeFile: writeFile2 } = await import('fs/promises');
976
- const { join: join3, parse: parse2, resolve: resolve4 } = await import('path');
1093
+ const { join: join4, parse: parse2, resolve: resolve5 } = await import('path');
977
1094
  const testCases = await runner.collectDatasetTestCases(dataset.id);
978
1095
  const payload = testCases.map((item) => {
979
1096
  const tc = item.testCase;
@@ -983,9 +1100,9 @@ function GenerateView({
983
1100
  output: typeof tc.getOutput === "function" ? tc.getOutput() : void 0
984
1101
  };
985
1102
  });
986
- const absoluteDatasetPath = resolve4(process.cwd(), dataset.filePath);
1103
+ const absoluteDatasetPath = resolve5(process.cwd(), dataset.filePath);
987
1104
  const parsed = parse2(absoluteDatasetPath);
988
- const outputPath = join3(parsed.dir, `${parsed.name}.cases.json`);
1105
+ const outputPath = join4(parsed.dir, `${parsed.name}.cases.json`);
989
1106
  await writeFile2(
990
1107
  outputPath,
991
1108
  `${JSON.stringify(payload, null, 2)}
@@ -1060,7 +1177,7 @@ async function generateDatasetJsonCommandPlain(runner, datasetName) {
1060
1177
  console.log(`Wrote ${outputPath}`);
1061
1178
  }
1062
1179
  async function generateDatasetJsonCommandInk(runner, datasetName) {
1063
- return new Promise((resolve4, reject) => {
1180
+ return new Promise((resolve5, reject) => {
1064
1181
  const app = ink.render(
1065
1182
  React2__default.default.createElement(GenerateView, {
1066
1183
  runner,
@@ -1070,7 +1187,7 @@ async function generateDatasetJsonCommandInk(runner, datasetName) {
1070
1187
  if (err) {
1071
1188
  reject(err);
1072
1189
  } else {
1073
- resolve4();
1190
+ resolve5();
1074
1191
  }
1075
1192
  }
1076
1193
  })
@@ -1198,7 +1315,7 @@ function RunView({
1198
1315
  const aggregates = /* @__PURE__ */ new Map();
1199
1316
  let overallScoreTotal = 0;
1200
1317
  let overallScoreCount = 0;
1201
- const done = new Promise((resolve4) => {
1318
+ const done = new Promise((resolve5) => {
1202
1319
  const unsubscribe = runner.subscribeRunEvents((event) => {
1203
1320
  if (event.type === "TestCaseProgress") {
1204
1321
  const numericScores = event.evaluatorScores.map((item) => toNumericScoreFromScores(item.scores)).filter((item) => item !== void 0);
@@ -1244,7 +1361,7 @@ function RunView({
1244
1361
  }
1245
1362
  if (event.type === "RunCompleted" || event.type === "RunFailed") {
1246
1363
  unsubscribe();
1247
- resolve4(event);
1364
+ resolve5(event);
1248
1365
  }
1249
1366
  });
1250
1367
  });
@@ -1588,7 +1705,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1588
1705
  );
1589
1706
  }
1590
1707
  let spinnerTimer;
1591
- const done = new Promise((resolve4) => {
1708
+ const done = new Promise((resolve5) => {
1592
1709
  const unsubscribe = runner.subscribeRunEvents((event) => {
1593
1710
  if (event.type === "TestCaseProgress") {
1594
1711
  completedCount = event.completedTestCases;
@@ -1649,7 +1766,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1649
1766
  runFinished = true;
1650
1767
  clearLine();
1651
1768
  unsubscribe();
1652
- resolve4(event);
1769
+ resolve5(event);
1653
1770
  }
1654
1771
  });
1655
1772
  });
@@ -1727,7 +1844,7 @@ async function runSimpleEvalCommandPlain(runner, datasetName, evaluatorPattern)
1727
1844
  console.log(`- artifact: ${colorize(finalEvent.artifactPath, ansi2.dim)}`);
1728
1845
  }
1729
1846
  async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1730
- return new Promise((resolve4, reject) => {
1847
+ return new Promise((resolve5, reject) => {
1731
1848
  const app = ink.render(
1732
1849
  React2__default.default.createElement(RunView, {
1733
1850
  runner,
@@ -1738,7 +1855,7 @@ async function runSimpleEvalCommandInk(runner, datasetName, evaluatorPattern) {
1738
1855
  if (err) {
1739
1856
  reject(err);
1740
1857
  } else {
1741
- resolve4();
1858
+ resolve5();
1742
1859
  }
1743
1860
  }
1744
1861
  })