@m4trix/evals 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +706 -231
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +707 -232
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +710 -390
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +702 -382
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +289 -108
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +28 -5
- package/dist/index.js +290 -109
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.js
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { withFullScreen, useScreenSize } from 'fullscreen-ink';
|
|
3
|
-
import
|
|
3
|
+
import React2, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
|
|
4
4
|
import { useApp, useInput, Box, Text } from 'ink';
|
|
5
5
|
import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
6
6
|
import { resolve, relative, join, dirname } from 'path';
|
|
7
|
-
import {
|
|
7
|
+
import { LineGraph } from '@pppp606/ink-chart';
|
|
8
8
|
import { randomUUID } from 'crypto';
|
|
9
|
-
import { Effect, PubSub, Queue, Fiber } from 'effect';
|
|
9
|
+
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
12
|
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
13
|
import { pathToFileURL } from 'url';
|
|
14
|
+
import { diffLines } from 'diff';
|
|
14
15
|
|
|
15
16
|
var SEP = " ";
|
|
16
17
|
var ARROW = "\u203A";
|
|
@@ -78,7 +79,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
|
|
|
78
79
|
// src/cli/components/Footer.tsx
|
|
79
80
|
function getFooterText(state) {
|
|
80
81
|
if (state.level === "datasets") {
|
|
81
|
-
return "\u2191\u2193
|
|
82
|
+
return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
|
|
82
83
|
}
|
|
83
84
|
if (state.level === "runs") {
|
|
84
85
|
return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
|
|
@@ -610,6 +611,7 @@ function createInitialState(data, args) {
|
|
|
610
611
|
datasetMenuIndex,
|
|
611
612
|
runMenuIndex,
|
|
612
613
|
detailsScrollOffset: 0,
|
|
614
|
+
overviewScrollOffset: 0,
|
|
613
615
|
selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
|
|
614
616
|
evaluatorMenuIndex: 0,
|
|
615
617
|
searchQuery,
|
|
@@ -625,8 +627,11 @@ function reduceCliState(state, action) {
|
|
|
625
627
|
if (state.level === "details" && state.focus === "right") {
|
|
626
628
|
return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
|
|
627
629
|
}
|
|
630
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
631
|
+
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
632
|
+
}
|
|
628
633
|
if (state.level === "datasets") {
|
|
629
|
-
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
|
|
634
|
+
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
|
|
630
635
|
}
|
|
631
636
|
if (state.level === "runs") {
|
|
632
637
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -643,8 +648,11 @@ function reduceCliState(state, action) {
|
|
|
643
648
|
if (state.level === "details" && state.focus === "right") {
|
|
644
649
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
645
650
|
}
|
|
651
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
652
|
+
return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
|
|
653
|
+
}
|
|
646
654
|
if (state.level === "datasets") {
|
|
647
|
-
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
|
|
655
|
+
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
|
|
648
656
|
}
|
|
649
657
|
if (state.level === "runs") {
|
|
650
658
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -720,249 +728,6 @@ function reduceCliState(state, action) {
|
|
|
720
728
|
}
|
|
721
729
|
return state;
|
|
722
730
|
}
|
|
723
|
-
var LEFT_PANE_WIDTH2 = 44;
|
|
724
|
-
function DatasetsView({
|
|
725
|
-
state,
|
|
726
|
-
filteredDatasets,
|
|
727
|
-
selectedDataset
|
|
728
|
-
}) {
|
|
729
|
-
const leftFocused = state.focus === "left";
|
|
730
|
-
const rightFocused = state.focus === "right";
|
|
731
|
-
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
732
|
-
/* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
733
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
|
|
734
|
-
/* @__PURE__ */ jsx(
|
|
735
|
-
ListItem,
|
|
736
|
-
{
|
|
737
|
-
selected: state.datasetMenuIndex === 0,
|
|
738
|
-
label: "New evaluation",
|
|
739
|
-
itemKey: "datasets-new-eval"
|
|
740
|
-
}
|
|
741
|
-
),
|
|
742
|
-
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
|
|
743
|
-
ListItem,
|
|
744
|
-
{
|
|
745
|
-
selected: state.datasetMenuIndex === index + 1,
|
|
746
|
-
label: dataset.name,
|
|
747
|
-
itemKey: `dataset-${dataset.id}`
|
|
748
|
-
},
|
|
749
|
-
dataset.id
|
|
750
|
-
))
|
|
751
|
-
] }),
|
|
752
|
-
/* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
753
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
|
|
754
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
|
|
755
|
-
] })
|
|
756
|
-
] });
|
|
757
|
-
}
|
|
758
|
-
function RunsView({
|
|
759
|
-
state,
|
|
760
|
-
dataset,
|
|
761
|
-
selectedRun
|
|
762
|
-
}) {
|
|
763
|
-
const runs = dataset?.runs ?? [];
|
|
764
|
-
const rightFocused = state.focus === "right";
|
|
765
|
-
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
766
|
-
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
767
|
-
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
768
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
769
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
|
|
770
|
-
" ",
|
|
771
|
-
selectedRun.label,
|
|
772
|
-
" ",
|
|
773
|
-
/* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
|
|
774
|
-
] }),
|
|
775
|
-
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
776
|
-
"Commit: ",
|
|
777
|
-
selectedRun.meta.commit,
|
|
778
|
-
" Branch: ",
|
|
779
|
-
selectedRun.meta.branch,
|
|
780
|
-
" ",
|
|
781
|
-
"Seed: ",
|
|
782
|
-
selectedRun.meta.seed
|
|
783
|
-
] }),
|
|
784
|
-
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
785
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
|
|
786
|
-
/* @__PURE__ */ jsx(
|
|
787
|
-
TextBar,
|
|
788
|
-
{
|
|
789
|
-
label: "pass rate",
|
|
790
|
-
value: selectedRun.performance.passRate,
|
|
791
|
-
format: (v) => `${v}%`
|
|
792
|
-
}
|
|
793
|
-
),
|
|
794
|
-
/* @__PURE__ */ jsx(
|
|
795
|
-
TextBar,
|
|
796
|
-
{
|
|
797
|
-
label: "avg score",
|
|
798
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
799
|
-
}
|
|
800
|
-
),
|
|
801
|
-
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
802
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
|
|
803
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
|
|
804
|
-
TextBar,
|
|
805
|
-
{
|
|
806
|
-
label: dimension.name,
|
|
807
|
-
value: dimension.score
|
|
808
|
-
},
|
|
809
|
-
dimension.name
|
|
810
|
-
)),
|
|
811
|
-
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
812
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
|
|
813
|
-
/* @__PURE__ */ jsx(
|
|
814
|
-
Sparkline,
|
|
815
|
-
{
|
|
816
|
-
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
817
|
-
selectedRun.performance.latencyAvgMs - 40,
|
|
818
|
-
selectedRun.performance.latencyAvgMs - 10,
|
|
819
|
-
selectedRun.performance.latencyAvgMs + 20,
|
|
820
|
-
selectedRun.performance.latencyP95Ms - 80,
|
|
821
|
-
selectedRun.performance.latencyP95Ms
|
|
822
|
-
],
|
|
823
|
-
width: 24
|
|
824
|
-
}
|
|
825
|
-
)
|
|
826
|
-
] }) })
|
|
827
|
-
] });
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
// src/evals/metric.ts
|
|
831
|
-
var registry = /* @__PURE__ */ new Map();
|
|
832
|
-
var Metric = {
|
|
833
|
-
of(config) {
|
|
834
|
-
const def = {
|
|
835
|
-
id: config.id,
|
|
836
|
-
name: config.name,
|
|
837
|
-
format: config.format,
|
|
838
|
-
make: (data) => ({ id: config.id, data })
|
|
839
|
-
};
|
|
840
|
-
registry.set(config.id, def);
|
|
841
|
-
return def;
|
|
842
|
-
}
|
|
843
|
-
};
|
|
844
|
-
function getMetricById(id) {
|
|
845
|
-
return registry.get(id);
|
|
846
|
-
}
|
|
847
|
-
|
|
848
|
-
// src/evals/score.ts
|
|
849
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
850
|
-
var Score = {
|
|
851
|
-
of(config) {
|
|
852
|
-
const def = {
|
|
853
|
-
id: config.id,
|
|
854
|
-
name: config.name,
|
|
855
|
-
displayStrategy: config.displayStrategy,
|
|
856
|
-
format: config.format,
|
|
857
|
-
make: (data, options) => {
|
|
858
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
859
|
-
return {
|
|
860
|
-
id: config.id,
|
|
861
|
-
data,
|
|
862
|
-
...passed !== void 0 && { passed }
|
|
863
|
-
};
|
|
864
|
-
}
|
|
865
|
-
};
|
|
866
|
-
registry2.set(config.id, def);
|
|
867
|
-
return def;
|
|
868
|
-
}
|
|
869
|
-
};
|
|
870
|
-
function getScoreById(id) {
|
|
871
|
-
return registry2.get(id);
|
|
872
|
-
}
|
|
873
|
-
|
|
874
|
-
// src/evals/metrics/standard.ts
|
|
875
|
-
Metric.of({
|
|
876
|
-
id: "token-count",
|
|
877
|
-
name: "Tokens",
|
|
878
|
-
format: (data) => {
|
|
879
|
-
const input = data.input ?? 0;
|
|
880
|
-
const output = data.output ?? 0;
|
|
881
|
-
const inputCached = data.inputCached ?? 0;
|
|
882
|
-
const outputCached = data.outputCached ?? 0;
|
|
883
|
-
const cached = inputCached + outputCached;
|
|
884
|
-
return `in:${input} out:${output} cached:${cached}`;
|
|
885
|
-
}
|
|
886
|
-
});
|
|
887
|
-
Metric.of({
|
|
888
|
-
id: "latency",
|
|
889
|
-
name: "Latency",
|
|
890
|
-
format: (data) => `${data.ms}ms`
|
|
891
|
-
});
|
|
892
|
-
|
|
893
|
-
// src/evals/scores/standard.ts
|
|
894
|
-
Score.of({
|
|
895
|
-
id: "percent",
|
|
896
|
-
name: "Score",
|
|
897
|
-
displayStrategy: "bar",
|
|
898
|
-
format: (data) => data.value.toFixed(2)
|
|
899
|
-
});
|
|
900
|
-
Score.of({
|
|
901
|
-
id: "binary",
|
|
902
|
-
name: "Result",
|
|
903
|
-
displayStrategy: "passFail",
|
|
904
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
905
|
-
});
|
|
906
|
-
function createDiffLogEntry(expected, actual, options) {
|
|
907
|
-
const diff = diffString(expected, actual, { color: false });
|
|
908
|
-
return {
|
|
909
|
-
type: "diff",
|
|
910
|
-
label: options?.label,
|
|
911
|
-
expected,
|
|
912
|
-
actual,
|
|
913
|
-
diff: diff || "(no differences)"
|
|
914
|
-
};
|
|
915
|
-
}
|
|
916
|
-
function getDiffLines(entry) {
|
|
917
|
-
const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
918
|
-
return raw.split("\n").map((line) => {
|
|
919
|
-
const trimmed = line.trimStart();
|
|
920
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
921
|
-
return { type: "remove", line };
|
|
922
|
-
}
|
|
923
|
-
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
924
|
-
return { type: "add", line };
|
|
925
|
-
}
|
|
926
|
-
return { type: "context", line };
|
|
927
|
-
});
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
// src/runner/score-utils.ts
|
|
931
|
-
function toNumericScoreFromScores(scores) {
|
|
932
|
-
for (const item of scores) {
|
|
933
|
-
const def = getScoreById(item.id);
|
|
934
|
-
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
935
|
-
const value = item.data.value;
|
|
936
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
937
|
-
return value;
|
|
938
|
-
}
|
|
939
|
-
}
|
|
940
|
-
const numeric = toNumericScore(item.data);
|
|
941
|
-
if (numeric !== void 0) {
|
|
942
|
-
return numeric;
|
|
943
|
-
}
|
|
944
|
-
}
|
|
945
|
-
return void 0;
|
|
946
|
-
}
|
|
947
|
-
function toNumericScore(value) {
|
|
948
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
949
|
-
return value;
|
|
950
|
-
}
|
|
951
|
-
if (typeof value !== "object" || value === null) {
|
|
952
|
-
return void 0;
|
|
953
|
-
}
|
|
954
|
-
const obj = value;
|
|
955
|
-
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
956
|
-
return obj.score;
|
|
957
|
-
}
|
|
958
|
-
const numberValues = Object.values(value).filter(
|
|
959
|
-
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
960
|
-
);
|
|
961
|
-
if (numberValues.length === 0) {
|
|
962
|
-
return void 0;
|
|
963
|
-
}
|
|
964
|
-
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
965
|
-
}
|
|
966
731
|
|
|
967
732
|
// src/runner/config.ts
|
|
968
733
|
var defaultRunnerConfig = {
|
|
@@ -983,7 +748,8 @@ var defaultRunnerConfig = {
|
|
|
983
748
|
],
|
|
984
749
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
985
750
|
},
|
|
986
|
-
artifactDirectory: ".eval-results"
|
|
751
|
+
artifactDirectory: ".eval-results",
|
|
752
|
+
maxConcurrency: 1
|
|
987
753
|
};
|
|
988
754
|
function toRunnerConfigOverrides(config) {
|
|
989
755
|
if (!config) {
|
|
@@ -1016,6 +782,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
1016
782
|
if (config.artifactDirectory !== void 0) {
|
|
1017
783
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
1018
784
|
}
|
|
785
|
+
if (config.maxConcurrency !== void 0) {
|
|
786
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
787
|
+
}
|
|
1019
788
|
if (Object.keys(discovery).length > 0) {
|
|
1020
789
|
overrides.discovery = discovery;
|
|
1021
790
|
}
|
|
@@ -1190,25 +959,261 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1190
959
|
);
|
|
1191
960
|
return found.flat();
|
|
1192
961
|
}
|
|
1193
|
-
async function collectTestCasesFromFiles(config) {
|
|
1194
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1195
|
-
const matched = files.filter(
|
|
1196
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
1197
|
-
);
|
|
1198
|
-
const found = await Promise.all(
|
|
1199
|
-
matched.map(async (absolutePath) => {
|
|
1200
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1201
|
-
const testCases = exports.filter(isTestCaseLike);
|
|
1202
|
-
const relPath = relative(config.rootDir, absolutePath);
|
|
1203
|
-
return testCases.map((testCase) => ({
|
|
1204
|
-
id: toId("test-case", relPath, testCase.getName()),
|
|
1205
|
-
filePath: relPath,
|
|
1206
|
-
testCase
|
|
1207
|
-
}));
|
|
1208
|
-
})
|
|
962
|
+
async function collectTestCasesFromFiles(config) {
|
|
963
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
964
|
+
const matched = files.filter(
|
|
965
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
966
|
+
);
|
|
967
|
+
const found = await Promise.all(
|
|
968
|
+
matched.map(async (absolutePath) => {
|
|
969
|
+
const exports = await loadModuleExports(absolutePath);
|
|
970
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
971
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
972
|
+
return testCases.map((testCase) => ({
|
|
973
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
974
|
+
filePath: relPath,
|
|
975
|
+
testCase
|
|
976
|
+
}));
|
|
977
|
+
})
|
|
978
|
+
);
|
|
979
|
+
return found.flat();
|
|
980
|
+
}
|
|
981
|
+
function toJsonLines(value) {
|
|
982
|
+
try {
|
|
983
|
+
return JSON.stringify(value, null, 2);
|
|
984
|
+
} catch {
|
|
985
|
+
return String(value);
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
function formatDiffString(changes) {
|
|
989
|
+
const lines = [];
|
|
990
|
+
for (const part of changes) {
|
|
991
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
992
|
+
const partLines = part.value.split("\n");
|
|
993
|
+
if (partLines[partLines.length - 1] === "") {
|
|
994
|
+
partLines.pop();
|
|
995
|
+
}
|
|
996
|
+
for (const line of partLines) {
|
|
997
|
+
lines.push(`${prefix} ${line}`);
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
return lines.join("\n");
|
|
1001
|
+
}
|
|
1002
|
+
function createDiffString(expected, actual) {
|
|
1003
|
+
const expectedStr = toJsonLines(expected);
|
|
1004
|
+
const actualStr = toJsonLines(actual);
|
|
1005
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
1006
|
+
return formatDiffString(changes);
|
|
1007
|
+
}
|
|
1008
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
1009
|
+
const diff = createDiffString(expected, actual);
|
|
1010
|
+
return {
|
|
1011
|
+
type: "diff",
|
|
1012
|
+
label: options?.label,
|
|
1013
|
+
expected,
|
|
1014
|
+
actual,
|
|
1015
|
+
diff: diff || "(no differences)"
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
function getDiffLines(entry) {
|
|
1019
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
1020
|
+
return raw.split("\n").map((line) => {
|
|
1021
|
+
const trimmed = line.trimStart();
|
|
1022
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
1023
|
+
return { type: "remove", line };
|
|
1024
|
+
}
|
|
1025
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
1026
|
+
return { type: "add", line };
|
|
1027
|
+
}
|
|
1028
|
+
return { type: "context", line };
|
|
1029
|
+
});
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// src/evals/metric.ts
|
|
1033
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1034
|
+
var Metric = {
|
|
1035
|
+
of(config) {
|
|
1036
|
+
const def = {
|
|
1037
|
+
id: config.id,
|
|
1038
|
+
name: config.name,
|
|
1039
|
+
aggregate: config.aggregate,
|
|
1040
|
+
format: config.format,
|
|
1041
|
+
make: (data) => ({ id: config.id, data })
|
|
1042
|
+
};
|
|
1043
|
+
registry.set(config.id, def);
|
|
1044
|
+
return def;
|
|
1045
|
+
}
|
|
1046
|
+
};
|
|
1047
|
+
function getMetricById(id) {
|
|
1048
|
+
return registry.get(id);
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// src/evals/score.ts
|
|
1052
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1053
|
+
var Score = {
|
|
1054
|
+
of(config) {
|
|
1055
|
+
const def = {
|
|
1056
|
+
id: config.id,
|
|
1057
|
+
name: config.name,
|
|
1058
|
+
displayStrategy: config.displayStrategy,
|
|
1059
|
+
aggregate: config.aggregate,
|
|
1060
|
+
format: config.format,
|
|
1061
|
+
make: (data, options) => {
|
|
1062
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1063
|
+
return {
|
|
1064
|
+
id: config.id,
|
|
1065
|
+
data,
|
|
1066
|
+
...passed !== void 0 && { passed }
|
|
1067
|
+
};
|
|
1068
|
+
}
|
|
1069
|
+
};
|
|
1070
|
+
registry2.set(config.id, def);
|
|
1071
|
+
return def;
|
|
1072
|
+
}
|
|
1073
|
+
};
|
|
1074
|
+
function getScoreById(id) {
|
|
1075
|
+
return registry2.get(id);
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
// src/evals/aggregators.ts
|
|
1079
|
+
function aggregateAverageWithVariance(values) {
|
|
1080
|
+
if (values.length === 0) {
|
|
1081
|
+
return { value: 0, count: 0 };
|
|
1082
|
+
}
|
|
1083
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1084
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1085
|
+
const mean = sum / values.length;
|
|
1086
|
+
let stdDev;
|
|
1087
|
+
if (values.length >= 2) {
|
|
1088
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1089
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1090
|
+
}
|
|
1091
|
+
return { value: mean, stdDev, count: values.length };
|
|
1092
|
+
}
|
|
1093
|
+
function aggregateAll(values) {
|
|
1094
|
+
const total = values.length;
|
|
1095
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1096
|
+
return {
|
|
1097
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1098
|
+
passedCount,
|
|
1099
|
+
totalCount: total
|
|
1100
|
+
};
|
|
1101
|
+
}
|
|
1102
|
+
function aggregateTokenCountSum(values) {
|
|
1103
|
+
const initial = {
|
|
1104
|
+
input: 0,
|
|
1105
|
+
output: 0,
|
|
1106
|
+
inputCached: 0,
|
|
1107
|
+
outputCached: 0
|
|
1108
|
+
};
|
|
1109
|
+
return values.reduce(
|
|
1110
|
+
(acc, v) => ({
|
|
1111
|
+
input: acc.input + (v.input ?? 0),
|
|
1112
|
+
output: acc.output + (v.output ?? 0),
|
|
1113
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1114
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1115
|
+
}),
|
|
1116
|
+
initial
|
|
1117
|
+
);
|
|
1118
|
+
}
|
|
1119
|
+
function aggregateLatencyAverage(values) {
|
|
1120
|
+
if (values.length === 0) {
|
|
1121
|
+
return { ms: 0 };
|
|
1122
|
+
}
|
|
1123
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1124
|
+
return { ms: sum / values.length };
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// src/evals/metrics/standard.ts
|
|
1128
|
+
Metric.of({
|
|
1129
|
+
id: "token-count",
|
|
1130
|
+
name: "Tokens",
|
|
1131
|
+
aggregate: aggregateTokenCountSum,
|
|
1132
|
+
format: (data, options) => {
|
|
1133
|
+
const input = data.input ?? 0;
|
|
1134
|
+
const output = data.output ?? 0;
|
|
1135
|
+
const inputCached = data.inputCached ?? 0;
|
|
1136
|
+
const outputCached = data.outputCached ?? 0;
|
|
1137
|
+
const cached = inputCached + outputCached;
|
|
1138
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1139
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1140
|
+
}
|
|
1141
|
+
});
|
|
1142
|
+
Metric.of({
|
|
1143
|
+
id: "latency",
|
|
1144
|
+
name: "Latency",
|
|
1145
|
+
aggregate: aggregateLatencyAverage,
|
|
1146
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1147
|
+
});
|
|
1148
|
+
|
|
1149
|
+
// src/evals/scores/standard.ts
|
|
1150
|
+
Score.of({
|
|
1151
|
+
id: "percent",
|
|
1152
|
+
name: "Score",
|
|
1153
|
+
displayStrategy: "bar",
|
|
1154
|
+
format: (data, options) => {
|
|
1155
|
+
if (options?.isAggregated) {
|
|
1156
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
1157
|
+
}
|
|
1158
|
+
return data.value.toFixed(2);
|
|
1159
|
+
},
|
|
1160
|
+
aggregate: aggregateAverageWithVariance
|
|
1161
|
+
});
|
|
1162
|
+
Score.of({
|
|
1163
|
+
id: "binary",
|
|
1164
|
+
name: "Result",
|
|
1165
|
+
displayStrategy: "passFail",
|
|
1166
|
+
format: (data, options) => {
|
|
1167
|
+
if (options?.isAggregated) {
|
|
1168
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1169
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1170
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1171
|
+
}
|
|
1172
|
+
return base;
|
|
1173
|
+
}
|
|
1174
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
1175
|
+
},
|
|
1176
|
+
aggregate: aggregateAll
|
|
1177
|
+
});
|
|
1178
|
+
|
|
1179
|
+
// src/runner/score-utils.ts
|
|
1180
|
+
function toNumericScoreFromScores(scores) {
|
|
1181
|
+
for (const item of scores) {
|
|
1182
|
+
const def = getScoreById(item.id);
|
|
1183
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1184
|
+
const value = item.data.value;
|
|
1185
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1186
|
+
return value;
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
const numeric = toNumericScore(item.data);
|
|
1190
|
+
if (numeric !== void 0) {
|
|
1191
|
+
return numeric;
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
return void 0;
|
|
1195
|
+
}
|
|
1196
|
+
function toNumericScore(value) {
|
|
1197
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1198
|
+
return value;
|
|
1199
|
+
}
|
|
1200
|
+
if (typeof value !== "object" || value === null) {
|
|
1201
|
+
return void 0;
|
|
1202
|
+
}
|
|
1203
|
+
const obj = value;
|
|
1204
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1205
|
+
return obj.score;
|
|
1206
|
+
}
|
|
1207
|
+
const numberValues = Object.values(value).filter(
|
|
1208
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1209
1209
|
);
|
|
1210
|
-
|
|
1210
|
+
if (numberValues.length === 0) {
|
|
1211
|
+
return void 0;
|
|
1212
|
+
}
|
|
1213
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1211
1214
|
}
|
|
1215
|
+
|
|
1216
|
+
// src/runner/execution.ts
|
|
1212
1217
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1213
1218
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1214
1219
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1250,6 +1255,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1250
1255
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1251
1256
|
);
|
|
1252
1257
|
}
|
|
1258
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1259
|
+
return Effect.gen(function* () {
|
|
1260
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1261
|
+
const rerunPassed = [];
|
|
1262
|
+
for (let r = 0; r < reruns; r++) {
|
|
1263
|
+
const started = Date.now();
|
|
1264
|
+
const evaluatorScores = [];
|
|
1265
|
+
let testCaseError;
|
|
1266
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1267
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1268
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1269
|
+
if (!evaluateFn) {
|
|
1270
|
+
continue;
|
|
1271
|
+
}
|
|
1272
|
+
try {
|
|
1273
|
+
const logs = [];
|
|
1274
|
+
const logDiff = (expected, actual, options) => {
|
|
1275
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1276
|
+
};
|
|
1277
|
+
const ctx = yield* Effect.promise(
|
|
1278
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1279
|
+
);
|
|
1280
|
+
const result = yield* Effect.promise(
|
|
1281
|
+
() => Promise.resolve(
|
|
1282
|
+
evaluateFn({
|
|
1283
|
+
input: testCaseItem.testCase.getInput(),
|
|
1284
|
+
ctx,
|
|
1285
|
+
output,
|
|
1286
|
+
logDiff
|
|
1287
|
+
})
|
|
1288
|
+
)
|
|
1289
|
+
);
|
|
1290
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1291
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1292
|
+
evaluatorScores.push({
|
|
1293
|
+
evaluatorId,
|
|
1294
|
+
scores,
|
|
1295
|
+
passed: passed2,
|
|
1296
|
+
metrics,
|
|
1297
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1298
|
+
});
|
|
1299
|
+
} catch (error) {
|
|
1300
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1301
|
+
evaluatorScores.push({
|
|
1302
|
+
evaluatorId,
|
|
1303
|
+
scores: [],
|
|
1304
|
+
passed: false
|
|
1305
|
+
});
|
|
1306
|
+
}
|
|
1307
|
+
}
|
|
1308
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1309
|
+
rerunPassed.push(rerunPassedThis);
|
|
1310
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1311
|
+
n + 1,
|
|
1312
|
+
n + 1
|
|
1313
|
+
]);
|
|
1314
|
+
const progressEvent = {
|
|
1315
|
+
type: "TestCaseProgress",
|
|
1316
|
+
runId: task.runId,
|
|
1317
|
+
testCaseId: testCaseItem.id,
|
|
1318
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1319
|
+
completedTestCases: completedEvaluations,
|
|
1320
|
+
totalTestCases: totalEvaluations,
|
|
1321
|
+
rerunIndex: r + 1,
|
|
1322
|
+
rerunTotal: reruns,
|
|
1323
|
+
passed: rerunPassedThis,
|
|
1324
|
+
durationMs: Date.now() - started,
|
|
1325
|
+
evaluatorScores,
|
|
1326
|
+
output,
|
|
1327
|
+
errorMessage: testCaseError
|
|
1328
|
+
};
|
|
1329
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1330
|
+
...snapshot,
|
|
1331
|
+
completedTestCases: completedEvaluations
|
|
1332
|
+
}));
|
|
1333
|
+
yield* publishEvent(progressEvent);
|
|
1334
|
+
yield* Queue.offer(persistenceQueue, {
|
|
1335
|
+
runId: task.runId,
|
|
1336
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1337
|
+
payload: progressEvent
|
|
1338
|
+
});
|
|
1339
|
+
}
|
|
1340
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1341
|
+
if (testCasePassed) {
|
|
1342
|
+
yield* Ref.update(passedRef, (n) => n + 1);
|
|
1343
|
+
} else {
|
|
1344
|
+
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1345
|
+
}
|
|
1346
|
+
const [passed, failed] = yield* Effect.all([
|
|
1347
|
+
Ref.get(passedRef),
|
|
1348
|
+
Ref.get(failedRef)
|
|
1349
|
+
]);
|
|
1350
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1351
|
+
...snapshot,
|
|
1352
|
+
passedTestCases: passed,
|
|
1353
|
+
failedTestCases: failed
|
|
1354
|
+
}));
|
|
1355
|
+
});
|
|
1356
|
+
}
|
|
1253
1357
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
|
|
1254
1358
|
const startedAt = Date.now();
|
|
1255
1359
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -1262,104 +1366,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => E
|
|
|
1262
1366
|
runId: task.runId,
|
|
1263
1367
|
startedAt
|
|
1264
1368
|
});
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
)
|
|
1295
|
-
);
|
|
1296
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1297
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1298
|
-
evaluatorScores.push({
|
|
1299
|
-
evaluatorId,
|
|
1300
|
-
scores,
|
|
1301
|
-
passed,
|
|
1302
|
-
metrics,
|
|
1303
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1304
|
-
});
|
|
1305
|
-
} catch (error) {
|
|
1306
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1307
|
-
evaluatorScores.push({
|
|
1308
|
-
evaluatorId,
|
|
1309
|
-
scores: [],
|
|
1310
|
-
passed: false
|
|
1311
|
-
});
|
|
1312
|
-
}
|
|
1313
|
-
}
|
|
1314
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
1315
|
-
completedTestCases += 1;
|
|
1316
|
-
if (testCasePassed) {
|
|
1317
|
-
passedTestCases += 1;
|
|
1318
|
-
} else {
|
|
1319
|
-
failedTestCases += 1;
|
|
1320
|
-
}
|
|
1321
|
-
const progressEvent = {
|
|
1322
|
-
type: "TestCaseProgress",
|
|
1323
|
-
runId: task.runId,
|
|
1324
|
-
testCaseId: testCaseItem.id,
|
|
1325
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1326
|
-
completedTestCases,
|
|
1327
|
-
totalTestCases: task.testCases.length,
|
|
1328
|
-
passed: testCasePassed,
|
|
1329
|
-
durationMs: Date.now() - started,
|
|
1330
|
-
evaluatorScores,
|
|
1331
|
-
output,
|
|
1332
|
-
errorMessage: testCaseError
|
|
1333
|
-
};
|
|
1334
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1335
|
-
...snapshot,
|
|
1336
|
-
completedTestCases,
|
|
1337
|
-
passedTestCases,
|
|
1338
|
-
failedTestCases
|
|
1339
|
-
}));
|
|
1340
|
-
yield* publishEvent(progressEvent);
|
|
1341
|
-
yield* Queue.offer(persistenceQueue, {
|
|
1342
|
-
runId: task.runId,
|
|
1343
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1344
|
-
payload: progressEvent
|
|
1345
|
-
});
|
|
1346
|
-
}
|
|
1369
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1370
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1371
|
+
0
|
|
1372
|
+
);
|
|
1373
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1374
|
+
const completedRef = yield* Ref.make(0);
|
|
1375
|
+
const passedRef = yield* Ref.make(0);
|
|
1376
|
+
const failedRef = yield* Ref.make(0);
|
|
1377
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1378
|
+
task,
|
|
1379
|
+
testCaseItem,
|
|
1380
|
+
totalEvaluations,
|
|
1381
|
+
publishEvent,
|
|
1382
|
+
persistenceQueue,
|
|
1383
|
+
updateSnapshot,
|
|
1384
|
+
completedRef,
|
|
1385
|
+
passedRef,
|
|
1386
|
+
failedRef
|
|
1387
|
+
);
|
|
1388
|
+
yield* Effect.forEach(
|
|
1389
|
+
task.testCases,
|
|
1390
|
+
processTestCase,
|
|
1391
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1392
|
+
);
|
|
1393
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* Effect.all([
|
|
1394
|
+
Ref.get(completedRef),
|
|
1395
|
+
Ref.get(passedRef),
|
|
1396
|
+
Ref.get(failedRef)
|
|
1397
|
+
]);
|
|
1347
1398
|
const finishedAt = Date.now();
|
|
1348
1399
|
const completedEvent = {
|
|
1349
1400
|
type: "RunCompleted",
|
|
1350
1401
|
runId: task.runId,
|
|
1351
1402
|
finishedAt,
|
|
1352
|
-
passedTestCases,
|
|
1353
|
-
failedTestCases,
|
|
1403
|
+
passedTestCases: passedUniqueTestCases,
|
|
1404
|
+
failedTestCases: failedUniqueTestCases,
|
|
1354
1405
|
totalTestCases: task.testCases.length,
|
|
1355
1406
|
artifactPath: task.snapshot.artifactPath
|
|
1356
1407
|
};
|
|
1357
1408
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1358
1409
|
...snapshot,
|
|
1359
1410
|
status: "completed",
|
|
1360
|
-
completedTestCases,
|
|
1361
|
-
passedTestCases,
|
|
1362
|
-
failedTestCases,
|
|
1411
|
+
completedTestCases: completedEvaluations,
|
|
1412
|
+
passedTestCases: passedUniqueTestCases,
|
|
1413
|
+
failedTestCases: failedUniqueTestCases,
|
|
1363
1414
|
finishedAt
|
|
1364
1415
|
}));
|
|
1365
1416
|
yield* publishEvent(completedEvent);
|
|
@@ -1447,7 +1498,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1447
1498
|
const artifactPath = filePath;
|
|
1448
1499
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1449
1500
|
const progress = aggregateTestCaseProgress(lines);
|
|
1450
|
-
const completedTestCases = runCompleted
|
|
1501
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1451
1502
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1452
1503
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1453
1504
|
return {
|
|
@@ -1469,23 +1520,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1469
1520
|
}
|
|
1470
1521
|
function aggregateTestCaseProgress(lines) {
|
|
1471
1522
|
let completedTestCases = 0;
|
|
1472
|
-
|
|
1473
|
-
let failedTestCases = 0;
|
|
1523
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1474
1524
|
for (const line of lines) {
|
|
1475
1525
|
try {
|
|
1476
1526
|
const event = JSON.parse(line);
|
|
1477
1527
|
if (event.type === "TestCaseProgress") {
|
|
1478
1528
|
const ev = event;
|
|
1479
1529
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
failedTestCases += 1;
|
|
1484
|
-
}
|
|
1530
|
+
const id = ev.testCaseId;
|
|
1531
|
+
const current = testCasePassedBy.get(id);
|
|
1532
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1485
1533
|
}
|
|
1486
1534
|
} catch {
|
|
1487
1535
|
}
|
|
1488
1536
|
}
|
|
1537
|
+
let passedTestCases = 0;
|
|
1538
|
+
let failedTestCases = 0;
|
|
1539
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1540
|
+
if (passed) {
|
|
1541
|
+
passedTestCases += 1;
|
|
1542
|
+
} else {
|
|
1543
|
+
failedTestCases += 1;
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1489
1546
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1490
1547
|
}
|
|
1491
1548
|
async function parseArtifactFile(artifactPath) {
|
|
@@ -1503,6 +1560,8 @@ async function parseArtifactFile(artifactPath) {
|
|
|
1503
1560
|
testCaseName: ev.testCaseName,
|
|
1504
1561
|
completedTestCases: ev.completedTestCases,
|
|
1505
1562
|
totalTestCases: ev.totalTestCases,
|
|
1563
|
+
rerunIndex: ev.rerunIndex,
|
|
1564
|
+
rerunTotal: ev.rerunTotal,
|
|
1506
1565
|
passed: ev.passed,
|
|
1507
1566
|
durationMs: ev.durationMs,
|
|
1508
1567
|
evaluatorScores: ev.evaluatorScores ?? []
|
|
@@ -1708,6 +1767,10 @@ var EffectRunner = class {
|
|
|
1708
1767
|
throw new Error("No evaluators selected for run");
|
|
1709
1768
|
}
|
|
1710
1769
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1770
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1771
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1772
|
+
0
|
|
1773
|
+
);
|
|
1711
1774
|
const runId = `run-${randomUUID()}`;
|
|
1712
1775
|
const artifactPath = createArtifactPath(
|
|
1713
1776
|
this.config.artifactDirectory,
|
|
@@ -1720,7 +1783,7 @@ var EffectRunner = class {
|
|
|
1720
1783
|
datasetName: dataset.dataset.getName(),
|
|
1721
1784
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1722
1785
|
queuedAt: Date.now(),
|
|
1723
|
-
totalTestCases:
|
|
1786
|
+
totalTestCases: totalEvaluations,
|
|
1724
1787
|
completedTestCases: 0,
|
|
1725
1788
|
passedTestCases: 0,
|
|
1726
1789
|
failedTestCases: 0,
|
|
@@ -1734,7 +1797,7 @@ var EffectRunner = class {
|
|
|
1734
1797
|
datasetId: request.datasetId,
|
|
1735
1798
|
datasetName: dataset.dataset.getName(),
|
|
1736
1799
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1737
|
-
totalTestCases:
|
|
1800
|
+
totalTestCases: totalEvaluations,
|
|
1738
1801
|
artifactPath
|
|
1739
1802
|
};
|
|
1740
1803
|
await Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1745,6 +1808,7 @@ var EffectRunner = class {
|
|
|
1745
1808
|
payload: queuedEvent
|
|
1746
1809
|
})
|
|
1747
1810
|
);
|
|
1811
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1748
1812
|
await Effect.runPromise(
|
|
1749
1813
|
Queue.offer(this.runQueue, {
|
|
1750
1814
|
runId,
|
|
@@ -1752,7 +1816,8 @@ var EffectRunner = class {
|
|
|
1752
1816
|
dataset: dataset.dataset,
|
|
1753
1817
|
evaluators: selectedEvaluators,
|
|
1754
1818
|
testCases: selectedTestCases,
|
|
1755
|
-
snapshot
|
|
1819
|
+
snapshot,
|
|
1820
|
+
maxConcurrency
|
|
1756
1821
|
})
|
|
1757
1822
|
);
|
|
1758
1823
|
return snapshot;
|
|
@@ -1819,6 +1884,240 @@ var EffectRunner = class {
|
|
|
1819
1884
|
);
|
|
1820
1885
|
}
|
|
1821
1886
|
};
|
|
1887
|
+
var LEFT_PANE_WIDTH2 = 44;
|
|
1888
|
+
var MAX_RUNS_FOR_CHART = 12;
|
|
1889
|
+
var MAX_RUNS_FOR_TREND = 20;
|
|
1890
|
+
var TREND_BATCH_SIZE = 4;
|
|
1891
|
+
function extractRunAverageScore(testCases) {
|
|
1892
|
+
const scores = [];
|
|
1893
|
+
for (const tc of testCases) {
|
|
1894
|
+
for (const es of tc.evaluatorScores) {
|
|
1895
|
+
const n = toNumericScoreFromScores(es.scores);
|
|
1896
|
+
if (n !== void 0) {
|
|
1897
|
+
scores.push(n);
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
}
|
|
1901
|
+
if (scores.length === 0)
|
|
1902
|
+
return void 0;
|
|
1903
|
+
return scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1904
|
+
}
|
|
1905
|
+
async function loadRunScores(runs) {
|
|
1906
|
+
const results = [];
|
|
1907
|
+
for (const run of runs) {
|
|
1908
|
+
const artifact = run.meta?.artifact;
|
|
1909
|
+
if (!artifact)
|
|
1910
|
+
continue;
|
|
1911
|
+
try {
|
|
1912
|
+
const path = resolve(artifact);
|
|
1913
|
+
const testCases = await parseArtifactFile(path);
|
|
1914
|
+
const avg = extractRunAverageScore(testCases);
|
|
1915
|
+
if (avg !== void 0) {
|
|
1916
|
+
results.push({
|
|
1917
|
+
runId: run.id,
|
|
1918
|
+
label: run.label,
|
|
1919
|
+
value: avg
|
|
1920
|
+
});
|
|
1921
|
+
}
|
|
1922
|
+
} catch {
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
return results;
|
|
1926
|
+
}
|
|
1927
|
+
function batchAverage(values, batchSize) {
|
|
1928
|
+
const batches = [];
|
|
1929
|
+
for (let i = 0; i < values.length; i += batchSize) {
|
|
1930
|
+
const slice = values.slice(i, i + batchSize);
|
|
1931
|
+
if (slice.length > 0) {
|
|
1932
|
+
batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
|
|
1933
|
+
}
|
|
1934
|
+
}
|
|
1935
|
+
return batches;
|
|
1936
|
+
}
|
|
1937
|
+
var OVERVIEW_PAGE_SIZE = 15;
|
|
1938
|
+
function DatasetsView({
|
|
1939
|
+
state,
|
|
1940
|
+
filteredDatasets,
|
|
1941
|
+
selectedDataset,
|
|
1942
|
+
overviewRowCountRef
|
|
1943
|
+
}) {
|
|
1944
|
+
const leftFocused = state.focus === "left";
|
|
1945
|
+
const rightFocused = state.focus === "right";
|
|
1946
|
+
const [runScores, setRunScores] = useState([]);
|
|
1947
|
+
const [loading, setLoading] = useState(false);
|
|
1948
|
+
useEffect(() => {
|
|
1949
|
+
if (!selectedDataset?.runs?.length) {
|
|
1950
|
+
setRunScores([]);
|
|
1951
|
+
return;
|
|
1952
|
+
}
|
|
1953
|
+
setLoading(true);
|
|
1954
|
+
const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
|
|
1955
|
+
loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
|
|
1956
|
+
}, [selectedDataset?.id, selectedDataset?.runs?.length]);
|
|
1957
|
+
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
1958
|
+
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
1959
|
+
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
1960
|
+
const overviewRows = useMemo(() => {
|
|
1961
|
+
const rows = [];
|
|
1962
|
+
rows.push(
|
|
1963
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
1964
|
+
);
|
|
1965
|
+
if (selectedDataset && selectedDataset.runs.length > 0) {
|
|
1966
|
+
if (loading) {
|
|
1967
|
+
rows.push(
|
|
1968
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
|
|
1969
|
+
);
|
|
1970
|
+
} else if (runScores.length > 0) {
|
|
1971
|
+
rows.push(
|
|
1972
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
|
|
1973
|
+
);
|
|
1974
|
+
for (const d of barData) {
|
|
1975
|
+
rows.push(
|
|
1976
|
+
/* @__PURE__ */ jsx(
|
|
1977
|
+
TextBar,
|
|
1978
|
+
{
|
|
1979
|
+
label: d.label,
|
|
1980
|
+
value: d.value,
|
|
1981
|
+
labelWidth: 14,
|
|
1982
|
+
barWidth: 24,
|
|
1983
|
+
max: 100,
|
|
1984
|
+
format: (v) => v.toFixed(1)
|
|
1985
|
+
},
|
|
1986
|
+
d.runId
|
|
1987
|
+
)
|
|
1988
|
+
);
|
|
1989
|
+
}
|
|
1990
|
+
if (trendBatched.length > 0) {
|
|
1991
|
+
rows.push(
|
|
1992
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
|
|
1993
|
+
);
|
|
1994
|
+
rows.push(
|
|
1995
|
+
/* @__PURE__ */ jsx(Box, { children: /* @__PURE__ */ jsx(
|
|
1996
|
+
LineGraph,
|
|
1997
|
+
{
|
|
1998
|
+
data: [{ values: trendBatched, color: "cyan" }],
|
|
1999
|
+
height: 5,
|
|
2000
|
+
width: 45,
|
|
2001
|
+
showYAxis: true,
|
|
2002
|
+
xLabels: ["older", "newer"]
|
|
2003
|
+
}
|
|
2004
|
+
) }, "trend-graph")
|
|
2005
|
+
);
|
|
2006
|
+
}
|
|
2007
|
+
}
|
|
2008
|
+
}
|
|
2009
|
+
return rows;
|
|
2010
|
+
}, [
|
|
2011
|
+
selectedDataset?.overview,
|
|
2012
|
+
selectedDataset?.runs?.length,
|
|
2013
|
+
loading,
|
|
2014
|
+
runScores,
|
|
2015
|
+
barData,
|
|
2016
|
+
trendBatched
|
|
2017
|
+
]);
|
|
2018
|
+
if (overviewRowCountRef) {
|
|
2019
|
+
overviewRowCountRef.current = overviewRows.length;
|
|
2020
|
+
}
|
|
2021
|
+
const offset = Math.max(0, state.overviewScrollOffset);
|
|
2022
|
+
const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
|
|
2023
|
+
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2024
|
+
/* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
2025
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
|
|
2026
|
+
/* @__PURE__ */ jsx(
|
|
2027
|
+
ListItem,
|
|
2028
|
+
{
|
|
2029
|
+
selected: state.datasetMenuIndex === 0,
|
|
2030
|
+
label: "New evaluation",
|
|
2031
|
+
itemKey: "datasets-new-eval"
|
|
2032
|
+
}
|
|
2033
|
+
),
|
|
2034
|
+
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
|
|
2035
|
+
ListItem,
|
|
2036
|
+
{
|
|
2037
|
+
selected: state.datasetMenuIndex === index + 1,
|
|
2038
|
+
label: dataset.name,
|
|
2039
|
+
itemKey: `dataset-${dataset.id}`
|
|
2040
|
+
},
|
|
2041
|
+
dataset.id
|
|
2042
|
+
))
|
|
2043
|
+
] }),
|
|
2044
|
+
/* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
2045
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
|
|
2046
|
+
/* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsx(Box, { children: row }, offset + i)) })
|
|
2047
|
+
] })
|
|
2048
|
+
] });
|
|
2049
|
+
}
|
|
2050
|
+
function RunsView({
|
|
2051
|
+
state,
|
|
2052
|
+
dataset,
|
|
2053
|
+
selectedRun
|
|
2054
|
+
}) {
|
|
2055
|
+
const runs = dataset?.runs ?? [];
|
|
2056
|
+
const rightFocused = state.focus === "right";
|
|
2057
|
+
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2058
|
+
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
2059
|
+
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
2060
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
2061
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
|
|
2062
|
+
" ",
|
|
2063
|
+
selectedRun.label,
|
|
2064
|
+
" ",
|
|
2065
|
+
/* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
|
|
2066
|
+
] }),
|
|
2067
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2068
|
+
"Commit: ",
|
|
2069
|
+
selectedRun.meta.commit,
|
|
2070
|
+
" Branch: ",
|
|
2071
|
+
selectedRun.meta.branch,
|
|
2072
|
+
" ",
|
|
2073
|
+
"Seed: ",
|
|
2074
|
+
selectedRun.meta.seed
|
|
2075
|
+
] }),
|
|
2076
|
+
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2077
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
|
|
2078
|
+
/* @__PURE__ */ jsx(
|
|
2079
|
+
TextBar,
|
|
2080
|
+
{
|
|
2081
|
+
label: "pass rate",
|
|
2082
|
+
value: selectedRun.performance.passRate,
|
|
2083
|
+
format: (v) => `${v}%`
|
|
2084
|
+
}
|
|
2085
|
+
),
|
|
2086
|
+
/* @__PURE__ */ jsx(
|
|
2087
|
+
TextBar,
|
|
2088
|
+
{
|
|
2089
|
+
label: "avg score",
|
|
2090
|
+
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2091
|
+
}
|
|
2092
|
+
),
|
|
2093
|
+
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2094
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
|
|
2095
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
|
|
2096
|
+
TextBar,
|
|
2097
|
+
{
|
|
2098
|
+
label: dimension.name,
|
|
2099
|
+
value: dimension.score
|
|
2100
|
+
},
|
|
2101
|
+
dimension.name
|
|
2102
|
+
)),
|
|
2103
|
+
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2104
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
|
|
2105
|
+
/* @__PURE__ */ jsx(
|
|
2106
|
+
Sparkline,
|
|
2107
|
+
{
|
|
2108
|
+
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
2109
|
+
selectedRun.performance.latencyAvgMs - 40,
|
|
2110
|
+
selectedRun.performance.latencyAvgMs - 10,
|
|
2111
|
+
selectedRun.performance.latencyAvgMs + 20,
|
|
2112
|
+
selectedRun.performance.latencyP95Ms - 80,
|
|
2113
|
+
selectedRun.performance.latencyP95Ms
|
|
2114
|
+
],
|
|
2115
|
+
width: 24
|
|
2116
|
+
}
|
|
2117
|
+
)
|
|
2118
|
+
] }) })
|
|
2119
|
+
] });
|
|
2120
|
+
}
|
|
1822
2121
|
var DETAILS_PAGE_SIZE = 20;
|
|
1823
2122
|
function scoreColor(score) {
|
|
1824
2123
|
if (score >= 80)
|
|
@@ -1827,7 +2126,7 @@ function scoreColor(score) {
|
|
|
1827
2126
|
return "yellow";
|
|
1828
2127
|
return "red";
|
|
1829
2128
|
}
|
|
1830
|
-
function formatScorePart(item
|
|
2129
|
+
function formatScorePart(item) {
|
|
1831
2130
|
const def = getScoreById(item.id);
|
|
1832
2131
|
if (!def) {
|
|
1833
2132
|
const numeric = toNumericScore(item.data);
|
|
@@ -1857,7 +2156,7 @@ function CheckRow({
|
|
|
1857
2156
|
" ",
|
|
1858
2157
|
/* @__PURE__ */ jsx(Text, { color, bold: true, children: status }),
|
|
1859
2158
|
detail ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1860
|
-
"
|
|
2159
|
+
" (",
|
|
1861
2160
|
detail,
|
|
1862
2161
|
")"
|
|
1863
2162
|
] }) : null
|
|
@@ -1877,21 +2176,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1877
2176
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1878
2177
|
"Model: ",
|
|
1879
2178
|
meta.model,
|
|
1880
|
-
"
|
|
2179
|
+
" Provider: ",
|
|
1881
2180
|
meta.provider
|
|
1882
2181
|
] }, "meta-1"),
|
|
1883
2182
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1884
2183
|
"Commit: ",
|
|
1885
2184
|
meta.commit,
|
|
1886
|
-
"
|
|
2185
|
+
" Branch: ",
|
|
1887
2186
|
meta.branch,
|
|
1888
|
-
"
|
|
2187
|
+
" Seed: ",
|
|
1889
2188
|
meta.seed
|
|
1890
2189
|
] }, "meta-2"),
|
|
1891
2190
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1892
2191
|
"Duration: ",
|
|
1893
2192
|
meta.duration,
|
|
1894
|
-
"
|
|
2193
|
+
" Concurrency: ",
|
|
1895
2194
|
meta.concurrency
|
|
1896
2195
|
] }, "meta-3"),
|
|
1897
2196
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
@@ -1903,7 +2202,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1903
2202
|
...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
1904
2203
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
|
|
1905
2204
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
1906
|
-
...checks.map((c) => /* @__PURE__ */ jsx(
|
|
2205
|
+
...checks.map((c) => /* @__PURE__ */ jsx(
|
|
2206
|
+
CheckRow,
|
|
2207
|
+
{
|
|
2208
|
+
name: c.name,
|
|
2209
|
+
passed: c.passed,
|
|
2210
|
+
detail: c.detail
|
|
2211
|
+
},
|
|
2212
|
+
`chk-${c.name}`
|
|
2213
|
+
)),
|
|
1907
2214
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
|
|
1908
2215
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
1909
2216
|
/* @__PURE__ */ jsx(
|
|
@@ -1916,16 +2223,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1916
2223
|
"perf-rate"
|
|
1917
2224
|
),
|
|
1918
2225
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1919
|
-
"latency avg
|
|
2226
|
+
"latency avg ",
|
|
1920
2227
|
performance.latencyAvgMs,
|
|
1921
|
-
"ms
|
|
2228
|
+
"ms p95 ",
|
|
1922
2229
|
performance.latencyP95Ms,
|
|
1923
2230
|
"ms"
|
|
1924
2231
|
] }, "perf-lat"),
|
|
1925
2232
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1926
|
-
"tokens avg
|
|
2233
|
+
"tokens avg ",
|
|
1927
2234
|
performance.tokensAvg,
|
|
1928
|
-
"
|
|
2235
|
+
" p95 ",
|
|
1929
2236
|
performance.tokensP95
|
|
1930
2237
|
] }, "perf-tok"),
|
|
1931
2238
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp4"),
|
|
@@ -1949,6 +2256,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1949
2256
|
rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp6"));
|
|
1950
2257
|
rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
1951
2258
|
for (const tc of testCases) {
|
|
2259
|
+
const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
|
|
1952
2260
|
rows.push(
|
|
1953
2261
|
/* @__PURE__ */ jsxs(Text, { children: [
|
|
1954
2262
|
/* @__PURE__ */ jsxs(Text, { color: "cyan", children: [
|
|
@@ -1960,12 +2268,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1960
2268
|
] }),
|
|
1961
2269
|
" ",
|
|
1962
2270
|
tc.testCaseName,
|
|
2271
|
+
rerunPart ? /* @__PURE__ */ jsx(Text, { color: "cyan", children: rerunPart }) : null,
|
|
1963
2272
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1964
2273
|
" (",
|
|
1965
2274
|
tc.durationMs,
|
|
1966
2275
|
"ms)"
|
|
1967
2276
|
] })
|
|
1968
|
-
] }, `tc-${tc.testCaseId}`)
|
|
2277
|
+
] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
|
|
1969
2278
|
);
|
|
1970
2279
|
for (const item of tc.evaluatorScores) {
|
|
1971
2280
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2034,7 +2343,7 @@ function RunDetailsView({
|
|
|
2034
2343
|
const runs = dataset?.runs ?? [];
|
|
2035
2344
|
const rightFocused = state.focus === "right";
|
|
2036
2345
|
const [testCases, setTestCases] = useState([]);
|
|
2037
|
-
const evaluatorNameById =
|
|
2346
|
+
const evaluatorNameById = React2.useMemo(
|
|
2038
2347
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2039
2348
|
[evaluators]
|
|
2040
2349
|
);
|
|
@@ -2057,7 +2366,7 @@ function RunDetailsView({
|
|
|
2057
2366
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2058
2367
|
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2059
2368
|
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
2060
|
-
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(
|
|
2369
|
+
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React2.Fragment, { children: row }, i)) }) })
|
|
2061
2370
|
] });
|
|
2062
2371
|
}
|
|
2063
2372
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2139,6 +2448,7 @@ function EvalsCliApp({
|
|
|
2139
2448
|
const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
|
|
2140
2449
|
const [liveData, setLiveData] = useState(data);
|
|
2141
2450
|
const [runtimeMessage, setRuntimeMessage] = useState();
|
|
2451
|
+
const overviewRowCountRef = useRef(0);
|
|
2142
2452
|
const [state, dispatch] = useReducer(
|
|
2143
2453
|
reduceCliState,
|
|
2144
2454
|
createInitialState(data, args)
|
|
@@ -2218,7 +2528,16 @@ function EvalsCliApp({
|
|
|
2218
2528
|
return;
|
|
2219
2529
|
}
|
|
2220
2530
|
if (key.downArrow) {
|
|
2221
|
-
|
|
2531
|
+
let max;
|
|
2532
|
+
if (clampedState.level === "datasets") {
|
|
2533
|
+
max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
|
|
2534
|
+
} else if (clampedState.level === "runs") {
|
|
2535
|
+
max = selectedDataset?.runs.length ?? 0;
|
|
2536
|
+
} else if (clampedState.level === "new-evaluation") {
|
|
2537
|
+
max = Math.max(0, visibleEvaluators.length - 1);
|
|
2538
|
+
} else {
|
|
2539
|
+
max = 100;
|
|
2540
|
+
}
|
|
2222
2541
|
dispatch({ type: "MOVE_DOWN", max });
|
|
2223
2542
|
return;
|
|
2224
2543
|
}
|
|
@@ -2236,7 +2555,7 @@ function EvalsCliApp({
|
|
|
2236
2555
|
}
|
|
2237
2556
|
return;
|
|
2238
2557
|
}
|
|
2239
|
-
if (isBackKey(key)) {
|
|
2558
|
+
if (isBackKey(key) || input === "\x7F" || input === "\b") {
|
|
2240
2559
|
dispatch({ type: "BACK" });
|
|
2241
2560
|
return;
|
|
2242
2561
|
}
|
|
@@ -2289,7 +2608,8 @@ function EvalsCliApp({
|
|
|
2289
2608
|
{
|
|
2290
2609
|
state: clampedState,
|
|
2291
2610
|
filteredDatasets,
|
|
2292
|
-
selectedDataset
|
|
2611
|
+
selectedDataset,
|
|
2612
|
+
overviewRowCountRef
|
|
2293
2613
|
}
|
|
2294
2614
|
);
|
|
2295
2615
|
}
|