@m4trix/evals 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +706 -231
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +707 -232
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +710 -390
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +702 -382
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +289 -108
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +28 -5
- package/dist/index.js +290 -109
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.cjs
CHANGED
|
@@ -2,17 +2,18 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
var fullscreenInk = require('fullscreen-ink');
|
|
5
|
-
var
|
|
5
|
+
var React2 = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
8
|
var path = require('path');
|
|
9
|
-
var
|
|
9
|
+
var inkChart = require('@pppp606/ink-chart');
|
|
10
10
|
var crypto = require('crypto');
|
|
11
11
|
var effect = require('effect');
|
|
12
12
|
var fs = require('fs');
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
14
|
var promises = require('fs/promises');
|
|
15
15
|
var url = require('url');
|
|
16
|
+
var diff = require('diff');
|
|
16
17
|
|
|
17
18
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
18
19
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -35,7 +36,7 @@ function _interopNamespace(e) {
|
|
|
35
36
|
return Object.freeze(n);
|
|
36
37
|
}
|
|
37
38
|
|
|
38
|
-
var
|
|
39
|
+
var React2__default = /*#__PURE__*/_interopDefault(React2);
|
|
39
40
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
40
41
|
|
|
41
42
|
var SEP = " ";
|
|
@@ -104,7 +105,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
|
|
|
104
105
|
// src/cli/components/Footer.tsx
|
|
105
106
|
function getFooterText(state) {
|
|
106
107
|
if (state.level === "datasets") {
|
|
107
|
-
return "\u2191\u2193
|
|
108
|
+
return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
|
|
108
109
|
}
|
|
109
110
|
if (state.level === "runs") {
|
|
110
111
|
return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
|
|
@@ -636,6 +637,7 @@ function createInitialState(data, args) {
|
|
|
636
637
|
datasetMenuIndex,
|
|
637
638
|
runMenuIndex,
|
|
638
639
|
detailsScrollOffset: 0,
|
|
640
|
+
overviewScrollOffset: 0,
|
|
639
641
|
selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
|
|
640
642
|
evaluatorMenuIndex: 0,
|
|
641
643
|
searchQuery,
|
|
@@ -651,8 +653,11 @@ function reduceCliState(state, action) {
|
|
|
651
653
|
if (state.level === "details" && state.focus === "right") {
|
|
652
654
|
return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
|
|
653
655
|
}
|
|
656
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
657
|
+
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
658
|
+
}
|
|
654
659
|
if (state.level === "datasets") {
|
|
655
|
-
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
|
|
660
|
+
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
|
|
656
661
|
}
|
|
657
662
|
if (state.level === "runs") {
|
|
658
663
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -669,8 +674,11 @@ function reduceCliState(state, action) {
|
|
|
669
674
|
if (state.level === "details" && state.focus === "right") {
|
|
670
675
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
671
676
|
}
|
|
677
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
678
|
+
return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
|
|
679
|
+
}
|
|
672
680
|
if (state.level === "datasets") {
|
|
673
|
-
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
|
|
681
|
+
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
|
|
674
682
|
}
|
|
675
683
|
if (state.level === "runs") {
|
|
676
684
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -746,249 +754,6 @@ function reduceCliState(state, action) {
|
|
|
746
754
|
}
|
|
747
755
|
return state;
|
|
748
756
|
}
|
|
749
|
-
var LEFT_PANE_WIDTH2 = 44;
|
|
750
|
-
function DatasetsView({
|
|
751
|
-
state,
|
|
752
|
-
filteredDatasets,
|
|
753
|
-
selectedDataset
|
|
754
|
-
}) {
|
|
755
|
-
const leftFocused = state.focus === "left";
|
|
756
|
-
const rightFocused = state.focus === "right";
|
|
757
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
758
|
-
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
759
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
760
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
761
|
-
ListItem,
|
|
762
|
-
{
|
|
763
|
-
selected: state.datasetMenuIndex === 0,
|
|
764
|
-
label: "New evaluation",
|
|
765
|
-
itemKey: "datasets-new-eval"
|
|
766
|
-
}
|
|
767
|
-
),
|
|
768
|
-
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
769
|
-
ListItem,
|
|
770
|
-
{
|
|
771
|
-
selected: state.datasetMenuIndex === index + 1,
|
|
772
|
-
label: dataset.name,
|
|
773
|
-
itemKey: `dataset-${dataset.id}`
|
|
774
|
-
},
|
|
775
|
-
dataset.id
|
|
776
|
-
))
|
|
777
|
-
] }),
|
|
778
|
-
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
779
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
780
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
|
|
781
|
-
] })
|
|
782
|
-
] });
|
|
783
|
-
}
|
|
784
|
-
function RunsView({
|
|
785
|
-
state,
|
|
786
|
-
dataset,
|
|
787
|
-
selectedRun
|
|
788
|
-
}) {
|
|
789
|
-
const runs = dataset?.runs ?? [];
|
|
790
|
-
const rightFocused = state.focus === "right";
|
|
791
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
792
|
-
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
793
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
794
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
795
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
796
|
-
" ",
|
|
797
|
-
selectedRun.label,
|
|
798
|
-
" ",
|
|
799
|
-
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
800
|
-
] }),
|
|
801
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
802
|
-
"Commit: ",
|
|
803
|
-
selectedRun.meta.commit,
|
|
804
|
-
" Branch: ",
|
|
805
|
-
selectedRun.meta.branch,
|
|
806
|
-
" ",
|
|
807
|
-
"Seed: ",
|
|
808
|
-
selectedRun.meta.seed
|
|
809
|
-
] }),
|
|
810
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
811
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
812
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
813
|
-
TextBar,
|
|
814
|
-
{
|
|
815
|
-
label: "pass rate",
|
|
816
|
-
value: selectedRun.performance.passRate,
|
|
817
|
-
format: (v) => `${v}%`
|
|
818
|
-
}
|
|
819
|
-
),
|
|
820
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
821
|
-
TextBar,
|
|
822
|
-
{
|
|
823
|
-
label: "avg score",
|
|
824
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
825
|
-
}
|
|
826
|
-
),
|
|
827
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
828
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
829
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
830
|
-
TextBar,
|
|
831
|
-
{
|
|
832
|
-
label: dimension.name,
|
|
833
|
-
value: dimension.score
|
|
834
|
-
},
|
|
835
|
-
dimension.name
|
|
836
|
-
)),
|
|
837
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
838
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
839
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
840
|
-
Sparkline,
|
|
841
|
-
{
|
|
842
|
-
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
843
|
-
selectedRun.performance.latencyAvgMs - 40,
|
|
844
|
-
selectedRun.performance.latencyAvgMs - 10,
|
|
845
|
-
selectedRun.performance.latencyAvgMs + 20,
|
|
846
|
-
selectedRun.performance.latencyP95Ms - 80,
|
|
847
|
-
selectedRun.performance.latencyP95Ms
|
|
848
|
-
],
|
|
849
|
-
width: 24
|
|
850
|
-
}
|
|
851
|
-
)
|
|
852
|
-
] }) })
|
|
853
|
-
] });
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
// src/evals/metric.ts
|
|
857
|
-
var registry = /* @__PURE__ */ new Map();
|
|
858
|
-
var Metric = {
|
|
859
|
-
of(config) {
|
|
860
|
-
const def = {
|
|
861
|
-
id: config.id,
|
|
862
|
-
name: config.name,
|
|
863
|
-
format: config.format,
|
|
864
|
-
make: (data) => ({ id: config.id, data })
|
|
865
|
-
};
|
|
866
|
-
registry.set(config.id, def);
|
|
867
|
-
return def;
|
|
868
|
-
}
|
|
869
|
-
};
|
|
870
|
-
function getMetricById(id) {
|
|
871
|
-
return registry.get(id);
|
|
872
|
-
}
|
|
873
|
-
|
|
874
|
-
// src/evals/score.ts
|
|
875
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
876
|
-
var Score = {
|
|
877
|
-
of(config) {
|
|
878
|
-
const def = {
|
|
879
|
-
id: config.id,
|
|
880
|
-
name: config.name,
|
|
881
|
-
displayStrategy: config.displayStrategy,
|
|
882
|
-
format: config.format,
|
|
883
|
-
make: (data, options) => {
|
|
884
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
885
|
-
return {
|
|
886
|
-
id: config.id,
|
|
887
|
-
data,
|
|
888
|
-
...passed !== void 0 && { passed }
|
|
889
|
-
};
|
|
890
|
-
}
|
|
891
|
-
};
|
|
892
|
-
registry2.set(config.id, def);
|
|
893
|
-
return def;
|
|
894
|
-
}
|
|
895
|
-
};
|
|
896
|
-
function getScoreById(id) {
|
|
897
|
-
return registry2.get(id);
|
|
898
|
-
}
|
|
899
|
-
|
|
900
|
-
// src/evals/metrics/standard.ts
|
|
901
|
-
Metric.of({
|
|
902
|
-
id: "token-count",
|
|
903
|
-
name: "Tokens",
|
|
904
|
-
format: (data) => {
|
|
905
|
-
const input = data.input ?? 0;
|
|
906
|
-
const output = data.output ?? 0;
|
|
907
|
-
const inputCached = data.inputCached ?? 0;
|
|
908
|
-
const outputCached = data.outputCached ?? 0;
|
|
909
|
-
const cached = inputCached + outputCached;
|
|
910
|
-
return `in:${input} out:${output} cached:${cached}`;
|
|
911
|
-
}
|
|
912
|
-
});
|
|
913
|
-
Metric.of({
|
|
914
|
-
id: "latency",
|
|
915
|
-
name: "Latency",
|
|
916
|
-
format: (data) => `${data.ms}ms`
|
|
917
|
-
});
|
|
918
|
-
|
|
919
|
-
// src/evals/scores/standard.ts
|
|
920
|
-
Score.of({
|
|
921
|
-
id: "percent",
|
|
922
|
-
name: "Score",
|
|
923
|
-
displayStrategy: "bar",
|
|
924
|
-
format: (data) => data.value.toFixed(2)
|
|
925
|
-
});
|
|
926
|
-
Score.of({
|
|
927
|
-
id: "binary",
|
|
928
|
-
name: "Result",
|
|
929
|
-
displayStrategy: "passFail",
|
|
930
|
-
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
931
|
-
});
|
|
932
|
-
function createDiffLogEntry(expected, actual, options) {
|
|
933
|
-
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
934
|
-
return {
|
|
935
|
-
type: "diff",
|
|
936
|
-
label: options?.label,
|
|
937
|
-
expected,
|
|
938
|
-
actual,
|
|
939
|
-
diff: diff || "(no differences)"
|
|
940
|
-
};
|
|
941
|
-
}
|
|
942
|
-
function getDiffLines(entry) {
|
|
943
|
-
const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
944
|
-
return raw.split("\n").map((line) => {
|
|
945
|
-
const trimmed = line.trimStart();
|
|
946
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
947
|
-
return { type: "remove", line };
|
|
948
|
-
}
|
|
949
|
-
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
950
|
-
return { type: "add", line };
|
|
951
|
-
}
|
|
952
|
-
return { type: "context", line };
|
|
953
|
-
});
|
|
954
|
-
}
|
|
955
|
-
|
|
956
|
-
// src/runner/score-utils.ts
|
|
957
|
-
function toNumericScoreFromScores(scores) {
|
|
958
|
-
for (const item of scores) {
|
|
959
|
-
const def = getScoreById(item.id);
|
|
960
|
-
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
961
|
-
const value = item.data.value;
|
|
962
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
963
|
-
return value;
|
|
964
|
-
}
|
|
965
|
-
}
|
|
966
|
-
const numeric = toNumericScore(item.data);
|
|
967
|
-
if (numeric !== void 0) {
|
|
968
|
-
return numeric;
|
|
969
|
-
}
|
|
970
|
-
}
|
|
971
|
-
return void 0;
|
|
972
|
-
}
|
|
973
|
-
function toNumericScore(value) {
|
|
974
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
975
|
-
return value;
|
|
976
|
-
}
|
|
977
|
-
if (typeof value !== "object" || value === null) {
|
|
978
|
-
return void 0;
|
|
979
|
-
}
|
|
980
|
-
const obj = value;
|
|
981
|
-
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
982
|
-
return obj.score;
|
|
983
|
-
}
|
|
984
|
-
const numberValues = Object.values(value).filter(
|
|
985
|
-
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
986
|
-
);
|
|
987
|
-
if (numberValues.length === 0) {
|
|
988
|
-
return void 0;
|
|
989
|
-
}
|
|
990
|
-
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
991
|
-
}
|
|
992
757
|
|
|
993
758
|
// src/runner/config.ts
|
|
994
759
|
var defaultRunnerConfig = {
|
|
@@ -1009,7 +774,8 @@ var defaultRunnerConfig = {
|
|
|
1009
774
|
],
|
|
1010
775
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
1011
776
|
},
|
|
1012
|
-
artifactDirectory: ".eval-results"
|
|
777
|
+
artifactDirectory: ".eval-results",
|
|
778
|
+
maxConcurrency: 1
|
|
1013
779
|
};
|
|
1014
780
|
function toRunnerConfigOverrides(config) {
|
|
1015
781
|
if (!config) {
|
|
@@ -1042,6 +808,9 @@ function toRunnerConfigOverrides(config) {
|
|
|
1042
808
|
if (config.artifactDirectory !== void 0) {
|
|
1043
809
|
overrides.artifactDirectory = config.artifactDirectory;
|
|
1044
810
|
}
|
|
811
|
+
if (config.maxConcurrency !== void 0) {
|
|
812
|
+
overrides.maxConcurrency = config.maxConcurrency;
|
|
813
|
+
}
|
|
1045
814
|
if (Object.keys(discovery).length > 0) {
|
|
1046
815
|
overrides.discovery = discovery;
|
|
1047
816
|
}
|
|
@@ -1216,25 +985,261 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
1216
985
|
);
|
|
1217
986
|
return found.flat();
|
|
1218
987
|
}
|
|
1219
|
-
async function collectTestCasesFromFiles(config) {
|
|
1220
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1221
|
-
const matched = files.filter(
|
|
1222
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
1223
|
-
);
|
|
1224
|
-
const found = await Promise.all(
|
|
1225
|
-
matched.map(async (absolutePath) => {
|
|
1226
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1227
|
-
const testCases = exports.filter(isTestCaseLike);
|
|
1228
|
-
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1229
|
-
return testCases.map((testCase) => ({
|
|
1230
|
-
id: toId("test-case", relPath, testCase.getName()),
|
|
1231
|
-
filePath: relPath,
|
|
1232
|
-
testCase
|
|
1233
|
-
}));
|
|
1234
|
-
})
|
|
988
|
+
async function collectTestCasesFromFiles(config) {
|
|
989
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
990
|
+
const matched = files.filter(
|
|
991
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
992
|
+
);
|
|
993
|
+
const found = await Promise.all(
|
|
994
|
+
matched.map(async (absolutePath) => {
|
|
995
|
+
const exports = await loadModuleExports(absolutePath);
|
|
996
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
997
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
998
|
+
return testCases.map((testCase) => ({
|
|
999
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
1000
|
+
filePath: relPath,
|
|
1001
|
+
testCase
|
|
1002
|
+
}));
|
|
1003
|
+
})
|
|
1004
|
+
);
|
|
1005
|
+
return found.flat();
|
|
1006
|
+
}
|
|
1007
|
+
function toJsonLines(value) {
|
|
1008
|
+
try {
|
|
1009
|
+
return JSON.stringify(value, null, 2);
|
|
1010
|
+
} catch {
|
|
1011
|
+
return String(value);
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
function formatDiffString(changes) {
|
|
1015
|
+
const lines = [];
|
|
1016
|
+
for (const part of changes) {
|
|
1017
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
1018
|
+
const partLines = part.value.split("\n");
|
|
1019
|
+
if (partLines[partLines.length - 1] === "") {
|
|
1020
|
+
partLines.pop();
|
|
1021
|
+
}
|
|
1022
|
+
for (const line of partLines) {
|
|
1023
|
+
lines.push(`${prefix} ${line}`);
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
return lines.join("\n");
|
|
1027
|
+
}
|
|
1028
|
+
function createDiffString(expected, actual) {
|
|
1029
|
+
const expectedStr = toJsonLines(expected);
|
|
1030
|
+
const actualStr = toJsonLines(actual);
|
|
1031
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
1032
|
+
return formatDiffString(changes);
|
|
1033
|
+
}
|
|
1034
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
1035
|
+
const diff = createDiffString(expected, actual);
|
|
1036
|
+
return {
|
|
1037
|
+
type: "diff",
|
|
1038
|
+
label: options?.label,
|
|
1039
|
+
expected,
|
|
1040
|
+
actual,
|
|
1041
|
+
diff: diff || "(no differences)"
|
|
1042
|
+
};
|
|
1043
|
+
}
|
|
1044
|
+
function getDiffLines(entry) {
|
|
1045
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
1046
|
+
return raw.split("\n").map((line) => {
|
|
1047
|
+
const trimmed = line.trimStart();
|
|
1048
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
1049
|
+
return { type: "remove", line };
|
|
1050
|
+
}
|
|
1051
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
1052
|
+
return { type: "add", line };
|
|
1053
|
+
}
|
|
1054
|
+
return { type: "context", line };
|
|
1055
|
+
});
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
// src/evals/metric.ts
|
|
1059
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1060
|
+
var Metric = {
|
|
1061
|
+
of(config) {
|
|
1062
|
+
const def = {
|
|
1063
|
+
id: config.id,
|
|
1064
|
+
name: config.name,
|
|
1065
|
+
aggregate: config.aggregate,
|
|
1066
|
+
format: config.format,
|
|
1067
|
+
make: (data) => ({ id: config.id, data })
|
|
1068
|
+
};
|
|
1069
|
+
registry.set(config.id, def);
|
|
1070
|
+
return def;
|
|
1071
|
+
}
|
|
1072
|
+
};
|
|
1073
|
+
function getMetricById(id) {
|
|
1074
|
+
return registry.get(id);
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// src/evals/score.ts
|
|
1078
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1079
|
+
var Score = {
|
|
1080
|
+
of(config) {
|
|
1081
|
+
const def = {
|
|
1082
|
+
id: config.id,
|
|
1083
|
+
name: config.name,
|
|
1084
|
+
displayStrategy: config.displayStrategy,
|
|
1085
|
+
aggregate: config.aggregate,
|
|
1086
|
+
format: config.format,
|
|
1087
|
+
make: (data, options) => {
|
|
1088
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1089
|
+
return {
|
|
1090
|
+
id: config.id,
|
|
1091
|
+
data,
|
|
1092
|
+
...passed !== void 0 && { passed }
|
|
1093
|
+
};
|
|
1094
|
+
}
|
|
1095
|
+
};
|
|
1096
|
+
registry2.set(config.id, def);
|
|
1097
|
+
return def;
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1100
|
+
function getScoreById(id) {
|
|
1101
|
+
return registry2.get(id);
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
// src/evals/aggregators.ts
|
|
1105
|
+
function aggregateAverageWithVariance(values) {
|
|
1106
|
+
if (values.length === 0) {
|
|
1107
|
+
return { value: 0, count: 0 };
|
|
1108
|
+
}
|
|
1109
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1110
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1111
|
+
const mean = sum / values.length;
|
|
1112
|
+
let stdDev;
|
|
1113
|
+
if (values.length >= 2) {
|
|
1114
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1115
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1116
|
+
}
|
|
1117
|
+
return { value: mean, stdDev, count: values.length };
|
|
1118
|
+
}
|
|
1119
|
+
function aggregateAll(values) {
|
|
1120
|
+
const total = values.length;
|
|
1121
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1122
|
+
return {
|
|
1123
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1124
|
+
passedCount,
|
|
1125
|
+
totalCount: total
|
|
1126
|
+
};
|
|
1127
|
+
}
|
|
1128
|
+
function aggregateTokenCountSum(values) {
|
|
1129
|
+
const initial = {
|
|
1130
|
+
input: 0,
|
|
1131
|
+
output: 0,
|
|
1132
|
+
inputCached: 0,
|
|
1133
|
+
outputCached: 0
|
|
1134
|
+
};
|
|
1135
|
+
return values.reduce(
|
|
1136
|
+
(acc, v) => ({
|
|
1137
|
+
input: acc.input + (v.input ?? 0),
|
|
1138
|
+
output: acc.output + (v.output ?? 0),
|
|
1139
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1140
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1141
|
+
}),
|
|
1142
|
+
initial
|
|
1143
|
+
);
|
|
1144
|
+
}
|
|
1145
|
+
function aggregateLatencyAverage(values) {
|
|
1146
|
+
if (values.length === 0) {
|
|
1147
|
+
return { ms: 0 };
|
|
1148
|
+
}
|
|
1149
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1150
|
+
return { ms: sum / values.length };
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
// src/evals/metrics/standard.ts
|
|
1154
|
+
Metric.of({
|
|
1155
|
+
id: "token-count",
|
|
1156
|
+
name: "Tokens",
|
|
1157
|
+
aggregate: aggregateTokenCountSum,
|
|
1158
|
+
format: (data, options) => {
|
|
1159
|
+
const input = data.input ?? 0;
|
|
1160
|
+
const output = data.output ?? 0;
|
|
1161
|
+
const inputCached = data.inputCached ?? 0;
|
|
1162
|
+
const outputCached = data.outputCached ?? 0;
|
|
1163
|
+
const cached = inputCached + outputCached;
|
|
1164
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1165
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1166
|
+
}
|
|
1167
|
+
});
|
|
1168
|
+
Metric.of({
|
|
1169
|
+
id: "latency",
|
|
1170
|
+
name: "Latency",
|
|
1171
|
+
aggregate: aggregateLatencyAverage,
|
|
1172
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1173
|
+
});
|
|
1174
|
+
|
|
1175
|
+
// src/evals/scores/standard.ts
|
|
1176
|
+
Score.of({
|
|
1177
|
+
id: "percent",
|
|
1178
|
+
name: "Score",
|
|
1179
|
+
displayStrategy: "bar",
|
|
1180
|
+
format: (data, options) => {
|
|
1181
|
+
if (options?.isAggregated) {
|
|
1182
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
1183
|
+
}
|
|
1184
|
+
return data.value.toFixed(2);
|
|
1185
|
+
},
|
|
1186
|
+
aggregate: aggregateAverageWithVariance
|
|
1187
|
+
});
|
|
1188
|
+
Score.of({
|
|
1189
|
+
id: "binary",
|
|
1190
|
+
name: "Result",
|
|
1191
|
+
displayStrategy: "passFail",
|
|
1192
|
+
format: (data, options) => {
|
|
1193
|
+
if (options?.isAggregated) {
|
|
1194
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1195
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1196
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1197
|
+
}
|
|
1198
|
+
return base;
|
|
1199
|
+
}
|
|
1200
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
1201
|
+
},
|
|
1202
|
+
aggregate: aggregateAll
|
|
1203
|
+
});
|
|
1204
|
+
|
|
1205
|
+
// src/runner/score-utils.ts
|
|
1206
|
+
function toNumericScoreFromScores(scores) {
|
|
1207
|
+
for (const item of scores) {
|
|
1208
|
+
const def = getScoreById(item.id);
|
|
1209
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1210
|
+
const value = item.data.value;
|
|
1211
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1212
|
+
return value;
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
const numeric = toNumericScore(item.data);
|
|
1216
|
+
if (numeric !== void 0) {
|
|
1217
|
+
return numeric;
|
|
1218
|
+
}
|
|
1219
|
+
}
|
|
1220
|
+
return void 0;
|
|
1221
|
+
}
|
|
1222
|
+
function toNumericScore(value) {
|
|
1223
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1224
|
+
return value;
|
|
1225
|
+
}
|
|
1226
|
+
if (typeof value !== "object" || value === null) {
|
|
1227
|
+
return void 0;
|
|
1228
|
+
}
|
|
1229
|
+
const obj = value;
|
|
1230
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1231
|
+
return obj.score;
|
|
1232
|
+
}
|
|
1233
|
+
const numberValues = Object.values(value).filter(
|
|
1234
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1235
1235
|
);
|
|
1236
|
-
|
|
1236
|
+
if (numberValues.length === 0) {
|
|
1237
|
+
return void 0;
|
|
1238
|
+
}
|
|
1239
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1237
1240
|
}
|
|
1241
|
+
|
|
1242
|
+
// src/runner/execution.ts
|
|
1238
1243
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1239
1244
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1240
1245
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1276,6 +1281,105 @@ function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
|
1276
1281
|
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1277
1282
|
);
|
|
1278
1283
|
}
|
|
1284
|
+
function processOneTestCase(task, testCaseItem, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, completedRef, passedRef, failedRef) {
|
|
1285
|
+
return effect.Effect.gen(function* () {
|
|
1286
|
+
const reruns = typeof testCaseItem.testCase.getReruns === "function" ? testCaseItem.testCase.getReruns() : 1;
|
|
1287
|
+
const rerunPassed = [];
|
|
1288
|
+
for (let r = 0; r < reruns; r++) {
|
|
1289
|
+
const started = Date.now();
|
|
1290
|
+
const evaluatorScores = [];
|
|
1291
|
+
let testCaseError;
|
|
1292
|
+
const output = readOutput(testCaseItem.testCase);
|
|
1293
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1294
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1295
|
+
if (!evaluateFn) {
|
|
1296
|
+
continue;
|
|
1297
|
+
}
|
|
1298
|
+
try {
|
|
1299
|
+
const logs = [];
|
|
1300
|
+
const logDiff = (expected, actual, options) => {
|
|
1301
|
+
logs.push(createDiffLogEntry(expected, actual, options));
|
|
1302
|
+
};
|
|
1303
|
+
const ctx = yield* effect.Effect.promise(
|
|
1304
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1305
|
+
);
|
|
1306
|
+
const result = yield* effect.Effect.promise(
|
|
1307
|
+
() => Promise.resolve(
|
|
1308
|
+
evaluateFn({
|
|
1309
|
+
input: testCaseItem.testCase.getInput(),
|
|
1310
|
+
ctx,
|
|
1311
|
+
output,
|
|
1312
|
+
logDiff
|
|
1313
|
+
})
|
|
1314
|
+
)
|
|
1315
|
+
);
|
|
1316
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1317
|
+
const passed2 = computeEvaluatorPassed(evaluator, result, scores);
|
|
1318
|
+
evaluatorScores.push({
|
|
1319
|
+
evaluatorId,
|
|
1320
|
+
scores,
|
|
1321
|
+
passed: passed2,
|
|
1322
|
+
metrics,
|
|
1323
|
+
logs: logs.length > 0 ? logs : void 0
|
|
1324
|
+
});
|
|
1325
|
+
} catch (error) {
|
|
1326
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1327
|
+
evaluatorScores.push({
|
|
1328
|
+
evaluatorId,
|
|
1329
|
+
scores: [],
|
|
1330
|
+
passed: false
|
|
1331
|
+
});
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1335
|
+
rerunPassed.push(rerunPassedThis);
|
|
1336
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1337
|
+
n + 1,
|
|
1338
|
+
n + 1
|
|
1339
|
+
]);
|
|
1340
|
+
const progressEvent = {
|
|
1341
|
+
type: "TestCaseProgress",
|
|
1342
|
+
runId: task.runId,
|
|
1343
|
+
testCaseId: testCaseItem.id,
|
|
1344
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1345
|
+
completedTestCases: completedEvaluations,
|
|
1346
|
+
totalTestCases: totalEvaluations,
|
|
1347
|
+
rerunIndex: r + 1,
|
|
1348
|
+
rerunTotal: reruns,
|
|
1349
|
+
passed: rerunPassedThis,
|
|
1350
|
+
durationMs: Date.now() - started,
|
|
1351
|
+
evaluatorScores,
|
|
1352
|
+
output,
|
|
1353
|
+
errorMessage: testCaseError
|
|
1354
|
+
};
|
|
1355
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1356
|
+
...snapshot,
|
|
1357
|
+
completedTestCases: completedEvaluations
|
|
1358
|
+
}));
|
|
1359
|
+
yield* publishEvent(progressEvent);
|
|
1360
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1361
|
+
runId: task.runId,
|
|
1362
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1363
|
+
payload: progressEvent
|
|
1364
|
+
});
|
|
1365
|
+
}
|
|
1366
|
+
const testCasePassed = rerunPassed.every(Boolean);
|
|
1367
|
+
if (testCasePassed) {
|
|
1368
|
+
yield* effect.Ref.update(passedRef, (n) => n + 1);
|
|
1369
|
+
} else {
|
|
1370
|
+
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1371
|
+
}
|
|
1372
|
+
const [passed, failed] = yield* effect.Effect.all([
|
|
1373
|
+
effect.Ref.get(passedRef),
|
|
1374
|
+
effect.Ref.get(failedRef)
|
|
1375
|
+
]);
|
|
1376
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1377
|
+
...snapshot,
|
|
1378
|
+
passedTestCases: passed,
|
|
1379
|
+
failedTestCases: failed
|
|
1380
|
+
}));
|
|
1381
|
+
});
|
|
1382
|
+
}
|
|
1279
1383
|
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
1280
1384
|
const startedAt = Date.now();
|
|
1281
1385
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
@@ -1288,104 +1392,51 @@ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => e
|
|
|
1288
1392
|
runId: task.runId,
|
|
1289
1393
|
startedAt
|
|
1290
1394
|
});
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
)
|
|
1321
|
-
);
|
|
1322
|
-
const { scores, metrics } = normalizeResult(result);
|
|
1323
|
-
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1324
|
-
evaluatorScores.push({
|
|
1325
|
-
evaluatorId,
|
|
1326
|
-
scores,
|
|
1327
|
-
passed,
|
|
1328
|
-
metrics,
|
|
1329
|
-
logs: logs.length > 0 ? logs : void 0
|
|
1330
|
-
});
|
|
1331
|
-
} catch (error) {
|
|
1332
|
-
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1333
|
-
evaluatorScores.push({
|
|
1334
|
-
evaluatorId,
|
|
1335
|
-
scores: [],
|
|
1336
|
-
passed: false
|
|
1337
|
-
});
|
|
1338
|
-
}
|
|
1339
|
-
}
|
|
1340
|
-
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
1341
|
-
completedTestCases += 1;
|
|
1342
|
-
if (testCasePassed) {
|
|
1343
|
-
passedTestCases += 1;
|
|
1344
|
-
} else {
|
|
1345
|
-
failedTestCases += 1;
|
|
1346
|
-
}
|
|
1347
|
-
const progressEvent = {
|
|
1348
|
-
type: "TestCaseProgress",
|
|
1349
|
-
runId: task.runId,
|
|
1350
|
-
testCaseId: testCaseItem.id,
|
|
1351
|
-
testCaseName: testCaseItem.testCase.getName(),
|
|
1352
|
-
completedTestCases,
|
|
1353
|
-
totalTestCases: task.testCases.length,
|
|
1354
|
-
passed: testCasePassed,
|
|
1355
|
-
durationMs: Date.now() - started,
|
|
1356
|
-
evaluatorScores,
|
|
1357
|
-
output,
|
|
1358
|
-
errorMessage: testCaseError
|
|
1359
|
-
};
|
|
1360
|
-
updateSnapshot(task.runId, (snapshot) => ({
|
|
1361
|
-
...snapshot,
|
|
1362
|
-
completedTestCases,
|
|
1363
|
-
passedTestCases,
|
|
1364
|
-
failedTestCases
|
|
1365
|
-
}));
|
|
1366
|
-
yield* publishEvent(progressEvent);
|
|
1367
|
-
yield* effect.Queue.offer(persistenceQueue, {
|
|
1368
|
-
runId: task.runId,
|
|
1369
|
-
artifactPath: task.snapshot.artifactPath,
|
|
1370
|
-
payload: progressEvent
|
|
1371
|
-
});
|
|
1372
|
-
}
|
|
1395
|
+
const totalEvaluations = task.testCases.reduce(
|
|
1396
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1397
|
+
0
|
|
1398
|
+
);
|
|
1399
|
+
const maxConcurrency = Math.max(1, task.maxConcurrency ?? 1);
|
|
1400
|
+
const completedRef = yield* effect.Ref.make(0);
|
|
1401
|
+
const passedRef = yield* effect.Ref.make(0);
|
|
1402
|
+
const failedRef = yield* effect.Ref.make(0);
|
|
1403
|
+
const processTestCase = (testCaseItem) => processOneTestCase(
|
|
1404
|
+
task,
|
|
1405
|
+
testCaseItem,
|
|
1406
|
+
totalEvaluations,
|
|
1407
|
+
publishEvent,
|
|
1408
|
+
persistenceQueue,
|
|
1409
|
+
updateSnapshot,
|
|
1410
|
+
completedRef,
|
|
1411
|
+
passedRef,
|
|
1412
|
+
failedRef
|
|
1413
|
+
);
|
|
1414
|
+
yield* effect.Effect.forEach(
|
|
1415
|
+
task.testCases,
|
|
1416
|
+
processTestCase,
|
|
1417
|
+
maxConcurrency > 1 ? { concurrency: maxConcurrency } : void 0
|
|
1418
|
+
);
|
|
1419
|
+
const [completedEvaluations, passedUniqueTestCases, failedUniqueTestCases] = yield* effect.Effect.all([
|
|
1420
|
+
effect.Ref.get(completedRef),
|
|
1421
|
+
effect.Ref.get(passedRef),
|
|
1422
|
+
effect.Ref.get(failedRef)
|
|
1423
|
+
]);
|
|
1373
1424
|
const finishedAt = Date.now();
|
|
1374
1425
|
const completedEvent = {
|
|
1375
1426
|
type: "RunCompleted",
|
|
1376
1427
|
runId: task.runId,
|
|
1377
1428
|
finishedAt,
|
|
1378
|
-
passedTestCases,
|
|
1379
|
-
failedTestCases,
|
|
1429
|
+
passedTestCases: passedUniqueTestCases,
|
|
1430
|
+
failedTestCases: failedUniqueTestCases,
|
|
1380
1431
|
totalTestCases: task.testCases.length,
|
|
1381
1432
|
artifactPath: task.snapshot.artifactPath
|
|
1382
1433
|
};
|
|
1383
1434
|
updateSnapshot(task.runId, (snapshot) => ({
|
|
1384
1435
|
...snapshot,
|
|
1385
1436
|
status: "completed",
|
|
1386
|
-
completedTestCases,
|
|
1387
|
-
passedTestCases,
|
|
1388
|
-
failedTestCases,
|
|
1437
|
+
completedTestCases: completedEvaluations,
|
|
1438
|
+
passedTestCases: passedUniqueTestCases,
|
|
1439
|
+
failedTestCases: failedUniqueTestCases,
|
|
1389
1440
|
finishedAt
|
|
1390
1441
|
}));
|
|
1391
1442
|
yield* publishEvent(completedEvent);
|
|
@@ -1473,7 +1524,7 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1473
1524
|
const artifactPath = filePath;
|
|
1474
1525
|
const status = runFailed ? "failed" : runCompleted ? "completed" : runStarted ? "running" : "queued";
|
|
1475
1526
|
const progress = aggregateTestCaseProgress(lines);
|
|
1476
|
-
const completedTestCases = runCompleted
|
|
1527
|
+
const completedTestCases = runCompleted ? runQueued.totalTestCases : progress.completedTestCases;
|
|
1477
1528
|
const passedTestCases = runCompleted?.passedTestCases ?? progress.passedTestCases;
|
|
1478
1529
|
const failedTestCases = runCompleted?.failedTestCases ?? progress.failedTestCases;
|
|
1479
1530
|
return {
|
|
@@ -1495,23 +1546,29 @@ async function parseArtifactToSnapshot(filePath, _config) {
|
|
|
1495
1546
|
}
|
|
1496
1547
|
function aggregateTestCaseProgress(lines) {
|
|
1497
1548
|
let completedTestCases = 0;
|
|
1498
|
-
|
|
1499
|
-
let failedTestCases = 0;
|
|
1549
|
+
const testCasePassedBy = /* @__PURE__ */ new Map();
|
|
1500
1550
|
for (const line of lines) {
|
|
1501
1551
|
try {
|
|
1502
1552
|
const event = JSON.parse(line);
|
|
1503
1553
|
if (event.type === "TestCaseProgress") {
|
|
1504
1554
|
const ev = event;
|
|
1505
1555
|
completedTestCases = ev.completedTestCases ?? completedTestCases;
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
failedTestCases += 1;
|
|
1510
|
-
}
|
|
1556
|
+
const id = ev.testCaseId;
|
|
1557
|
+
const current = testCasePassedBy.get(id);
|
|
1558
|
+
testCasePassedBy.set(id, current === void 0 ? ev.passed : current && ev.passed);
|
|
1511
1559
|
}
|
|
1512
1560
|
} catch {
|
|
1513
1561
|
}
|
|
1514
1562
|
}
|
|
1563
|
+
let passedTestCases = 0;
|
|
1564
|
+
let failedTestCases = 0;
|
|
1565
|
+
for (const passed of testCasePassedBy.values()) {
|
|
1566
|
+
if (passed) {
|
|
1567
|
+
passedTestCases += 1;
|
|
1568
|
+
} else {
|
|
1569
|
+
failedTestCases += 1;
|
|
1570
|
+
}
|
|
1571
|
+
}
|
|
1515
1572
|
return { completedTestCases, passedTestCases, failedTestCases };
|
|
1516
1573
|
}
|
|
1517
1574
|
async function parseArtifactFile(artifactPath) {
|
|
@@ -1529,6 +1586,8 @@ async function parseArtifactFile(artifactPath) {
|
|
|
1529
1586
|
testCaseName: ev.testCaseName,
|
|
1530
1587
|
completedTestCases: ev.completedTestCases,
|
|
1531
1588
|
totalTestCases: ev.totalTestCases,
|
|
1589
|
+
rerunIndex: ev.rerunIndex,
|
|
1590
|
+
rerunTotal: ev.rerunTotal,
|
|
1532
1591
|
passed: ev.passed,
|
|
1533
1592
|
durationMs: ev.durationMs,
|
|
1534
1593
|
evaluatorScores: ev.evaluatorScores ?? []
|
|
@@ -1734,6 +1793,10 @@ var EffectRunner = class {
|
|
|
1734
1793
|
throw new Error("No evaluators selected for run");
|
|
1735
1794
|
}
|
|
1736
1795
|
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1796
|
+
const totalEvaluations = selectedTestCases.reduce(
|
|
1797
|
+
(sum, tc) => sum + (typeof tc.testCase.getReruns === "function" ? tc.testCase.getReruns() : 1),
|
|
1798
|
+
0
|
|
1799
|
+
);
|
|
1737
1800
|
const runId = `run-${crypto.randomUUID()}`;
|
|
1738
1801
|
const artifactPath = createArtifactPath(
|
|
1739
1802
|
this.config.artifactDirectory,
|
|
@@ -1746,7 +1809,7 @@ var EffectRunner = class {
|
|
|
1746
1809
|
datasetName: dataset.dataset.getName(),
|
|
1747
1810
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1748
1811
|
queuedAt: Date.now(),
|
|
1749
|
-
totalTestCases:
|
|
1812
|
+
totalTestCases: totalEvaluations,
|
|
1750
1813
|
completedTestCases: 0,
|
|
1751
1814
|
passedTestCases: 0,
|
|
1752
1815
|
failedTestCases: 0,
|
|
@@ -1760,7 +1823,7 @@ var EffectRunner = class {
|
|
|
1760
1823
|
datasetId: request.datasetId,
|
|
1761
1824
|
datasetName: dataset.dataset.getName(),
|
|
1762
1825
|
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1763
|
-
totalTestCases:
|
|
1826
|
+
totalTestCases: totalEvaluations,
|
|
1764
1827
|
artifactPath
|
|
1765
1828
|
};
|
|
1766
1829
|
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
@@ -1771,6 +1834,7 @@ var EffectRunner = class {
|
|
|
1771
1834
|
payload: queuedEvent
|
|
1772
1835
|
})
|
|
1773
1836
|
);
|
|
1837
|
+
const maxConcurrency = request.concurrency ?? this.config.maxConcurrency ?? 1;
|
|
1774
1838
|
await effect.Effect.runPromise(
|
|
1775
1839
|
effect.Queue.offer(this.runQueue, {
|
|
1776
1840
|
runId,
|
|
@@ -1778,7 +1842,8 @@ var EffectRunner = class {
|
|
|
1778
1842
|
dataset: dataset.dataset,
|
|
1779
1843
|
evaluators: selectedEvaluators,
|
|
1780
1844
|
testCases: selectedTestCases,
|
|
1781
|
-
snapshot
|
|
1845
|
+
snapshot,
|
|
1846
|
+
maxConcurrency
|
|
1782
1847
|
})
|
|
1783
1848
|
);
|
|
1784
1849
|
return snapshot;
|
|
@@ -1845,6 +1910,240 @@ var EffectRunner = class {
|
|
|
1845
1910
|
);
|
|
1846
1911
|
}
|
|
1847
1912
|
};
|
|
1913
|
+
var LEFT_PANE_WIDTH2 = 44;
|
|
1914
|
+
var MAX_RUNS_FOR_CHART = 12;
|
|
1915
|
+
var MAX_RUNS_FOR_TREND = 20;
|
|
1916
|
+
var TREND_BATCH_SIZE = 4;
|
|
1917
|
+
function extractRunAverageScore(testCases) {
|
|
1918
|
+
const scores = [];
|
|
1919
|
+
for (const tc of testCases) {
|
|
1920
|
+
for (const es of tc.evaluatorScores) {
|
|
1921
|
+
const n = toNumericScoreFromScores(es.scores);
|
|
1922
|
+
if (n !== void 0) {
|
|
1923
|
+
scores.push(n);
|
|
1924
|
+
}
|
|
1925
|
+
}
|
|
1926
|
+
}
|
|
1927
|
+
if (scores.length === 0)
|
|
1928
|
+
return void 0;
|
|
1929
|
+
return scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1930
|
+
}
|
|
1931
|
+
async function loadRunScores(runs) {
|
|
1932
|
+
const results = [];
|
|
1933
|
+
for (const run of runs) {
|
|
1934
|
+
const artifact = run.meta?.artifact;
|
|
1935
|
+
if (!artifact)
|
|
1936
|
+
continue;
|
|
1937
|
+
try {
|
|
1938
|
+
const path$1 = path.resolve(artifact);
|
|
1939
|
+
const testCases = await parseArtifactFile(path$1);
|
|
1940
|
+
const avg = extractRunAverageScore(testCases);
|
|
1941
|
+
if (avg !== void 0) {
|
|
1942
|
+
results.push({
|
|
1943
|
+
runId: run.id,
|
|
1944
|
+
label: run.label,
|
|
1945
|
+
value: avg
|
|
1946
|
+
});
|
|
1947
|
+
}
|
|
1948
|
+
} catch {
|
|
1949
|
+
}
|
|
1950
|
+
}
|
|
1951
|
+
return results;
|
|
1952
|
+
}
|
|
1953
|
+
function batchAverage(values, batchSize) {
|
|
1954
|
+
const batches = [];
|
|
1955
|
+
for (let i = 0; i < values.length; i += batchSize) {
|
|
1956
|
+
const slice = values.slice(i, i + batchSize);
|
|
1957
|
+
if (slice.length > 0) {
|
|
1958
|
+
batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
|
|
1959
|
+
}
|
|
1960
|
+
}
|
|
1961
|
+
return batches;
|
|
1962
|
+
}
|
|
1963
|
+
var OVERVIEW_PAGE_SIZE = 15;
|
|
1964
|
+
function DatasetsView({
|
|
1965
|
+
state,
|
|
1966
|
+
filteredDatasets,
|
|
1967
|
+
selectedDataset,
|
|
1968
|
+
overviewRowCountRef
|
|
1969
|
+
}) {
|
|
1970
|
+
const leftFocused = state.focus === "left";
|
|
1971
|
+
const rightFocused = state.focus === "right";
|
|
1972
|
+
const [runScores, setRunScores] = React2.useState([]);
|
|
1973
|
+
const [loading, setLoading] = React2.useState(false);
|
|
1974
|
+
React2.useEffect(() => {
|
|
1975
|
+
if (!selectedDataset?.runs?.length) {
|
|
1976
|
+
setRunScores([]);
|
|
1977
|
+
return;
|
|
1978
|
+
}
|
|
1979
|
+
setLoading(true);
|
|
1980
|
+
const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
|
|
1981
|
+
loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
|
|
1982
|
+
}, [selectedDataset?.id, selectedDataset?.runs?.length]);
|
|
1983
|
+
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
1984
|
+
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
1985
|
+
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
1986
|
+
const overviewRows = React2.useMemo(() => {
|
|
1987
|
+
const rows = [];
|
|
1988
|
+
rows.push(
|
|
1989
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
1990
|
+
);
|
|
1991
|
+
if (selectedDataset && selectedDataset.runs.length > 0) {
|
|
1992
|
+
if (loading) {
|
|
1993
|
+
rows.push(
|
|
1994
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
|
|
1995
|
+
);
|
|
1996
|
+
} else if (runScores.length > 0) {
|
|
1997
|
+
rows.push(
|
|
1998
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
|
|
1999
|
+
);
|
|
2000
|
+
for (const d of barData) {
|
|
2001
|
+
rows.push(
|
|
2002
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2003
|
+
TextBar,
|
|
2004
|
+
{
|
|
2005
|
+
label: d.label,
|
|
2006
|
+
value: d.value,
|
|
2007
|
+
labelWidth: 14,
|
|
2008
|
+
barWidth: 24,
|
|
2009
|
+
max: 100,
|
|
2010
|
+
format: (v) => v.toFixed(1)
|
|
2011
|
+
},
|
|
2012
|
+
d.runId
|
|
2013
|
+
)
|
|
2014
|
+
);
|
|
2015
|
+
}
|
|
2016
|
+
if (trendBatched.length > 0) {
|
|
2017
|
+
rows.push(
|
|
2018
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
|
|
2019
|
+
);
|
|
2020
|
+
rows.push(
|
|
2021
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
2022
|
+
inkChart.LineGraph,
|
|
2023
|
+
{
|
|
2024
|
+
data: [{ values: trendBatched, color: "cyan" }],
|
|
2025
|
+
height: 5,
|
|
2026
|
+
width: 45,
|
|
2027
|
+
showYAxis: true,
|
|
2028
|
+
xLabels: ["older", "newer"]
|
|
2029
|
+
}
|
|
2030
|
+
) }, "trend-graph")
|
|
2031
|
+
);
|
|
2032
|
+
}
|
|
2033
|
+
}
|
|
2034
|
+
}
|
|
2035
|
+
return rows;
|
|
2036
|
+
}, [
|
|
2037
|
+
selectedDataset?.overview,
|
|
2038
|
+
selectedDataset?.runs?.length,
|
|
2039
|
+
loading,
|
|
2040
|
+
runScores,
|
|
2041
|
+
barData,
|
|
2042
|
+
trendBatched
|
|
2043
|
+
]);
|
|
2044
|
+
if (overviewRowCountRef) {
|
|
2045
|
+
overviewRowCountRef.current = overviewRows.length;
|
|
2046
|
+
}
|
|
2047
|
+
const offset = Math.max(0, state.overviewScrollOffset);
|
|
2048
|
+
const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
|
|
2049
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2050
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
2051
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
2052
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2053
|
+
ListItem,
|
|
2054
|
+
{
|
|
2055
|
+
selected: state.datasetMenuIndex === 0,
|
|
2056
|
+
label: "New evaluation",
|
|
2057
|
+
itemKey: "datasets-new-eval"
|
|
2058
|
+
}
|
|
2059
|
+
),
|
|
2060
|
+
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2061
|
+
ListItem,
|
|
2062
|
+
{
|
|
2063
|
+
selected: state.datasetMenuIndex === index + 1,
|
|
2064
|
+
label: dataset.name,
|
|
2065
|
+
itemKey: `dataset-${dataset.id}`
|
|
2066
|
+
},
|
|
2067
|
+
dataset.id
|
|
2068
|
+
))
|
|
2069
|
+
] }),
|
|
2070
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
2071
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
2072
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: row }, offset + i)) })
|
|
2073
|
+
] })
|
|
2074
|
+
] });
|
|
2075
|
+
}
|
|
2076
|
+
function RunsView({
|
|
2077
|
+
state,
|
|
2078
|
+
dataset,
|
|
2079
|
+
selectedRun
|
|
2080
|
+
}) {
|
|
2081
|
+
const runs = dataset?.runs ?? [];
|
|
2082
|
+
const rightFocused = state.focus === "right";
|
|
2083
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2084
|
+
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2085
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
2086
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2087
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
2088
|
+
" ",
|
|
2089
|
+
selectedRun.label,
|
|
2090
|
+
" ",
|
|
2091
|
+
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
2092
|
+
] }),
|
|
2093
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2094
|
+
"Commit: ",
|
|
2095
|
+
selectedRun.meta.commit,
|
|
2096
|
+
" Branch: ",
|
|
2097
|
+
selectedRun.meta.branch,
|
|
2098
|
+
" ",
|
|
2099
|
+
"Seed: ",
|
|
2100
|
+
selectedRun.meta.seed
|
|
2101
|
+
] }),
|
|
2102
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2103
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
2104
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2105
|
+
TextBar,
|
|
2106
|
+
{
|
|
2107
|
+
label: "pass rate",
|
|
2108
|
+
value: selectedRun.performance.passRate,
|
|
2109
|
+
format: (v) => `${v}%`
|
|
2110
|
+
}
|
|
2111
|
+
),
|
|
2112
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2113
|
+
TextBar,
|
|
2114
|
+
{
|
|
2115
|
+
label: "avg score",
|
|
2116
|
+
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2117
|
+
}
|
|
2118
|
+
),
|
|
2119
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2120
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
2121
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2122
|
+
TextBar,
|
|
2123
|
+
{
|
|
2124
|
+
label: dimension.name,
|
|
2125
|
+
value: dimension.score
|
|
2126
|
+
},
|
|
2127
|
+
dimension.name
|
|
2128
|
+
)),
|
|
2129
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2130
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
2131
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2132
|
+
Sparkline,
|
|
2133
|
+
{
|
|
2134
|
+
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
2135
|
+
selectedRun.performance.latencyAvgMs - 40,
|
|
2136
|
+
selectedRun.performance.latencyAvgMs - 10,
|
|
2137
|
+
selectedRun.performance.latencyAvgMs + 20,
|
|
2138
|
+
selectedRun.performance.latencyP95Ms - 80,
|
|
2139
|
+
selectedRun.performance.latencyP95Ms
|
|
2140
|
+
],
|
|
2141
|
+
width: 24
|
|
2142
|
+
}
|
|
2143
|
+
)
|
|
2144
|
+
] }) })
|
|
2145
|
+
] });
|
|
2146
|
+
}
|
|
1848
2147
|
var DETAILS_PAGE_SIZE = 20;
|
|
1849
2148
|
function scoreColor(score) {
|
|
1850
2149
|
if (score >= 80)
|
|
@@ -1853,7 +2152,7 @@ function scoreColor(score) {
|
|
|
1853
2152
|
return "yellow";
|
|
1854
2153
|
return "red";
|
|
1855
2154
|
}
|
|
1856
|
-
function formatScorePart(item
|
|
2155
|
+
function formatScorePart(item) {
|
|
1857
2156
|
const def = getScoreById(item.id);
|
|
1858
2157
|
if (!def) {
|
|
1859
2158
|
const numeric = toNumericScore(item.data);
|
|
@@ -1883,7 +2182,7 @@ function CheckRow({
|
|
|
1883
2182
|
" ",
|
|
1884
2183
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
|
|
1885
2184
|
detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1886
|
-
"
|
|
2185
|
+
" (",
|
|
1887
2186
|
detail,
|
|
1888
2187
|
")"
|
|
1889
2188
|
] }) : null
|
|
@@ -1903,21 +2202,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1903
2202
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1904
2203
|
"Model: ",
|
|
1905
2204
|
meta.model,
|
|
1906
|
-
"
|
|
2205
|
+
" Provider: ",
|
|
1907
2206
|
meta.provider
|
|
1908
2207
|
] }, "meta-1"),
|
|
1909
2208
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1910
2209
|
"Commit: ",
|
|
1911
2210
|
meta.commit,
|
|
1912
|
-
"
|
|
2211
|
+
" Branch: ",
|
|
1913
2212
|
meta.branch,
|
|
1914
|
-
"
|
|
2213
|
+
" Seed: ",
|
|
1915
2214
|
meta.seed
|
|
1916
2215
|
] }, "meta-2"),
|
|
1917
2216
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1918
2217
|
"Duration: ",
|
|
1919
2218
|
meta.duration,
|
|
1920
|
-
"
|
|
2219
|
+
" Concurrency: ",
|
|
1921
2220
|
meta.concurrency
|
|
1922
2221
|
] }, "meta-3"),
|
|
1923
2222
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
@@ -1929,7 +2228,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1929
2228
|
...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
1930
2229
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
|
|
1931
2230
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
1932
|
-
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2231
|
+
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2232
|
+
CheckRow,
|
|
2233
|
+
{
|
|
2234
|
+
name: c.name,
|
|
2235
|
+
passed: c.passed,
|
|
2236
|
+
detail: c.detail
|
|
2237
|
+
},
|
|
2238
|
+
`chk-${c.name}`
|
|
2239
|
+
)),
|
|
1933
2240
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
|
|
1934
2241
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
1935
2242
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -1942,16 +2249,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1942
2249
|
"perf-rate"
|
|
1943
2250
|
),
|
|
1944
2251
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1945
|
-
"latency avg
|
|
2252
|
+
"latency avg ",
|
|
1946
2253
|
performance.latencyAvgMs,
|
|
1947
|
-
"ms
|
|
2254
|
+
"ms p95 ",
|
|
1948
2255
|
performance.latencyP95Ms,
|
|
1949
2256
|
"ms"
|
|
1950
2257
|
] }, "perf-lat"),
|
|
1951
2258
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1952
|
-
"tokens avg
|
|
2259
|
+
"tokens avg ",
|
|
1953
2260
|
performance.tokensAvg,
|
|
1954
|
-
"
|
|
2261
|
+
" p95 ",
|
|
1955
2262
|
performance.tokensP95
|
|
1956
2263
|
] }, "perf-tok"),
|
|
1957
2264
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
|
|
@@ -1975,6 +2282,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1975
2282
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp6"));
|
|
1976
2283
|
rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Test cases" }, "tc-h"));
|
|
1977
2284
|
for (const tc of testCases) {
|
|
2285
|
+
const rerunPart = tc.rerunTotal != null && tc.rerunIndex != null ? ` (${tc.rerunIndex}/${tc.rerunTotal})` : "";
|
|
1978
2286
|
rows.push(
|
|
1979
2287
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
1980
2288
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "cyan", children: [
|
|
@@ -1986,12 +2294,13 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1986
2294
|
] }),
|
|
1987
2295
|
" ",
|
|
1988
2296
|
tc.testCaseName,
|
|
2297
|
+
rerunPart ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: rerunPart }) : null,
|
|
1989
2298
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1990
2299
|
" (",
|
|
1991
2300
|
tc.durationMs,
|
|
1992
2301
|
"ms)"
|
|
1993
2302
|
] })
|
|
1994
|
-
] }, `tc-${tc.testCaseId}`)
|
|
2303
|
+
] }, `tc-${tc.testCaseId}-${tc.rerunIndex ?? 0}`)
|
|
1995
2304
|
);
|
|
1996
2305
|
for (const item of tc.evaluatorScores) {
|
|
1997
2306
|
const name = evaluatorNameById.get(item.evaluatorId) ?? item.evaluatorId;
|
|
@@ -2059,12 +2368,12 @@ function RunDetailsView({
|
|
|
2059
2368
|
}) {
|
|
2060
2369
|
const runs = dataset?.runs ?? [];
|
|
2061
2370
|
const rightFocused = state.focus === "right";
|
|
2062
|
-
const [testCases, setTestCases] =
|
|
2063
|
-
const evaluatorNameById =
|
|
2371
|
+
const [testCases, setTestCases] = React2.useState([]);
|
|
2372
|
+
const evaluatorNameById = React2__default.default.useMemo(
|
|
2064
2373
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2065
2374
|
[evaluators]
|
|
2066
2375
|
);
|
|
2067
|
-
|
|
2376
|
+
React2.useEffect(() => {
|
|
2068
2377
|
if (!selectedRun?.meta?.artifact) {
|
|
2069
2378
|
setTestCases([]);
|
|
2070
2379
|
return;
|
|
@@ -2083,7 +2392,7 @@ function RunDetailsView({
|
|
|
2083
2392
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2084
2393
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2085
2394
|
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2086
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2395
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
|
|
2087
2396
|
] });
|
|
2088
2397
|
}
|
|
2089
2398
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2163,16 +2472,17 @@ function EvalsCliApp({
|
|
|
2163
2472
|
}) {
|
|
2164
2473
|
const { exit } = ink.useApp();
|
|
2165
2474
|
const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
|
|
2166
|
-
const [liveData, setLiveData] =
|
|
2167
|
-
const [runtimeMessage, setRuntimeMessage] =
|
|
2168
|
-
const
|
|
2475
|
+
const [liveData, setLiveData] = React2.useState(data);
|
|
2476
|
+
const [runtimeMessage, setRuntimeMessage] = React2.useState();
|
|
2477
|
+
const overviewRowCountRef = React2.useRef(0);
|
|
2478
|
+
const [state, dispatch] = React2.useReducer(
|
|
2169
2479
|
reduceCliState,
|
|
2170
2480
|
createInitialState(data, args)
|
|
2171
2481
|
);
|
|
2172
|
-
|
|
2482
|
+
React2.useEffect(() => {
|
|
2173
2483
|
setLiveData(data);
|
|
2174
2484
|
}, [data]);
|
|
2175
|
-
|
|
2485
|
+
React2.useEffect(() => {
|
|
2176
2486
|
if (!runner) {
|
|
2177
2487
|
return void 0;
|
|
2178
2488
|
}
|
|
@@ -2191,7 +2501,7 @@ function EvalsCliApp({
|
|
|
2191
2501
|
}
|
|
2192
2502
|
});
|
|
2193
2503
|
}, [runner]);
|
|
2194
|
-
const filteredDatasets =
|
|
2504
|
+
const filteredDatasets = React2.useMemo(
|
|
2195
2505
|
() => getFilteredDatasets(liveData, state.searchQuery),
|
|
2196
2506
|
[liveData, state.searchQuery]
|
|
2197
2507
|
);
|
|
@@ -2244,7 +2554,16 @@ function EvalsCliApp({
|
|
|
2244
2554
|
return;
|
|
2245
2555
|
}
|
|
2246
2556
|
if (key.downArrow) {
|
|
2247
|
-
|
|
2557
|
+
let max;
|
|
2558
|
+
if (clampedState.level === "datasets") {
|
|
2559
|
+
max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
|
|
2560
|
+
} else if (clampedState.level === "runs") {
|
|
2561
|
+
max = selectedDataset?.runs.length ?? 0;
|
|
2562
|
+
} else if (clampedState.level === "new-evaluation") {
|
|
2563
|
+
max = Math.max(0, visibleEvaluators.length - 1);
|
|
2564
|
+
} else {
|
|
2565
|
+
max = 100;
|
|
2566
|
+
}
|
|
2248
2567
|
dispatch({ type: "MOVE_DOWN", max });
|
|
2249
2568
|
return;
|
|
2250
2569
|
}
|
|
@@ -2262,7 +2581,7 @@ function EvalsCliApp({
|
|
|
2262
2581
|
}
|
|
2263
2582
|
return;
|
|
2264
2583
|
}
|
|
2265
|
-
if (isBackKey(key)) {
|
|
2584
|
+
if (isBackKey(key) || input === "\x7F" || input === "\b") {
|
|
2266
2585
|
dispatch({ type: "BACK" });
|
|
2267
2586
|
return;
|
|
2268
2587
|
}
|
|
@@ -2315,7 +2634,8 @@ function EvalsCliApp({
|
|
|
2315
2634
|
{
|
|
2316
2635
|
state: clampedState,
|
|
2317
2636
|
filteredDatasets,
|
|
2318
|
-
selectedDataset
|
|
2637
|
+
selectedDataset,
|
|
2638
|
+
overviewRowCountRef
|
|
2319
2639
|
}
|
|
2320
2640
|
);
|
|
2321
2641
|
}
|