@m4trix/evals 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +129 -29
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +129 -29
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +591 -380
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +582 -371
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.cjs
CHANGED
|
@@ -2,17 +2,18 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
var fullscreenInk = require('fullscreen-ink');
|
|
5
|
-
var
|
|
5
|
+
var React2 = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
8
|
var path = require('path');
|
|
9
|
-
var
|
|
9
|
+
var inkChart = require('@pppp606/ink-chart');
|
|
10
10
|
var crypto = require('crypto');
|
|
11
11
|
var effect = require('effect');
|
|
12
12
|
var fs = require('fs');
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
14
|
var promises = require('fs/promises');
|
|
15
15
|
var url = require('url');
|
|
16
|
+
var diff = require('diff');
|
|
16
17
|
|
|
17
18
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
18
19
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -35,7 +36,7 @@ function _interopNamespace(e) {
|
|
|
35
36
|
return Object.freeze(n);
|
|
36
37
|
}
|
|
37
38
|
|
|
38
|
-
var
|
|
39
|
+
var React2__default = /*#__PURE__*/_interopDefault(React2);
|
|
39
40
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
40
41
|
|
|
41
42
|
var SEP = " ";
|
|
@@ -104,7 +105,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
|
|
|
104
105
|
// src/cli/components/Footer.tsx
|
|
105
106
|
function getFooterText(state) {
|
|
106
107
|
if (state.level === "datasets") {
|
|
107
|
-
return "\u2191\u2193
|
|
108
|
+
return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
|
|
108
109
|
}
|
|
109
110
|
if (state.level === "runs") {
|
|
110
111
|
return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
|
|
@@ -636,6 +637,7 @@ function createInitialState(data, args) {
|
|
|
636
637
|
datasetMenuIndex,
|
|
637
638
|
runMenuIndex,
|
|
638
639
|
detailsScrollOffset: 0,
|
|
640
|
+
overviewScrollOffset: 0,
|
|
639
641
|
selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
|
|
640
642
|
evaluatorMenuIndex: 0,
|
|
641
643
|
searchQuery,
|
|
@@ -651,8 +653,11 @@ function reduceCliState(state, action) {
|
|
|
651
653
|
if (state.level === "details" && state.focus === "right") {
|
|
652
654
|
return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
|
|
653
655
|
}
|
|
656
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
657
|
+
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
658
|
+
}
|
|
654
659
|
if (state.level === "datasets") {
|
|
655
|
-
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
|
|
660
|
+
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
|
|
656
661
|
}
|
|
657
662
|
if (state.level === "runs") {
|
|
658
663
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -669,8 +674,11 @@ function reduceCliState(state, action) {
|
|
|
669
674
|
if (state.level === "details" && state.focus === "right") {
|
|
670
675
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
671
676
|
}
|
|
677
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
678
|
+
return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
|
|
679
|
+
}
|
|
672
680
|
if (state.level === "datasets") {
|
|
673
|
-
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
|
|
681
|
+
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
|
|
674
682
|
}
|
|
675
683
|
if (state.level === "runs") {
|
|
676
684
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -746,292 +754,6 @@ function reduceCliState(state, action) {
|
|
|
746
754
|
}
|
|
747
755
|
return state;
|
|
748
756
|
}
|
|
749
|
-
var LEFT_PANE_WIDTH2 = 44;
|
|
750
|
-
function DatasetsView({
|
|
751
|
-
state,
|
|
752
|
-
filteredDatasets,
|
|
753
|
-
selectedDataset
|
|
754
|
-
}) {
|
|
755
|
-
const leftFocused = state.focus === "left";
|
|
756
|
-
const rightFocused = state.focus === "right";
|
|
757
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
758
|
-
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
759
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
760
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
761
|
-
ListItem,
|
|
762
|
-
{
|
|
763
|
-
selected: state.datasetMenuIndex === 0,
|
|
764
|
-
label: "New evaluation",
|
|
765
|
-
itemKey: "datasets-new-eval"
|
|
766
|
-
}
|
|
767
|
-
),
|
|
768
|
-
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
769
|
-
ListItem,
|
|
770
|
-
{
|
|
771
|
-
selected: state.datasetMenuIndex === index + 1,
|
|
772
|
-
label: dataset.name,
|
|
773
|
-
itemKey: `dataset-${dataset.id}`
|
|
774
|
-
},
|
|
775
|
-
dataset.id
|
|
776
|
-
))
|
|
777
|
-
] }),
|
|
778
|
-
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
779
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
780
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
|
|
781
|
-
] })
|
|
782
|
-
] });
|
|
783
|
-
}
|
|
784
|
-
function RunsView({
|
|
785
|
-
state,
|
|
786
|
-
dataset,
|
|
787
|
-
selectedRun
|
|
788
|
-
}) {
|
|
789
|
-
const runs = dataset?.runs ?? [];
|
|
790
|
-
const rightFocused = state.focus === "right";
|
|
791
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
792
|
-
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
793
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
794
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
795
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
796
|
-
" ",
|
|
797
|
-
selectedRun.label,
|
|
798
|
-
" ",
|
|
799
|
-
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
800
|
-
] }),
|
|
801
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
802
|
-
"Commit: ",
|
|
803
|
-
selectedRun.meta.commit,
|
|
804
|
-
" Branch: ",
|
|
805
|
-
selectedRun.meta.branch,
|
|
806
|
-
" ",
|
|
807
|
-
"Seed: ",
|
|
808
|
-
selectedRun.meta.seed
|
|
809
|
-
] }),
|
|
810
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
811
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
812
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
813
|
-
TextBar,
|
|
814
|
-
{
|
|
815
|
-
label: "pass rate",
|
|
816
|
-
value: selectedRun.performance.passRate,
|
|
817
|
-
format: (v) => `${v}%`
|
|
818
|
-
}
|
|
819
|
-
),
|
|
820
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
821
|
-
TextBar,
|
|
822
|
-
{
|
|
823
|
-
label: "avg score",
|
|
824
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
825
|
-
}
|
|
826
|
-
),
|
|
827
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
828
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
829
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
830
|
-
TextBar,
|
|
831
|
-
{
|
|
832
|
-
label: dimension.name,
|
|
833
|
-
value: dimension.score
|
|
834
|
-
},
|
|
835
|
-
dimension.name
|
|
836
|
-
)),
|
|
837
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
838
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
839
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
840
|
-
Sparkline,
|
|
841
|
-
{
|
|
842
|
-
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
843
|
-
selectedRun.performance.latencyAvgMs - 40,
|
|
844
|
-
selectedRun.performance.latencyAvgMs - 10,
|
|
845
|
-
selectedRun.performance.latencyAvgMs + 20,
|
|
846
|
-
selectedRun.performance.latencyP95Ms - 80,
|
|
847
|
-
selectedRun.performance.latencyP95Ms
|
|
848
|
-
],
|
|
849
|
-
width: 24
|
|
850
|
-
}
|
|
851
|
-
)
|
|
852
|
-
] }) })
|
|
853
|
-
] });
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
// src/evals/metric.ts
|
|
857
|
-
var registry = /* @__PURE__ */ new Map();
|
|
858
|
-
var Metric = {
|
|
859
|
-
of(config) {
|
|
860
|
-
const def = {
|
|
861
|
-
id: config.id,
|
|
862
|
-
name: config.name,
|
|
863
|
-
aggregate: config.aggregate,
|
|
864
|
-
format: config.format,
|
|
865
|
-
make: (data) => ({ id: config.id, data })
|
|
866
|
-
};
|
|
867
|
-
registry.set(config.id, def);
|
|
868
|
-
return def;
|
|
869
|
-
}
|
|
870
|
-
};
|
|
871
|
-
function getMetricById(id) {
|
|
872
|
-
return registry.get(id);
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
// src/evals/score.ts
|
|
876
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
877
|
-
var Score = {
|
|
878
|
-
of(config) {
|
|
879
|
-
const def = {
|
|
880
|
-
id: config.id,
|
|
881
|
-
name: config.name,
|
|
882
|
-
displayStrategy: config.displayStrategy,
|
|
883
|
-
aggregate: config.aggregate,
|
|
884
|
-
format: config.format,
|
|
885
|
-
make: (data, options) => {
|
|
886
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
887
|
-
return {
|
|
888
|
-
id: config.id,
|
|
889
|
-
data,
|
|
890
|
-
...passed !== void 0 && { passed }
|
|
891
|
-
};
|
|
892
|
-
}
|
|
893
|
-
};
|
|
894
|
-
registry2.set(config.id, def);
|
|
895
|
-
return def;
|
|
896
|
-
}
|
|
897
|
-
};
|
|
898
|
-
function getScoreById(id) {
|
|
899
|
-
return registry2.get(id);
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
// src/evals/aggregators.ts
|
|
903
|
-
function aggregateAverage(values) {
|
|
904
|
-
if (values.length === 0) {
|
|
905
|
-
return { value: 0 };
|
|
906
|
-
}
|
|
907
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
908
|
-
return { value: sum / values.length };
|
|
909
|
-
}
|
|
910
|
-
function aggregateAll(values) {
|
|
911
|
-
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
912
|
-
}
|
|
913
|
-
function aggregateTokenCountSum(values) {
|
|
914
|
-
const initial = {
|
|
915
|
-
input: 0,
|
|
916
|
-
output: 0,
|
|
917
|
-
inputCached: 0,
|
|
918
|
-
outputCached: 0
|
|
919
|
-
};
|
|
920
|
-
return values.reduce(
|
|
921
|
-
(acc, v) => ({
|
|
922
|
-
input: acc.input + (v.input ?? 0),
|
|
923
|
-
output: acc.output + (v.output ?? 0),
|
|
924
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
925
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
926
|
-
}),
|
|
927
|
-
initial
|
|
928
|
-
);
|
|
929
|
-
}
|
|
930
|
-
function aggregateLatencyAverage(values) {
|
|
931
|
-
if (values.length === 0) {
|
|
932
|
-
return { ms: 0 };
|
|
933
|
-
}
|
|
934
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
935
|
-
return { ms: sum / values.length };
|
|
936
|
-
}
|
|
937
|
-
|
|
938
|
-
// src/evals/metrics/standard.ts
|
|
939
|
-
Metric.of({
|
|
940
|
-
id: "token-count",
|
|
941
|
-
name: "Tokens",
|
|
942
|
-
aggregate: aggregateTokenCountSum,
|
|
943
|
-
format: (data, options) => {
|
|
944
|
-
const input = data.input ?? 0;
|
|
945
|
-
const output = data.output ?? 0;
|
|
946
|
-
const inputCached = data.inputCached ?? 0;
|
|
947
|
-
const outputCached = data.outputCached ?? 0;
|
|
948
|
-
const cached = inputCached + outputCached;
|
|
949
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
950
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
951
|
-
}
|
|
952
|
-
});
|
|
953
|
-
Metric.of({
|
|
954
|
-
id: "latency",
|
|
955
|
-
name: "Latency",
|
|
956
|
-
aggregate: aggregateLatencyAverage,
|
|
957
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
958
|
-
});
|
|
959
|
-
|
|
960
|
-
// src/evals/scores/standard.ts
|
|
961
|
-
Score.of({
|
|
962
|
-
id: "percent",
|
|
963
|
-
name: "Score",
|
|
964
|
-
displayStrategy: "bar",
|
|
965
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
966
|
-
aggregate: aggregateAverage
|
|
967
|
-
});
|
|
968
|
-
Score.of({
|
|
969
|
-
id: "binary",
|
|
970
|
-
name: "Result",
|
|
971
|
-
displayStrategy: "passFail",
|
|
972
|
-
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
973
|
-
aggregate: aggregateAll
|
|
974
|
-
});
|
|
975
|
-
function createDiffLogEntry(expected, actual, options) {
|
|
976
|
-
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
977
|
-
return {
|
|
978
|
-
type: "diff",
|
|
979
|
-
label: options?.label,
|
|
980
|
-
expected,
|
|
981
|
-
actual,
|
|
982
|
-
diff: diff || "(no differences)"
|
|
983
|
-
};
|
|
984
|
-
}
|
|
985
|
-
function getDiffLines(entry) {
|
|
986
|
-
const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
987
|
-
return raw.split("\n").map((line) => {
|
|
988
|
-
const trimmed = line.trimStart();
|
|
989
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
990
|
-
return { type: "remove", line };
|
|
991
|
-
}
|
|
992
|
-
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
993
|
-
return { type: "add", line };
|
|
994
|
-
}
|
|
995
|
-
return { type: "context", line };
|
|
996
|
-
});
|
|
997
|
-
}
|
|
998
|
-
|
|
999
|
-
// src/runner/score-utils.ts
|
|
1000
|
-
function toNumericScoreFromScores(scores) {
|
|
1001
|
-
for (const item of scores) {
|
|
1002
|
-
const def = getScoreById(item.id);
|
|
1003
|
-
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1004
|
-
const value = item.data.value;
|
|
1005
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1006
|
-
return value;
|
|
1007
|
-
}
|
|
1008
|
-
}
|
|
1009
|
-
const numeric = toNumericScore(item.data);
|
|
1010
|
-
if (numeric !== void 0) {
|
|
1011
|
-
return numeric;
|
|
1012
|
-
}
|
|
1013
|
-
}
|
|
1014
|
-
return void 0;
|
|
1015
|
-
}
|
|
1016
|
-
function toNumericScore(value) {
|
|
1017
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1018
|
-
return value;
|
|
1019
|
-
}
|
|
1020
|
-
if (typeof value !== "object" || value === null) {
|
|
1021
|
-
return void 0;
|
|
1022
|
-
}
|
|
1023
|
-
const obj = value;
|
|
1024
|
-
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1025
|
-
return obj.score;
|
|
1026
|
-
}
|
|
1027
|
-
const numberValues = Object.values(value).filter(
|
|
1028
|
-
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1029
|
-
);
|
|
1030
|
-
if (numberValues.length === 0) {
|
|
1031
|
-
return void 0;
|
|
1032
|
-
}
|
|
1033
|
-
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1034
|
-
}
|
|
1035
757
|
|
|
1036
758
|
// src/runner/config.ts
|
|
1037
759
|
var defaultRunnerConfig = {
|
|
@@ -1213,75 +935,311 @@ async function loadModuleExports(filePath) {
|
|
|
1213
935
|
if (!createJiti2) {
|
|
1214
936
|
throw new Error("Failed to initialize jiti TypeScript loader");
|
|
1215
937
|
}
|
|
1216
|
-
jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
|
|
1217
|
-
interopDefault: true,
|
|
1218
|
-
moduleCache: true
|
|
1219
|
-
});
|
|
938
|
+
jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
|
|
939
|
+
interopDefault: true,
|
|
940
|
+
moduleCache: true
|
|
941
|
+
});
|
|
942
|
+
}
|
|
943
|
+
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
944
|
+
return Object.values(loaded2);
|
|
945
|
+
}
|
|
946
|
+
const moduleUrl = url.pathToFileURL(filePath).href;
|
|
947
|
+
const loaded = await import(moduleUrl);
|
|
948
|
+
return Object.values(loaded);
|
|
949
|
+
}
|
|
950
|
+
async function collectDatasetsFromFiles(config) {
|
|
951
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
952
|
+
const matched = files.filter(
|
|
953
|
+
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
954
|
+
);
|
|
955
|
+
const found = await Promise.all(
|
|
956
|
+
matched.map(async (absolutePath) => {
|
|
957
|
+
const exports = await loadModuleExports(absolutePath);
|
|
958
|
+
const datasets = exports.filter(isDatasetLike);
|
|
959
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
960
|
+
return datasets.map((dataset) => ({
|
|
961
|
+
id: toId("dataset", relPath, dataset.getName()),
|
|
962
|
+
filePath: relPath,
|
|
963
|
+
dataset
|
|
964
|
+
}));
|
|
965
|
+
})
|
|
966
|
+
);
|
|
967
|
+
return found.flat();
|
|
968
|
+
}
|
|
969
|
+
async function collectEvaluatorsFromFiles(config) {
|
|
970
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
971
|
+
const matched = files.filter(
|
|
972
|
+
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
973
|
+
);
|
|
974
|
+
const found = await Promise.all(
|
|
975
|
+
matched.map(async (absolutePath) => {
|
|
976
|
+
const exports = await loadModuleExports(absolutePath);
|
|
977
|
+
const evaluators = exports.filter(isEvaluatorLike);
|
|
978
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
979
|
+
return evaluators.map((evaluator) => ({
|
|
980
|
+
id: toId("evaluator", relPath, evaluator.getName()),
|
|
981
|
+
filePath: relPath,
|
|
982
|
+
evaluator
|
|
983
|
+
}));
|
|
984
|
+
})
|
|
985
|
+
);
|
|
986
|
+
return found.flat();
|
|
987
|
+
}
|
|
988
|
+
async function collectTestCasesFromFiles(config) {
|
|
989
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
990
|
+
const matched = files.filter(
|
|
991
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
992
|
+
);
|
|
993
|
+
const found = await Promise.all(
|
|
994
|
+
matched.map(async (absolutePath) => {
|
|
995
|
+
const exports = await loadModuleExports(absolutePath);
|
|
996
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
997
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
998
|
+
return testCases.map((testCase) => ({
|
|
999
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
1000
|
+
filePath: relPath,
|
|
1001
|
+
testCase
|
|
1002
|
+
}));
|
|
1003
|
+
})
|
|
1004
|
+
);
|
|
1005
|
+
return found.flat();
|
|
1006
|
+
}
|
|
1007
|
+
function toJsonLines(value) {
|
|
1008
|
+
try {
|
|
1009
|
+
return JSON.stringify(value, null, 2);
|
|
1010
|
+
} catch {
|
|
1011
|
+
return String(value);
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
function formatDiffString(changes) {
|
|
1015
|
+
const lines = [];
|
|
1016
|
+
for (const part of changes) {
|
|
1017
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
1018
|
+
const partLines = part.value.split("\n");
|
|
1019
|
+
if (partLines[partLines.length - 1] === "") {
|
|
1020
|
+
partLines.pop();
|
|
1021
|
+
}
|
|
1022
|
+
for (const line of partLines) {
|
|
1023
|
+
lines.push(`${prefix} ${line}`);
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
return lines.join("\n");
|
|
1027
|
+
}
|
|
1028
|
+
function createDiffString(expected, actual) {
|
|
1029
|
+
const expectedStr = toJsonLines(expected);
|
|
1030
|
+
const actualStr = toJsonLines(actual);
|
|
1031
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
1032
|
+
return formatDiffString(changes);
|
|
1033
|
+
}
|
|
1034
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
1035
|
+
const diff = createDiffString(expected, actual);
|
|
1036
|
+
return {
|
|
1037
|
+
type: "diff",
|
|
1038
|
+
label: options?.label,
|
|
1039
|
+
expected,
|
|
1040
|
+
actual,
|
|
1041
|
+
diff: diff || "(no differences)"
|
|
1042
|
+
};
|
|
1043
|
+
}
|
|
1044
|
+
function getDiffLines(entry) {
|
|
1045
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
1046
|
+
return raw.split("\n").map((line) => {
|
|
1047
|
+
const trimmed = line.trimStart();
|
|
1048
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
1049
|
+
return { type: "remove", line };
|
|
1050
|
+
}
|
|
1051
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
1052
|
+
return { type: "add", line };
|
|
1053
|
+
}
|
|
1054
|
+
return { type: "context", line };
|
|
1055
|
+
});
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
// src/evals/metric.ts
|
|
1059
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1060
|
+
var Metric = {
|
|
1061
|
+
of(config) {
|
|
1062
|
+
const def = {
|
|
1063
|
+
id: config.id,
|
|
1064
|
+
name: config.name,
|
|
1065
|
+
aggregate: config.aggregate,
|
|
1066
|
+
format: config.format,
|
|
1067
|
+
make: (data) => ({ id: config.id, data })
|
|
1068
|
+
};
|
|
1069
|
+
registry.set(config.id, def);
|
|
1070
|
+
return def;
|
|
1071
|
+
}
|
|
1072
|
+
};
|
|
1073
|
+
function getMetricById(id) {
|
|
1074
|
+
return registry.get(id);
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// src/evals/score.ts
|
|
1078
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1079
|
+
var Score = {
|
|
1080
|
+
of(config) {
|
|
1081
|
+
const def = {
|
|
1082
|
+
id: config.id,
|
|
1083
|
+
name: config.name,
|
|
1084
|
+
displayStrategy: config.displayStrategy,
|
|
1085
|
+
aggregate: config.aggregate,
|
|
1086
|
+
format: config.format,
|
|
1087
|
+
make: (data, options) => {
|
|
1088
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1089
|
+
return {
|
|
1090
|
+
id: config.id,
|
|
1091
|
+
data,
|
|
1092
|
+
...passed !== void 0 && { passed }
|
|
1093
|
+
};
|
|
1094
|
+
}
|
|
1095
|
+
};
|
|
1096
|
+
registry2.set(config.id, def);
|
|
1097
|
+
return def;
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1100
|
+
function getScoreById(id) {
|
|
1101
|
+
return registry2.get(id);
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
// src/evals/aggregators.ts
|
|
1105
|
+
function aggregateAverageWithVariance(values) {
|
|
1106
|
+
if (values.length === 0) {
|
|
1107
|
+
return { value: 0, count: 0 };
|
|
1108
|
+
}
|
|
1109
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1110
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1111
|
+
const mean = sum / values.length;
|
|
1112
|
+
let stdDev;
|
|
1113
|
+
if (values.length >= 2) {
|
|
1114
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1115
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1116
|
+
}
|
|
1117
|
+
return { value: mean, stdDev, count: values.length };
|
|
1118
|
+
}
|
|
1119
|
+
function aggregateAll(values) {
|
|
1120
|
+
const total = values.length;
|
|
1121
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1122
|
+
return {
|
|
1123
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1124
|
+
passedCount,
|
|
1125
|
+
totalCount: total
|
|
1126
|
+
};
|
|
1127
|
+
}
|
|
1128
|
+
function aggregateTokenCountSum(values) {
|
|
1129
|
+
const initial = {
|
|
1130
|
+
input: 0,
|
|
1131
|
+
output: 0,
|
|
1132
|
+
inputCached: 0,
|
|
1133
|
+
outputCached: 0
|
|
1134
|
+
};
|
|
1135
|
+
return values.reduce(
|
|
1136
|
+
(acc, v) => ({
|
|
1137
|
+
input: acc.input + (v.input ?? 0),
|
|
1138
|
+
output: acc.output + (v.output ?? 0),
|
|
1139
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1140
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1141
|
+
}),
|
|
1142
|
+
initial
|
|
1143
|
+
);
|
|
1144
|
+
}
|
|
1145
|
+
function aggregateLatencyAverage(values) {
|
|
1146
|
+
if (values.length === 0) {
|
|
1147
|
+
return { ms: 0 };
|
|
1148
|
+
}
|
|
1149
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1150
|
+
return { ms: sum / values.length };
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
// src/evals/metrics/standard.ts
|
|
1154
|
+
Metric.of({
|
|
1155
|
+
id: "token-count",
|
|
1156
|
+
name: "Tokens",
|
|
1157
|
+
aggregate: aggregateTokenCountSum,
|
|
1158
|
+
format: (data, options) => {
|
|
1159
|
+
const input = data.input ?? 0;
|
|
1160
|
+
const output = data.output ?? 0;
|
|
1161
|
+
const inputCached = data.inputCached ?? 0;
|
|
1162
|
+
const outputCached = data.outputCached ?? 0;
|
|
1163
|
+
const cached = inputCached + outputCached;
|
|
1164
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1165
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1166
|
+
}
|
|
1167
|
+
});
|
|
1168
|
+
Metric.of({
|
|
1169
|
+
id: "latency",
|
|
1170
|
+
name: "Latency",
|
|
1171
|
+
aggregate: aggregateLatencyAverage,
|
|
1172
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1173
|
+
});
|
|
1174
|
+
|
|
1175
|
+
// src/evals/scores/standard.ts
|
|
1176
|
+
Score.of({
|
|
1177
|
+
id: "percent",
|
|
1178
|
+
name: "Score",
|
|
1179
|
+
displayStrategy: "bar",
|
|
1180
|
+
format: (data, options) => {
|
|
1181
|
+
if (options?.isAggregated) {
|
|
1182
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
1183
|
+
}
|
|
1184
|
+
return data.value.toFixed(2);
|
|
1185
|
+
},
|
|
1186
|
+
aggregate: aggregateAverageWithVariance
|
|
1187
|
+
});
|
|
1188
|
+
Score.of({
|
|
1189
|
+
id: "binary",
|
|
1190
|
+
name: "Result",
|
|
1191
|
+
displayStrategy: "passFail",
|
|
1192
|
+
format: (data, options) => {
|
|
1193
|
+
if (options?.isAggregated) {
|
|
1194
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1195
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1196
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1197
|
+
}
|
|
1198
|
+
return base;
|
|
1199
|
+
}
|
|
1200
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
1201
|
+
},
|
|
1202
|
+
aggregate: aggregateAll
|
|
1203
|
+
});
|
|
1204
|
+
|
|
1205
|
+
// src/runner/score-utils.ts
|
|
1206
|
+
function toNumericScoreFromScores(scores) {
|
|
1207
|
+
for (const item of scores) {
|
|
1208
|
+
const def = getScoreById(item.id);
|
|
1209
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1210
|
+
const value = item.data.value;
|
|
1211
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1212
|
+
return value;
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
const numeric = toNumericScore(item.data);
|
|
1216
|
+
if (numeric !== void 0) {
|
|
1217
|
+
return numeric;
|
|
1220
1218
|
}
|
|
1221
|
-
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
1222
|
-
return Object.values(loaded2);
|
|
1223
1219
|
}
|
|
1224
|
-
|
|
1225
|
-
const loaded = await import(moduleUrl);
|
|
1226
|
-
return Object.values(loaded);
|
|
1227
|
-
}
|
|
1228
|
-
async function collectDatasetsFromFiles(config) {
|
|
1229
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1230
|
-
const matched = files.filter(
|
|
1231
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1232
|
-
);
|
|
1233
|
-
const found = await Promise.all(
|
|
1234
|
-
matched.map(async (absolutePath) => {
|
|
1235
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1236
|
-
const datasets = exports.filter(isDatasetLike);
|
|
1237
|
-
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1238
|
-
return datasets.map((dataset) => ({
|
|
1239
|
-
id: toId("dataset", relPath, dataset.getName()),
|
|
1240
|
-
filePath: relPath,
|
|
1241
|
-
dataset
|
|
1242
|
-
}));
|
|
1243
|
-
})
|
|
1244
|
-
);
|
|
1245
|
-
return found.flat();
|
|
1246
|
-
}
|
|
1247
|
-
async function collectEvaluatorsFromFiles(config) {
|
|
1248
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1249
|
-
const matched = files.filter(
|
|
1250
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1251
|
-
);
|
|
1252
|
-
const found = await Promise.all(
|
|
1253
|
-
matched.map(async (absolutePath) => {
|
|
1254
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1255
|
-
const evaluators = exports.filter(isEvaluatorLike);
|
|
1256
|
-
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1257
|
-
return evaluators.map((evaluator) => ({
|
|
1258
|
-
id: toId("evaluator", relPath, evaluator.getName()),
|
|
1259
|
-
filePath: relPath,
|
|
1260
|
-
evaluator
|
|
1261
|
-
}));
|
|
1262
|
-
})
|
|
1263
|
-
);
|
|
1264
|
-
return found.flat();
|
|
1220
|
+
return void 0;
|
|
1265
1221
|
}
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
)
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
testCase
|
|
1280
|
-
}));
|
|
1281
|
-
})
|
|
1222
|
+
function toNumericScore(value) {
|
|
1223
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1224
|
+
return value;
|
|
1225
|
+
}
|
|
1226
|
+
if (typeof value !== "object" || value === null) {
|
|
1227
|
+
return void 0;
|
|
1228
|
+
}
|
|
1229
|
+
const obj = value;
|
|
1230
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1231
|
+
return obj.score;
|
|
1232
|
+
}
|
|
1233
|
+
const numberValues = Object.values(value).filter(
|
|
1234
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1282
1235
|
);
|
|
1283
|
-
|
|
1236
|
+
if (numberValues.length === 0) {
|
|
1237
|
+
return void 0;
|
|
1238
|
+
}
|
|
1239
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1284
1240
|
}
|
|
1241
|
+
|
|
1242
|
+
// src/runner/execution.ts
|
|
1285
1243
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1286
1244
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1287
1245
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1952,6 +1910,240 @@ var EffectRunner = class {
|
|
|
1952
1910
|
);
|
|
1953
1911
|
}
|
|
1954
1912
|
};
|
|
1913
|
+
var LEFT_PANE_WIDTH2 = 44;
|
|
1914
|
+
var MAX_RUNS_FOR_CHART = 12;
|
|
1915
|
+
var MAX_RUNS_FOR_TREND = 20;
|
|
1916
|
+
var TREND_BATCH_SIZE = 4;
|
|
1917
|
+
function extractRunAverageScore(testCases) {
|
|
1918
|
+
const scores = [];
|
|
1919
|
+
for (const tc of testCases) {
|
|
1920
|
+
for (const es of tc.evaluatorScores) {
|
|
1921
|
+
const n = toNumericScoreFromScores(es.scores);
|
|
1922
|
+
if (n !== void 0) {
|
|
1923
|
+
scores.push(n);
|
|
1924
|
+
}
|
|
1925
|
+
}
|
|
1926
|
+
}
|
|
1927
|
+
if (scores.length === 0)
|
|
1928
|
+
return void 0;
|
|
1929
|
+
return scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1930
|
+
}
|
|
1931
|
+
async function loadRunScores(runs) {
|
|
1932
|
+
const results = [];
|
|
1933
|
+
for (const run of runs) {
|
|
1934
|
+
const artifact = run.meta?.artifact;
|
|
1935
|
+
if (!artifact)
|
|
1936
|
+
continue;
|
|
1937
|
+
try {
|
|
1938
|
+
const path$1 = path.resolve(artifact);
|
|
1939
|
+
const testCases = await parseArtifactFile(path$1);
|
|
1940
|
+
const avg = extractRunAverageScore(testCases);
|
|
1941
|
+
if (avg !== void 0) {
|
|
1942
|
+
results.push({
|
|
1943
|
+
runId: run.id,
|
|
1944
|
+
label: run.label,
|
|
1945
|
+
value: avg
|
|
1946
|
+
});
|
|
1947
|
+
}
|
|
1948
|
+
} catch {
|
|
1949
|
+
}
|
|
1950
|
+
}
|
|
1951
|
+
return results;
|
|
1952
|
+
}
|
|
1953
|
+
function batchAverage(values, batchSize) {
|
|
1954
|
+
const batches = [];
|
|
1955
|
+
for (let i = 0; i < values.length; i += batchSize) {
|
|
1956
|
+
const slice = values.slice(i, i + batchSize);
|
|
1957
|
+
if (slice.length > 0) {
|
|
1958
|
+
batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
|
|
1959
|
+
}
|
|
1960
|
+
}
|
|
1961
|
+
return batches;
|
|
1962
|
+
}
|
|
1963
|
+
var OVERVIEW_PAGE_SIZE = 15;
|
|
1964
|
+
function DatasetsView({
|
|
1965
|
+
state,
|
|
1966
|
+
filteredDatasets,
|
|
1967
|
+
selectedDataset,
|
|
1968
|
+
overviewRowCountRef
|
|
1969
|
+
}) {
|
|
1970
|
+
const leftFocused = state.focus === "left";
|
|
1971
|
+
const rightFocused = state.focus === "right";
|
|
1972
|
+
const [runScores, setRunScores] = React2.useState([]);
|
|
1973
|
+
const [loading, setLoading] = React2.useState(false);
|
|
1974
|
+
React2.useEffect(() => {
|
|
1975
|
+
if (!selectedDataset?.runs?.length) {
|
|
1976
|
+
setRunScores([]);
|
|
1977
|
+
return;
|
|
1978
|
+
}
|
|
1979
|
+
setLoading(true);
|
|
1980
|
+
const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
|
|
1981
|
+
loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
|
|
1982
|
+
}, [selectedDataset?.id, selectedDataset?.runs?.length]);
|
|
1983
|
+
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
1984
|
+
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
1985
|
+
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
1986
|
+
const overviewRows = React2.useMemo(() => {
|
|
1987
|
+
const rows = [];
|
|
1988
|
+
rows.push(
|
|
1989
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
1990
|
+
);
|
|
1991
|
+
if (selectedDataset && selectedDataset.runs.length > 0) {
|
|
1992
|
+
if (loading) {
|
|
1993
|
+
rows.push(
|
|
1994
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
|
|
1995
|
+
);
|
|
1996
|
+
} else if (runScores.length > 0) {
|
|
1997
|
+
rows.push(
|
|
1998
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
|
|
1999
|
+
);
|
|
2000
|
+
for (const d of barData) {
|
|
2001
|
+
rows.push(
|
|
2002
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2003
|
+
TextBar,
|
|
2004
|
+
{
|
|
2005
|
+
label: d.label,
|
|
2006
|
+
value: d.value,
|
|
2007
|
+
labelWidth: 14,
|
|
2008
|
+
barWidth: 24,
|
|
2009
|
+
max: 100,
|
|
2010
|
+
format: (v) => v.toFixed(1)
|
|
2011
|
+
},
|
|
2012
|
+
d.runId
|
|
2013
|
+
)
|
|
2014
|
+
);
|
|
2015
|
+
}
|
|
2016
|
+
if (trendBatched.length > 0) {
|
|
2017
|
+
rows.push(
|
|
2018
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
|
|
2019
|
+
);
|
|
2020
|
+
rows.push(
|
|
2021
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
2022
|
+
inkChart.LineGraph,
|
|
2023
|
+
{
|
|
2024
|
+
data: [{ values: trendBatched, color: "cyan" }],
|
|
2025
|
+
height: 5,
|
|
2026
|
+
width: 45,
|
|
2027
|
+
showYAxis: true,
|
|
2028
|
+
xLabels: ["older", "newer"]
|
|
2029
|
+
}
|
|
2030
|
+
) }, "trend-graph")
|
|
2031
|
+
);
|
|
2032
|
+
}
|
|
2033
|
+
}
|
|
2034
|
+
}
|
|
2035
|
+
return rows;
|
|
2036
|
+
}, [
|
|
2037
|
+
selectedDataset?.overview,
|
|
2038
|
+
selectedDataset?.runs?.length,
|
|
2039
|
+
loading,
|
|
2040
|
+
runScores,
|
|
2041
|
+
barData,
|
|
2042
|
+
trendBatched
|
|
2043
|
+
]);
|
|
2044
|
+
if (overviewRowCountRef) {
|
|
2045
|
+
overviewRowCountRef.current = overviewRows.length;
|
|
2046
|
+
}
|
|
2047
|
+
const offset = Math.max(0, state.overviewScrollOffset);
|
|
2048
|
+
const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
|
|
2049
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2050
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
2051
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
2052
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2053
|
+
ListItem,
|
|
2054
|
+
{
|
|
2055
|
+
selected: state.datasetMenuIndex === 0,
|
|
2056
|
+
label: "New evaluation",
|
|
2057
|
+
itemKey: "datasets-new-eval"
|
|
2058
|
+
}
|
|
2059
|
+
),
|
|
2060
|
+
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2061
|
+
ListItem,
|
|
2062
|
+
{
|
|
2063
|
+
selected: state.datasetMenuIndex === index + 1,
|
|
2064
|
+
label: dataset.name,
|
|
2065
|
+
itemKey: `dataset-${dataset.id}`
|
|
2066
|
+
},
|
|
2067
|
+
dataset.id
|
|
2068
|
+
))
|
|
2069
|
+
] }),
|
|
2070
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
2071
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
2072
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: row }, offset + i)) })
|
|
2073
|
+
] })
|
|
2074
|
+
] });
|
|
2075
|
+
}
|
|
2076
|
+
function RunsView({
|
|
2077
|
+
state,
|
|
2078
|
+
dataset,
|
|
2079
|
+
selectedRun
|
|
2080
|
+
}) {
|
|
2081
|
+
const runs = dataset?.runs ?? [];
|
|
2082
|
+
const rightFocused = state.focus === "right";
|
|
2083
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2084
|
+
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2085
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
2086
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2087
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
2088
|
+
" ",
|
|
2089
|
+
selectedRun.label,
|
|
2090
|
+
" ",
|
|
2091
|
+
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
2092
|
+
] }),
|
|
2093
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2094
|
+
"Commit: ",
|
|
2095
|
+
selectedRun.meta.commit,
|
|
2096
|
+
" Branch: ",
|
|
2097
|
+
selectedRun.meta.branch,
|
|
2098
|
+
" ",
|
|
2099
|
+
"Seed: ",
|
|
2100
|
+
selectedRun.meta.seed
|
|
2101
|
+
] }),
|
|
2102
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2103
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
2104
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2105
|
+
TextBar,
|
|
2106
|
+
{
|
|
2107
|
+
label: "pass rate",
|
|
2108
|
+
value: selectedRun.performance.passRate,
|
|
2109
|
+
format: (v) => `${v}%`
|
|
2110
|
+
}
|
|
2111
|
+
),
|
|
2112
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2113
|
+
TextBar,
|
|
2114
|
+
{
|
|
2115
|
+
label: "avg score",
|
|
2116
|
+
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2117
|
+
}
|
|
2118
|
+
),
|
|
2119
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2120
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
2121
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2122
|
+
TextBar,
|
|
2123
|
+
{
|
|
2124
|
+
label: dimension.name,
|
|
2125
|
+
value: dimension.score
|
|
2126
|
+
},
|
|
2127
|
+
dimension.name
|
|
2128
|
+
)),
|
|
2129
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2130
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
2131
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2132
|
+
Sparkline,
|
|
2133
|
+
{
|
|
2134
|
+
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
2135
|
+
selectedRun.performance.latencyAvgMs - 40,
|
|
2136
|
+
selectedRun.performance.latencyAvgMs - 10,
|
|
2137
|
+
selectedRun.performance.latencyAvgMs + 20,
|
|
2138
|
+
selectedRun.performance.latencyP95Ms - 80,
|
|
2139
|
+
selectedRun.performance.latencyP95Ms
|
|
2140
|
+
],
|
|
2141
|
+
width: 24
|
|
2142
|
+
}
|
|
2143
|
+
)
|
|
2144
|
+
] }) })
|
|
2145
|
+
] });
|
|
2146
|
+
}
|
|
1955
2147
|
var DETAILS_PAGE_SIZE = 20;
|
|
1956
2148
|
function scoreColor(score) {
|
|
1957
2149
|
if (score >= 80)
|
|
@@ -1960,7 +2152,7 @@ function scoreColor(score) {
|
|
|
1960
2152
|
return "yellow";
|
|
1961
2153
|
return "red";
|
|
1962
2154
|
}
|
|
1963
|
-
function formatScorePart(item
|
|
2155
|
+
function formatScorePart(item) {
|
|
1964
2156
|
const def = getScoreById(item.id);
|
|
1965
2157
|
if (!def) {
|
|
1966
2158
|
const numeric = toNumericScore(item.data);
|
|
@@ -1990,7 +2182,7 @@ function CheckRow({
|
|
|
1990
2182
|
" ",
|
|
1991
2183
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
|
|
1992
2184
|
detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1993
|
-
"
|
|
2185
|
+
" (",
|
|
1994
2186
|
detail,
|
|
1995
2187
|
")"
|
|
1996
2188
|
] }) : null
|
|
@@ -2010,21 +2202,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2010
2202
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2011
2203
|
"Model: ",
|
|
2012
2204
|
meta.model,
|
|
2013
|
-
"
|
|
2205
|
+
" Provider: ",
|
|
2014
2206
|
meta.provider
|
|
2015
2207
|
] }, "meta-1"),
|
|
2016
2208
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2017
2209
|
"Commit: ",
|
|
2018
2210
|
meta.commit,
|
|
2019
|
-
"
|
|
2211
|
+
" Branch: ",
|
|
2020
2212
|
meta.branch,
|
|
2021
|
-
"
|
|
2213
|
+
" Seed: ",
|
|
2022
2214
|
meta.seed
|
|
2023
2215
|
] }, "meta-2"),
|
|
2024
2216
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2025
2217
|
"Duration: ",
|
|
2026
2218
|
meta.duration,
|
|
2027
|
-
"
|
|
2219
|
+
" Concurrency: ",
|
|
2028
2220
|
meta.concurrency
|
|
2029
2221
|
] }, "meta-3"),
|
|
2030
2222
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
@@ -2036,7 +2228,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2036
2228
|
...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2037
2229
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
|
|
2038
2230
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2039
|
-
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2231
|
+
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2232
|
+
CheckRow,
|
|
2233
|
+
{
|
|
2234
|
+
name: c.name,
|
|
2235
|
+
passed: c.passed,
|
|
2236
|
+
detail: c.detail
|
|
2237
|
+
},
|
|
2238
|
+
`chk-${c.name}`
|
|
2239
|
+
)),
|
|
2040
2240
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
|
|
2041
2241
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2042
2242
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2049,16 +2249,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2049
2249
|
"perf-rate"
|
|
2050
2250
|
),
|
|
2051
2251
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2052
|
-
"latency avg
|
|
2252
|
+
"latency avg ",
|
|
2053
2253
|
performance.latencyAvgMs,
|
|
2054
|
-
"ms
|
|
2254
|
+
"ms p95 ",
|
|
2055
2255
|
performance.latencyP95Ms,
|
|
2056
2256
|
"ms"
|
|
2057
2257
|
] }, "perf-lat"),
|
|
2058
2258
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2059
|
-
"tokens avg
|
|
2259
|
+
"tokens avg ",
|
|
2060
2260
|
performance.tokensAvg,
|
|
2061
|
-
"
|
|
2261
|
+
" p95 ",
|
|
2062
2262
|
performance.tokensP95
|
|
2063
2263
|
] }, "perf-tok"),
|
|
2064
2264
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
|
|
@@ -2168,12 +2368,12 @@ function RunDetailsView({
|
|
|
2168
2368
|
}) {
|
|
2169
2369
|
const runs = dataset?.runs ?? [];
|
|
2170
2370
|
const rightFocused = state.focus === "right";
|
|
2171
|
-
const [testCases, setTestCases] =
|
|
2172
|
-
const evaluatorNameById =
|
|
2371
|
+
const [testCases, setTestCases] = React2.useState([]);
|
|
2372
|
+
const evaluatorNameById = React2__default.default.useMemo(
|
|
2173
2373
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2174
2374
|
[evaluators]
|
|
2175
2375
|
);
|
|
2176
|
-
|
|
2376
|
+
React2.useEffect(() => {
|
|
2177
2377
|
if (!selectedRun?.meta?.artifact) {
|
|
2178
2378
|
setTestCases([]);
|
|
2179
2379
|
return;
|
|
@@ -2192,7 +2392,7 @@ function RunDetailsView({
|
|
|
2192
2392
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2193
2393
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2194
2394
|
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2195
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2395
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
|
|
2196
2396
|
] });
|
|
2197
2397
|
}
|
|
2198
2398
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2272,16 +2472,17 @@ function EvalsCliApp({
|
|
|
2272
2472
|
}) {
|
|
2273
2473
|
const { exit } = ink.useApp();
|
|
2274
2474
|
const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
|
|
2275
|
-
const [liveData, setLiveData] =
|
|
2276
|
-
const [runtimeMessage, setRuntimeMessage] =
|
|
2277
|
-
const
|
|
2475
|
+
const [liveData, setLiveData] = React2.useState(data);
|
|
2476
|
+
const [runtimeMessage, setRuntimeMessage] = React2.useState();
|
|
2477
|
+
const overviewRowCountRef = React2.useRef(0);
|
|
2478
|
+
const [state, dispatch] = React2.useReducer(
|
|
2278
2479
|
reduceCliState,
|
|
2279
2480
|
createInitialState(data, args)
|
|
2280
2481
|
);
|
|
2281
|
-
|
|
2482
|
+
React2.useEffect(() => {
|
|
2282
2483
|
setLiveData(data);
|
|
2283
2484
|
}, [data]);
|
|
2284
|
-
|
|
2485
|
+
React2.useEffect(() => {
|
|
2285
2486
|
if (!runner) {
|
|
2286
2487
|
return void 0;
|
|
2287
2488
|
}
|
|
@@ -2300,7 +2501,7 @@ function EvalsCliApp({
|
|
|
2300
2501
|
}
|
|
2301
2502
|
});
|
|
2302
2503
|
}, [runner]);
|
|
2303
|
-
const filteredDatasets =
|
|
2504
|
+
const filteredDatasets = React2.useMemo(
|
|
2304
2505
|
() => getFilteredDatasets(liveData, state.searchQuery),
|
|
2305
2506
|
[liveData, state.searchQuery]
|
|
2306
2507
|
);
|
|
@@ -2353,7 +2554,16 @@ function EvalsCliApp({
|
|
|
2353
2554
|
return;
|
|
2354
2555
|
}
|
|
2355
2556
|
if (key.downArrow) {
|
|
2356
|
-
|
|
2557
|
+
let max;
|
|
2558
|
+
if (clampedState.level === "datasets") {
|
|
2559
|
+
max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
|
|
2560
|
+
} else if (clampedState.level === "runs") {
|
|
2561
|
+
max = selectedDataset?.runs.length ?? 0;
|
|
2562
|
+
} else if (clampedState.level === "new-evaluation") {
|
|
2563
|
+
max = Math.max(0, visibleEvaluators.length - 1);
|
|
2564
|
+
} else {
|
|
2565
|
+
max = 100;
|
|
2566
|
+
}
|
|
2357
2567
|
dispatch({ type: "MOVE_DOWN", max });
|
|
2358
2568
|
return;
|
|
2359
2569
|
}
|
|
@@ -2371,7 +2581,7 @@ function EvalsCliApp({
|
|
|
2371
2581
|
}
|
|
2372
2582
|
return;
|
|
2373
2583
|
}
|
|
2374
|
-
if (isBackKey(key)) {
|
|
2584
|
+
if (isBackKey(key) || input === "\x7F" || input === "\b") {
|
|
2375
2585
|
dispatch({ type: "BACK" });
|
|
2376
2586
|
return;
|
|
2377
2587
|
}
|
|
@@ -2424,7 +2634,8 @@ function EvalsCliApp({
|
|
|
2424
2634
|
{
|
|
2425
2635
|
state: clampedState,
|
|
2426
2636
|
filteredDatasets,
|
|
2427
|
-
selectedDataset
|
|
2637
|
+
selectedDataset,
|
|
2638
|
+
overviewRowCountRef
|
|
2428
2639
|
}
|
|
2429
2640
|
);
|
|
2430
2641
|
}
|