@m4trix/evals 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +287 -107
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +287 -107
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +643 -398
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +634 -389
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.cjs
CHANGED
|
@@ -2,17 +2,18 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
var fullscreenInk = require('fullscreen-ink');
|
|
5
|
-
var
|
|
5
|
+
var React2 = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
8
|
var path = require('path');
|
|
9
|
-
var
|
|
9
|
+
var inkChart = require('@pppp606/ink-chart');
|
|
10
10
|
var crypto = require('crypto');
|
|
11
11
|
var effect = require('effect');
|
|
12
12
|
var fs = require('fs');
|
|
13
13
|
var jitiModule = require('jiti');
|
|
14
14
|
var promises = require('fs/promises');
|
|
15
15
|
var url = require('url');
|
|
16
|
+
var diff = require('diff');
|
|
16
17
|
|
|
17
18
|
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
18
19
|
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
@@ -35,7 +36,7 @@ function _interopNamespace(e) {
|
|
|
35
36
|
return Object.freeze(n);
|
|
36
37
|
}
|
|
37
38
|
|
|
38
|
-
var
|
|
39
|
+
var React2__default = /*#__PURE__*/_interopDefault(React2);
|
|
39
40
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
40
41
|
|
|
41
42
|
var SEP = " ";
|
|
@@ -104,7 +105,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
|
|
|
104
105
|
// src/cli/components/Footer.tsx
|
|
105
106
|
function getFooterText(state) {
|
|
106
107
|
if (state.level === "datasets") {
|
|
107
|
-
return "\u2191\u2193
|
|
108
|
+
return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
|
|
108
109
|
}
|
|
109
110
|
if (state.level === "runs") {
|
|
110
111
|
return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
|
|
@@ -636,6 +637,7 @@ function createInitialState(data, args) {
|
|
|
636
637
|
datasetMenuIndex,
|
|
637
638
|
runMenuIndex,
|
|
638
639
|
detailsScrollOffset: 0,
|
|
640
|
+
overviewScrollOffset: 0,
|
|
639
641
|
selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
|
|
640
642
|
evaluatorMenuIndex: 0,
|
|
641
643
|
searchQuery,
|
|
@@ -651,8 +653,11 @@ function reduceCliState(state, action) {
|
|
|
651
653
|
if (state.level === "details" && state.focus === "right") {
|
|
652
654
|
return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
|
|
653
655
|
}
|
|
656
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
657
|
+
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
658
|
+
}
|
|
654
659
|
if (state.level === "datasets") {
|
|
655
|
-
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
|
|
660
|
+
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
|
|
656
661
|
}
|
|
657
662
|
if (state.level === "runs") {
|
|
658
663
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -669,8 +674,11 @@ function reduceCliState(state, action) {
|
|
|
669
674
|
if (state.level === "details" && state.focus === "right") {
|
|
670
675
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
671
676
|
}
|
|
677
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
678
|
+
return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
|
|
679
|
+
}
|
|
672
680
|
if (state.level === "datasets") {
|
|
673
|
-
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
|
|
681
|
+
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
|
|
674
682
|
}
|
|
675
683
|
if (state.level === "runs") {
|
|
676
684
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -746,292 +754,6 @@ function reduceCliState(state, action) {
|
|
|
746
754
|
}
|
|
747
755
|
return state;
|
|
748
756
|
}
|
|
749
|
-
var LEFT_PANE_WIDTH2 = 44;
|
|
750
|
-
function DatasetsView({
|
|
751
|
-
state,
|
|
752
|
-
filteredDatasets,
|
|
753
|
-
selectedDataset
|
|
754
|
-
}) {
|
|
755
|
-
const leftFocused = state.focus === "left";
|
|
756
|
-
const rightFocused = state.focus === "right";
|
|
757
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
758
|
-
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
759
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
760
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
761
|
-
ListItem,
|
|
762
|
-
{
|
|
763
|
-
selected: state.datasetMenuIndex === 0,
|
|
764
|
-
label: "New evaluation",
|
|
765
|
-
itemKey: "datasets-new-eval"
|
|
766
|
-
}
|
|
767
|
-
),
|
|
768
|
-
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
769
|
-
ListItem,
|
|
770
|
-
{
|
|
771
|
-
selected: state.datasetMenuIndex === index + 1,
|
|
772
|
-
label: dataset.name,
|
|
773
|
-
itemKey: `dataset-${dataset.id}`
|
|
774
|
-
},
|
|
775
|
-
dataset.id
|
|
776
|
-
))
|
|
777
|
-
] }),
|
|
778
|
-
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
779
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
780
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
|
|
781
|
-
] })
|
|
782
|
-
] });
|
|
783
|
-
}
|
|
784
|
-
function RunsView({
|
|
785
|
-
state,
|
|
786
|
-
dataset,
|
|
787
|
-
selectedRun
|
|
788
|
-
}) {
|
|
789
|
-
const runs = dataset?.runs ?? [];
|
|
790
|
-
const rightFocused = state.focus === "right";
|
|
791
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
792
|
-
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
793
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
794
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
795
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
796
|
-
" ",
|
|
797
|
-
selectedRun.label,
|
|
798
|
-
" ",
|
|
799
|
-
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
800
|
-
] }),
|
|
801
|
-
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
802
|
-
"Commit: ",
|
|
803
|
-
selectedRun.meta.commit,
|
|
804
|
-
" Branch: ",
|
|
805
|
-
selectedRun.meta.branch,
|
|
806
|
-
" ",
|
|
807
|
-
"Seed: ",
|
|
808
|
-
selectedRun.meta.seed
|
|
809
|
-
] }),
|
|
810
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
811
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
812
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
813
|
-
TextBar,
|
|
814
|
-
{
|
|
815
|
-
label: "pass rate",
|
|
816
|
-
value: selectedRun.performance.passRate,
|
|
817
|
-
format: (v) => `${v}%`
|
|
818
|
-
}
|
|
819
|
-
),
|
|
820
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
821
|
-
TextBar,
|
|
822
|
-
{
|
|
823
|
-
label: "avg score",
|
|
824
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
825
|
-
}
|
|
826
|
-
),
|
|
827
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
828
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
829
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
830
|
-
TextBar,
|
|
831
|
-
{
|
|
832
|
-
label: dimension.name,
|
|
833
|
-
value: dimension.score
|
|
834
|
-
},
|
|
835
|
-
dimension.name
|
|
836
|
-
)),
|
|
837
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
838
|
-
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
839
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
840
|
-
Sparkline,
|
|
841
|
-
{
|
|
842
|
-
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
843
|
-
selectedRun.performance.latencyAvgMs - 40,
|
|
844
|
-
selectedRun.performance.latencyAvgMs - 10,
|
|
845
|
-
selectedRun.performance.latencyAvgMs + 20,
|
|
846
|
-
selectedRun.performance.latencyP95Ms - 80,
|
|
847
|
-
selectedRun.performance.latencyP95Ms
|
|
848
|
-
],
|
|
849
|
-
width: 24
|
|
850
|
-
}
|
|
851
|
-
)
|
|
852
|
-
] }) })
|
|
853
|
-
] });
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
// src/evals/metric.ts
|
|
857
|
-
var registry = /* @__PURE__ */ new Map();
|
|
858
|
-
var Metric = {
|
|
859
|
-
of(config) {
|
|
860
|
-
const def = {
|
|
861
|
-
id: config.id,
|
|
862
|
-
name: config.name,
|
|
863
|
-
aggregate: config.aggregate,
|
|
864
|
-
format: config.format,
|
|
865
|
-
make: (data) => ({ id: config.id, data })
|
|
866
|
-
};
|
|
867
|
-
registry.set(config.id, def);
|
|
868
|
-
return def;
|
|
869
|
-
}
|
|
870
|
-
};
|
|
871
|
-
function getMetricById(id) {
|
|
872
|
-
return registry.get(id);
|
|
873
|
-
}
|
|
874
|
-
|
|
875
|
-
// src/evals/score.ts
|
|
876
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
877
|
-
var Score = {
|
|
878
|
-
of(config) {
|
|
879
|
-
const def = {
|
|
880
|
-
id: config.id,
|
|
881
|
-
name: config.name,
|
|
882
|
-
displayStrategy: config.displayStrategy,
|
|
883
|
-
aggregate: config.aggregate,
|
|
884
|
-
format: config.format,
|
|
885
|
-
make: (data, options) => {
|
|
886
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
887
|
-
return {
|
|
888
|
-
id: config.id,
|
|
889
|
-
data,
|
|
890
|
-
...passed !== void 0 && { passed }
|
|
891
|
-
};
|
|
892
|
-
}
|
|
893
|
-
};
|
|
894
|
-
registry2.set(config.id, def);
|
|
895
|
-
return def;
|
|
896
|
-
}
|
|
897
|
-
};
|
|
898
|
-
function getScoreById(id) {
|
|
899
|
-
return registry2.get(id);
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
// src/evals/aggregators.ts
|
|
903
|
-
function aggregateAverage(values) {
|
|
904
|
-
if (values.length === 0) {
|
|
905
|
-
return { value: 0 };
|
|
906
|
-
}
|
|
907
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
908
|
-
return { value: sum / values.length };
|
|
909
|
-
}
|
|
910
|
-
function aggregateAll(values) {
|
|
911
|
-
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
912
|
-
}
|
|
913
|
-
function aggregateTokenCountSum(values) {
|
|
914
|
-
const initial = {
|
|
915
|
-
input: 0,
|
|
916
|
-
output: 0,
|
|
917
|
-
inputCached: 0,
|
|
918
|
-
outputCached: 0
|
|
919
|
-
};
|
|
920
|
-
return values.reduce(
|
|
921
|
-
(acc, v) => ({
|
|
922
|
-
input: acc.input + (v.input ?? 0),
|
|
923
|
-
output: acc.output + (v.output ?? 0),
|
|
924
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
925
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
926
|
-
}),
|
|
927
|
-
initial
|
|
928
|
-
);
|
|
929
|
-
}
|
|
930
|
-
function aggregateLatencyAverage(values) {
|
|
931
|
-
if (values.length === 0) {
|
|
932
|
-
return { ms: 0 };
|
|
933
|
-
}
|
|
934
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
935
|
-
return { ms: sum / values.length };
|
|
936
|
-
}
|
|
937
|
-
|
|
938
|
-
// src/evals/metrics/standard.ts
|
|
939
|
-
Metric.of({
|
|
940
|
-
id: "token-count",
|
|
941
|
-
name: "Tokens",
|
|
942
|
-
aggregate: aggregateTokenCountSum,
|
|
943
|
-
format: (data, options) => {
|
|
944
|
-
const input = data.input ?? 0;
|
|
945
|
-
const output = data.output ?? 0;
|
|
946
|
-
const inputCached = data.inputCached ?? 0;
|
|
947
|
-
const outputCached = data.outputCached ?? 0;
|
|
948
|
-
const cached = inputCached + outputCached;
|
|
949
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
950
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
951
|
-
}
|
|
952
|
-
});
|
|
953
|
-
Metric.of({
|
|
954
|
-
id: "latency",
|
|
955
|
-
name: "Latency",
|
|
956
|
-
aggregate: aggregateLatencyAverage,
|
|
957
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
958
|
-
});
|
|
959
|
-
|
|
960
|
-
// src/evals/scores/standard.ts
|
|
961
|
-
Score.of({
|
|
962
|
-
id: "percent",
|
|
963
|
-
name: "Score",
|
|
964
|
-
displayStrategy: "bar",
|
|
965
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
966
|
-
aggregate: aggregateAverage
|
|
967
|
-
});
|
|
968
|
-
Score.of({
|
|
969
|
-
id: "binary",
|
|
970
|
-
name: "Result",
|
|
971
|
-
displayStrategy: "passFail",
|
|
972
|
-
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
973
|
-
aggregate: aggregateAll
|
|
974
|
-
});
|
|
975
|
-
function createDiffLogEntry(expected, actual, options) {
|
|
976
|
-
const diff = jsonDiff.diffString(expected, actual, { color: false });
|
|
977
|
-
return {
|
|
978
|
-
type: "diff",
|
|
979
|
-
label: options?.label,
|
|
980
|
-
expected,
|
|
981
|
-
actual,
|
|
982
|
-
diff: diff || "(no differences)"
|
|
983
|
-
};
|
|
984
|
-
}
|
|
985
|
-
function getDiffLines(entry) {
|
|
986
|
-
const raw = jsonDiff.diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
987
|
-
return raw.split("\n").map((line) => {
|
|
988
|
-
const trimmed = line.trimStart();
|
|
989
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
990
|
-
return { type: "remove", line };
|
|
991
|
-
}
|
|
992
|
-
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
993
|
-
return { type: "add", line };
|
|
994
|
-
}
|
|
995
|
-
return { type: "context", line };
|
|
996
|
-
});
|
|
997
|
-
}
|
|
998
|
-
|
|
999
|
-
// src/runner/score-utils.ts
|
|
1000
|
-
function toNumericScoreFromScores(scores) {
|
|
1001
|
-
for (const item of scores) {
|
|
1002
|
-
const def = getScoreById(item.id);
|
|
1003
|
-
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1004
|
-
const value = item.data.value;
|
|
1005
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1006
|
-
return value;
|
|
1007
|
-
}
|
|
1008
|
-
}
|
|
1009
|
-
const numeric = toNumericScore(item.data);
|
|
1010
|
-
if (numeric !== void 0) {
|
|
1011
|
-
return numeric;
|
|
1012
|
-
}
|
|
1013
|
-
}
|
|
1014
|
-
return void 0;
|
|
1015
|
-
}
|
|
1016
|
-
function toNumericScore(value) {
|
|
1017
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1018
|
-
return value;
|
|
1019
|
-
}
|
|
1020
|
-
if (typeof value !== "object" || value === null) {
|
|
1021
|
-
return void 0;
|
|
1022
|
-
}
|
|
1023
|
-
const obj = value;
|
|
1024
|
-
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1025
|
-
return obj.score;
|
|
1026
|
-
}
|
|
1027
|
-
const numberValues = Object.values(value).filter(
|
|
1028
|
-
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1029
|
-
);
|
|
1030
|
-
if (numberValues.length === 0) {
|
|
1031
|
-
return void 0;
|
|
1032
|
-
}
|
|
1033
|
-
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1034
|
-
}
|
|
1035
757
|
|
|
1036
758
|
// src/runner/config.ts
|
|
1037
759
|
var defaultRunnerConfig = {
|
|
@@ -1213,75 +935,311 @@ async function loadModuleExports(filePath) {
|
|
|
1213
935
|
if (!createJiti2) {
|
|
1214
936
|
throw new Error("Failed to initialize jiti TypeScript loader");
|
|
1215
937
|
}
|
|
1216
|
-
jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
|
|
1217
|
-
interopDefault: true,
|
|
1218
|
-
moduleCache: true
|
|
1219
|
-
});
|
|
938
|
+
jitiLoader = createJiti2((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
|
|
939
|
+
interopDefault: true,
|
|
940
|
+
moduleCache: true
|
|
941
|
+
});
|
|
942
|
+
}
|
|
943
|
+
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
944
|
+
return Object.values(loaded2);
|
|
945
|
+
}
|
|
946
|
+
const moduleUrl = url.pathToFileURL(filePath).href;
|
|
947
|
+
const loaded = await import(moduleUrl);
|
|
948
|
+
return Object.values(loaded);
|
|
949
|
+
}
|
|
950
|
+
async function collectDatasetsFromFiles(config) {
|
|
951
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
952
|
+
const matched = files.filter(
|
|
953
|
+
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
954
|
+
);
|
|
955
|
+
const found = await Promise.all(
|
|
956
|
+
matched.map(async (absolutePath) => {
|
|
957
|
+
const exports = await loadModuleExports(absolutePath);
|
|
958
|
+
const datasets = exports.filter(isDatasetLike);
|
|
959
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
960
|
+
return datasets.map((dataset) => ({
|
|
961
|
+
id: toId("dataset", relPath, dataset.getName()),
|
|
962
|
+
filePath: relPath,
|
|
963
|
+
dataset
|
|
964
|
+
}));
|
|
965
|
+
})
|
|
966
|
+
);
|
|
967
|
+
return found.flat();
|
|
968
|
+
}
|
|
969
|
+
async function collectEvaluatorsFromFiles(config) {
|
|
970
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
971
|
+
const matched = files.filter(
|
|
972
|
+
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
973
|
+
);
|
|
974
|
+
const found = await Promise.all(
|
|
975
|
+
matched.map(async (absolutePath) => {
|
|
976
|
+
const exports = await loadModuleExports(absolutePath);
|
|
977
|
+
const evaluators = exports.filter(isEvaluatorLike);
|
|
978
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
979
|
+
return evaluators.map((evaluator) => ({
|
|
980
|
+
id: toId("evaluator", relPath, evaluator.getName()),
|
|
981
|
+
filePath: relPath,
|
|
982
|
+
evaluator
|
|
983
|
+
}));
|
|
984
|
+
})
|
|
985
|
+
);
|
|
986
|
+
return found.flat();
|
|
987
|
+
}
|
|
988
|
+
async function collectTestCasesFromFiles(config) {
|
|
989
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
990
|
+
const matched = files.filter(
|
|
991
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
992
|
+
);
|
|
993
|
+
const found = await Promise.all(
|
|
994
|
+
matched.map(async (absolutePath) => {
|
|
995
|
+
const exports = await loadModuleExports(absolutePath);
|
|
996
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
997
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
998
|
+
return testCases.map((testCase) => ({
|
|
999
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
1000
|
+
filePath: relPath,
|
|
1001
|
+
testCase
|
|
1002
|
+
}));
|
|
1003
|
+
})
|
|
1004
|
+
);
|
|
1005
|
+
return found.flat();
|
|
1006
|
+
}
|
|
1007
|
+
function toJsonLines(value) {
|
|
1008
|
+
try {
|
|
1009
|
+
return JSON.stringify(value, null, 2);
|
|
1010
|
+
} catch {
|
|
1011
|
+
return String(value);
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
function formatDiffString(changes) {
|
|
1015
|
+
const lines = [];
|
|
1016
|
+
for (const part of changes) {
|
|
1017
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
1018
|
+
const partLines = part.value.split("\n");
|
|
1019
|
+
if (partLines[partLines.length - 1] === "") {
|
|
1020
|
+
partLines.pop();
|
|
1021
|
+
}
|
|
1022
|
+
for (const line of partLines) {
|
|
1023
|
+
lines.push(`${prefix} ${line}`);
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
return lines.join("\n");
|
|
1027
|
+
}
|
|
1028
|
+
function createDiffString(expected, actual) {
|
|
1029
|
+
const expectedStr = toJsonLines(expected);
|
|
1030
|
+
const actualStr = toJsonLines(actual);
|
|
1031
|
+
const changes = diff.diffLines(expectedStr, actualStr);
|
|
1032
|
+
return formatDiffString(changes);
|
|
1033
|
+
}
|
|
1034
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
1035
|
+
const diff = createDiffString(expected, actual);
|
|
1036
|
+
return {
|
|
1037
|
+
type: "diff",
|
|
1038
|
+
label: options?.label,
|
|
1039
|
+
expected,
|
|
1040
|
+
actual,
|
|
1041
|
+
diff: diff || "(no differences)"
|
|
1042
|
+
};
|
|
1043
|
+
}
|
|
1044
|
+
function getDiffLines(entry) {
|
|
1045
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
1046
|
+
return raw.split("\n").map((line) => {
|
|
1047
|
+
const trimmed = line.trimStart();
|
|
1048
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
1049
|
+
return { type: "remove", line };
|
|
1050
|
+
}
|
|
1051
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
1052
|
+
return { type: "add", line };
|
|
1053
|
+
}
|
|
1054
|
+
return { type: "context", line };
|
|
1055
|
+
});
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
// src/evals/metric.ts
|
|
1059
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1060
|
+
var Metric = {
|
|
1061
|
+
of(config) {
|
|
1062
|
+
const def = {
|
|
1063
|
+
id: config.id,
|
|
1064
|
+
name: config.name,
|
|
1065
|
+
aggregate: config.aggregate,
|
|
1066
|
+
format: config.format,
|
|
1067
|
+
make: (data) => ({ id: config.id, data })
|
|
1068
|
+
};
|
|
1069
|
+
registry.set(config.id, def);
|
|
1070
|
+
return def;
|
|
1071
|
+
}
|
|
1072
|
+
};
|
|
1073
|
+
function getMetricById(id) {
|
|
1074
|
+
return registry.get(id);
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
// src/evals/score.ts
|
|
1078
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1079
|
+
var Score = {
|
|
1080
|
+
of(config) {
|
|
1081
|
+
const def = {
|
|
1082
|
+
id: config.id,
|
|
1083
|
+
name: config.name,
|
|
1084
|
+
displayStrategy: config.displayStrategy,
|
|
1085
|
+
aggregate: config.aggregate,
|
|
1086
|
+
format: config.format,
|
|
1087
|
+
make: (data, options) => {
|
|
1088
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1089
|
+
return {
|
|
1090
|
+
id: config.id,
|
|
1091
|
+
data,
|
|
1092
|
+
...passed !== void 0 && { passed }
|
|
1093
|
+
};
|
|
1094
|
+
}
|
|
1095
|
+
};
|
|
1096
|
+
registry2.set(config.id, def);
|
|
1097
|
+
return def;
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1100
|
+
function getScoreById(id) {
|
|
1101
|
+
return registry2.get(id);
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
// src/evals/aggregators.ts
|
|
1105
|
+
function aggregateAverageWithVariance(values) {
|
|
1106
|
+
if (values.length === 0) {
|
|
1107
|
+
return { value: 0, count: 0 };
|
|
1108
|
+
}
|
|
1109
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1110
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1111
|
+
const mean = sum / values.length;
|
|
1112
|
+
let stdDev;
|
|
1113
|
+
if (values.length >= 2) {
|
|
1114
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1115
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1116
|
+
}
|
|
1117
|
+
return { value: mean, stdDev, count: values.length };
|
|
1118
|
+
}
|
|
1119
|
+
function aggregateAll(values) {
|
|
1120
|
+
const total = values.length;
|
|
1121
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1122
|
+
return {
|
|
1123
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1124
|
+
passedCount,
|
|
1125
|
+
totalCount: total
|
|
1126
|
+
};
|
|
1127
|
+
}
|
|
1128
|
+
function aggregateTokenCountSum(values) {
|
|
1129
|
+
const initial = {
|
|
1130
|
+
input: 0,
|
|
1131
|
+
output: 0,
|
|
1132
|
+
inputCached: 0,
|
|
1133
|
+
outputCached: 0
|
|
1134
|
+
};
|
|
1135
|
+
return values.reduce(
|
|
1136
|
+
(acc, v) => ({
|
|
1137
|
+
input: acc.input + (v.input ?? 0),
|
|
1138
|
+
output: acc.output + (v.output ?? 0),
|
|
1139
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1140
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1141
|
+
}),
|
|
1142
|
+
initial
|
|
1143
|
+
);
|
|
1144
|
+
}
|
|
1145
|
+
function aggregateLatencyAverage(values) {
|
|
1146
|
+
if (values.length === 0) {
|
|
1147
|
+
return { ms: 0 };
|
|
1148
|
+
}
|
|
1149
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1150
|
+
return { ms: sum / values.length };
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
// src/evals/metrics/standard.ts
|
|
1154
|
+
Metric.of({
|
|
1155
|
+
id: "token-count",
|
|
1156
|
+
name: "Tokens",
|
|
1157
|
+
aggregate: aggregateTokenCountSum,
|
|
1158
|
+
format: (data, options) => {
|
|
1159
|
+
const input = data.input ?? 0;
|
|
1160
|
+
const output = data.output ?? 0;
|
|
1161
|
+
const inputCached = data.inputCached ?? 0;
|
|
1162
|
+
const outputCached = data.outputCached ?? 0;
|
|
1163
|
+
const cached = inputCached + outputCached;
|
|
1164
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1165
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1166
|
+
}
|
|
1167
|
+
});
|
|
1168
|
+
Metric.of({
|
|
1169
|
+
id: "latency",
|
|
1170
|
+
name: "Latency",
|
|
1171
|
+
aggregate: aggregateLatencyAverage,
|
|
1172
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1173
|
+
});
|
|
1174
|
+
|
|
1175
|
+
// src/evals/scores/standard.ts
|
|
1176
|
+
Score.of({
|
|
1177
|
+
id: "percent",
|
|
1178
|
+
name: "Score",
|
|
1179
|
+
displayStrategy: "bar",
|
|
1180
|
+
format: (data, options) => {
|
|
1181
|
+
if (options?.isAggregated) {
|
|
1182
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
1183
|
+
}
|
|
1184
|
+
return data.value.toFixed(2);
|
|
1185
|
+
},
|
|
1186
|
+
aggregate: aggregateAverageWithVariance
|
|
1187
|
+
});
|
|
1188
|
+
Score.of({
|
|
1189
|
+
id: "binary",
|
|
1190
|
+
name: "Result",
|
|
1191
|
+
displayStrategy: "passFail",
|
|
1192
|
+
format: (data, options) => {
|
|
1193
|
+
if (options?.isAggregated) {
|
|
1194
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1195
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1196
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1197
|
+
}
|
|
1198
|
+
return base;
|
|
1199
|
+
}
|
|
1200
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
1201
|
+
},
|
|
1202
|
+
aggregate: aggregateAll
|
|
1203
|
+
});
|
|
1204
|
+
|
|
1205
|
+
// src/runner/score-utils.ts
|
|
1206
|
+
function toNumericScoreFromScores(scores) {
|
|
1207
|
+
for (const item of scores) {
|
|
1208
|
+
const def = getScoreById(item.id);
|
|
1209
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1210
|
+
const value = item.data.value;
|
|
1211
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1212
|
+
return value;
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
const numeric = toNumericScore(item.data);
|
|
1216
|
+
if (numeric !== void 0) {
|
|
1217
|
+
return numeric;
|
|
1220
1218
|
}
|
|
1221
|
-
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
1222
|
-
return Object.values(loaded2);
|
|
1223
1219
|
}
|
|
1224
|
-
|
|
1225
|
-
const loaded = await import(moduleUrl);
|
|
1226
|
-
return Object.values(loaded);
|
|
1227
|
-
}
|
|
1228
|
-
async function collectDatasetsFromFiles(config) {
|
|
1229
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1230
|
-
const matched = files.filter(
|
|
1231
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1232
|
-
);
|
|
1233
|
-
const found = await Promise.all(
|
|
1234
|
-
matched.map(async (absolutePath) => {
|
|
1235
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1236
|
-
const datasets = exports.filter(isDatasetLike);
|
|
1237
|
-
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1238
|
-
return datasets.map((dataset) => ({
|
|
1239
|
-
id: toId("dataset", relPath, dataset.getName()),
|
|
1240
|
-
filePath: relPath,
|
|
1241
|
-
dataset
|
|
1242
|
-
}));
|
|
1243
|
-
})
|
|
1244
|
-
);
|
|
1245
|
-
return found.flat();
|
|
1246
|
-
}
|
|
1247
|
-
async function collectEvaluatorsFromFiles(config) {
|
|
1248
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1249
|
-
const matched = files.filter(
|
|
1250
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1251
|
-
);
|
|
1252
|
-
const found = await Promise.all(
|
|
1253
|
-
matched.map(async (absolutePath) => {
|
|
1254
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1255
|
-
const evaluators = exports.filter(isEvaluatorLike);
|
|
1256
|
-
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1257
|
-
return evaluators.map((evaluator) => ({
|
|
1258
|
-
id: toId("evaluator", relPath, evaluator.getName()),
|
|
1259
|
-
filePath: relPath,
|
|
1260
|
-
evaluator
|
|
1261
|
-
}));
|
|
1262
|
-
})
|
|
1263
|
-
);
|
|
1264
|
-
return found.flat();
|
|
1220
|
+
return void 0;
|
|
1265
1221
|
}
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
)
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
testCase
|
|
1280
|
-
}));
|
|
1281
|
-
})
|
|
1222
|
+
function toNumericScore(value) {
|
|
1223
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1224
|
+
return value;
|
|
1225
|
+
}
|
|
1226
|
+
if (typeof value !== "object" || value === null) {
|
|
1227
|
+
return void 0;
|
|
1228
|
+
}
|
|
1229
|
+
const obj = value;
|
|
1230
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1231
|
+
return obj.score;
|
|
1232
|
+
}
|
|
1233
|
+
const numberValues = Object.values(value).filter(
|
|
1234
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1282
1235
|
);
|
|
1283
|
-
|
|
1236
|
+
if (numberValues.length === 0) {
|
|
1237
|
+
return void 0;
|
|
1238
|
+
}
|
|
1239
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1284
1240
|
}
|
|
1241
|
+
|
|
1242
|
+
// src/runner/execution.ts
|
|
1285
1243
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1286
1244
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1287
1245
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1952,6 +1910,240 @@ var EffectRunner = class {
|
|
|
1952
1910
|
);
|
|
1953
1911
|
}
|
|
1954
1912
|
};
|
|
1913
|
+
var LEFT_PANE_WIDTH2 = 44;
|
|
1914
|
+
var MAX_RUNS_FOR_CHART = 12;
|
|
1915
|
+
var MAX_RUNS_FOR_TREND = 20;
|
|
1916
|
+
var TREND_BATCH_SIZE = 4;
|
|
1917
|
+
function extractRunAverageScore(testCases) {
|
|
1918
|
+
const scores = [];
|
|
1919
|
+
for (const tc of testCases) {
|
|
1920
|
+
for (const es of tc.evaluatorScores) {
|
|
1921
|
+
const n = toNumericScoreFromScores(es.scores);
|
|
1922
|
+
if (n !== void 0) {
|
|
1923
|
+
scores.push(n);
|
|
1924
|
+
}
|
|
1925
|
+
}
|
|
1926
|
+
}
|
|
1927
|
+
if (scores.length === 0)
|
|
1928
|
+
return void 0;
|
|
1929
|
+
return scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1930
|
+
}
|
|
1931
|
+
async function loadRunScores(runs) {
|
|
1932
|
+
const results = [];
|
|
1933
|
+
for (const run of runs) {
|
|
1934
|
+
const artifact = run.meta?.artifact;
|
|
1935
|
+
if (!artifact)
|
|
1936
|
+
continue;
|
|
1937
|
+
try {
|
|
1938
|
+
const path$1 = path.resolve(artifact);
|
|
1939
|
+
const testCases = await parseArtifactFile(path$1);
|
|
1940
|
+
const avg = extractRunAverageScore(testCases);
|
|
1941
|
+
if (avg !== void 0) {
|
|
1942
|
+
results.push({
|
|
1943
|
+
runId: run.id,
|
|
1944
|
+
label: run.label,
|
|
1945
|
+
value: avg
|
|
1946
|
+
});
|
|
1947
|
+
}
|
|
1948
|
+
} catch {
|
|
1949
|
+
}
|
|
1950
|
+
}
|
|
1951
|
+
return results;
|
|
1952
|
+
}
|
|
1953
|
+
function batchAverage(values, batchSize) {
|
|
1954
|
+
const batches = [];
|
|
1955
|
+
for (let i = 0; i < values.length; i += batchSize) {
|
|
1956
|
+
const slice = values.slice(i, i + batchSize);
|
|
1957
|
+
if (slice.length > 0) {
|
|
1958
|
+
batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
|
|
1959
|
+
}
|
|
1960
|
+
}
|
|
1961
|
+
return batches;
|
|
1962
|
+
}
|
|
1963
|
+
var OVERVIEW_PAGE_SIZE = 15;
|
|
1964
|
+
function DatasetsView({
|
|
1965
|
+
state,
|
|
1966
|
+
filteredDatasets,
|
|
1967
|
+
selectedDataset,
|
|
1968
|
+
overviewRowCountRef
|
|
1969
|
+
}) {
|
|
1970
|
+
const leftFocused = state.focus === "left";
|
|
1971
|
+
const rightFocused = state.focus === "right";
|
|
1972
|
+
const [runScores, setRunScores] = React2.useState([]);
|
|
1973
|
+
const [loading, setLoading] = React2.useState(false);
|
|
1974
|
+
React2.useEffect(() => {
|
|
1975
|
+
if (!selectedDataset?.runs?.length) {
|
|
1976
|
+
setRunScores([]);
|
|
1977
|
+
return;
|
|
1978
|
+
}
|
|
1979
|
+
setLoading(true);
|
|
1980
|
+
const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
|
|
1981
|
+
loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
|
|
1982
|
+
}, [selectedDataset?.id, selectedDataset?.runs?.length]);
|
|
1983
|
+
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
1984
|
+
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
1985
|
+
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
1986
|
+
const overviewRows = React2.useMemo(() => {
|
|
1987
|
+
const rows = [];
|
|
1988
|
+
rows.push(
|
|
1989
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
1990
|
+
);
|
|
1991
|
+
if (selectedDataset && selectedDataset.runs.length > 0) {
|
|
1992
|
+
if (loading) {
|
|
1993
|
+
rows.push(
|
|
1994
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
|
|
1995
|
+
);
|
|
1996
|
+
} else if (runScores.length > 0) {
|
|
1997
|
+
rows.push(
|
|
1998
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
|
|
1999
|
+
);
|
|
2000
|
+
for (const d of barData) {
|
|
2001
|
+
rows.push(
|
|
2002
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2003
|
+
TextBar,
|
|
2004
|
+
{
|
|
2005
|
+
label: d.label,
|
|
2006
|
+
value: d.value,
|
|
2007
|
+
labelWidth: 14,
|
|
2008
|
+
barWidth: 24,
|
|
2009
|
+
max: 100,
|
|
2010
|
+
format: (v) => v.toFixed(1)
|
|
2011
|
+
},
|
|
2012
|
+
d.runId
|
|
2013
|
+
)
|
|
2014
|
+
);
|
|
2015
|
+
}
|
|
2016
|
+
if (trendBatched.length > 0) {
|
|
2017
|
+
rows.push(
|
|
2018
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
|
|
2019
|
+
);
|
|
2020
|
+
rows.push(
|
|
2021
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: /* @__PURE__ */ jsxRuntime.jsx(
|
|
2022
|
+
inkChart.LineGraph,
|
|
2023
|
+
{
|
|
2024
|
+
data: [{ values: trendBatched, color: "cyan" }],
|
|
2025
|
+
height: 5,
|
|
2026
|
+
width: 45,
|
|
2027
|
+
showYAxis: true,
|
|
2028
|
+
xLabels: ["older", "newer"]
|
|
2029
|
+
}
|
|
2030
|
+
) }, "trend-graph")
|
|
2031
|
+
);
|
|
2032
|
+
}
|
|
2033
|
+
}
|
|
2034
|
+
}
|
|
2035
|
+
return rows;
|
|
2036
|
+
}, [
|
|
2037
|
+
selectedDataset?.overview,
|
|
2038
|
+
selectedDataset?.runs?.length,
|
|
2039
|
+
loading,
|
|
2040
|
+
runScores,
|
|
2041
|
+
barData,
|
|
2042
|
+
trendBatched
|
|
2043
|
+
]);
|
|
2044
|
+
if (overviewRowCountRef) {
|
|
2045
|
+
overviewRowCountRef.current = overviewRows.length;
|
|
2046
|
+
}
|
|
2047
|
+
const offset = Math.max(0, state.overviewScrollOffset);
|
|
2048
|
+
const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
|
|
2049
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2050
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
2051
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
2052
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2053
|
+
ListItem,
|
|
2054
|
+
{
|
|
2055
|
+
selected: state.datasetMenuIndex === 0,
|
|
2056
|
+
label: "New evaluation",
|
|
2057
|
+
itemKey: "datasets-new-eval"
|
|
2058
|
+
}
|
|
2059
|
+
),
|
|
2060
|
+
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2061
|
+
ListItem,
|
|
2062
|
+
{
|
|
2063
|
+
selected: state.datasetMenuIndex === index + 1,
|
|
2064
|
+
label: dataset.name,
|
|
2065
|
+
itemKey: `dataset-${dataset.id}`
|
|
2066
|
+
},
|
|
2067
|
+
dataset.id
|
|
2068
|
+
))
|
|
2069
|
+
] }),
|
|
2070
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
2071
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
2072
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { children: row }, offset + i)) })
|
|
2073
|
+
] })
|
|
2074
|
+
] });
|
|
2075
|
+
}
|
|
2076
|
+
function RunsView({
|
|
2077
|
+
state,
|
|
2078
|
+
dataset,
|
|
2079
|
+
selectedRun
|
|
2080
|
+
}) {
|
|
2081
|
+
const runs = dataset?.runs ?? [];
|
|
2082
|
+
const rightFocused = state.focus === "right";
|
|
2083
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2084
|
+
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2085
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
2086
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
2087
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
2088
|
+
" ",
|
|
2089
|
+
selectedRun.label,
|
|
2090
|
+
" ",
|
|
2091
|
+
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
2092
|
+
] }),
|
|
2093
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2094
|
+
"Commit: ",
|
|
2095
|
+
selectedRun.meta.commit,
|
|
2096
|
+
" Branch: ",
|
|
2097
|
+
selectedRun.meta.branch,
|
|
2098
|
+
" ",
|
|
2099
|
+
"Seed: ",
|
|
2100
|
+
selectedRun.meta.seed
|
|
2101
|
+
] }),
|
|
2102
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2103
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
2104
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2105
|
+
TextBar,
|
|
2106
|
+
{
|
|
2107
|
+
label: "pass rate",
|
|
2108
|
+
value: selectedRun.performance.passRate,
|
|
2109
|
+
format: (v) => `${v}%`
|
|
2110
|
+
}
|
|
2111
|
+
),
|
|
2112
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2113
|
+
TextBar,
|
|
2114
|
+
{
|
|
2115
|
+
label: "avg score",
|
|
2116
|
+
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2117
|
+
}
|
|
2118
|
+
),
|
|
2119
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2120
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
2121
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2122
|
+
TextBar,
|
|
2123
|
+
{
|
|
2124
|
+
label: dimension.name,
|
|
2125
|
+
value: dimension.score
|
|
2126
|
+
},
|
|
2127
|
+
dimension.name
|
|
2128
|
+
)),
|
|
2129
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2130
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
2131
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2132
|
+
Sparkline,
|
|
2133
|
+
{
|
|
2134
|
+
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
2135
|
+
selectedRun.performance.latencyAvgMs - 40,
|
|
2136
|
+
selectedRun.performance.latencyAvgMs - 10,
|
|
2137
|
+
selectedRun.performance.latencyAvgMs + 20,
|
|
2138
|
+
selectedRun.performance.latencyP95Ms - 80,
|
|
2139
|
+
selectedRun.performance.latencyP95Ms
|
|
2140
|
+
],
|
|
2141
|
+
width: 24
|
|
2142
|
+
}
|
|
2143
|
+
)
|
|
2144
|
+
] }) })
|
|
2145
|
+
] });
|
|
2146
|
+
}
|
|
1955
2147
|
var DETAILS_PAGE_SIZE = 20;
|
|
1956
2148
|
function scoreColor(score) {
|
|
1957
2149
|
if (score >= 80)
|
|
@@ -1960,7 +2152,7 @@ function scoreColor(score) {
|
|
|
1960
2152
|
return "yellow";
|
|
1961
2153
|
return "red";
|
|
1962
2154
|
}
|
|
1963
|
-
function formatScorePart(item
|
|
2155
|
+
function formatScorePart(item) {
|
|
1964
2156
|
const def = getScoreById(item.id);
|
|
1965
2157
|
if (!def) {
|
|
1966
2158
|
const numeric = toNumericScore(item.data);
|
|
@@ -1990,7 +2182,7 @@ function CheckRow({
|
|
|
1990
2182
|
" ",
|
|
1991
2183
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
|
|
1992
2184
|
detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
1993
|
-
"
|
|
2185
|
+
" (",
|
|
1994
2186
|
detail,
|
|
1995
2187
|
")"
|
|
1996
2188
|
] }) : null
|
|
@@ -2010,21 +2202,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2010
2202
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2011
2203
|
"Model: ",
|
|
2012
2204
|
meta.model,
|
|
2013
|
-
"
|
|
2205
|
+
" Provider: ",
|
|
2014
2206
|
meta.provider
|
|
2015
2207
|
] }, "meta-1"),
|
|
2016
2208
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2017
2209
|
"Commit: ",
|
|
2018
2210
|
meta.commit,
|
|
2019
|
-
"
|
|
2211
|
+
" Branch: ",
|
|
2020
2212
|
meta.branch,
|
|
2021
|
-
"
|
|
2213
|
+
" Seed: ",
|
|
2022
2214
|
meta.seed
|
|
2023
2215
|
] }, "meta-2"),
|
|
2024
2216
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2025
2217
|
"Duration: ",
|
|
2026
2218
|
meta.duration,
|
|
2027
|
-
"
|
|
2219
|
+
" Concurrency: ",
|
|
2028
2220
|
meta.concurrency
|
|
2029
2221
|
] }, "meta-3"),
|
|
2030
2222
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
@@ -2036,7 +2228,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2036
2228
|
...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2037
2229
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
|
|
2038
2230
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2039
|
-
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2231
|
+
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2232
|
+
CheckRow,
|
|
2233
|
+
{
|
|
2234
|
+
name: c.name,
|
|
2235
|
+
passed: c.passed,
|
|
2236
|
+
detail: c.detail
|
|
2237
|
+
},
|
|
2238
|
+
`chk-${c.name}`
|
|
2239
|
+
)),
|
|
2040
2240
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
|
|
2041
2241
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2042
2242
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2049,16 +2249,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2049
2249
|
"perf-rate"
|
|
2050
2250
|
),
|
|
2051
2251
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2052
|
-
"latency avg
|
|
2252
|
+
"latency avg ",
|
|
2053
2253
|
performance.latencyAvgMs,
|
|
2054
|
-
"ms
|
|
2254
|
+
"ms p95 ",
|
|
2055
2255
|
performance.latencyP95Ms,
|
|
2056
2256
|
"ms"
|
|
2057
2257
|
] }, "perf-lat"),
|
|
2058
2258
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2059
|
-
"tokens avg
|
|
2259
|
+
"tokens avg ",
|
|
2060
2260
|
performance.tokensAvg,
|
|
2061
|
-
"
|
|
2261
|
+
" p95 ",
|
|
2062
2262
|
performance.tokensP95
|
|
2063
2263
|
] }, "perf-tok"),
|
|
2064
2264
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
|
|
@@ -2111,26 +2311,60 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2111
2311
|
":",
|
|
2112
2312
|
" ",
|
|
2113
2313
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2122
|
-
|
|
2123
|
-
|
|
2124
|
-
|
|
2125
|
-
|
|
2126
|
-
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
] }, m.id);
|
|
2131
|
-
})
|
|
2314
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2315
|
+
" ",
|
|
2316
|
+
item.metrics.map((m) => {
|
|
2317
|
+
const def = getMetricById(m.id);
|
|
2318
|
+
if (!def)
|
|
2319
|
+
return null;
|
|
2320
|
+
const formatted = def.format(m.data);
|
|
2321
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2322
|
+
"[",
|
|
2323
|
+
def.name ? `${def.name}: ` : "",
|
|
2324
|
+
formatted,
|
|
2325
|
+
"]",
|
|
2326
|
+
" "
|
|
2327
|
+
] }, m.id);
|
|
2328
|
+
})
|
|
2329
|
+
] }) : null
|
|
2132
2330
|
] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
|
|
2133
2331
|
);
|
|
2332
|
+
if (item.scores.length > 0) {
|
|
2333
|
+
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2334
|
+
const s = item.scores[sIdx];
|
|
2335
|
+
const def = getScoreById(s.id);
|
|
2336
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
2337
|
+
rows.push(
|
|
2338
|
+
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2339
|
+
ink.Text,
|
|
2340
|
+
{
|
|
2341
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2342
|
+
children: [
|
|
2343
|
+
" ",
|
|
2344
|
+
scoreLabel,
|
|
2345
|
+
": ",
|
|
2346
|
+
formatScorePart(s)
|
|
2347
|
+
]
|
|
2348
|
+
},
|
|
2349
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-score-${sIdx}`
|
|
2350
|
+
)
|
|
2351
|
+
);
|
|
2352
|
+
}
|
|
2353
|
+
} else {
|
|
2354
|
+
rows.push(
|
|
2355
|
+
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2356
|
+
ink.Text,
|
|
2357
|
+
{
|
|
2358
|
+
color: "gray",
|
|
2359
|
+
children: [
|
|
2360
|
+
" ",
|
|
2361
|
+
"n/a"
|
|
2362
|
+
]
|
|
2363
|
+
},
|
|
2364
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2365
|
+
)
|
|
2366
|
+
);
|
|
2367
|
+
}
|
|
2134
2368
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
2135
2369
|
for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
|
|
2136
2370
|
const log = item.logs[logIdx];
|
|
@@ -2168,12 +2402,12 @@ function RunDetailsView({
|
|
|
2168
2402
|
}) {
|
|
2169
2403
|
const runs = dataset?.runs ?? [];
|
|
2170
2404
|
const rightFocused = state.focus === "right";
|
|
2171
|
-
const [testCases, setTestCases] =
|
|
2172
|
-
const evaluatorNameById =
|
|
2405
|
+
const [testCases, setTestCases] = React2.useState([]);
|
|
2406
|
+
const evaluatorNameById = React2__default.default.useMemo(
|
|
2173
2407
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2174
2408
|
[evaluators]
|
|
2175
2409
|
);
|
|
2176
|
-
|
|
2410
|
+
React2.useEffect(() => {
|
|
2177
2411
|
if (!selectedRun?.meta?.artifact) {
|
|
2178
2412
|
setTestCases([]);
|
|
2179
2413
|
return;
|
|
@@ -2192,7 +2426,7 @@ function RunDetailsView({
|
|
|
2192
2426
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2193
2427
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2194
2428
|
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2195
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2429
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React2__default.default.Fragment, { children: row }, i)) }) })
|
|
2196
2430
|
] });
|
|
2197
2431
|
}
|
|
2198
2432
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2272,16 +2506,17 @@ function EvalsCliApp({
|
|
|
2272
2506
|
}) {
|
|
2273
2507
|
const { exit } = ink.useApp();
|
|
2274
2508
|
const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
|
|
2275
|
-
const [liveData, setLiveData] =
|
|
2276
|
-
const [runtimeMessage, setRuntimeMessage] =
|
|
2277
|
-
const
|
|
2509
|
+
const [liveData, setLiveData] = React2.useState(data);
|
|
2510
|
+
const [runtimeMessage, setRuntimeMessage] = React2.useState();
|
|
2511
|
+
const overviewRowCountRef = React2.useRef(0);
|
|
2512
|
+
const [state, dispatch] = React2.useReducer(
|
|
2278
2513
|
reduceCliState,
|
|
2279
2514
|
createInitialState(data, args)
|
|
2280
2515
|
);
|
|
2281
|
-
|
|
2516
|
+
React2.useEffect(() => {
|
|
2282
2517
|
setLiveData(data);
|
|
2283
2518
|
}, [data]);
|
|
2284
|
-
|
|
2519
|
+
React2.useEffect(() => {
|
|
2285
2520
|
if (!runner) {
|
|
2286
2521
|
return void 0;
|
|
2287
2522
|
}
|
|
@@ -2300,7 +2535,7 @@ function EvalsCliApp({
|
|
|
2300
2535
|
}
|
|
2301
2536
|
});
|
|
2302
2537
|
}, [runner]);
|
|
2303
|
-
const filteredDatasets =
|
|
2538
|
+
const filteredDatasets = React2.useMemo(
|
|
2304
2539
|
() => getFilteredDatasets(liveData, state.searchQuery),
|
|
2305
2540
|
[liveData, state.searchQuery]
|
|
2306
2541
|
);
|
|
@@ -2353,7 +2588,16 @@ function EvalsCliApp({
|
|
|
2353
2588
|
return;
|
|
2354
2589
|
}
|
|
2355
2590
|
if (key.downArrow) {
|
|
2356
|
-
|
|
2591
|
+
let max;
|
|
2592
|
+
if (clampedState.level === "datasets") {
|
|
2593
|
+
max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
|
|
2594
|
+
} else if (clampedState.level === "runs") {
|
|
2595
|
+
max = selectedDataset?.runs.length ?? 0;
|
|
2596
|
+
} else if (clampedState.level === "new-evaluation") {
|
|
2597
|
+
max = Math.max(0, visibleEvaluators.length - 1);
|
|
2598
|
+
} else {
|
|
2599
|
+
max = 100;
|
|
2600
|
+
}
|
|
2357
2601
|
dispatch({ type: "MOVE_DOWN", max });
|
|
2358
2602
|
return;
|
|
2359
2603
|
}
|
|
@@ -2371,7 +2615,7 @@ function EvalsCliApp({
|
|
|
2371
2615
|
}
|
|
2372
2616
|
return;
|
|
2373
2617
|
}
|
|
2374
|
-
if (isBackKey(key)) {
|
|
2618
|
+
if (isBackKey(key) || input === "\x7F" || input === "\b") {
|
|
2375
2619
|
dispatch({ type: "BACK" });
|
|
2376
2620
|
return;
|
|
2377
2621
|
}
|
|
@@ -2424,7 +2668,8 @@ function EvalsCliApp({
|
|
|
2424
2668
|
{
|
|
2425
2669
|
state: clampedState,
|
|
2426
2670
|
filteredDatasets,
|
|
2427
|
-
selectedDataset
|
|
2671
|
+
selectedDataset,
|
|
2672
|
+
overviewRowCountRef
|
|
2428
2673
|
}
|
|
2429
2674
|
);
|
|
2430
2675
|
}
|