@m4trix/evals 0.13.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +287 -107
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +287 -107
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +643 -398
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +634 -389
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +79 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +5 -0
- package/dist/index.js +79 -11
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
package/dist/cli.js
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { withFullScreen, useScreenSize } from 'fullscreen-ink';
|
|
3
|
-
import
|
|
3
|
+
import React2, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
|
|
4
4
|
import { useApp, useInput, Box, Text } from 'ink';
|
|
5
5
|
import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
6
6
|
import { resolve, relative, join, dirname } from 'path';
|
|
7
|
-
import {
|
|
7
|
+
import { LineGraph } from '@pppp606/ink-chart';
|
|
8
8
|
import { randomUUID } from 'crypto';
|
|
9
9
|
import { Effect, PubSub, Queue, Fiber, Ref } from 'effect';
|
|
10
10
|
import { existsSync } from 'fs';
|
|
11
11
|
import * as jitiModule from 'jiti';
|
|
12
12
|
import { readdir, readFile, mkdir, appendFile } from 'fs/promises';
|
|
13
13
|
import { pathToFileURL } from 'url';
|
|
14
|
+
import { diffLines } from 'diff';
|
|
14
15
|
|
|
15
16
|
var SEP = " ";
|
|
16
17
|
var ARROW = "\u203A";
|
|
@@ -78,7 +79,7 @@ function getBreadcrumbText(state, datasetName, runLabel) {
|
|
|
78
79
|
// src/cli/components/Footer.tsx
|
|
79
80
|
function getFooterText(state) {
|
|
80
81
|
if (state.level === "datasets") {
|
|
81
|
-
return "\u2191\u2193
|
|
82
|
+
return state.focus === "right" ? "\u2191\u2193 scroll Tab focus left / search q quit" : "\u2191\u2193 move Enter open Tab focus right / search q quit";
|
|
82
83
|
}
|
|
83
84
|
if (state.level === "runs") {
|
|
84
85
|
return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
|
|
@@ -610,6 +611,7 @@ function createInitialState(data, args) {
|
|
|
610
611
|
datasetMenuIndex,
|
|
611
612
|
runMenuIndex,
|
|
612
613
|
detailsScrollOffset: 0,
|
|
614
|
+
overviewScrollOffset: 0,
|
|
613
615
|
selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
|
|
614
616
|
evaluatorMenuIndex: 0,
|
|
615
617
|
searchQuery,
|
|
@@ -625,8 +627,11 @@ function reduceCliState(state, action) {
|
|
|
625
627
|
if (state.level === "details" && state.focus === "right") {
|
|
626
628
|
return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
|
|
627
629
|
}
|
|
630
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
631
|
+
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
632
|
+
}
|
|
628
633
|
if (state.level === "datasets") {
|
|
629
|
-
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
|
|
634
|
+
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1), overviewScrollOffset: 0 };
|
|
630
635
|
}
|
|
631
636
|
if (state.level === "runs") {
|
|
632
637
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -643,8 +648,11 @@ function reduceCliState(state, action) {
|
|
|
643
648
|
if (state.level === "details" && state.focus === "right") {
|
|
644
649
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
645
650
|
}
|
|
651
|
+
if (state.level === "datasets" && state.focus === "right") {
|
|
652
|
+
return { ...state, overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1) };
|
|
653
|
+
}
|
|
646
654
|
if (state.level === "datasets") {
|
|
647
|
-
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
|
|
655
|
+
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1), overviewScrollOffset: 0 };
|
|
648
656
|
}
|
|
649
657
|
if (state.level === "runs") {
|
|
650
658
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -720,292 +728,6 @@ function reduceCliState(state, action) {
|
|
|
720
728
|
}
|
|
721
729
|
return state;
|
|
722
730
|
}
|
|
723
|
-
var LEFT_PANE_WIDTH2 = 44;
|
|
724
|
-
function DatasetsView({
|
|
725
|
-
state,
|
|
726
|
-
filteredDatasets,
|
|
727
|
-
selectedDataset
|
|
728
|
-
}) {
|
|
729
|
-
const leftFocused = state.focus === "left";
|
|
730
|
-
const rightFocused = state.focus === "right";
|
|
731
|
-
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
732
|
-
/* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
733
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
|
|
734
|
-
/* @__PURE__ */ jsx(
|
|
735
|
-
ListItem,
|
|
736
|
-
{
|
|
737
|
-
selected: state.datasetMenuIndex === 0,
|
|
738
|
-
label: "New evaluation",
|
|
739
|
-
itemKey: "datasets-new-eval"
|
|
740
|
-
}
|
|
741
|
-
),
|
|
742
|
-
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
|
|
743
|
-
ListItem,
|
|
744
|
-
{
|
|
745
|
-
selected: state.datasetMenuIndex === index + 1,
|
|
746
|
-
label: dataset.name,
|
|
747
|
-
itemKey: `dataset-${dataset.id}`
|
|
748
|
-
},
|
|
749
|
-
dataset.id
|
|
750
|
-
))
|
|
751
|
-
] }),
|
|
752
|
-
/* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
753
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
|
|
754
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
|
|
755
|
-
] })
|
|
756
|
-
] });
|
|
757
|
-
}
|
|
758
|
-
function RunsView({
|
|
759
|
-
state,
|
|
760
|
-
dataset,
|
|
761
|
-
selectedRun
|
|
762
|
-
}) {
|
|
763
|
-
const runs = dataset?.runs ?? [];
|
|
764
|
-
const rightFocused = state.focus === "right";
|
|
765
|
-
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
766
|
-
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
767
|
-
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
768
|
-
/* @__PURE__ */ jsxs(Text, { children: [
|
|
769
|
-
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
|
|
770
|
-
" ",
|
|
771
|
-
selectedRun.label,
|
|
772
|
-
" ",
|
|
773
|
-
/* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
|
|
774
|
-
] }),
|
|
775
|
-
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
776
|
-
"Commit: ",
|
|
777
|
-
selectedRun.meta.commit,
|
|
778
|
-
" Branch: ",
|
|
779
|
-
selectedRun.meta.branch,
|
|
780
|
-
" ",
|
|
781
|
-
"Seed: ",
|
|
782
|
-
selectedRun.meta.seed
|
|
783
|
-
] }),
|
|
784
|
-
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
785
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
|
|
786
|
-
/* @__PURE__ */ jsx(
|
|
787
|
-
TextBar,
|
|
788
|
-
{
|
|
789
|
-
label: "pass rate",
|
|
790
|
-
value: selectedRun.performance.passRate,
|
|
791
|
-
format: (v) => `${v}%`
|
|
792
|
-
}
|
|
793
|
-
),
|
|
794
|
-
/* @__PURE__ */ jsx(
|
|
795
|
-
TextBar,
|
|
796
|
-
{
|
|
797
|
-
label: "avg score",
|
|
798
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
799
|
-
}
|
|
800
|
-
),
|
|
801
|
-
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
802
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
|
|
803
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
|
|
804
|
-
TextBar,
|
|
805
|
-
{
|
|
806
|
-
label: dimension.name,
|
|
807
|
-
value: dimension.score
|
|
808
|
-
},
|
|
809
|
-
dimension.name
|
|
810
|
-
)),
|
|
811
|
-
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
812
|
-
/* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
|
|
813
|
-
/* @__PURE__ */ jsx(
|
|
814
|
-
Sparkline,
|
|
815
|
-
{
|
|
816
|
-
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
817
|
-
selectedRun.performance.latencyAvgMs - 40,
|
|
818
|
-
selectedRun.performance.latencyAvgMs - 10,
|
|
819
|
-
selectedRun.performance.latencyAvgMs + 20,
|
|
820
|
-
selectedRun.performance.latencyP95Ms - 80,
|
|
821
|
-
selectedRun.performance.latencyP95Ms
|
|
822
|
-
],
|
|
823
|
-
width: 24
|
|
824
|
-
}
|
|
825
|
-
)
|
|
826
|
-
] }) })
|
|
827
|
-
] });
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
// src/evals/metric.ts
|
|
831
|
-
var registry = /* @__PURE__ */ new Map();
|
|
832
|
-
var Metric = {
|
|
833
|
-
of(config) {
|
|
834
|
-
const def = {
|
|
835
|
-
id: config.id,
|
|
836
|
-
name: config.name,
|
|
837
|
-
aggregate: config.aggregate,
|
|
838
|
-
format: config.format,
|
|
839
|
-
make: (data) => ({ id: config.id, data })
|
|
840
|
-
};
|
|
841
|
-
registry.set(config.id, def);
|
|
842
|
-
return def;
|
|
843
|
-
}
|
|
844
|
-
};
|
|
845
|
-
function getMetricById(id) {
|
|
846
|
-
return registry.get(id);
|
|
847
|
-
}
|
|
848
|
-
|
|
849
|
-
// src/evals/score.ts
|
|
850
|
-
var registry2 = /* @__PURE__ */ new Map();
|
|
851
|
-
var Score = {
|
|
852
|
-
of(config) {
|
|
853
|
-
const def = {
|
|
854
|
-
id: config.id,
|
|
855
|
-
name: config.name,
|
|
856
|
-
displayStrategy: config.displayStrategy,
|
|
857
|
-
aggregate: config.aggregate,
|
|
858
|
-
format: config.format,
|
|
859
|
-
make: (data, options) => {
|
|
860
|
-
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
861
|
-
return {
|
|
862
|
-
id: config.id,
|
|
863
|
-
data,
|
|
864
|
-
...passed !== void 0 && { passed }
|
|
865
|
-
};
|
|
866
|
-
}
|
|
867
|
-
};
|
|
868
|
-
registry2.set(config.id, def);
|
|
869
|
-
return def;
|
|
870
|
-
}
|
|
871
|
-
};
|
|
872
|
-
function getScoreById(id) {
|
|
873
|
-
return registry2.get(id);
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
// src/evals/aggregators.ts
|
|
877
|
-
function aggregateAverage(values) {
|
|
878
|
-
if (values.length === 0) {
|
|
879
|
-
return { value: 0 };
|
|
880
|
-
}
|
|
881
|
-
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
882
|
-
return { value: sum / values.length };
|
|
883
|
-
}
|
|
884
|
-
function aggregateAll(values) {
|
|
885
|
-
return { passed: values.length > 0 && values.every((v) => v.passed) };
|
|
886
|
-
}
|
|
887
|
-
function aggregateTokenCountSum(values) {
|
|
888
|
-
const initial = {
|
|
889
|
-
input: 0,
|
|
890
|
-
output: 0,
|
|
891
|
-
inputCached: 0,
|
|
892
|
-
outputCached: 0
|
|
893
|
-
};
|
|
894
|
-
return values.reduce(
|
|
895
|
-
(acc, v) => ({
|
|
896
|
-
input: acc.input + (v.input ?? 0),
|
|
897
|
-
output: acc.output + (v.output ?? 0),
|
|
898
|
-
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
899
|
-
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
900
|
-
}),
|
|
901
|
-
initial
|
|
902
|
-
);
|
|
903
|
-
}
|
|
904
|
-
function aggregateLatencyAverage(values) {
|
|
905
|
-
if (values.length === 0) {
|
|
906
|
-
return { ms: 0 };
|
|
907
|
-
}
|
|
908
|
-
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
909
|
-
return { ms: sum / values.length };
|
|
910
|
-
}
|
|
911
|
-
|
|
912
|
-
// src/evals/metrics/standard.ts
|
|
913
|
-
Metric.of({
|
|
914
|
-
id: "token-count",
|
|
915
|
-
name: "Tokens",
|
|
916
|
-
aggregate: aggregateTokenCountSum,
|
|
917
|
-
format: (data, options) => {
|
|
918
|
-
const input = data.input ?? 0;
|
|
919
|
-
const output = data.output ?? 0;
|
|
920
|
-
const inputCached = data.inputCached ?? 0;
|
|
921
|
-
const outputCached = data.outputCached ?? 0;
|
|
922
|
-
const cached = inputCached + outputCached;
|
|
923
|
-
const base = `in:${input} out:${output} cached:${cached}`;
|
|
924
|
-
return options?.isAggregated ? `Total: ${base}` : base;
|
|
925
|
-
}
|
|
926
|
-
});
|
|
927
|
-
Metric.of({
|
|
928
|
-
id: "latency",
|
|
929
|
-
name: "Latency",
|
|
930
|
-
aggregate: aggregateLatencyAverage,
|
|
931
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
932
|
-
});
|
|
933
|
-
|
|
934
|
-
// src/evals/scores/standard.ts
|
|
935
|
-
Score.of({
|
|
936
|
-
id: "percent",
|
|
937
|
-
name: "Score",
|
|
938
|
-
displayStrategy: "bar",
|
|
939
|
-
format: (data, options) => options?.isAggregated ? `Avg: ${data.value.toFixed(2)}` : data.value.toFixed(2),
|
|
940
|
-
aggregate: aggregateAverage
|
|
941
|
-
});
|
|
942
|
-
Score.of({
|
|
943
|
-
id: "binary",
|
|
944
|
-
name: "Result",
|
|
945
|
-
displayStrategy: "passFail",
|
|
946
|
-
format: (data, options) => options?.isAggregated ? data.passed ? "All: PASSED" : "Some: FAILED" : data.passed ? "PASSED" : "NOT PASSED",
|
|
947
|
-
aggregate: aggregateAll
|
|
948
|
-
});
|
|
949
|
-
function createDiffLogEntry(expected, actual, options) {
|
|
950
|
-
const diff = diffString(expected, actual, { color: false });
|
|
951
|
-
return {
|
|
952
|
-
type: "diff",
|
|
953
|
-
label: options?.label,
|
|
954
|
-
expected,
|
|
955
|
-
actual,
|
|
956
|
-
diff: diff || "(no differences)"
|
|
957
|
-
};
|
|
958
|
-
}
|
|
959
|
-
function getDiffLines(entry) {
|
|
960
|
-
const raw = diffString(entry.expected, entry.actual, { color: false }) || "(no differences)";
|
|
961
|
-
return raw.split("\n").map((line) => {
|
|
962
|
-
const trimmed = line.trimStart();
|
|
963
|
-
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
964
|
-
return { type: "remove", line };
|
|
965
|
-
}
|
|
966
|
-
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
967
|
-
return { type: "add", line };
|
|
968
|
-
}
|
|
969
|
-
return { type: "context", line };
|
|
970
|
-
});
|
|
971
|
-
}
|
|
972
|
-
|
|
973
|
-
// src/runner/score-utils.ts
|
|
974
|
-
function toNumericScoreFromScores(scores) {
|
|
975
|
-
for (const item of scores) {
|
|
976
|
-
const def = getScoreById(item.id);
|
|
977
|
-
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
978
|
-
const value = item.data.value;
|
|
979
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
980
|
-
return value;
|
|
981
|
-
}
|
|
982
|
-
}
|
|
983
|
-
const numeric = toNumericScore(item.data);
|
|
984
|
-
if (numeric !== void 0) {
|
|
985
|
-
return numeric;
|
|
986
|
-
}
|
|
987
|
-
}
|
|
988
|
-
return void 0;
|
|
989
|
-
}
|
|
990
|
-
function toNumericScore(value) {
|
|
991
|
-
if (typeof value === "number" && Number.isFinite(value)) {
|
|
992
|
-
return value;
|
|
993
|
-
}
|
|
994
|
-
if (typeof value !== "object" || value === null) {
|
|
995
|
-
return void 0;
|
|
996
|
-
}
|
|
997
|
-
const obj = value;
|
|
998
|
-
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
999
|
-
return obj.score;
|
|
1000
|
-
}
|
|
1001
|
-
const numberValues = Object.values(value).filter(
|
|
1002
|
-
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1003
|
-
);
|
|
1004
|
-
if (numberValues.length === 0) {
|
|
1005
|
-
return void 0;
|
|
1006
|
-
}
|
|
1007
|
-
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1008
|
-
}
|
|
1009
731
|
|
|
1010
732
|
// src/runner/config.ts
|
|
1011
733
|
var defaultRunnerConfig = {
|
|
@@ -1187,75 +909,311 @@ async function loadModuleExports(filePath) {
|
|
|
1187
909
|
if (!createJiti2) {
|
|
1188
910
|
throw new Error("Failed to initialize jiti TypeScript loader");
|
|
1189
911
|
}
|
|
1190
|
-
jitiLoader = createJiti2(import.meta.url, {
|
|
1191
|
-
interopDefault: true,
|
|
1192
|
-
moduleCache: true
|
|
1193
|
-
});
|
|
912
|
+
jitiLoader = createJiti2(import.meta.url, {
|
|
913
|
+
interopDefault: true,
|
|
914
|
+
moduleCache: true
|
|
915
|
+
});
|
|
916
|
+
}
|
|
917
|
+
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
918
|
+
return Object.values(loaded2);
|
|
919
|
+
}
|
|
920
|
+
const moduleUrl = pathToFileURL(filePath).href;
|
|
921
|
+
const loaded = await import(moduleUrl);
|
|
922
|
+
return Object.values(loaded);
|
|
923
|
+
}
|
|
924
|
+
async function collectDatasetsFromFiles(config) {
|
|
925
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
926
|
+
const matched = files.filter(
|
|
927
|
+
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
928
|
+
);
|
|
929
|
+
const found = await Promise.all(
|
|
930
|
+
matched.map(async (absolutePath) => {
|
|
931
|
+
const exports = await loadModuleExports(absolutePath);
|
|
932
|
+
const datasets = exports.filter(isDatasetLike);
|
|
933
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
934
|
+
return datasets.map((dataset) => ({
|
|
935
|
+
id: toId("dataset", relPath, dataset.getName()),
|
|
936
|
+
filePath: relPath,
|
|
937
|
+
dataset
|
|
938
|
+
}));
|
|
939
|
+
})
|
|
940
|
+
);
|
|
941
|
+
return found.flat();
|
|
942
|
+
}
|
|
943
|
+
async function collectEvaluatorsFromFiles(config) {
|
|
944
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
945
|
+
const matched = files.filter(
|
|
946
|
+
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
947
|
+
);
|
|
948
|
+
const found = await Promise.all(
|
|
949
|
+
matched.map(async (absolutePath) => {
|
|
950
|
+
const exports = await loadModuleExports(absolutePath);
|
|
951
|
+
const evaluators = exports.filter(isEvaluatorLike);
|
|
952
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
953
|
+
return evaluators.map((evaluator) => ({
|
|
954
|
+
id: toId("evaluator", relPath, evaluator.getName()),
|
|
955
|
+
filePath: relPath,
|
|
956
|
+
evaluator
|
|
957
|
+
}));
|
|
958
|
+
})
|
|
959
|
+
);
|
|
960
|
+
return found.flat();
|
|
961
|
+
}
|
|
962
|
+
async function collectTestCasesFromFiles(config) {
|
|
963
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
964
|
+
const matched = files.filter(
|
|
965
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
966
|
+
);
|
|
967
|
+
const found = await Promise.all(
|
|
968
|
+
matched.map(async (absolutePath) => {
|
|
969
|
+
const exports = await loadModuleExports(absolutePath);
|
|
970
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
971
|
+
const relPath = relative(config.rootDir, absolutePath);
|
|
972
|
+
return testCases.map((testCase) => ({
|
|
973
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
974
|
+
filePath: relPath,
|
|
975
|
+
testCase
|
|
976
|
+
}));
|
|
977
|
+
})
|
|
978
|
+
);
|
|
979
|
+
return found.flat();
|
|
980
|
+
}
|
|
981
|
+
function toJsonLines(value) {
|
|
982
|
+
try {
|
|
983
|
+
return JSON.stringify(value, null, 2);
|
|
984
|
+
} catch {
|
|
985
|
+
return String(value);
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
function formatDiffString(changes) {
|
|
989
|
+
const lines = [];
|
|
990
|
+
for (const part of changes) {
|
|
991
|
+
const prefix = part.added ? "+" : part.removed ? "-" : " ";
|
|
992
|
+
const partLines = part.value.split("\n");
|
|
993
|
+
if (partLines[partLines.length - 1] === "") {
|
|
994
|
+
partLines.pop();
|
|
995
|
+
}
|
|
996
|
+
for (const line of partLines) {
|
|
997
|
+
lines.push(`${prefix} ${line}`);
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
return lines.join("\n");
|
|
1001
|
+
}
|
|
1002
|
+
function createDiffString(expected, actual) {
|
|
1003
|
+
const expectedStr = toJsonLines(expected);
|
|
1004
|
+
const actualStr = toJsonLines(actual);
|
|
1005
|
+
const changes = diffLines(expectedStr, actualStr);
|
|
1006
|
+
return formatDiffString(changes);
|
|
1007
|
+
}
|
|
1008
|
+
function createDiffLogEntry(expected, actual, options) {
|
|
1009
|
+
const diff = createDiffString(expected, actual);
|
|
1010
|
+
return {
|
|
1011
|
+
type: "diff",
|
|
1012
|
+
label: options?.label,
|
|
1013
|
+
expected,
|
|
1014
|
+
actual,
|
|
1015
|
+
diff: diff || "(no differences)"
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
function getDiffLines(entry) {
|
|
1019
|
+
const raw = createDiffString(entry.expected, entry.actual) || "(no differences)";
|
|
1020
|
+
return raw.split("\n").map((line) => {
|
|
1021
|
+
const trimmed = line.trimStart();
|
|
1022
|
+
if (trimmed.startsWith("-") && !trimmed.startsWith("---")) {
|
|
1023
|
+
return { type: "remove", line };
|
|
1024
|
+
}
|
|
1025
|
+
if (trimmed.startsWith("+") && !trimmed.startsWith("+++")) {
|
|
1026
|
+
return { type: "add", line };
|
|
1027
|
+
}
|
|
1028
|
+
return { type: "context", line };
|
|
1029
|
+
});
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// src/evals/metric.ts
|
|
1033
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1034
|
+
var Metric = {
|
|
1035
|
+
of(config) {
|
|
1036
|
+
const def = {
|
|
1037
|
+
id: config.id,
|
|
1038
|
+
name: config.name,
|
|
1039
|
+
aggregate: config.aggregate,
|
|
1040
|
+
format: config.format,
|
|
1041
|
+
make: (data) => ({ id: config.id, data })
|
|
1042
|
+
};
|
|
1043
|
+
registry.set(config.id, def);
|
|
1044
|
+
return def;
|
|
1045
|
+
}
|
|
1046
|
+
};
|
|
1047
|
+
function getMetricById(id) {
|
|
1048
|
+
return registry.get(id);
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
// src/evals/score.ts
|
|
1052
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1053
|
+
var Score = {
|
|
1054
|
+
of(config) {
|
|
1055
|
+
const def = {
|
|
1056
|
+
id: config.id,
|
|
1057
|
+
name: config.name,
|
|
1058
|
+
displayStrategy: config.displayStrategy,
|
|
1059
|
+
aggregate: config.aggregate,
|
|
1060
|
+
format: config.format,
|
|
1061
|
+
make: (data, options) => {
|
|
1062
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1063
|
+
return {
|
|
1064
|
+
id: config.id,
|
|
1065
|
+
data,
|
|
1066
|
+
...passed !== void 0 && { passed }
|
|
1067
|
+
};
|
|
1068
|
+
}
|
|
1069
|
+
};
|
|
1070
|
+
registry2.set(config.id, def);
|
|
1071
|
+
return def;
|
|
1072
|
+
}
|
|
1073
|
+
};
|
|
1074
|
+
function getScoreById(id) {
|
|
1075
|
+
return registry2.get(id);
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
// src/evals/aggregators.ts
|
|
1079
|
+
function aggregateAverageWithVariance(values) {
|
|
1080
|
+
if (values.length === 0) {
|
|
1081
|
+
return { value: 0, count: 0 };
|
|
1082
|
+
}
|
|
1083
|
+
const sum = values.reduce((s, v) => s + v.value, 0);
|
|
1084
|
+
const sumSq = values.reduce((s, v) => s + v.value * v.value, 0);
|
|
1085
|
+
const mean = sum / values.length;
|
|
1086
|
+
let stdDev;
|
|
1087
|
+
if (values.length >= 2) {
|
|
1088
|
+
const variance = (sumSq - values.length * mean * mean) / (values.length - 1);
|
|
1089
|
+
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
1090
|
+
}
|
|
1091
|
+
return { value: mean, stdDev, count: values.length };
|
|
1092
|
+
}
|
|
1093
|
+
function aggregateAll(values) {
|
|
1094
|
+
const total = values.length;
|
|
1095
|
+
const passedCount = values.filter((v) => v.passed).length;
|
|
1096
|
+
return {
|
|
1097
|
+
passed: total > 0 && values.every((v) => v.passed),
|
|
1098
|
+
passedCount,
|
|
1099
|
+
totalCount: total
|
|
1100
|
+
};
|
|
1101
|
+
}
|
|
1102
|
+
function aggregateTokenCountSum(values) {
|
|
1103
|
+
const initial = {
|
|
1104
|
+
input: 0,
|
|
1105
|
+
output: 0,
|
|
1106
|
+
inputCached: 0,
|
|
1107
|
+
outputCached: 0
|
|
1108
|
+
};
|
|
1109
|
+
return values.reduce(
|
|
1110
|
+
(acc, v) => ({
|
|
1111
|
+
input: acc.input + (v.input ?? 0),
|
|
1112
|
+
output: acc.output + (v.output ?? 0),
|
|
1113
|
+
inputCached: acc.inputCached + (v.inputCached ?? 0),
|
|
1114
|
+
outputCached: acc.outputCached + (v.outputCached ?? 0)
|
|
1115
|
+
}),
|
|
1116
|
+
initial
|
|
1117
|
+
);
|
|
1118
|
+
}
|
|
1119
|
+
function aggregateLatencyAverage(values) {
|
|
1120
|
+
if (values.length === 0) {
|
|
1121
|
+
return { ms: 0 };
|
|
1122
|
+
}
|
|
1123
|
+
const sum = values.reduce((s, v) => s + v.ms, 0);
|
|
1124
|
+
return { ms: sum / values.length };
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// src/evals/metrics/standard.ts
|
|
1128
|
+
Metric.of({
|
|
1129
|
+
id: "token-count",
|
|
1130
|
+
name: "Tokens",
|
|
1131
|
+
aggregate: aggregateTokenCountSum,
|
|
1132
|
+
format: (data, options) => {
|
|
1133
|
+
const input = data.input ?? 0;
|
|
1134
|
+
const output = data.output ?? 0;
|
|
1135
|
+
const inputCached = data.inputCached ?? 0;
|
|
1136
|
+
const outputCached = data.outputCached ?? 0;
|
|
1137
|
+
const cached = inputCached + outputCached;
|
|
1138
|
+
const base = `in:${input} out:${output} cached:${cached}`;
|
|
1139
|
+
return options?.isAggregated ? `Total: ${base}` : base;
|
|
1140
|
+
}
|
|
1141
|
+
});
|
|
1142
|
+
Metric.of({
|
|
1143
|
+
id: "latency",
|
|
1144
|
+
name: "Latency",
|
|
1145
|
+
aggregate: aggregateLatencyAverage,
|
|
1146
|
+
format: (data, options) => options?.isAggregated ? `Avg: ${data.ms}ms` : `${data.ms}ms`
|
|
1147
|
+
});
|
|
1148
|
+
|
|
1149
|
+
// src/evals/scores/standard.ts
|
|
1150
|
+
Score.of({
|
|
1151
|
+
id: "percent",
|
|
1152
|
+
name: "Score",
|
|
1153
|
+
displayStrategy: "bar",
|
|
1154
|
+
format: (data, options) => {
|
|
1155
|
+
if (options?.isAggregated) {
|
|
1156
|
+
return data.stdDev != null ? `Avg: ${data.value.toFixed(2)} \xB1 ${data.stdDev.toFixed(2)}` : `Avg: ${data.value.toFixed(2)}`;
|
|
1157
|
+
}
|
|
1158
|
+
return data.value.toFixed(2);
|
|
1159
|
+
},
|
|
1160
|
+
aggregate: aggregateAverageWithVariance
|
|
1161
|
+
});
|
|
1162
|
+
Score.of({
|
|
1163
|
+
id: "binary",
|
|
1164
|
+
name: "Result",
|
|
1165
|
+
displayStrategy: "passFail",
|
|
1166
|
+
format: (data, options) => {
|
|
1167
|
+
if (options?.isAggregated) {
|
|
1168
|
+
const base = data.passed ? "All: PASSED" : "Some: FAILED";
|
|
1169
|
+
if (data.passedCount != null && data.totalCount != null && data.totalCount > 1) {
|
|
1170
|
+
return `${base} (${data.passedCount}/${data.totalCount})`;
|
|
1171
|
+
}
|
|
1172
|
+
return base;
|
|
1173
|
+
}
|
|
1174
|
+
return data.passed ? "PASSED" : "NOT PASSED";
|
|
1175
|
+
},
|
|
1176
|
+
aggregate: aggregateAll
|
|
1177
|
+
});
|
|
1178
|
+
|
|
1179
|
+
// src/runner/score-utils.ts
|
|
1180
|
+
function toNumericScoreFromScores(scores) {
|
|
1181
|
+
for (const item of scores) {
|
|
1182
|
+
const def = getScoreById(item.id);
|
|
1183
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1184
|
+
const value = item.data.value;
|
|
1185
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1186
|
+
return value;
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
const numeric = toNumericScore(item.data);
|
|
1190
|
+
if (numeric !== void 0) {
|
|
1191
|
+
return numeric;
|
|
1194
1192
|
}
|
|
1195
|
-
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
1196
|
-
return Object.values(loaded2);
|
|
1197
1193
|
}
|
|
1198
|
-
|
|
1199
|
-
const loaded = await import(moduleUrl);
|
|
1200
|
-
return Object.values(loaded);
|
|
1201
|
-
}
|
|
1202
|
-
async function collectDatasetsFromFiles(config) {
|
|
1203
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1204
|
-
const matched = files.filter(
|
|
1205
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1206
|
-
);
|
|
1207
|
-
const found = await Promise.all(
|
|
1208
|
-
matched.map(async (absolutePath) => {
|
|
1209
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1210
|
-
const datasets = exports.filter(isDatasetLike);
|
|
1211
|
-
const relPath = relative(config.rootDir, absolutePath);
|
|
1212
|
-
return datasets.map((dataset) => ({
|
|
1213
|
-
id: toId("dataset", relPath, dataset.getName()),
|
|
1214
|
-
filePath: relPath,
|
|
1215
|
-
dataset
|
|
1216
|
-
}));
|
|
1217
|
-
})
|
|
1218
|
-
);
|
|
1219
|
-
return found.flat();
|
|
1220
|
-
}
|
|
1221
|
-
async function collectEvaluatorsFromFiles(config) {
|
|
1222
|
-
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1223
|
-
const matched = files.filter(
|
|
1224
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1225
|
-
);
|
|
1226
|
-
const found = await Promise.all(
|
|
1227
|
-
matched.map(async (absolutePath) => {
|
|
1228
|
-
const exports = await loadModuleExports(absolutePath);
|
|
1229
|
-
const evaluators = exports.filter(isEvaluatorLike);
|
|
1230
|
-
const relPath = relative(config.rootDir, absolutePath);
|
|
1231
|
-
return evaluators.map((evaluator) => ({
|
|
1232
|
-
id: toId("evaluator", relPath, evaluator.getName()),
|
|
1233
|
-
filePath: relPath,
|
|
1234
|
-
evaluator
|
|
1235
|
-
}));
|
|
1236
|
-
})
|
|
1237
|
-
);
|
|
1238
|
-
return found.flat();
|
|
1194
|
+
return void 0;
|
|
1239
1195
|
}
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
)
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
testCase
|
|
1254
|
-
}));
|
|
1255
|
-
})
|
|
1196
|
+
function toNumericScore(value) {
|
|
1197
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1198
|
+
return value;
|
|
1199
|
+
}
|
|
1200
|
+
if (typeof value !== "object" || value === null) {
|
|
1201
|
+
return void 0;
|
|
1202
|
+
}
|
|
1203
|
+
const obj = value;
|
|
1204
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1205
|
+
return obj.score;
|
|
1206
|
+
}
|
|
1207
|
+
const numberValues = Object.values(value).filter(
|
|
1208
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1256
1209
|
);
|
|
1257
|
-
|
|
1210
|
+
if (numberValues.length === 0) {
|
|
1211
|
+
return void 0;
|
|
1212
|
+
}
|
|
1213
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1258
1214
|
}
|
|
1215
|
+
|
|
1216
|
+
// src/runner/execution.ts
|
|
1259
1217
|
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1260
1218
|
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1261
1219
|
if (scoresWithPassed.length > 0) {
|
|
@@ -1926,6 +1884,240 @@ var EffectRunner = class {
|
|
|
1926
1884
|
);
|
|
1927
1885
|
}
|
|
1928
1886
|
};
|
|
1887
|
+
var LEFT_PANE_WIDTH2 = 44;
|
|
1888
|
+
var MAX_RUNS_FOR_CHART = 12;
|
|
1889
|
+
var MAX_RUNS_FOR_TREND = 20;
|
|
1890
|
+
var TREND_BATCH_SIZE = 4;
|
|
1891
|
+
function extractRunAverageScore(testCases) {
|
|
1892
|
+
const scores = [];
|
|
1893
|
+
for (const tc of testCases) {
|
|
1894
|
+
for (const es of tc.evaluatorScores) {
|
|
1895
|
+
const n = toNumericScoreFromScores(es.scores);
|
|
1896
|
+
if (n !== void 0) {
|
|
1897
|
+
scores.push(n);
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
}
|
|
1901
|
+
if (scores.length === 0)
|
|
1902
|
+
return void 0;
|
|
1903
|
+
return scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1904
|
+
}
|
|
1905
|
+
async function loadRunScores(runs) {
|
|
1906
|
+
const results = [];
|
|
1907
|
+
for (const run of runs) {
|
|
1908
|
+
const artifact = run.meta?.artifact;
|
|
1909
|
+
if (!artifact)
|
|
1910
|
+
continue;
|
|
1911
|
+
try {
|
|
1912
|
+
const path = resolve(artifact);
|
|
1913
|
+
const testCases = await parseArtifactFile(path);
|
|
1914
|
+
const avg = extractRunAverageScore(testCases);
|
|
1915
|
+
if (avg !== void 0) {
|
|
1916
|
+
results.push({
|
|
1917
|
+
runId: run.id,
|
|
1918
|
+
label: run.label,
|
|
1919
|
+
value: avg
|
|
1920
|
+
});
|
|
1921
|
+
}
|
|
1922
|
+
} catch {
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
return results;
|
|
1926
|
+
}
|
|
1927
|
+
function batchAverage(values, batchSize) {
|
|
1928
|
+
const batches = [];
|
|
1929
|
+
for (let i = 0; i < values.length; i += batchSize) {
|
|
1930
|
+
const slice = values.slice(i, i + batchSize);
|
|
1931
|
+
if (slice.length > 0) {
|
|
1932
|
+
batches.push(slice.reduce((a, b) => a + b, 0) / slice.length);
|
|
1933
|
+
}
|
|
1934
|
+
}
|
|
1935
|
+
return batches;
|
|
1936
|
+
}
|
|
1937
|
+
var OVERVIEW_PAGE_SIZE = 15;
|
|
1938
|
+
function DatasetsView({
|
|
1939
|
+
state,
|
|
1940
|
+
filteredDatasets,
|
|
1941
|
+
selectedDataset,
|
|
1942
|
+
overviewRowCountRef
|
|
1943
|
+
}) {
|
|
1944
|
+
const leftFocused = state.focus === "left";
|
|
1945
|
+
const rightFocused = state.focus === "right";
|
|
1946
|
+
const [runScores, setRunScores] = useState([]);
|
|
1947
|
+
const [loading, setLoading] = useState(false);
|
|
1948
|
+
useEffect(() => {
|
|
1949
|
+
if (!selectedDataset?.runs?.length) {
|
|
1950
|
+
setRunScores([]);
|
|
1951
|
+
return;
|
|
1952
|
+
}
|
|
1953
|
+
setLoading(true);
|
|
1954
|
+
const runs = selectedDataset.runs.slice(0, MAX_RUNS_FOR_TREND);
|
|
1955
|
+
loadRunScores(runs).then(setRunScores).finally(() => setLoading(false));
|
|
1956
|
+
}, [selectedDataset?.id, selectedDataset?.runs?.length]);
|
|
1957
|
+
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
1958
|
+
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
1959
|
+
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
1960
|
+
const overviewRows = useMemo(() => {
|
|
1961
|
+
const rows = [];
|
|
1962
|
+
rows.push(
|
|
1963
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
1964
|
+
);
|
|
1965
|
+
if (selectedDataset && selectedDataset.runs.length > 0) {
|
|
1966
|
+
if (loading) {
|
|
1967
|
+
rows.push(
|
|
1968
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Loading run scores\u2026" }, "loading")
|
|
1969
|
+
);
|
|
1970
|
+
} else if (runScores.length > 0) {
|
|
1971
|
+
rows.push(
|
|
1972
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Scores (last runs)" }, "scores-header")
|
|
1973
|
+
);
|
|
1974
|
+
for (const d of barData) {
|
|
1975
|
+
rows.push(
|
|
1976
|
+
/* @__PURE__ */ jsx(
|
|
1977
|
+
TextBar,
|
|
1978
|
+
{
|
|
1979
|
+
label: d.label,
|
|
1980
|
+
value: d.value,
|
|
1981
|
+
labelWidth: 14,
|
|
1982
|
+
barWidth: 24,
|
|
1983
|
+
max: 100,
|
|
1984
|
+
format: (v) => v.toFixed(1)
|
|
1985
|
+
},
|
|
1986
|
+
d.runId
|
|
1987
|
+
)
|
|
1988
|
+
);
|
|
1989
|
+
}
|
|
1990
|
+
if (trendBatched.length > 0) {
|
|
1991
|
+
rows.push(
|
|
1992
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Avg trend (last 20, batched by 4)" }, "trend-header")
|
|
1993
|
+
);
|
|
1994
|
+
rows.push(
|
|
1995
|
+
/* @__PURE__ */ jsx(Box, { children: /* @__PURE__ */ jsx(
|
|
1996
|
+
LineGraph,
|
|
1997
|
+
{
|
|
1998
|
+
data: [{ values: trendBatched, color: "cyan" }],
|
|
1999
|
+
height: 5,
|
|
2000
|
+
width: 45,
|
|
2001
|
+
showYAxis: true,
|
|
2002
|
+
xLabels: ["older", "newer"]
|
|
2003
|
+
}
|
|
2004
|
+
) }, "trend-graph")
|
|
2005
|
+
);
|
|
2006
|
+
}
|
|
2007
|
+
}
|
|
2008
|
+
}
|
|
2009
|
+
return rows;
|
|
2010
|
+
}, [
|
|
2011
|
+
selectedDataset?.overview,
|
|
2012
|
+
selectedDataset?.runs?.length,
|
|
2013
|
+
loading,
|
|
2014
|
+
runScores,
|
|
2015
|
+
barData,
|
|
2016
|
+
trendBatched
|
|
2017
|
+
]);
|
|
2018
|
+
if (overviewRowCountRef) {
|
|
2019
|
+
overviewRowCountRef.current = overviewRows.length;
|
|
2020
|
+
}
|
|
2021
|
+
const offset = Math.max(0, state.overviewScrollOffset);
|
|
2022
|
+
const visibleRows = overviewRows.slice(offset, offset + OVERVIEW_PAGE_SIZE);
|
|
2023
|
+
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2024
|
+
/* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
2025
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
|
|
2026
|
+
/* @__PURE__ */ jsx(
|
|
2027
|
+
ListItem,
|
|
2028
|
+
{
|
|
2029
|
+
selected: state.datasetMenuIndex === 0,
|
|
2030
|
+
label: "New evaluation",
|
|
2031
|
+
itemKey: "datasets-new-eval"
|
|
2032
|
+
}
|
|
2033
|
+
),
|
|
2034
|
+
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
|
|
2035
|
+
ListItem,
|
|
2036
|
+
{
|
|
2037
|
+
selected: state.datasetMenuIndex === index + 1,
|
|
2038
|
+
label: dataset.name,
|
|
2039
|
+
itemKey: `dataset-${dataset.id}`
|
|
2040
|
+
},
|
|
2041
|
+
dataset.id
|
|
2042
|
+
))
|
|
2043
|
+
] }),
|
|
2044
|
+
/* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
2045
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
|
|
2046
|
+
/* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visibleRows.map((row, i) => /* @__PURE__ */ jsx(Box, { children: row }, offset + i)) })
|
|
2047
|
+
] })
|
|
2048
|
+
] });
|
|
2049
|
+
}
|
|
2050
|
+
function RunsView({
|
|
2051
|
+
state,
|
|
2052
|
+
dataset,
|
|
2053
|
+
selectedRun
|
|
2054
|
+
}) {
|
|
2055
|
+
const runs = dataset?.runs ?? [];
|
|
2056
|
+
const rightFocused = state.focus === "right";
|
|
2057
|
+
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2058
|
+
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
2059
|
+
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
|
|
2060
|
+
/* @__PURE__ */ jsxs(Text, { children: [
|
|
2061
|
+
/* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
|
|
2062
|
+
" ",
|
|
2063
|
+
selectedRun.label,
|
|
2064
|
+
" ",
|
|
2065
|
+
/* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
|
|
2066
|
+
] }),
|
|
2067
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2068
|
+
"Commit: ",
|
|
2069
|
+
selectedRun.meta.commit,
|
|
2070
|
+
" Branch: ",
|
|
2071
|
+
selectedRun.meta.branch,
|
|
2072
|
+
" ",
|
|
2073
|
+
"Seed: ",
|
|
2074
|
+
selectedRun.meta.seed
|
|
2075
|
+
] }),
|
|
2076
|
+
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2077
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
|
|
2078
|
+
/* @__PURE__ */ jsx(
|
|
2079
|
+
TextBar,
|
|
2080
|
+
{
|
|
2081
|
+
label: "pass rate",
|
|
2082
|
+
value: selectedRun.performance.passRate,
|
|
2083
|
+
format: (v) => `${v}%`
|
|
2084
|
+
}
|
|
2085
|
+
),
|
|
2086
|
+
/* @__PURE__ */ jsx(
|
|
2087
|
+
TextBar,
|
|
2088
|
+
{
|
|
2089
|
+
label: "avg score",
|
|
2090
|
+
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2091
|
+
}
|
|
2092
|
+
),
|
|
2093
|
+
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2094
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
|
|
2095
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
|
|
2096
|
+
TextBar,
|
|
2097
|
+
{
|
|
2098
|
+
label: dimension.name,
|
|
2099
|
+
value: dimension.score
|
|
2100
|
+
},
|
|
2101
|
+
dimension.name
|
|
2102
|
+
)),
|
|
2103
|
+
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2104
|
+
/* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
|
|
2105
|
+
/* @__PURE__ */ jsx(
|
|
2106
|
+
Sparkline,
|
|
2107
|
+
{
|
|
2108
|
+
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
2109
|
+
selectedRun.performance.latencyAvgMs - 40,
|
|
2110
|
+
selectedRun.performance.latencyAvgMs - 10,
|
|
2111
|
+
selectedRun.performance.latencyAvgMs + 20,
|
|
2112
|
+
selectedRun.performance.latencyP95Ms - 80,
|
|
2113
|
+
selectedRun.performance.latencyP95Ms
|
|
2114
|
+
],
|
|
2115
|
+
width: 24
|
|
2116
|
+
}
|
|
2117
|
+
)
|
|
2118
|
+
] }) })
|
|
2119
|
+
] });
|
|
2120
|
+
}
|
|
1929
2121
|
var DETAILS_PAGE_SIZE = 20;
|
|
1930
2122
|
function scoreColor(score) {
|
|
1931
2123
|
if (score >= 80)
|
|
@@ -1934,7 +2126,7 @@ function scoreColor(score) {
|
|
|
1934
2126
|
return "yellow";
|
|
1935
2127
|
return "red";
|
|
1936
2128
|
}
|
|
1937
|
-
function formatScorePart(item
|
|
2129
|
+
function formatScorePart(item) {
|
|
1938
2130
|
const def = getScoreById(item.id);
|
|
1939
2131
|
if (!def) {
|
|
1940
2132
|
const numeric = toNumericScore(item.data);
|
|
@@ -1964,7 +2156,7 @@ function CheckRow({
|
|
|
1964
2156
|
" ",
|
|
1965
2157
|
/* @__PURE__ */ jsx(Text, { color, bold: true, children: status }),
|
|
1966
2158
|
detail ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1967
|
-
"
|
|
2159
|
+
" (",
|
|
1968
2160
|
detail,
|
|
1969
2161
|
")"
|
|
1970
2162
|
] }) : null
|
|
@@ -1984,21 +2176,21 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
1984
2176
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1985
2177
|
"Model: ",
|
|
1986
2178
|
meta.model,
|
|
1987
|
-
"
|
|
2179
|
+
" Provider: ",
|
|
1988
2180
|
meta.provider
|
|
1989
2181
|
] }, "meta-1"),
|
|
1990
2182
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1991
2183
|
"Commit: ",
|
|
1992
2184
|
meta.commit,
|
|
1993
|
-
"
|
|
2185
|
+
" Branch: ",
|
|
1994
2186
|
meta.branch,
|
|
1995
|
-
"
|
|
2187
|
+
" Seed: ",
|
|
1996
2188
|
meta.seed
|
|
1997
2189
|
] }, "meta-2"),
|
|
1998
2190
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
1999
2191
|
"Duration: ",
|
|
2000
2192
|
meta.duration,
|
|
2001
|
-
"
|
|
2193
|
+
" Concurrency: ",
|
|
2002
2194
|
meta.concurrency
|
|
2003
2195
|
] }, "meta-3"),
|
|
2004
2196
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
@@ -2010,7 +2202,15 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2010
2202
|
...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2011
2203
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
|
|
2012
2204
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2013
|
-
...checks.map((c) => /* @__PURE__ */ jsx(
|
|
2205
|
+
...checks.map((c) => /* @__PURE__ */ jsx(
|
|
2206
|
+
CheckRow,
|
|
2207
|
+
{
|
|
2208
|
+
name: c.name,
|
|
2209
|
+
passed: c.passed,
|
|
2210
|
+
detail: c.detail
|
|
2211
|
+
},
|
|
2212
|
+
`chk-${c.name}`
|
|
2213
|
+
)),
|
|
2014
2214
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
|
|
2015
2215
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2016
2216
|
/* @__PURE__ */ jsx(
|
|
@@ -2023,16 +2223,16 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2023
2223
|
"perf-rate"
|
|
2024
2224
|
),
|
|
2025
2225
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2026
|
-
"latency avg
|
|
2226
|
+
"latency avg ",
|
|
2027
2227
|
performance.latencyAvgMs,
|
|
2028
|
-
"ms
|
|
2228
|
+
"ms p95 ",
|
|
2029
2229
|
performance.latencyP95Ms,
|
|
2030
2230
|
"ms"
|
|
2031
2231
|
] }, "perf-lat"),
|
|
2032
2232
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2033
|
-
"tokens avg
|
|
2233
|
+
"tokens avg ",
|
|
2034
2234
|
performance.tokensAvg,
|
|
2035
|
-
"
|
|
2235
|
+
" p95 ",
|
|
2036
2236
|
performance.tokensP95
|
|
2037
2237
|
] }, "perf-tok"),
|
|
2038
2238
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp4"),
|
|
@@ -2085,26 +2285,60 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2085
2285
|
":",
|
|
2086
2286
|
" ",
|
|
2087
2287
|
/* @__PURE__ */ jsx(Text, { color: item.passed ? "green" : "red", bold: true, children: item.passed ? "PASS" : "FAIL" }),
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
] }, m.id);
|
|
2105
|
-
})
|
|
2288
|
+
item.metrics && item.metrics.length > 0 ? /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2289
|
+
" ",
|
|
2290
|
+
item.metrics.map((m) => {
|
|
2291
|
+
const def = getMetricById(m.id);
|
|
2292
|
+
if (!def)
|
|
2293
|
+
return null;
|
|
2294
|
+
const formatted = def.format(m.data);
|
|
2295
|
+
return /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2296
|
+
"[",
|
|
2297
|
+
def.name ? `${def.name}: ` : "",
|
|
2298
|
+
formatted,
|
|
2299
|
+
"]",
|
|
2300
|
+
" "
|
|
2301
|
+
] }, m.id);
|
|
2302
|
+
})
|
|
2303
|
+
] }) : null
|
|
2106
2304
|
] }, `tc-${tc.testCaseId}-${item.evaluatorId}`)
|
|
2107
2305
|
);
|
|
2306
|
+
if (item.scores.length > 0) {
|
|
2307
|
+
for (let sIdx = 0; sIdx < item.scores.length; sIdx++) {
|
|
2308
|
+
const s = item.scores[sIdx];
|
|
2309
|
+
const def = getScoreById(s.id);
|
|
2310
|
+
const scoreLabel = def ? def.name ?? def.id : s.id;
|
|
2311
|
+
rows.push(
|
|
2312
|
+
/* @__PURE__ */ jsxs(
|
|
2313
|
+
Text,
|
|
2314
|
+
{
|
|
2315
|
+
color: scoreColor(toNumericScore(s.data) ?? 0),
|
|
2316
|
+
children: [
|
|
2317
|
+
" ",
|
|
2318
|
+
scoreLabel,
|
|
2319
|
+
": ",
|
|
2320
|
+
formatScorePart(s)
|
|
2321
|
+
]
|
|
2322
|
+
},
|
|
2323
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-score-${sIdx}`
|
|
2324
|
+
)
|
|
2325
|
+
);
|
|
2326
|
+
}
|
|
2327
|
+
} else {
|
|
2328
|
+
rows.push(
|
|
2329
|
+
/* @__PURE__ */ jsxs(
|
|
2330
|
+
Text,
|
|
2331
|
+
{
|
|
2332
|
+
color: "gray",
|
|
2333
|
+
children: [
|
|
2334
|
+
" ",
|
|
2335
|
+
"n/a"
|
|
2336
|
+
]
|
|
2337
|
+
},
|
|
2338
|
+
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2339
|
+
)
|
|
2340
|
+
);
|
|
2341
|
+
}
|
|
2108
2342
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
2109
2343
|
for (let logIdx = 0; logIdx < item.logs.length; logIdx++) {
|
|
2110
2344
|
const log = item.logs[logIdx];
|
|
@@ -2143,7 +2377,7 @@ function RunDetailsView({
|
|
|
2143
2377
|
const runs = dataset?.runs ?? [];
|
|
2144
2378
|
const rightFocused = state.focus === "right";
|
|
2145
2379
|
const [testCases, setTestCases] = useState([]);
|
|
2146
|
-
const evaluatorNameById =
|
|
2380
|
+
const evaluatorNameById = React2.useMemo(
|
|
2147
2381
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2148
2382
|
[evaluators]
|
|
2149
2383
|
);
|
|
@@ -2166,7 +2400,7 @@ function RunDetailsView({
|
|
|
2166
2400
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2167
2401
|
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2168
2402
|
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
2169
|
-
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(
|
|
2403
|
+
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React2.Fragment, { children: row }, i)) }) })
|
|
2170
2404
|
] });
|
|
2171
2405
|
}
|
|
2172
2406
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2248,6 +2482,7 @@ function EvalsCliApp({
|
|
|
2248
2482
|
const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
|
|
2249
2483
|
const [liveData, setLiveData] = useState(data);
|
|
2250
2484
|
const [runtimeMessage, setRuntimeMessage] = useState();
|
|
2485
|
+
const overviewRowCountRef = useRef(0);
|
|
2251
2486
|
const [state, dispatch] = useReducer(
|
|
2252
2487
|
reduceCliState,
|
|
2253
2488
|
createInitialState(data, args)
|
|
@@ -2327,7 +2562,16 @@ function EvalsCliApp({
|
|
|
2327
2562
|
return;
|
|
2328
2563
|
}
|
|
2329
2564
|
if (key.downArrow) {
|
|
2330
|
-
|
|
2565
|
+
let max;
|
|
2566
|
+
if (clampedState.level === "datasets") {
|
|
2567
|
+
max = clampedState.focus === "right" ? Math.max(0, overviewRowCountRef.current - OVERVIEW_PAGE_SIZE) : filteredDatasets.length;
|
|
2568
|
+
} else if (clampedState.level === "runs") {
|
|
2569
|
+
max = selectedDataset?.runs.length ?? 0;
|
|
2570
|
+
} else if (clampedState.level === "new-evaluation") {
|
|
2571
|
+
max = Math.max(0, visibleEvaluators.length - 1);
|
|
2572
|
+
} else {
|
|
2573
|
+
max = 100;
|
|
2574
|
+
}
|
|
2331
2575
|
dispatch({ type: "MOVE_DOWN", max });
|
|
2332
2576
|
return;
|
|
2333
2577
|
}
|
|
@@ -2345,7 +2589,7 @@ function EvalsCliApp({
|
|
|
2345
2589
|
}
|
|
2346
2590
|
return;
|
|
2347
2591
|
}
|
|
2348
|
-
if (isBackKey(key)) {
|
|
2592
|
+
if (isBackKey(key) || input === "\x7F" || input === "\b") {
|
|
2349
2593
|
dispatch({ type: "BACK" });
|
|
2350
2594
|
return;
|
|
2351
2595
|
}
|
|
@@ -2398,7 +2642,8 @@ function EvalsCliApp({
|
|
|
2398
2642
|
{
|
|
2399
2643
|
state: clampedState,
|
|
2400
2644
|
filteredDatasets,
|
|
2401
|
-
selectedDataset
|
|
2645
|
+
selectedDataset,
|
|
2646
|
+
overviewRowCountRef
|
|
2402
2647
|
}
|
|
2403
2648
|
);
|
|
2404
2649
|
}
|