@m4trix/evals 0.24.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +147 -260
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +133 -246
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +143 -291
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +130 -278
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +49 -92
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import { withFullScreen, useScreenSize } from 'fullscreen-ink';
|
|
3
|
-
import
|
|
3
|
+
import React, { useState, useRef, useReducer, useEffect, useMemo } from 'react';
|
|
4
4
|
import { useApp, useInput, Box, Text } from 'ink';
|
|
5
5
|
import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
|
|
6
6
|
import { resolve, relative, join, dirname } from 'path';
|
|
@@ -90,11 +90,7 @@ function getFooterText(state) {
|
|
|
90
90
|
}
|
|
91
91
|
return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
|
|
92
92
|
}
|
|
93
|
-
function ListItem({
|
|
94
|
-
selected,
|
|
95
|
-
label,
|
|
96
|
-
itemKey
|
|
97
|
-
}) {
|
|
93
|
+
function ListItem({ selected, label, itemKey }) {
|
|
98
94
|
return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
99
95
|
selected ? "\u25B8 " : " ",
|
|
100
96
|
label
|
|
@@ -121,9 +117,7 @@ function Pane({
|
|
|
121
117
|
}
|
|
122
118
|
);
|
|
123
119
|
}
|
|
124
|
-
function SectionHeader({
|
|
125
|
-
children
|
|
126
|
-
}) {
|
|
120
|
+
function SectionHeader({ children }) {
|
|
127
121
|
return /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children });
|
|
128
122
|
}
|
|
129
123
|
function StatusText({ status }) {
|
|
@@ -135,10 +129,7 @@ function StatusText({ status }) {
|
|
|
135
129
|
] });
|
|
136
130
|
}
|
|
137
131
|
var LEFT_PANE_WIDTH = 44;
|
|
138
|
-
function RunsSidebar({
|
|
139
|
-
state,
|
|
140
|
-
runs
|
|
141
|
-
}) {
|
|
132
|
+
function RunsSidebar({ state, runs }) {
|
|
142
133
|
const focused = state.focus === "left";
|
|
143
134
|
return /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
|
|
144
135
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Runs" }),
|
|
@@ -167,11 +158,7 @@ function RunsSidebar({
|
|
|
167
158
|
] });
|
|
168
159
|
}
|
|
169
160
|
var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
|
|
170
|
-
function Sparkline({
|
|
171
|
-
data,
|
|
172
|
-
width,
|
|
173
|
-
label
|
|
174
|
-
}) {
|
|
161
|
+
function Sparkline({ data, width, label }) {
|
|
175
162
|
if (data.length === 0)
|
|
176
163
|
return null;
|
|
177
164
|
const max = Math.max(...data);
|
|
@@ -401,9 +388,7 @@ var data_mock_default = {
|
|
|
401
388
|
{ name: "contract_match", score: 100 },
|
|
402
389
|
{ name: "arg_validity", score: 100 }
|
|
403
390
|
],
|
|
404
|
-
checks: [
|
|
405
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
406
|
-
],
|
|
391
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
407
392
|
failures: [],
|
|
408
393
|
meta: {
|
|
409
394
|
model: "gpt-4o-mini",
|
|
@@ -426,9 +411,21 @@ var data_mock_default = {
|
|
|
426
411
|
}
|
|
427
412
|
],
|
|
428
413
|
evaluators: [
|
|
429
|
-
{
|
|
430
|
-
|
|
431
|
-
|
|
414
|
+
{
|
|
415
|
+
id: "json-schema-validator",
|
|
416
|
+
name: "JSON Schema Validator",
|
|
417
|
+
configPreview: "strict=true"
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
id: "tool-call-contract-checker",
|
|
421
|
+
name: "Tool-call Contract Checker",
|
|
422
|
+
configPreview: "unexpectedCalls=error"
|
|
423
|
+
},
|
|
424
|
+
{
|
|
425
|
+
id: "rubric-judge",
|
|
426
|
+
name: "Rubric Judge (LLM)",
|
|
427
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
428
|
+
},
|
|
432
429
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
433
430
|
]
|
|
434
431
|
};
|
|
@@ -508,9 +505,7 @@ async function loadRunnerData(runner) {
|
|
|
508
505
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
509
506
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
510
507
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
511
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
512
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
513
|
-
);
|
|
508
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
514
509
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
515
510
|
return loadMockData();
|
|
516
511
|
}
|
|
@@ -632,7 +627,11 @@ function reduceCliState(state, action) {
|
|
|
632
627
|
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
633
628
|
}
|
|
634
629
|
if (state.level === "datasets") {
|
|
635
|
-
return {
|
|
630
|
+
return {
|
|
631
|
+
...state,
|
|
632
|
+
datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
|
|
633
|
+
overviewScrollOffset: 0
|
|
634
|
+
};
|
|
636
635
|
}
|
|
637
636
|
if (state.level === "runs") {
|
|
638
637
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -650,10 +649,17 @@ function reduceCliState(state, action) {
|
|
|
650
649
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
651
650
|
}
|
|
652
651
|
if (state.level === "datasets" && state.focus === "right") {
|
|
653
|
-
return {
|
|
652
|
+
return {
|
|
653
|
+
...state,
|
|
654
|
+
overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
|
|
655
|
+
};
|
|
654
656
|
}
|
|
655
657
|
if (state.level === "datasets") {
|
|
656
|
-
return {
|
|
658
|
+
return {
|
|
659
|
+
...state,
|
|
660
|
+
datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
|
|
661
|
+
overviewScrollOffset: 0
|
|
662
|
+
};
|
|
657
663
|
}
|
|
658
664
|
if (state.level === "runs") {
|
|
659
665
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -735,18 +741,8 @@ var defaultRunnerConfig = {
|
|
|
735
741
|
discovery: {
|
|
736
742
|
rootDir: process.cwd(),
|
|
737
743
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
738
|
-
evaluatorSuffixes: [
|
|
739
|
-
|
|
740
|
-
".evaluator.tsx",
|
|
741
|
-
".evaluator.js",
|
|
742
|
-
".evaluator.mjs"
|
|
743
|
-
],
|
|
744
|
-
testCaseSuffixes: [
|
|
745
|
-
".test-case.ts",
|
|
746
|
-
".test-case.tsx",
|
|
747
|
-
".test-case.js",
|
|
748
|
-
".test-case.mjs"
|
|
749
|
-
],
|
|
744
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
745
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
750
746
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
751
747
|
},
|
|
752
748
|
artifactDirectory: ".eval-results",
|
|
@@ -813,14 +809,15 @@ function getJitiLoader() {
|
|
|
813
809
|
}
|
|
814
810
|
const createJiti2 = jitiModule.createJiti ?? jitiModule.default;
|
|
815
811
|
if (typeof createJiti2 !== "function") {
|
|
816
|
-
throw new Error(
|
|
817
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
818
|
-
);
|
|
812
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
819
813
|
}
|
|
820
|
-
cachedLoader = createJiti2(
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
814
|
+
cachedLoader = createJiti2(
|
|
815
|
+
import.meta.url,
|
|
816
|
+
{
|
|
817
|
+
interopDefault: true,
|
|
818
|
+
moduleCache: true
|
|
819
|
+
}
|
|
820
|
+
);
|
|
824
821
|
return cachedLoader;
|
|
825
822
|
}
|
|
826
823
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -924,9 +921,7 @@ async function loadModuleExports(filePath) {
|
|
|
924
921
|
}
|
|
925
922
|
async function collectDatasetsFromFiles(config) {
|
|
926
923
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
927
|
-
const matched = files.filter(
|
|
928
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
929
|
-
);
|
|
924
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
930
925
|
const found = await Promise.all(
|
|
931
926
|
matched.map(async (absolutePath) => {
|
|
932
927
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -943,9 +938,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
943
938
|
}
|
|
944
939
|
async function collectEvaluatorsFromFiles(config) {
|
|
945
940
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
946
|
-
const matched = files.filter(
|
|
947
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
948
|
-
);
|
|
941
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
949
942
|
const found = await Promise.all(
|
|
950
943
|
matched.map(async (absolutePath) => {
|
|
951
944
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -962,9 +955,7 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
962
955
|
}
|
|
963
956
|
async function collectTestCasesFromFiles(config) {
|
|
964
957
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
965
|
-
const matched = files.filter(
|
|
966
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
967
|
-
);
|
|
958
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
968
959
|
const found = await Promise.all(
|
|
969
960
|
matched.map(async (absolutePath) => {
|
|
970
961
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1036,16 +1027,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1036
1027
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
1037
1028
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
1038
1029
|
if (diffOptions?.keysOnly) {
|
|
1039
|
-
const expectedKeys = JSON.stringify(
|
|
1040
|
-
|
|
1041
|
-
null,
|
|
1042
|
-
2
|
|
1043
|
-
);
|
|
1044
|
-
const actualKeys = JSON.stringify(
|
|
1045
|
-
extractKeys(actualProcessed),
|
|
1046
|
-
null,
|
|
1047
|
-
2
|
|
1048
|
-
);
|
|
1030
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
1031
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
1049
1032
|
const parts2 = diffLines(expectedKeys, actualKeys);
|
|
1050
1033
|
return formatDiffParts(parts2);
|
|
1051
1034
|
}
|
|
@@ -1056,9 +1039,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1056
1039
|
}
|
|
1057
1040
|
const parts = diffLines(expectedStr, actualStr);
|
|
1058
1041
|
if (diffOptions?.outputNewOnly) {
|
|
1059
|
-
const filtered = parts.filter(
|
|
1060
|
-
(p) => p.added === true
|
|
1061
|
-
);
|
|
1042
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
1062
1043
|
return formatDiffParts(filtered);
|
|
1063
1044
|
}
|
|
1064
1045
|
return formatDiffParts(parts);
|
|
@@ -1160,10 +1141,7 @@ var ScoreAggregate = {
|
|
|
1160
1141
|
const count = values.length || 1;
|
|
1161
1142
|
const result = {};
|
|
1162
1143
|
for (const field of fields) {
|
|
1163
|
-
result[field] = values.reduce(
|
|
1164
|
-
(s, v) => s + (v[field] ?? 0),
|
|
1165
|
-
0
|
|
1166
|
-
) / count;
|
|
1144
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1167
1145
|
}
|
|
1168
1146
|
return result;
|
|
1169
1147
|
};
|
|
@@ -1197,13 +1175,10 @@ var ScoreAggregate = {
|
|
|
1197
1175
|
(s, v) => s + (v[valueField] ?? 0),
|
|
1198
1176
|
0
|
|
1199
1177
|
);
|
|
1200
|
-
const sumSq = values.reduce(
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
},
|
|
1205
|
-
0
|
|
1206
|
-
);
|
|
1178
|
+
const sumSq = values.reduce((s, v) => {
|
|
1179
|
+
const value = v[valueField] ?? 0;
|
|
1180
|
+
return s + value * value;
|
|
1181
|
+
}, 0);
|
|
1207
1182
|
const mean = sum / count;
|
|
1208
1183
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1209
1184
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -1434,20 +1409,14 @@ function nowIsoForFile() {
|
|
|
1434
1409
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1435
1410
|
}
|
|
1436
1411
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1437
|
-
return join(
|
|
1438
|
-
artifactDirectory,
|
|
1439
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1440
|
-
);
|
|
1412
|
+
return join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1441
1413
|
}
|
|
1442
1414
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1443
1415
|
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1444
1416
|
return Effect.gen(function* () {
|
|
1445
1417
|
const evaluatorRunId = `run-${randomUUID()}`;
|
|
1446
1418
|
const started = Date.now();
|
|
1447
|
-
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [
|
|
1448
|
-
n + 1,
|
|
1449
|
-
n + 1
|
|
1450
|
-
]);
|
|
1419
|
+
const startedEvaluations = yield* Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1451
1420
|
yield* publishEvent({
|
|
1452
1421
|
type: "TestCaseStarted",
|
|
1453
1422
|
runId: task.runId,
|
|
@@ -1480,9 +1449,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1480
1449
|
return error;
|
|
1481
1450
|
};
|
|
1482
1451
|
try {
|
|
1483
|
-
const ctx = yield* Effect.promise(
|
|
1484
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1485
|
-
);
|
|
1452
|
+
const ctx = yield* Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1486
1453
|
const result = yield* Effect.promise(
|
|
1487
1454
|
() => Promise.resolve().then(
|
|
1488
1455
|
() => evaluateFn({
|
|
@@ -1537,10 +1504,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1537
1504
|
}
|
|
1538
1505
|
}
|
|
1539
1506
|
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1540
|
-
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [
|
|
1541
|
-
n + 1,
|
|
1542
|
-
n + 1
|
|
1543
|
-
]);
|
|
1507
|
+
const completedEvaluations = yield* Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1544
1508
|
const progressEvent = {
|
|
1545
1509
|
type: "TestCaseProgress",
|
|
1546
1510
|
runId: task.runId,
|
|
@@ -1589,10 +1553,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1589
1553
|
} else {
|
|
1590
1554
|
yield* Ref.update(failedRef, (n) => n + 1);
|
|
1591
1555
|
}
|
|
1592
|
-
const [passed, failed] = yield* Effect.all([
|
|
1593
|
-
Ref.get(passedRef),
|
|
1594
|
-
Ref.get(failedRef)
|
|
1595
|
-
]);
|
|
1556
|
+
const [passed, failed] = yield* Effect.all([Ref.get(passedRef), Ref.get(failedRef)]);
|
|
1596
1557
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1597
1558
|
...snapshot,
|
|
1598
1559
|
passedTestCases: passed,
|
|
@@ -1942,15 +1903,11 @@ var EffectRunner = class {
|
|
|
1942
1903
|
this.persistenceQueue = Effect.runSync(
|
|
1943
1904
|
Queue.unbounded()
|
|
1944
1905
|
);
|
|
1945
|
-
this.snapshotsRef = Effect.runSync(
|
|
1946
|
-
Ref.make(/* @__PURE__ */ new Map())
|
|
1947
|
-
);
|
|
1906
|
+
this.snapshotsRef = Effect.runSync(Ref.make(/* @__PURE__ */ new Map()));
|
|
1948
1907
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1949
1908
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1950
1909
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1951
|
-
this.schedulerFiber = Effect.runFork(
|
|
1952
|
-
this.createSchedulerEffect()
|
|
1953
|
-
);
|
|
1910
|
+
this.schedulerFiber = Effect.runFork(this.createSchedulerEffect());
|
|
1954
1911
|
this.persistenceFiber = Effect.runFork(
|
|
1955
1912
|
createPersistenceWorker(this.persistenceQueue)
|
|
1956
1913
|
);
|
|
@@ -2097,9 +2054,9 @@ var EffectRunner = class {
|
|
|
2097
2054
|
return Effect.runSync(Ref.get(this.snapshotsRef)).get(runId);
|
|
2098
2055
|
}
|
|
2099
2056
|
getAllRunSnapshots() {
|
|
2100
|
-
return Array.from(
|
|
2101
|
-
|
|
2102
|
-
)
|
|
2057
|
+
return Array.from(Effect.runSync(Ref.get(this.snapshotsRef)).values()).sort(
|
|
2058
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
2059
|
+
);
|
|
2103
2060
|
}
|
|
2104
2061
|
async loadRunSnapshotsFromArtifacts() {
|
|
2105
2062
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -2315,11 +2272,7 @@ function DatasetsView({
|
|
|
2315
2272
|
] })
|
|
2316
2273
|
] });
|
|
2317
2274
|
}
|
|
2318
|
-
function RunsView({
|
|
2319
|
-
state,
|
|
2320
|
-
dataset,
|
|
2321
|
-
selectedRun
|
|
2322
|
-
}) {
|
|
2275
|
+
function RunsView({ state, dataset, selectedRun }) {
|
|
2323
2276
|
const runs = dataset?.runs ?? [];
|
|
2324
2277
|
const rightFocused = state.focus === "right";
|
|
2325
2278
|
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
@@ -2335,10 +2288,10 @@ function RunsView({
|
|
|
2335
2288
|
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2336
2289
|
"Commit: ",
|
|
2337
2290
|
selectedRun.meta.commit,
|
|
2338
|
-
"
|
|
2291
|
+
" Branch: ",
|
|
2339
2292
|
selectedRun.meta.branch,
|
|
2293
|
+
" Seed:",
|
|
2340
2294
|
" ",
|
|
2341
|
-
"Seed: ",
|
|
2342
2295
|
selectedRun.meta.seed
|
|
2343
2296
|
] }),
|
|
2344
2297
|
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
@@ -2351,23 +2304,10 @@ function RunsView({
|
|
|
2351
2304
|
format: (v) => `${v}%`
|
|
2352
2305
|
}
|
|
2353
2306
|
),
|
|
2354
|
-
/* @__PURE__ */ jsx(
|
|
2355
|
-
TextBar,
|
|
2356
|
-
{
|
|
2357
|
-
label: "avg score",
|
|
2358
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2359
|
-
}
|
|
2360
|
-
),
|
|
2307
|
+
/* @__PURE__ */ jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
|
|
2361
2308
|
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2362
2309
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
|
|
2363
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
|
|
2364
|
-
TextBar,
|
|
2365
|
-
{
|
|
2366
|
-
label: dimension.name,
|
|
2367
|
-
value: dimension.score
|
|
2368
|
-
},
|
|
2369
|
-
dimension.name
|
|
2370
|
-
)),
|
|
2310
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
|
|
2371
2311
|
/* @__PURE__ */ jsx(Text, { children: " " }),
|
|
2372
2312
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
|
|
2373
2313
|
/* @__PURE__ */ jsx(
|
|
@@ -2470,15 +2410,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2470
2410
|
...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2471
2411
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
|
|
2472
2412
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2473
|
-
...checks.map((c) => /* @__PURE__ */ jsx(
|
|
2474
|
-
CheckRow,
|
|
2475
|
-
{
|
|
2476
|
-
name: c.name,
|
|
2477
|
-
passed: c.passed,
|
|
2478
|
-
detail: c.detail
|
|
2479
|
-
},
|
|
2480
|
-
`chk-${c.name}`
|
|
2481
|
-
)),
|
|
2413
|
+
...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
|
|
2482
2414
|
/* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
|
|
2483
2415
|
/* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2484
2416
|
/* @__PURE__ */ jsx(
|
|
@@ -2595,17 +2527,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2595
2527
|
}
|
|
2596
2528
|
} else {
|
|
2597
2529
|
rows.push(
|
|
2598
|
-
/* @__PURE__ */ jsxs(
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
children: [
|
|
2603
|
-
" ",
|
|
2604
|
-
"n/a"
|
|
2605
|
-
]
|
|
2606
|
-
},
|
|
2607
|
-
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2608
|
-
)
|
|
2530
|
+
/* @__PURE__ */ jsxs(Text, { color: "gray", children: [
|
|
2531
|
+
" ",
|
|
2532
|
+
"n/a"
|
|
2533
|
+
] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
|
|
2609
2534
|
);
|
|
2610
2535
|
}
|
|
2611
2536
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
@@ -2663,7 +2588,7 @@ function RunDetailsView({
|
|
|
2663
2588
|
const runs = dataset?.runs ?? [];
|
|
2664
2589
|
const rightFocused = state.focus === "right";
|
|
2665
2590
|
const [testCases, setTestCases] = useState([]);
|
|
2666
|
-
const evaluatorNameById =
|
|
2591
|
+
const evaluatorNameById = React.useMemo(
|
|
2667
2592
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2668
2593
|
[evaluators]
|
|
2669
2594
|
);
|
|
@@ -2686,7 +2611,7 @@ function RunDetailsView({
|
|
|
2686
2611
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2687
2612
|
return /* @__PURE__ */ jsxs(Fragment, { children: [
|
|
2688
2613
|
/* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
|
|
2689
|
-
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(
|
|
2614
|
+
/* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
|
|
2690
2615
|
] });
|
|
2691
2616
|
}
|
|
2692
2617
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2709,19 +2634,11 @@ function NewEvaluationView({
|
|
|
2709
2634
|
visibleEvaluators.map((evaluator, index) => {
|
|
2710
2635
|
const selected = index === state.evaluatorMenuIndex;
|
|
2711
2636
|
const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
|
|
2712
|
-
return /* @__PURE__ */ jsxs(
|
|
2713
|
-
|
|
2714
|
-
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
children: [
|
|
2718
|
-
selected ? "\u25B8 " : " ",
|
|
2719
|
-
inSelection ? "[x] " : "[ ] ",
|
|
2720
|
-
evaluator.name
|
|
2721
|
-
]
|
|
2722
|
-
},
|
|
2723
|
-
evaluator.id
|
|
2724
|
-
);
|
|
2637
|
+
return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
2638
|
+
selected ? "\u25B8 " : " ",
|
|
2639
|
+
inSelection ? "[x] " : "[ ] ",
|
|
2640
|
+
evaluator.name
|
|
2641
|
+
] }, evaluator.id);
|
|
2725
2642
|
})
|
|
2726
2643
|
] }),
|
|
2727
2644
|
/* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
@@ -2753,26 +2670,16 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
|
|
|
2753
2670
|
...state,
|
|
2754
2671
|
datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
|
|
2755
2672
|
runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
|
|
2756
|
-
evaluatorMenuIndex: Math.max(
|
|
2757
|
-
0,
|
|
2758
|
-
Math.min(state.evaluatorMenuIndex, evaluatorMax)
|
|
2759
|
-
)
|
|
2673
|
+
evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
|
|
2760
2674
|
};
|
|
2761
2675
|
}
|
|
2762
|
-
function EvalsCliApp({
|
|
2763
|
-
data,
|
|
2764
|
-
args,
|
|
2765
|
-
runner
|
|
2766
|
-
}) {
|
|
2676
|
+
function EvalsCliApp({ data, args, runner }) {
|
|
2767
2677
|
const { exit } = useApp();
|
|
2768
2678
|
const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
|
|
2769
2679
|
const [liveData, setLiveData] = useState(data);
|
|
2770
2680
|
const [runtimeMessage, setRuntimeMessage] = useState();
|
|
2771
2681
|
const overviewRowCountRef = useRef(0);
|
|
2772
|
-
const [state, dispatch] = useReducer(
|
|
2773
|
-
reduceCliState,
|
|
2774
|
-
createInitialState(data, args)
|
|
2775
|
-
);
|
|
2682
|
+
const [state, dispatch] = useReducer(reduceCliState, createInitialState(data, args));
|
|
2776
2683
|
useEffect(() => {
|
|
2777
2684
|
setLiveData(data);
|
|
2778
2685
|
}, [data]);
|
|
@@ -2804,14 +2711,8 @@ function EvalsCliApp({
|
|
|
2804
2711
|
filteredDatasets.length,
|
|
2805
2712
|
getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
|
|
2806
2713
|
);
|
|
2807
|
-
const selectedDataset = getDatasetByMenuIndex(
|
|
2808
|
-
|
|
2809
|
-
clampedState.datasetMenuIndex
|
|
2810
|
-
);
|
|
2811
|
-
const selectedRun = getRunByMenuIndex(
|
|
2812
|
-
selectedDataset,
|
|
2813
|
-
clampedState.runMenuIndex
|
|
2814
|
-
);
|
|
2714
|
+
const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
|
|
2715
|
+
const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
|
|
2815
2716
|
const visibleEvaluators = liveData.evaluators.filter(
|
|
2816
2717
|
(evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
|
|
2817
2718
|
);
|
|
@@ -2905,9 +2806,7 @@ function EvalsCliApp({
|
|
|
2905
2806
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|
|
2906
2807
|
);
|
|
2907
2808
|
}).catch((error) => {
|
|
2908
|
-
setRuntimeMessage(
|
|
2909
|
-
error instanceof Error ? error.message : "Failed to start evaluation."
|
|
2910
|
-
);
|
|
2809
|
+
setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
|
|
2911
2810
|
});
|
|
2912
2811
|
}
|
|
2913
2812
|
});
|
|
@@ -2934,14 +2833,7 @@ function EvalsCliApp({
|
|
|
2934
2833
|
);
|
|
2935
2834
|
}
|
|
2936
2835
|
if (clampedState.level === "runs") {
|
|
2937
|
-
return /* @__PURE__ */ jsx(
|
|
2938
|
-
RunsView,
|
|
2939
|
-
{
|
|
2940
|
-
state: clampedState,
|
|
2941
|
-
dataset: selectedDataset,
|
|
2942
|
-
selectedRun
|
|
2943
|
-
}
|
|
2944
|
-
);
|
|
2836
|
+
return /* @__PURE__ */ jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
|
|
2945
2837
|
}
|
|
2946
2838
|
return /* @__PURE__ */ jsx(
|
|
2947
2839
|
RunDetailsView,
|
|
@@ -2953,82 +2845,44 @@ function EvalsCliApp({
|
|
|
2953
2845
|
}
|
|
2954
2846
|
);
|
|
2955
2847
|
};
|
|
2956
|
-
return /* @__PURE__ */ jsxs(
|
|
2957
|
-
Box,
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
Box,
|
|
2995
|
-
{
|
|
2996
|
-
marginTop: 1,
|
|
2997
|
-
borderStyle: "round",
|
|
2998
|
-
borderColor: "magenta",
|
|
2999
|
-
paddingX: 1,
|
|
3000
|
-
width: stdoutWidth,
|
|
3001
|
-
children: [
|
|
3002
|
-
/* @__PURE__ */ jsx(Text, { color: "magenta", bold: true, children: "Search: " }),
|
|
3003
|
-
/* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
|
|
3004
|
-
]
|
|
3005
|
-
}
|
|
3006
|
-
),
|
|
3007
|
-
runtimeMessage && /* @__PURE__ */ jsx(
|
|
3008
|
-
Box,
|
|
3009
|
-
{
|
|
3010
|
-
marginTop: 1,
|
|
3011
|
-
borderStyle: "round",
|
|
3012
|
-
borderColor: "blue",
|
|
3013
|
-
paddingX: 1,
|
|
3014
|
-
width: stdoutWidth,
|
|
3015
|
-
children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage })
|
|
3016
|
-
}
|
|
3017
|
-
),
|
|
3018
|
-
/* @__PURE__ */ jsx(
|
|
3019
|
-
Box,
|
|
3020
|
-
{
|
|
3021
|
-
marginTop: 1,
|
|
3022
|
-
flexGrow: 1,
|
|
3023
|
-
width: stdoutWidth,
|
|
3024
|
-
flexDirection: "row",
|
|
3025
|
-
children: renderContent()
|
|
3026
|
-
}
|
|
3027
|
-
),
|
|
3028
|
-
/* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
3029
|
-
]
|
|
3030
|
-
}
|
|
3031
|
-
);
|
|
2848
|
+
return /* @__PURE__ */ jsxs(Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
|
|
2849
|
+
/* @__PURE__ */ jsx(Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
|
|
2850
|
+
clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
|
|
2851
|
+
Box,
|
|
2852
|
+
{
|
|
2853
|
+
marginTop: 1,
|
|
2854
|
+
borderStyle: "round",
|
|
2855
|
+
borderColor: "yellow",
|
|
2856
|
+
paddingX: 1,
|
|
2857
|
+
flexDirection: "column",
|
|
2858
|
+
width: stdoutWidth,
|
|
2859
|
+
children: [
|
|
2860
|
+
/* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
|
|
2861
|
+
clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
|
|
2862
|
+
]
|
|
2863
|
+
}
|
|
2864
|
+
),
|
|
2865
|
+
clampedState.searchMode && /* @__PURE__ */ jsxs(
|
|
2866
|
+
Box,
|
|
2867
|
+
{
|
|
2868
|
+
marginTop: 1,
|
|
2869
|
+
borderStyle: "round",
|
|
2870
|
+
borderColor: "magenta",
|
|
2871
|
+
paddingX: 1,
|
|
2872
|
+
width: stdoutWidth,
|
|
2873
|
+
children: [
|
|
2874
|
+
/* @__PURE__ */ jsxs(Text, { color: "magenta", bold: true, children: [
|
|
2875
|
+
"Search:",
|
|
2876
|
+
" "
|
|
2877
|
+
] }),
|
|
2878
|
+
/* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
|
|
2879
|
+
]
|
|
2880
|
+
}
|
|
2881
|
+
),
|
|
2882
|
+
runtimeMessage && /* @__PURE__ */ jsx(Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage }) }),
|
|
2883
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
|
|
2884
|
+
/* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
2885
|
+
] });
|
|
3032
2886
|
}
|
|
3033
2887
|
async function main() {
|
|
3034
2888
|
const args = parseStartupArgs(process.argv.slice(2));
|
|
@@ -3040,9 +2894,7 @@ async function main() {
|
|
|
3040
2894
|
process.on("SIGTERM", () => {
|
|
3041
2895
|
void runner.shutdown().finally(() => process.exit(0));
|
|
3042
2896
|
});
|
|
3043
|
-
withFullScreen(
|
|
3044
|
-
/* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })
|
|
3045
|
-
).start();
|
|
2897
|
+
withFullScreen(/* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })).start();
|
|
3046
2898
|
}
|
|
3047
2899
|
void main();
|
|
3048
2900
|
//# sourceMappingURL=out.js.map
|