@m4trix/evals 0.25.0 → 0.25.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +147 -260
- package/dist/cli-simple.cjs.map +1 -1
- package/dist/cli-simple.js +133 -246
- package/dist/cli-simple.js.map +1 -1
- package/dist/cli.cjs +143 -291
- package/dist/cli.cjs.map +1 -1
- package/dist/cli.js +130 -278
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +49 -92
- package/dist/index.js.map +1 -1
- package/package.json +3 -5
package/dist/cli.cjs
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
4
|
var fullscreenInk = require('fullscreen-ink');
|
|
5
|
-
var
|
|
5
|
+
var React = require('react');
|
|
6
6
|
var ink = require('ink');
|
|
7
7
|
var jsxRuntime = require('react/jsx-runtime');
|
|
8
8
|
var path = require('path');
|
|
@@ -37,7 +37,7 @@ function _interopNamespace(e) {
|
|
|
37
37
|
return Object.freeze(n);
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
-
var
|
|
40
|
+
var React__default = /*#__PURE__*/_interopDefault(React);
|
|
41
41
|
var jitiModule__namespace = /*#__PURE__*/_interopNamespace(jitiModule);
|
|
42
42
|
var stringify__default = /*#__PURE__*/_interopDefault(stringify);
|
|
43
43
|
|
|
@@ -117,11 +117,7 @@ function getFooterText(state) {
|
|
|
117
117
|
}
|
|
118
118
|
return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
|
|
119
119
|
}
|
|
120
|
-
function ListItem({
|
|
121
|
-
selected,
|
|
122
|
-
label,
|
|
123
|
-
itemKey
|
|
124
|
-
}) {
|
|
120
|
+
function ListItem({ selected, label, itemKey }) {
|
|
125
121
|
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
126
122
|
selected ? "\u25B8 " : " ",
|
|
127
123
|
label
|
|
@@ -148,9 +144,7 @@ function Pane({
|
|
|
148
144
|
}
|
|
149
145
|
);
|
|
150
146
|
}
|
|
151
|
-
function SectionHeader({
|
|
152
|
-
children
|
|
153
|
-
}) {
|
|
147
|
+
function SectionHeader({ children }) {
|
|
154
148
|
return /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children });
|
|
155
149
|
}
|
|
156
150
|
function StatusText({ status }) {
|
|
@@ -162,10 +156,7 @@ function StatusText({ status }) {
|
|
|
162
156
|
] });
|
|
163
157
|
}
|
|
164
158
|
var LEFT_PANE_WIDTH = 44;
|
|
165
|
-
function RunsSidebar({
|
|
166
|
-
state,
|
|
167
|
-
runs
|
|
168
|
-
}) {
|
|
159
|
+
function RunsSidebar({ state, runs }) {
|
|
169
160
|
const focused = state.focus === "left";
|
|
170
161
|
return /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
|
|
171
162
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Runs" }),
|
|
@@ -194,11 +185,7 @@ function RunsSidebar({
|
|
|
194
185
|
] });
|
|
195
186
|
}
|
|
196
187
|
var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
|
|
197
|
-
function Sparkline({
|
|
198
|
-
data,
|
|
199
|
-
width,
|
|
200
|
-
label
|
|
201
|
-
}) {
|
|
188
|
+
function Sparkline({ data, width, label }) {
|
|
202
189
|
if (data.length === 0)
|
|
203
190
|
return null;
|
|
204
191
|
const max = Math.max(...data);
|
|
@@ -428,9 +415,7 @@ var data_mock_default = {
|
|
|
428
415
|
{ name: "contract_match", score: 100 },
|
|
429
416
|
{ name: "arg_validity", score: 100 }
|
|
430
417
|
],
|
|
431
|
-
checks: [
|
|
432
|
-
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
433
|
-
],
|
|
418
|
+
checks: [{ name: "tool_calls", passed: true, detail: "0 unexpected" }],
|
|
434
419
|
failures: [],
|
|
435
420
|
meta: {
|
|
436
421
|
model: "gpt-4o-mini",
|
|
@@ -453,9 +438,21 @@ var data_mock_default = {
|
|
|
453
438
|
}
|
|
454
439
|
],
|
|
455
440
|
evaluators: [
|
|
456
|
-
{
|
|
457
|
-
|
|
458
|
-
|
|
441
|
+
{
|
|
442
|
+
id: "json-schema-validator",
|
|
443
|
+
name: "JSON Schema Validator",
|
|
444
|
+
configPreview: "strict=true"
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
id: "tool-call-contract-checker",
|
|
448
|
+
name: "Tool-call Contract Checker",
|
|
449
|
+
configPreview: "unexpectedCalls=error"
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
id: "rubric-judge",
|
|
453
|
+
name: "Rubric Judge (LLM)",
|
|
454
|
+
configPreview: "model=gpt-4o-mini; scale=0-100"
|
|
455
|
+
},
|
|
459
456
|
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
460
457
|
]
|
|
461
458
|
};
|
|
@@ -535,9 +532,7 @@ async function loadRunnerData(runner) {
|
|
|
535
532
|
const memSnapshots = runner.getAllRunSnapshots();
|
|
536
533
|
const seen = new Set(memSnapshots.map((s) => s.runId));
|
|
537
534
|
const fromDisk = diskSnapshots.filter((s) => !seen.has(s.runId));
|
|
538
|
-
const snapshots = [...memSnapshots, ...fromDisk].sort(
|
|
539
|
-
(a, b) => b.queuedAt - a.queuedAt
|
|
540
|
-
);
|
|
535
|
+
const snapshots = [...memSnapshots, ...fromDisk].sort((a, b) => b.queuedAt - a.queuedAt);
|
|
541
536
|
if (datasets.length === 0 && evaluators.length === 0) {
|
|
542
537
|
return loadMockData();
|
|
543
538
|
}
|
|
@@ -659,7 +654,11 @@ function reduceCliState(state, action) {
|
|
|
659
654
|
return { ...state, overviewScrollOffset: Math.max(0, state.overviewScrollOffset - 1) };
|
|
660
655
|
}
|
|
661
656
|
if (state.level === "datasets") {
|
|
662
|
-
return {
|
|
657
|
+
return {
|
|
658
|
+
...state,
|
|
659
|
+
datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1),
|
|
660
|
+
overviewScrollOffset: 0
|
|
661
|
+
};
|
|
663
662
|
}
|
|
664
663
|
if (state.level === "runs") {
|
|
665
664
|
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
@@ -677,10 +676,17 @@ function reduceCliState(state, action) {
|
|
|
677
676
|
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
678
677
|
}
|
|
679
678
|
if (state.level === "datasets" && state.focus === "right") {
|
|
680
|
-
return {
|
|
679
|
+
return {
|
|
680
|
+
...state,
|
|
681
|
+
overviewScrollOffset: Math.min(action.max, state.overviewScrollOffset + 1)
|
|
682
|
+
};
|
|
681
683
|
}
|
|
682
684
|
if (state.level === "datasets") {
|
|
683
|
-
return {
|
|
685
|
+
return {
|
|
686
|
+
...state,
|
|
687
|
+
datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1),
|
|
688
|
+
overviewScrollOffset: 0
|
|
689
|
+
};
|
|
684
690
|
}
|
|
685
691
|
if (state.level === "runs") {
|
|
686
692
|
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
@@ -762,18 +768,8 @@ var defaultRunnerConfig = {
|
|
|
762
768
|
discovery: {
|
|
763
769
|
rootDir: process.cwd(),
|
|
764
770
|
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
765
|
-
evaluatorSuffixes: [
|
|
766
|
-
|
|
767
|
-
".evaluator.tsx",
|
|
768
|
-
".evaluator.js",
|
|
769
|
-
".evaluator.mjs"
|
|
770
|
-
],
|
|
771
|
-
testCaseSuffixes: [
|
|
772
|
-
".test-case.ts",
|
|
773
|
-
".test-case.tsx",
|
|
774
|
-
".test-case.js",
|
|
775
|
-
".test-case.mjs"
|
|
776
|
-
],
|
|
771
|
+
evaluatorSuffixes: [".evaluator.ts", ".evaluator.tsx", ".evaluator.js", ".evaluator.mjs"],
|
|
772
|
+
testCaseSuffixes: [".test-case.ts", ".test-case.tsx", ".test-case.js", ".test-case.mjs"],
|
|
777
773
|
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
778
774
|
},
|
|
779
775
|
artifactDirectory: ".eval-results",
|
|
@@ -840,14 +836,15 @@ function getJitiLoader() {
|
|
|
840
836
|
}
|
|
841
837
|
const createJiti2 = jitiModule__namespace.createJiti ?? jitiModule__namespace.default;
|
|
842
838
|
if (typeof createJiti2 !== "function") {
|
|
843
|
-
throw new Error(
|
|
844
|
-
"Failed to initialize jiti for m4trix eval config loading."
|
|
845
|
-
);
|
|
839
|
+
throw new Error("Failed to initialize jiti for m4trix eval config loading.");
|
|
846
840
|
}
|
|
847
|
-
cachedLoader = createJiti2(
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
841
|
+
cachedLoader = createJiti2(
|
|
842
|
+
(typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)),
|
|
843
|
+
{
|
|
844
|
+
interopDefault: true,
|
|
845
|
+
moduleCache: true
|
|
846
|
+
}
|
|
847
|
+
);
|
|
851
848
|
return cachedLoader;
|
|
852
849
|
}
|
|
853
850
|
function resolveConfigModuleExport(loadedModule) {
|
|
@@ -951,9 +948,7 @@ async function loadModuleExports(filePath) {
|
|
|
951
948
|
}
|
|
952
949
|
async function collectDatasetsFromFiles(config) {
|
|
953
950
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
954
|
-
const matched = files.filter(
|
|
955
|
-
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
956
|
-
);
|
|
951
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.datasetSuffixes));
|
|
957
952
|
const found = await Promise.all(
|
|
958
953
|
matched.map(async (absolutePath) => {
|
|
959
954
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -970,9 +965,7 @@ async function collectDatasetsFromFiles(config) {
|
|
|
970
965
|
}
|
|
971
966
|
async function collectEvaluatorsFromFiles(config) {
|
|
972
967
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
973
|
-
const matched = files.filter(
|
|
974
|
-
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
975
|
-
);
|
|
968
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes));
|
|
976
969
|
const found = await Promise.all(
|
|
977
970
|
matched.map(async (absolutePath) => {
|
|
978
971
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -989,9 +982,7 @@ async function collectEvaluatorsFromFiles(config) {
|
|
|
989
982
|
}
|
|
990
983
|
async function collectTestCasesFromFiles(config) {
|
|
991
984
|
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
992
|
-
const matched = files.filter(
|
|
993
|
-
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
994
|
-
);
|
|
985
|
+
const matched = files.filter((filePath) => hasOneSuffix(filePath, config.testCaseSuffixes));
|
|
995
986
|
const found = await Promise.all(
|
|
996
987
|
matched.map(async (absolutePath) => {
|
|
997
988
|
const exports = await loadModuleExports(absolutePath);
|
|
@@ -1063,16 +1054,8 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1063
1054
|
const expectedProcessed = preprocessForDiff(expected, diffOptions);
|
|
1064
1055
|
const actualProcessed = preprocessForDiff(actual, diffOptions);
|
|
1065
1056
|
if (diffOptions?.keysOnly) {
|
|
1066
|
-
const expectedKeys = JSON.stringify(
|
|
1067
|
-
|
|
1068
|
-
null,
|
|
1069
|
-
2
|
|
1070
|
-
);
|
|
1071
|
-
const actualKeys = JSON.stringify(
|
|
1072
|
-
extractKeys(actualProcessed),
|
|
1073
|
-
null,
|
|
1074
|
-
2
|
|
1075
|
-
);
|
|
1057
|
+
const expectedKeys = JSON.stringify(extractKeys(expectedProcessed), null, 2);
|
|
1058
|
+
const actualKeys = JSON.stringify(extractKeys(actualProcessed), null, 2);
|
|
1076
1059
|
const parts2 = diff.diffLines(expectedKeys, actualKeys);
|
|
1077
1060
|
return formatDiffParts(parts2);
|
|
1078
1061
|
}
|
|
@@ -1083,9 +1066,7 @@ function createDiffString(expected, actual, diffOptions) {
|
|
|
1083
1066
|
}
|
|
1084
1067
|
const parts = diff.diffLines(expectedStr, actualStr);
|
|
1085
1068
|
if (diffOptions?.outputNewOnly) {
|
|
1086
|
-
const filtered = parts.filter(
|
|
1087
|
-
(p) => p.added === true
|
|
1088
|
-
);
|
|
1069
|
+
const filtered = parts.filter((p) => p.added === true);
|
|
1089
1070
|
return formatDiffParts(filtered);
|
|
1090
1071
|
}
|
|
1091
1072
|
return formatDiffParts(parts);
|
|
@@ -1187,10 +1168,7 @@ var ScoreAggregate = {
|
|
|
1187
1168
|
const count = values.length || 1;
|
|
1188
1169
|
const result = {};
|
|
1189
1170
|
for (const field of fields) {
|
|
1190
|
-
result[field] = values.reduce(
|
|
1191
|
-
(s, v) => s + (v[field] ?? 0),
|
|
1192
|
-
0
|
|
1193
|
-
) / count;
|
|
1171
|
+
result[field] = values.reduce((s, v) => s + (v[field] ?? 0), 0) / count;
|
|
1194
1172
|
}
|
|
1195
1173
|
return result;
|
|
1196
1174
|
};
|
|
@@ -1224,13 +1202,10 @@ var ScoreAggregate = {
|
|
|
1224
1202
|
(s, v) => s + (v[valueField] ?? 0),
|
|
1225
1203
|
0
|
|
1226
1204
|
);
|
|
1227
|
-
const sumSq = values.reduce(
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
},
|
|
1232
|
-
0
|
|
1233
|
-
);
|
|
1205
|
+
const sumSq = values.reduce((s, v) => {
|
|
1206
|
+
const value = v[valueField] ?? 0;
|
|
1207
|
+
return s + value * value;
|
|
1208
|
+
}, 0);
|
|
1234
1209
|
const mean = sum / count;
|
|
1235
1210
|
const variance = (sumSq - count * mean * mean) / (count - 1);
|
|
1236
1211
|
stdDev = variance > 0 ? Math.sqrt(variance) : 0;
|
|
@@ -1461,20 +1436,14 @@ function nowIsoForFile() {
|
|
|
1461
1436
|
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1462
1437
|
}
|
|
1463
1438
|
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1464
|
-
return path.join(
|
|
1465
|
-
artifactDirectory,
|
|
1466
|
-
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1467
|
-
);
|
|
1439
|
+
return path.join(artifactDirectory, `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`);
|
|
1468
1440
|
}
|
|
1469
1441
|
function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persistenceQueue, updateSnapshot, startedRef, completedRef, passedRef, failedRef, testCaseResultsRef) {
|
|
1470
1442
|
const { testCaseItem, rerunIndex, rerunTotal } = unit;
|
|
1471
1443
|
return effect.Effect.gen(function* () {
|
|
1472
1444
|
const evaluatorRunId = `run-${crypto.randomUUID()}`;
|
|
1473
1445
|
const started = Date.now();
|
|
1474
|
-
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [
|
|
1475
|
-
n + 1,
|
|
1476
|
-
n + 1
|
|
1477
|
-
]);
|
|
1446
|
+
const startedEvaluations = yield* effect.Ref.modify(startedRef, (n) => [n + 1, n + 1]);
|
|
1478
1447
|
yield* publishEvent({
|
|
1479
1448
|
type: "TestCaseStarted",
|
|
1480
1449
|
runId: task.runId,
|
|
@@ -1507,9 +1476,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1507
1476
|
return error;
|
|
1508
1477
|
};
|
|
1509
1478
|
try {
|
|
1510
|
-
const ctx = yield* effect.Effect.promise(
|
|
1511
|
-
() => Promise.resolve(evaluator.resolveContext())
|
|
1512
|
-
);
|
|
1479
|
+
const ctx = yield* effect.Effect.promise(() => Promise.resolve(evaluator.resolveContext()));
|
|
1513
1480
|
const result = yield* effect.Effect.promise(
|
|
1514
1481
|
() => Promise.resolve().then(
|
|
1515
1482
|
() => evaluateFn({
|
|
@@ -1564,10 +1531,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1564
1531
|
}
|
|
1565
1532
|
}
|
|
1566
1533
|
const rerunPassedThis = evaluatorScores.every((s) => s.passed);
|
|
1567
|
-
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [
|
|
1568
|
-
n + 1,
|
|
1569
|
-
n + 1
|
|
1570
|
-
]);
|
|
1534
|
+
const completedEvaluations = yield* effect.Ref.modify(completedRef, (n) => [n + 1, n + 1]);
|
|
1571
1535
|
const progressEvent = {
|
|
1572
1536
|
type: "TestCaseProgress",
|
|
1573
1537
|
runId: task.runId,
|
|
@@ -1616,10 +1580,7 @@ function processOneEvaluation(task, unit, totalEvaluations, publishEvent, persis
|
|
|
1616
1580
|
} else {
|
|
1617
1581
|
yield* effect.Ref.update(failedRef, (n) => n + 1);
|
|
1618
1582
|
}
|
|
1619
|
-
const [passed, failed] = yield* effect.Effect.all([
|
|
1620
|
-
effect.Ref.get(passedRef),
|
|
1621
|
-
effect.Ref.get(failedRef)
|
|
1622
|
-
]);
|
|
1583
|
+
const [passed, failed] = yield* effect.Effect.all([effect.Ref.get(passedRef), effect.Ref.get(failedRef)]);
|
|
1623
1584
|
yield* updateSnapshot(task.runId, (snapshot) => ({
|
|
1624
1585
|
...snapshot,
|
|
1625
1586
|
passedTestCases: passed,
|
|
@@ -1969,15 +1930,11 @@ var EffectRunner = class {
|
|
|
1969
1930
|
this.persistenceQueue = effect.Effect.runSync(
|
|
1970
1931
|
effect.Queue.unbounded()
|
|
1971
1932
|
);
|
|
1972
|
-
this.snapshotsRef = effect.Effect.runSync(
|
|
1973
|
-
effect.Ref.make(/* @__PURE__ */ new Map())
|
|
1974
|
-
);
|
|
1933
|
+
this.snapshotsRef = effect.Effect.runSync(effect.Ref.make(/* @__PURE__ */ new Map()));
|
|
1975
1934
|
this.listeners = /* @__PURE__ */ new Set();
|
|
1976
1935
|
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1977
1936
|
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1978
|
-
this.schedulerFiber = effect.Effect.runFork(
|
|
1979
|
-
this.createSchedulerEffect()
|
|
1980
|
-
);
|
|
1937
|
+
this.schedulerFiber = effect.Effect.runFork(this.createSchedulerEffect());
|
|
1981
1938
|
this.persistenceFiber = effect.Effect.runFork(
|
|
1982
1939
|
createPersistenceWorker(this.persistenceQueue)
|
|
1983
1940
|
);
|
|
@@ -2124,9 +2081,9 @@ var EffectRunner = class {
|
|
|
2124
2081
|
return effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).get(runId);
|
|
2125
2082
|
}
|
|
2126
2083
|
getAllRunSnapshots() {
|
|
2127
|
-
return Array.from(
|
|
2128
|
-
|
|
2129
|
-
)
|
|
2084
|
+
return Array.from(effect.Effect.runSync(effect.Ref.get(this.snapshotsRef)).values()).sort(
|
|
2085
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
2086
|
+
);
|
|
2130
2087
|
}
|
|
2131
2088
|
async loadRunSnapshotsFromArtifacts() {
|
|
2132
2089
|
return loadRunSnapshotsFromArtifacts(this.config);
|
|
@@ -2238,9 +2195,9 @@ function DatasetsView({
|
|
|
2238
2195
|
}) {
|
|
2239
2196
|
const leftFocused = state.focus === "left";
|
|
2240
2197
|
const rightFocused = state.focus === "right";
|
|
2241
|
-
const [runScores, setRunScores] =
|
|
2242
|
-
const [loading, setLoading] =
|
|
2243
|
-
|
|
2198
|
+
const [runScores, setRunScores] = React.useState([]);
|
|
2199
|
+
const [loading, setLoading] = React.useState(false);
|
|
2200
|
+
React.useEffect(() => {
|
|
2244
2201
|
if (!selectedDataset?.runs?.length) {
|
|
2245
2202
|
setRunScores([]);
|
|
2246
2203
|
return;
|
|
@@ -2252,7 +2209,7 @@ function DatasetsView({
|
|
|
2252
2209
|
const barData = runScores.slice(0, MAX_RUNS_FOR_CHART).reverse();
|
|
2253
2210
|
const trendValues = runScores.slice(0, MAX_RUNS_FOR_TREND).map((r) => r.value).reverse();
|
|
2254
2211
|
const trendBatched = batchAverage(trendValues, TREND_BATCH_SIZE);
|
|
2255
|
-
const overviewRows =
|
|
2212
|
+
const overviewRows = React.useMemo(() => {
|
|
2256
2213
|
const rows = [];
|
|
2257
2214
|
rows.push(
|
|
2258
2215
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." }, "overview")
|
|
@@ -2342,11 +2299,7 @@ function DatasetsView({
|
|
|
2342
2299
|
] })
|
|
2343
2300
|
] });
|
|
2344
2301
|
}
|
|
2345
|
-
function RunsView({
|
|
2346
|
-
state,
|
|
2347
|
-
dataset,
|
|
2348
|
-
selectedRun
|
|
2349
|
-
}) {
|
|
2302
|
+
function RunsView({ state, dataset, selectedRun }) {
|
|
2350
2303
|
const runs = dataset?.runs ?? [];
|
|
2351
2304
|
const rightFocused = state.focus === "right";
|
|
2352
2305
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
@@ -2362,10 +2315,10 @@ function RunsView({
|
|
|
2362
2315
|
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2363
2316
|
"Commit: ",
|
|
2364
2317
|
selectedRun.meta.commit,
|
|
2365
|
-
"
|
|
2318
|
+
" Branch: ",
|
|
2366
2319
|
selectedRun.meta.branch,
|
|
2320
|
+
" Seed:",
|
|
2367
2321
|
" ",
|
|
2368
|
-
"Seed: ",
|
|
2369
2322
|
selectedRun.meta.seed
|
|
2370
2323
|
] }),
|
|
2371
2324
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
@@ -2378,23 +2331,10 @@ function RunsView({
|
|
|
2378
2331
|
format: (v) => `${v}%`
|
|
2379
2332
|
}
|
|
2380
2333
|
),
|
|
2381
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
2382
|
-
TextBar,
|
|
2383
|
-
{
|
|
2384
|
-
label: "avg score",
|
|
2385
|
-
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
2386
|
-
}
|
|
2387
|
-
),
|
|
2334
|
+
/* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: "avg score", value: Math.round(selectedRun.performance.avgScore * 100) }),
|
|
2388
2335
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2389
2336
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
2390
|
-
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2391
|
-
TextBar,
|
|
2392
|
-
{
|
|
2393
|
-
label: dimension.name,
|
|
2394
|
-
value: dimension.score
|
|
2395
|
-
},
|
|
2396
|
-
dimension.name
|
|
2397
|
-
)),
|
|
2337
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: dimension.name, value: dimension.score }, dimension.name)),
|
|
2398
2338
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
2399
2339
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
2400
2340
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2497,15 +2437,7 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2497
2437
|
...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
2498
2438
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
|
|
2499
2439
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
2500
|
-
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2501
|
-
CheckRow,
|
|
2502
|
-
{
|
|
2503
|
-
name: c.name,
|
|
2504
|
-
passed: c.passed,
|
|
2505
|
-
detail: c.detail
|
|
2506
|
-
},
|
|
2507
|
-
`chk-${c.name}`
|
|
2508
|
-
)),
|
|
2440
|
+
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
|
|
2509
2441
|
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
|
|
2510
2442
|
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
2511
2443
|
/* @__PURE__ */ jsxRuntime.jsx(
|
|
@@ -2622,17 +2554,10 @@ function buildDetailRows(run, testCases, evaluatorNameById) {
|
|
|
2622
2554
|
}
|
|
2623
2555
|
} else {
|
|
2624
2556
|
rows.push(
|
|
2625
|
-
/* @__PURE__ */ jsxRuntime.jsxs(
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
children: [
|
|
2630
|
-
" ",
|
|
2631
|
-
"n/a"
|
|
2632
|
-
]
|
|
2633
|
-
},
|
|
2634
|
-
`tc-${tc.testCaseId}-${item.evaluatorId}-n/a`
|
|
2635
|
-
)
|
|
2557
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
2558
|
+
" ",
|
|
2559
|
+
"n/a"
|
|
2560
|
+
] }, `tc-${tc.testCaseId}-${item.evaluatorId}-n/a`)
|
|
2636
2561
|
);
|
|
2637
2562
|
}
|
|
2638
2563
|
if (!item.passed && item.logs && item.logs.length > 0) {
|
|
@@ -2689,12 +2614,12 @@ function RunDetailsView({
|
|
|
2689
2614
|
}) {
|
|
2690
2615
|
const runs = dataset?.runs ?? [];
|
|
2691
2616
|
const rightFocused = state.focus === "right";
|
|
2692
|
-
const [testCases, setTestCases] =
|
|
2693
|
-
const evaluatorNameById =
|
|
2617
|
+
const [testCases, setTestCases] = React.useState([]);
|
|
2618
|
+
const evaluatorNameById = React__default.default.useMemo(
|
|
2694
2619
|
() => new Map(evaluators.map((e) => [e.id, e.name])),
|
|
2695
2620
|
[evaluators]
|
|
2696
2621
|
);
|
|
2697
|
-
|
|
2622
|
+
React.useEffect(() => {
|
|
2698
2623
|
if (!selectedRun?.meta?.artifact) {
|
|
2699
2624
|
setTestCases([]);
|
|
2700
2625
|
return;
|
|
@@ -2713,7 +2638,7 @@ function RunDetailsView({
|
|
|
2713
2638
|
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
2714
2639
|
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
2715
2640
|
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
2716
|
-
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
2641
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
|
|
2717
2642
|
] });
|
|
2718
2643
|
}
|
|
2719
2644
|
var LEFT_PANE_WIDTH3 = 44;
|
|
@@ -2736,19 +2661,11 @@ function NewEvaluationView({
|
|
|
2736
2661
|
visibleEvaluators.map((evaluator, index) => {
|
|
2737
2662
|
const selected = index === state.evaluatorMenuIndex;
|
|
2738
2663
|
const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
|
|
2739
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2740
|
-
|
|
2741
|
-
|
|
2742
|
-
|
|
2743
|
-
|
|
2744
|
-
children: [
|
|
2745
|
-
selected ? "\u25B8 " : " ",
|
|
2746
|
-
inSelection ? "[x] " : "[ ] ",
|
|
2747
|
-
evaluator.name
|
|
2748
|
-
]
|
|
2749
|
-
},
|
|
2750
|
-
evaluator.id
|
|
2751
|
-
);
|
|
2664
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
2665
|
+
selected ? "\u25B8 " : " ",
|
|
2666
|
+
inSelection ? "[x] " : "[ ] ",
|
|
2667
|
+
evaluator.name
|
|
2668
|
+
] }, evaluator.id);
|
|
2752
2669
|
})
|
|
2753
2670
|
] }),
|
|
2754
2671
|
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
@@ -2780,30 +2697,20 @@ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
|
|
|
2780
2697
|
...state,
|
|
2781
2698
|
datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
|
|
2782
2699
|
runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
|
|
2783
|
-
evaluatorMenuIndex: Math.max(
|
|
2784
|
-
0,
|
|
2785
|
-
Math.min(state.evaluatorMenuIndex, evaluatorMax)
|
|
2786
|
-
)
|
|
2700
|
+
evaluatorMenuIndex: Math.max(0, Math.min(state.evaluatorMenuIndex, evaluatorMax))
|
|
2787
2701
|
};
|
|
2788
2702
|
}
|
|
2789
|
-
function EvalsCliApp({
|
|
2790
|
-
data,
|
|
2791
|
-
args,
|
|
2792
|
-
runner
|
|
2793
|
-
}) {
|
|
2703
|
+
function EvalsCliApp({ data, args, runner }) {
|
|
2794
2704
|
const { exit } = ink.useApp();
|
|
2795
2705
|
const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
|
|
2796
|
-
const [liveData, setLiveData] =
|
|
2797
|
-
const [runtimeMessage, setRuntimeMessage] =
|
|
2798
|
-
const overviewRowCountRef =
|
|
2799
|
-
const [state, dispatch] =
|
|
2800
|
-
|
|
2801
|
-
createInitialState(data, args)
|
|
2802
|
-
);
|
|
2803
|
-
React2.useEffect(() => {
|
|
2706
|
+
const [liveData, setLiveData] = React.useState(data);
|
|
2707
|
+
const [runtimeMessage, setRuntimeMessage] = React.useState();
|
|
2708
|
+
const overviewRowCountRef = React.useRef(0);
|
|
2709
|
+
const [state, dispatch] = React.useReducer(reduceCliState, createInitialState(data, args));
|
|
2710
|
+
React.useEffect(() => {
|
|
2804
2711
|
setLiveData(data);
|
|
2805
2712
|
}, [data]);
|
|
2806
|
-
|
|
2713
|
+
React.useEffect(() => {
|
|
2807
2714
|
if (!runner) {
|
|
2808
2715
|
return void 0;
|
|
2809
2716
|
}
|
|
@@ -2822,7 +2729,7 @@ function EvalsCliApp({
|
|
|
2822
2729
|
}
|
|
2823
2730
|
});
|
|
2824
2731
|
}, [runner]);
|
|
2825
|
-
const filteredDatasets =
|
|
2732
|
+
const filteredDatasets = React.useMemo(
|
|
2826
2733
|
() => getFilteredDatasets(liveData, state.searchQuery),
|
|
2827
2734
|
[liveData, state.searchQuery]
|
|
2828
2735
|
);
|
|
@@ -2831,14 +2738,8 @@ function EvalsCliApp({
|
|
|
2831
2738
|
filteredDatasets.length,
|
|
2832
2739
|
getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
|
|
2833
2740
|
);
|
|
2834
|
-
const selectedDataset = getDatasetByMenuIndex(
|
|
2835
|
-
|
|
2836
|
-
clampedState.datasetMenuIndex
|
|
2837
|
-
);
|
|
2838
|
-
const selectedRun = getRunByMenuIndex(
|
|
2839
|
-
selectedDataset,
|
|
2840
|
-
clampedState.runMenuIndex
|
|
2841
|
-
);
|
|
2741
|
+
const selectedDataset = getDatasetByMenuIndex(filteredDatasets, clampedState.datasetMenuIndex);
|
|
2742
|
+
const selectedRun = getRunByMenuIndex(selectedDataset, clampedState.runMenuIndex);
|
|
2842
2743
|
const visibleEvaluators = liveData.evaluators.filter(
|
|
2843
2744
|
(evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
|
|
2844
2745
|
);
|
|
@@ -2932,9 +2833,7 @@ function EvalsCliApp({
|
|
|
2932
2833
|
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|
|
2933
2834
|
);
|
|
2934
2835
|
}).catch((error) => {
|
|
2935
|
-
setRuntimeMessage(
|
|
2936
|
-
error instanceof Error ? error.message : "Failed to start evaluation."
|
|
2937
|
-
);
|
|
2836
|
+
setRuntimeMessage(error instanceof Error ? error.message : "Failed to start evaluation.");
|
|
2938
2837
|
});
|
|
2939
2838
|
}
|
|
2940
2839
|
});
|
|
@@ -2961,14 +2860,7 @@ function EvalsCliApp({
|
|
|
2961
2860
|
);
|
|
2962
2861
|
}
|
|
2963
2862
|
if (clampedState.level === "runs") {
|
|
2964
|
-
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
2965
|
-
RunsView,
|
|
2966
|
-
{
|
|
2967
|
-
state: clampedState,
|
|
2968
|
-
dataset: selectedDataset,
|
|
2969
|
-
selectedRun
|
|
2970
|
-
}
|
|
2971
|
-
);
|
|
2863
|
+
return /* @__PURE__ */ jsxRuntime.jsx(RunsView, { state: clampedState, dataset: selectedDataset, selectedRun });
|
|
2972
2864
|
}
|
|
2973
2865
|
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
2974
2866
|
RunDetailsView,
|
|
@@ -2980,82 +2872,44 @@ function EvalsCliApp({
|
|
|
2980
2872
|
}
|
|
2981
2873
|
);
|
|
2982
2874
|
};
|
|
2983
|
-
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2984
|
-
ink.Box,
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3001
|
-
|
|
3002
|
-
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
|
|
3006
|
-
|
|
3007
|
-
|
|
3008
|
-
|
|
3009
|
-
|
|
3010
|
-
|
|
3011
|
-
|
|
3012
|
-
|
|
3013
|
-
|
|
3014
|
-
|
|
3015
|
-
|
|
3016
|
-
|
|
3017
|
-
|
|
3018
|
-
|
|
3019
|
-
|
|
3020
|
-
|
|
3021
|
-
ink.Box,
|
|
3022
|
-
{
|
|
3023
|
-
marginTop: 1,
|
|
3024
|
-
borderStyle: "round",
|
|
3025
|
-
borderColor: "magenta",
|
|
3026
|
-
paddingX: 1,
|
|
3027
|
-
width: stdoutWidth,
|
|
3028
|
-
children: [
|
|
3029
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", bold: true, children: "Search: " }),
|
|
3030
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
|
|
3031
|
-
]
|
|
3032
|
-
}
|
|
3033
|
-
),
|
|
3034
|
-
runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(
|
|
3035
|
-
ink.Box,
|
|
3036
|
-
{
|
|
3037
|
-
marginTop: 1,
|
|
3038
|
-
borderStyle: "round",
|
|
3039
|
-
borderColor: "blue",
|
|
3040
|
-
paddingX: 1,
|
|
3041
|
-
width: stdoutWidth,
|
|
3042
|
-
children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage })
|
|
3043
|
-
}
|
|
3044
|
-
),
|
|
3045
|
-
/* @__PURE__ */ jsxRuntime.jsx(
|
|
3046
|
-
ink.Box,
|
|
3047
|
-
{
|
|
3048
|
-
marginTop: 1,
|
|
3049
|
-
flexGrow: 1,
|
|
3050
|
-
width: stdoutWidth,
|
|
3051
|
-
flexDirection: "row",
|
|
3052
|
-
children: renderContent()
|
|
3053
|
-
}
|
|
3054
|
-
),
|
|
3055
|
-
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
3056
|
-
]
|
|
3057
|
-
}
|
|
3058
|
-
);
|
|
2875
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", flexGrow: 1, width: stdoutWidth, height: stdoutHeight, children: [
|
|
2876
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { borderStyle: "round", borderColor: "cyan", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(clampedState, selectedDataset?.name, selectedRun?.label) }) }),
|
|
2877
|
+
clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2878
|
+
ink.Box,
|
|
2879
|
+
{
|
|
2880
|
+
marginTop: 1,
|
|
2881
|
+
borderStyle: "round",
|
|
2882
|
+
borderColor: "yellow",
|
|
2883
|
+
paddingX: 1,
|
|
2884
|
+
flexDirection: "column",
|
|
2885
|
+
width: stdoutWidth,
|
|
2886
|
+
children: [
|
|
2887
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
|
|
2888
|
+
clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
|
|
2889
|
+
]
|
|
2890
|
+
}
|
|
2891
|
+
),
|
|
2892
|
+
clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
|
|
2893
|
+
ink.Box,
|
|
2894
|
+
{
|
|
2895
|
+
marginTop: 1,
|
|
2896
|
+
borderStyle: "round",
|
|
2897
|
+
borderColor: "magenta",
|
|
2898
|
+
paddingX: 1,
|
|
2899
|
+
width: stdoutWidth,
|
|
2900
|
+
children: [
|
|
2901
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "magenta", bold: true, children: [
|
|
2902
|
+
"Search:",
|
|
2903
|
+
" "
|
|
2904
|
+
] }),
|
|
2905
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
|
|
2906
|
+
]
|
|
2907
|
+
}
|
|
2908
|
+
),
|
|
2909
|
+
runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, borderStyle: "round", borderColor: "blue", paddingX: 1, width: stdoutWidth, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage }) }),
|
|
2910
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, flexGrow: 1, width: stdoutWidth, flexDirection: "row", children: renderContent() }),
|
|
2911
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
2912
|
+
] });
|
|
3059
2913
|
}
|
|
3060
2914
|
async function main() {
|
|
3061
2915
|
const args = parseStartupArgs(process.argv.slice(2));
|
|
@@ -3067,9 +2921,7 @@ async function main() {
|
|
|
3067
2921
|
process.on("SIGTERM", () => {
|
|
3068
2922
|
void runner.shutdown().finally(() => process.exit(0));
|
|
3069
2923
|
});
|
|
3070
|
-
fullscreenInk.withFullScreen(
|
|
3071
|
-
/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })
|
|
3072
|
-
).start();
|
|
2924
|
+
fullscreenInk.withFullScreen(/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })).start();
|
|
3073
2925
|
}
|
|
3074
2926
|
void main();
|
|
3075
2927
|
//# sourceMappingURL=out.js.map
|