@m4trix/evals 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli-simple.cjs +1075 -0
- package/dist/cli-simple.cjs.map +1 -0
- package/dist/cli-simple.d.cts +1 -0
- package/dist/cli-simple.d.ts +1 -0
- package/dist/cli-simple.js +1072 -0
- package/dist/cli-simple.js.map +1 -0
- package/dist/cli.cjs +1981 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +1974 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +1184 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +347 -0
- package/dist/index.d.ts +347 -0
- package/dist/index.js +1165 -0
- package/dist/index.js.map +1 -0
- package/package.json +53 -0
package/dist/cli.cjs
ADDED
|
@@ -0,0 +1,1981 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
var fullscreenInk = require('fullscreen-ink');
|
|
5
|
+
var React = require('react');
|
|
6
|
+
var ink = require('ink');
|
|
7
|
+
var jsxRuntime = require('react/jsx-runtime');
|
|
8
|
+
var crypto = require('crypto');
|
|
9
|
+
var effect = require('effect');
|
|
10
|
+
var promises = require('fs/promises');
|
|
11
|
+
var path = require('path');
|
|
12
|
+
var url = require('url');
|
|
13
|
+
|
|
14
|
+
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
15
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
16
|
+
|
|
17
|
+
var React__default = /*#__PURE__*/_interopDefault(React);
|
|
18
|
+
|
|
19
|
+
var SEP = " ";
|
|
20
|
+
var ARROW = "\u203A";
|
|
21
|
+
function getBreadcrumbText(state, datasetName, runLabel) {
|
|
22
|
+
const dim = (s, k) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: s }, k ?? s);
|
|
23
|
+
const accent = (s) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: s }, s);
|
|
24
|
+
if (state.level === "datasets") {
|
|
25
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
26
|
+
dim("Evaluations"),
|
|
27
|
+
SEP,
|
|
28
|
+
dim(ARROW, "a1"),
|
|
29
|
+
SEP,
|
|
30
|
+
accent("Datasets")
|
|
31
|
+
] });
|
|
32
|
+
}
|
|
33
|
+
if (state.level === "runs") {
|
|
34
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
35
|
+
dim("Evaluations"),
|
|
36
|
+
SEP,
|
|
37
|
+
dim(ARROW, "a1"),
|
|
38
|
+
SEP,
|
|
39
|
+
dim("Dataset:"),
|
|
40
|
+
" ",
|
|
41
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: datasetName ?? "-" }, "ds"),
|
|
42
|
+
SEP,
|
|
43
|
+
dim(ARROW, "a2"),
|
|
44
|
+
SEP,
|
|
45
|
+
accent("Runs")
|
|
46
|
+
] });
|
|
47
|
+
}
|
|
48
|
+
if (state.level === "details") {
|
|
49
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
50
|
+
dim("Evaluations"),
|
|
51
|
+
SEP,
|
|
52
|
+
dim(ARROW, "a1"),
|
|
53
|
+
SEP,
|
|
54
|
+
dim("Dataset:"),
|
|
55
|
+
" ",
|
|
56
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: datasetName ?? "-" }, "ds"),
|
|
57
|
+
SEP,
|
|
58
|
+
dim(ARROW, "a2"),
|
|
59
|
+
SEP,
|
|
60
|
+
dim("Run:"),
|
|
61
|
+
" ",
|
|
62
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: runLabel ?? "-" }, "rl"),
|
|
63
|
+
SEP,
|
|
64
|
+
dim(ARROW, "a3"),
|
|
65
|
+
SEP,
|
|
66
|
+
accent("Details")
|
|
67
|
+
] });
|
|
68
|
+
}
|
|
69
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
70
|
+
dim("Evaluations"),
|
|
71
|
+
SEP,
|
|
72
|
+
dim(ARROW, "a1"),
|
|
73
|
+
SEP,
|
|
74
|
+
accent("New evaluation"),
|
|
75
|
+
SEP,
|
|
76
|
+
dim(ARROW, "a2"),
|
|
77
|
+
SEP,
|
|
78
|
+
dim("Select evaluators", "sel")
|
|
79
|
+
] });
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// src/cli/components/Footer.tsx
|
|
83
|
+
function getFooterText(state) {
|
|
84
|
+
if (state.level === "datasets") {
|
|
85
|
+
return "\u2191\u2193 move Enter open / search Tab focus q quit";
|
|
86
|
+
}
|
|
87
|
+
if (state.level === "runs") {
|
|
88
|
+
return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
|
|
89
|
+
}
|
|
90
|
+
if (state.level === "details") {
|
|
91
|
+
return "\u2191\u2193 scroll Backspace runs Tab focus q quit";
|
|
92
|
+
}
|
|
93
|
+
return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
|
|
94
|
+
}
|
|
95
|
+
function ListItem({
|
|
96
|
+
selected,
|
|
97
|
+
label,
|
|
98
|
+
itemKey
|
|
99
|
+
}) {
|
|
100
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
|
|
101
|
+
selected ? "\u25B8 " : " ",
|
|
102
|
+
label
|
|
103
|
+
] }, itemKey);
|
|
104
|
+
}
|
|
105
|
+
function Pane({
|
|
106
|
+
children,
|
|
107
|
+
width,
|
|
108
|
+
flexGrow,
|
|
109
|
+
marginLeft,
|
|
110
|
+
focused = false
|
|
111
|
+
}) {
|
|
112
|
+
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
113
|
+
ink.Box,
|
|
114
|
+
{
|
|
115
|
+
flexDirection: "column",
|
|
116
|
+
width,
|
|
117
|
+
flexGrow,
|
|
118
|
+
marginLeft,
|
|
119
|
+
borderStyle: focused ? "single" : "round",
|
|
120
|
+
borderColor: focused ? "cyan" : "gray",
|
|
121
|
+
paddingX: 1,
|
|
122
|
+
children
|
|
123
|
+
}
|
|
124
|
+
);
|
|
125
|
+
}
|
|
126
|
+
function SectionHeader({
|
|
127
|
+
children
|
|
128
|
+
}) {
|
|
129
|
+
return /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children });
|
|
130
|
+
}
|
|
131
|
+
function StatusText({ status }) {
|
|
132
|
+
const color = status === "PASS" ? "green" : status === "RUNNING" ? "yellow" : "red";
|
|
133
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color, children: [
|
|
134
|
+
"(",
|
|
135
|
+
status,
|
|
136
|
+
")"
|
|
137
|
+
] });
|
|
138
|
+
}
|
|
139
|
+
var LEFT_PANE_WIDTH = 44;
|
|
140
|
+
function RunsSidebar({
|
|
141
|
+
state,
|
|
142
|
+
runs
|
|
143
|
+
}) {
|
|
144
|
+
const focused = state.focus === "left";
|
|
145
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
|
|
146
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Runs" }),
|
|
147
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
148
|
+
ListItem,
|
|
149
|
+
{
|
|
150
|
+
selected: state.runMenuIndex === 0,
|
|
151
|
+
label: "New evaluation",
|
|
152
|
+
itemKey: "runs-new-eval"
|
|
153
|
+
}
|
|
154
|
+
),
|
|
155
|
+
runs.map((run, index) => /* @__PURE__ */ jsxRuntime.jsxs(
|
|
156
|
+
ink.Text,
|
|
157
|
+
{
|
|
158
|
+
color: state.runMenuIndex === index + 1 ? "cyan" : "gray",
|
|
159
|
+
bold: state.runMenuIndex === index + 1,
|
|
160
|
+
children: [
|
|
161
|
+
state.runMenuIndex === index + 1 ? "\u25B8 " : " ",
|
|
162
|
+
run.label,
|
|
163
|
+
" ",
|
|
164
|
+
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: run.status })
|
|
165
|
+
]
|
|
166
|
+
},
|
|
167
|
+
run.id
|
|
168
|
+
))
|
|
169
|
+
] });
|
|
170
|
+
}
|
|
171
|
+
var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
|
|
172
|
+
function Sparkline({
|
|
173
|
+
data,
|
|
174
|
+
width,
|
|
175
|
+
label
|
|
176
|
+
}) {
|
|
177
|
+
if (data.length === 0)
|
|
178
|
+
return null;
|
|
179
|
+
const max = Math.max(...data);
|
|
180
|
+
const min = Math.min(...data);
|
|
181
|
+
const range = max - min || 1;
|
|
182
|
+
const targetWidth = width ?? Math.min(data.length, 24);
|
|
183
|
+
let values;
|
|
184
|
+
if (data.length <= targetWidth) {
|
|
185
|
+
values = data;
|
|
186
|
+
} else {
|
|
187
|
+
const step = data.length / targetWidth;
|
|
188
|
+
values = Array.from({ length: targetWidth }, (_, i) => {
|
|
189
|
+
const start = Math.floor(i * step);
|
|
190
|
+
const end = Math.floor((i + 1) * step);
|
|
191
|
+
const slice = data.slice(start, end);
|
|
192
|
+
return slice.reduce((a, b) => a + b, 0) / slice.length;
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
const spark = values.map((v) => {
|
|
196
|
+
const normalized = (v - min) / range;
|
|
197
|
+
const idx = Math.min(7, Math.floor(normalized * 8));
|
|
198
|
+
return BLOCKS[idx];
|
|
199
|
+
}).join("");
|
|
200
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
201
|
+
label !== void 0 && label !== "" ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
202
|
+
label.padEnd(14),
|
|
203
|
+
" "
|
|
204
|
+
] }) : null,
|
|
205
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: spark })
|
|
206
|
+
] });
|
|
207
|
+
}
|
|
208
|
+
function barColor(pct) {
|
|
209
|
+
if (pct >= 70)
|
|
210
|
+
return "green";
|
|
211
|
+
if (pct >= 40)
|
|
212
|
+
return "yellow";
|
|
213
|
+
return "red";
|
|
214
|
+
}
|
|
215
|
+
function TextBar({
|
|
216
|
+
label,
|
|
217
|
+
value,
|
|
218
|
+
max = 100,
|
|
219
|
+
labelWidth = 14,
|
|
220
|
+
barWidth = 20,
|
|
221
|
+
format = (v) => String(v),
|
|
222
|
+
colorByValue = true
|
|
223
|
+
}) {
|
|
224
|
+
const clamped = Math.max(0, Math.min(max, value));
|
|
225
|
+
const pct = max > 0 ? clamped / max * 100 : 0;
|
|
226
|
+
const filled = Math.round(clamped / max * barWidth);
|
|
227
|
+
const filledBar = "\u2588".repeat(filled);
|
|
228
|
+
const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
|
|
229
|
+
const color = colorByValue ? barColor(pct) : void 0;
|
|
230
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
231
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
|
|
232
|
+
" [",
|
|
233
|
+
color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
234
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
|
|
235
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
|
|
236
|
+
] }) : filledBar + emptyBar,
|
|
237
|
+
"] ",
|
|
238
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
|
|
239
|
+
] });
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// src/cli/keys.ts
|
|
243
|
+
function isQuitInput(input) {
|
|
244
|
+
return input.toLowerCase() === "q";
|
|
245
|
+
}
|
|
246
|
+
function isSearchInput(input) {
|
|
247
|
+
return input === "/";
|
|
248
|
+
}
|
|
249
|
+
function isPrintableCharacter(input) {
|
|
250
|
+
return input.length === 1 && input >= " " && input !== "\x7F";
|
|
251
|
+
}
|
|
252
|
+
function isBackKey(key) {
|
|
253
|
+
return key.backspace || key.delete;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// src/cli/data.mock.json
|
|
257
|
+
var data_mock_default = {
|
|
258
|
+
datasets: [
|
|
259
|
+
{
|
|
260
|
+
id: "onboarding-flows",
|
|
261
|
+
name: "onboarding-flows",
|
|
262
|
+
overview: "Evaluate first-user journeys and schema compliance for generated onboarding payloads.",
|
|
263
|
+
runs: [
|
|
264
|
+
{
|
|
265
|
+
id: "run_2026-02-17_2044",
|
|
266
|
+
label: "2026-02-17 20:44",
|
|
267
|
+
status: "FAILED",
|
|
268
|
+
performance: {
|
|
269
|
+
passRate: 96,
|
|
270
|
+
avgScore: 0.91,
|
|
271
|
+
latencyP95Ms: 710,
|
|
272
|
+
latencyAvgMs: 502,
|
|
273
|
+
tokensAvg: 171,
|
|
274
|
+
tokensP95: 230,
|
|
275
|
+
costUsd: 24e-4,
|
|
276
|
+
latencyHistoryMs: [380, 420, 510, 480, 550, 620, 590, 710, 520, 480, 530, 600]
|
|
277
|
+
},
|
|
278
|
+
dimensions: [
|
|
279
|
+
{ name: "correctness", score: 82 },
|
|
280
|
+
{ name: "faithfulness", score: 79 },
|
|
281
|
+
{ name: "brevity", score: 68 },
|
|
282
|
+
{ name: "style", score: 90 }
|
|
283
|
+
],
|
|
284
|
+
checks: [
|
|
285
|
+
{ name: "json_schema", passed: false, detail: "3 violations" },
|
|
286
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" },
|
|
287
|
+
{ name: "pii_leak", passed: true },
|
|
288
|
+
{ name: "jailbreak", passed: true }
|
|
289
|
+
],
|
|
290
|
+
failures: [
|
|
291
|
+
{ title: "product_parser \u203A conforms to schema (price: string)" },
|
|
292
|
+
{ title: "checkout \u203A tool-call count mismatch" }
|
|
293
|
+
],
|
|
294
|
+
meta: {
|
|
295
|
+
model: "gpt-4o-mini",
|
|
296
|
+
provider: "OpenAI",
|
|
297
|
+
commit: "2f3c1a9",
|
|
298
|
+
branch: "main",
|
|
299
|
+
seed: 42,
|
|
300
|
+
concurrency: 4,
|
|
301
|
+
duration: "00:01:12",
|
|
302
|
+
artifact: "./eval-results/run_2026-02-17.jsonl"
|
|
303
|
+
}
|
|
304
|
+
},
|
|
305
|
+
{
|
|
306
|
+
id: "run_2026-02-16_1112",
|
|
307
|
+
label: "2026-02-16 11:12",
|
|
308
|
+
status: "PASS",
|
|
309
|
+
performance: {
|
|
310
|
+
passRate: 99,
|
|
311
|
+
avgScore: 0.95,
|
|
312
|
+
latencyP95Ms: 650,
|
|
313
|
+
latencyAvgMs: 488,
|
|
314
|
+
tokensAvg: 168,
|
|
315
|
+
tokensP95: 220,
|
|
316
|
+
costUsd: 2e-3,
|
|
317
|
+
latencyHistoryMs: [420, 450, 480, 460, 520, 490, 510, 650, 440, 470, 500, 480]
|
|
318
|
+
},
|
|
319
|
+
dimensions: [
|
|
320
|
+
{ name: "correctness", score: 89 },
|
|
321
|
+
{ name: "faithfulness", score: 88 },
|
|
322
|
+
{ name: "brevity", score: 72 },
|
|
323
|
+
{ name: "style", score: 93 }
|
|
324
|
+
],
|
|
325
|
+
checks: [
|
|
326
|
+
{ name: "json_schema", passed: true, detail: "0 violations" },
|
|
327
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" },
|
|
328
|
+
{ name: "pii_leak", passed: true },
|
|
329
|
+
{ name: "jailbreak", passed: true }
|
|
330
|
+
],
|
|
331
|
+
failures: [],
|
|
332
|
+
meta: {
|
|
333
|
+
model: "gpt-4o-mini",
|
|
334
|
+
provider: "OpenAI",
|
|
335
|
+
commit: "0d24f8f",
|
|
336
|
+
branch: "main",
|
|
337
|
+
seed: 42,
|
|
338
|
+
concurrency: 4,
|
|
339
|
+
duration: "00:01:06",
|
|
340
|
+
artifact: "./eval-results/run_2026-02-16.jsonl"
|
|
341
|
+
}
|
|
342
|
+
},
|
|
343
|
+
{
|
|
344
|
+
id: "run_2026-02-15_0921",
|
|
345
|
+
label: "2026-02-15 09:21",
|
|
346
|
+
status: "PASS",
|
|
347
|
+
performance: {
|
|
348
|
+
passRate: 98,
|
|
349
|
+
avgScore: 0.93,
|
|
350
|
+
latencyP95Ms: 680,
|
|
351
|
+
latencyAvgMs: 495,
|
|
352
|
+
tokensAvg: 175,
|
|
353
|
+
tokensP95: 235,
|
|
354
|
+
costUsd: 22e-4,
|
|
355
|
+
latencyHistoryMs: [450, 480, 520, 490, 550, 580, 620, 680, 510, 470, 530, 560]
|
|
356
|
+
},
|
|
357
|
+
dimensions: [
|
|
358
|
+
{ name: "correctness", score: 86 },
|
|
359
|
+
{ name: "faithfulness", score: 84 },
|
|
360
|
+
{ name: "brevity", score: 70 },
|
|
361
|
+
{ name: "style", score: 91 }
|
|
362
|
+
],
|
|
363
|
+
checks: [
|
|
364
|
+
{ name: "json_schema", passed: true, detail: "0 violations" },
|
|
365
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" },
|
|
366
|
+
{ name: "pii_leak", passed: true },
|
|
367
|
+
{ name: "jailbreak", passed: true }
|
|
368
|
+
],
|
|
369
|
+
failures: [],
|
|
370
|
+
meta: {
|
|
371
|
+
model: "gpt-4o-mini",
|
|
372
|
+
provider: "OpenAI",
|
|
373
|
+
commit: "a1b2c3d",
|
|
374
|
+
branch: "main",
|
|
375
|
+
seed: 42,
|
|
376
|
+
concurrency: 4,
|
|
377
|
+
duration: "00:01:08",
|
|
378
|
+
artifact: "./eval-results/run_2026-02-15.jsonl"
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
]
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
id: "tool-calls",
|
|
385
|
+
name: "tool-calls",
|
|
386
|
+
overview: "Validate function-call conformance and unexpected tool invocation behavior.",
|
|
387
|
+
runs: [
|
|
388
|
+
{
|
|
389
|
+
id: "run_2026-02-14_1530",
|
|
390
|
+
label: "2026-02-14 15:30",
|
|
391
|
+
status: "PASS",
|
|
392
|
+
performance: {
|
|
393
|
+
passRate: 100,
|
|
394
|
+
avgScore: 1,
|
|
395
|
+
latencyP95Ms: 320,
|
|
396
|
+
latencyAvgMs: 280,
|
|
397
|
+
tokensAvg: 45,
|
|
398
|
+
tokensP95: 62,
|
|
399
|
+
costUsd: 8e-4,
|
|
400
|
+
latencyHistoryMs: [250, 270, 290, 280, 310, 320, 265, 290, 300, 275]
|
|
401
|
+
},
|
|
402
|
+
dimensions: [
|
|
403
|
+
{ name: "contract_match", score: 100 },
|
|
404
|
+
{ name: "arg_validity", score: 100 }
|
|
405
|
+
],
|
|
406
|
+
checks: [
|
|
407
|
+
{ name: "tool_calls", passed: true, detail: "0 unexpected" }
|
|
408
|
+
],
|
|
409
|
+
failures: [],
|
|
410
|
+
meta: {
|
|
411
|
+
model: "gpt-4o-mini",
|
|
412
|
+
provider: "OpenAI",
|
|
413
|
+
commit: "e4f5g6h",
|
|
414
|
+
branch: "feat/tools",
|
|
415
|
+
seed: 42,
|
|
416
|
+
concurrency: 8,
|
|
417
|
+
duration: "00:00:45",
|
|
418
|
+
artifact: "./eval-results/tool-calls_2026-02-14.jsonl"
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
]
|
|
422
|
+
},
|
|
423
|
+
{
|
|
424
|
+
id: "json-schema",
|
|
425
|
+
name: "json-schema",
|
|
426
|
+
overview: "Stress-test schema fidelity across generated extraction payloads.",
|
|
427
|
+
runs: []
|
|
428
|
+
}
|
|
429
|
+
],
|
|
430
|
+
evaluators: [
|
|
431
|
+
{ id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
|
|
432
|
+
{ id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
|
|
433
|
+
{ id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
|
|
434
|
+
{ id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
|
|
435
|
+
]
|
|
436
|
+
};
|
|
437
|
+
|
|
438
|
+
// src/cli/state.ts
|
|
439
|
+
function loadMockData() {
|
|
440
|
+
return data_mock_default;
|
|
441
|
+
}
|
|
442
|
+
function toSlug(input) {
|
|
443
|
+
return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
444
|
+
}
|
|
445
|
+
function toEvalRun(snapshot) {
|
|
446
|
+
const total = snapshot.totalTestCases === 0 ? 1 : snapshot.totalTestCases;
|
|
447
|
+
const passRate = Math.round(snapshot.passedTestCases / total * 100);
|
|
448
|
+
const avgScore = snapshot.passedTestCases / total;
|
|
449
|
+
const durationMs = snapshot.finishedAt ? snapshot.finishedAt - (snapshot.startedAt ?? snapshot.queuedAt) : Date.now() - (snapshot.startedAt ?? snapshot.queuedAt);
|
|
450
|
+
return {
|
|
451
|
+
id: snapshot.runId,
|
|
452
|
+
label: snapshot.runId.slice(0, 12),
|
|
453
|
+
status: snapshot.status === "completed" ? "PASS" : snapshot.status === "failed" ? "FAILED" : "RUNNING",
|
|
454
|
+
performance: {
|
|
455
|
+
passRate,
|
|
456
|
+
avgScore,
|
|
457
|
+
latencyP95Ms: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
|
|
458
|
+
latencyAvgMs: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
|
|
459
|
+
tokensAvg: 0,
|
|
460
|
+
tokensP95: 0,
|
|
461
|
+
costUsd: 0,
|
|
462
|
+
latencyHistoryMs: [durationMs]
|
|
463
|
+
},
|
|
464
|
+
dimensions: [
|
|
465
|
+
{ name: "passed", score: Math.round(snapshot.passedTestCases / total * 100) },
|
|
466
|
+
{ name: "failed", score: Math.round(snapshot.failedTestCases / total * 100) }
|
|
467
|
+
],
|
|
468
|
+
checks: [
|
|
469
|
+
{
|
|
470
|
+
name: "run_status",
|
|
471
|
+
passed: snapshot.status === "completed",
|
|
472
|
+
detail: snapshot.status
|
|
473
|
+
}
|
|
474
|
+
],
|
|
475
|
+
failures: snapshot.errorMessage && snapshot.errorMessage.length > 0 ? [{ title: snapshot.errorMessage }] : [],
|
|
476
|
+
meta: {
|
|
477
|
+
model: "n/a",
|
|
478
|
+
provider: "runner",
|
|
479
|
+
commit: "local",
|
|
480
|
+
branch: "local",
|
|
481
|
+
seed: 0,
|
|
482
|
+
concurrency: 1,
|
|
483
|
+
duration: `${durationMs}ms`,
|
|
484
|
+
artifact: snapshot.artifactPath
|
|
485
|
+
}
|
|
486
|
+
};
|
|
487
|
+
}
|
|
488
|
+
function toEvalDataset(item, snapshots) {
|
|
489
|
+
const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
|
|
490
|
+
return {
|
|
491
|
+
id: item.id,
|
|
492
|
+
name: item.dataset.getName(),
|
|
493
|
+
overview: `Discovered from ${item.filePath}`,
|
|
494
|
+
runs
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
function toEvaluatorOption(item) {
|
|
498
|
+
return {
|
|
499
|
+
id: item.id,
|
|
500
|
+
name: item.evaluator.getName() ?? toSlug(item.id),
|
|
501
|
+
configPreview: `Source: ${item.filePath}`
|
|
502
|
+
};
|
|
503
|
+
}
|
|
504
|
+
async function loadRunnerData(runner) {
|
|
505
|
+
const [datasets, evaluators] = await Promise.all([
|
|
506
|
+
runner.collectDatasets(),
|
|
507
|
+
runner.collectEvaluators()
|
|
508
|
+
]);
|
|
509
|
+
const snapshots = runner.getAllRunSnapshots();
|
|
510
|
+
if (datasets.length === 0 && evaluators.length === 0) {
|
|
511
|
+
return loadMockData();
|
|
512
|
+
}
|
|
513
|
+
return {
|
|
514
|
+
datasets: datasets.map((dataset) => toEvalDataset(dataset, snapshots)),
|
|
515
|
+
evaluators: evaluators.map(toEvaluatorOption)
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
function applyRunnerEvent(data, event, runner) {
|
|
519
|
+
const snapshot = runner.getRunSnapshot(event.runId);
|
|
520
|
+
if (!snapshot) {
|
|
521
|
+
return data;
|
|
522
|
+
}
|
|
523
|
+
const dataset = data.datasets.find((item) => item.id === snapshot.datasetId);
|
|
524
|
+
if (!dataset) {
|
|
525
|
+
return data;
|
|
526
|
+
}
|
|
527
|
+
const run = toEvalRun(snapshot);
|
|
528
|
+
const hasRun = dataset.runs.some((item) => item.id === run.id);
|
|
529
|
+
const nextRuns = hasRun ? dataset.runs.map((item) => item.id === run.id ? run : item) : [run, ...dataset.runs];
|
|
530
|
+
return {
|
|
531
|
+
...data,
|
|
532
|
+
datasets: data.datasets.map(
|
|
533
|
+
(item) => item.id === dataset.id ? { ...item, runs: nextRuns } : item
|
|
534
|
+
)
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
function parseStartupArgs(argv) {
|
|
538
|
+
const args = { unknownArgs: [] };
|
|
539
|
+
for (let index = 0; index < argv.length; index += 1) {
|
|
540
|
+
const token = argv[index];
|
|
541
|
+
if (token === "--dataset" && argv[index + 1]) {
|
|
542
|
+
args.datasetId = argv[index + 1];
|
|
543
|
+
index += 1;
|
|
544
|
+
continue;
|
|
545
|
+
}
|
|
546
|
+
if (token === "--run" && argv[index + 1]) {
|
|
547
|
+
args.runId = argv[index + 1];
|
|
548
|
+
index += 1;
|
|
549
|
+
continue;
|
|
550
|
+
}
|
|
551
|
+
if (token === "--search" && argv[index + 1]) {
|
|
552
|
+
args.search = argv[index + 1];
|
|
553
|
+
index += 1;
|
|
554
|
+
continue;
|
|
555
|
+
}
|
|
556
|
+
args.unknownArgs.push(token);
|
|
557
|
+
}
|
|
558
|
+
return args;
|
|
559
|
+
}
|
|
560
|
+
function getFilteredDatasets(data, searchQuery) {
|
|
561
|
+
const query = searchQuery.trim().toLowerCase();
|
|
562
|
+
if (!query) {
|
|
563
|
+
return data.datasets;
|
|
564
|
+
}
|
|
565
|
+
return data.datasets.filter((dataset) => dataset.name.toLowerCase().includes(query));
|
|
566
|
+
}
|
|
567
|
+
function getDatasetByMenuIndex(datasets, menuIndex) {
|
|
568
|
+
if (menuIndex <= 0) {
|
|
569
|
+
return void 0;
|
|
570
|
+
}
|
|
571
|
+
return datasets[menuIndex - 1];
|
|
572
|
+
}
|
|
573
|
+
function getRunByMenuIndex(dataset, menuIndex) {
|
|
574
|
+
if (!dataset || menuIndex <= 0) {
|
|
575
|
+
return void 0;
|
|
576
|
+
}
|
|
577
|
+
return dataset.runs[menuIndex - 1];
|
|
578
|
+
}
|
|
579
|
+
function createInitialState(data, args) {
|
|
580
|
+
const warnings = [];
|
|
581
|
+
if (args.unknownArgs.length > 0) {
|
|
582
|
+
warnings.push(`Unknown args: ${args.unknownArgs.join(", ")}`);
|
|
583
|
+
warnings.push("Supported: --dataset <id>, --run <id>, --search <term>");
|
|
584
|
+
}
|
|
585
|
+
const searchQuery = args.search ?? "";
|
|
586
|
+
const filteredDatasets = getFilteredDatasets(data, searchQuery);
|
|
587
|
+
const datasetByArg = filteredDatasets.find((dataset) => dataset.id === args.datasetId);
|
|
588
|
+
const datasetMenuIndex = datasetByArg ? filteredDatasets.indexOf(datasetByArg) + 1 : 0;
|
|
589
|
+
let level = "datasets";
|
|
590
|
+
let runMenuIndex = 0;
|
|
591
|
+
if (datasetByArg) {
|
|
592
|
+
level = "runs";
|
|
593
|
+
} else if (args.datasetId) {
|
|
594
|
+
warnings.push(`Dataset "${args.datasetId}" not found.`);
|
|
595
|
+
}
|
|
596
|
+
if (datasetByArg && args.runId) {
|
|
597
|
+
const runIndex = datasetByArg.runs.findIndex((run) => run.id === args.runId);
|
|
598
|
+
if (runIndex >= 0) {
|
|
599
|
+
runMenuIndex = runIndex + 1;
|
|
600
|
+
level = "details";
|
|
601
|
+
} else {
|
|
602
|
+
warnings.push(`Run "${args.runId}" not found in dataset "${datasetByArg.id}".`);
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
return {
|
|
606
|
+
level,
|
|
607
|
+
focus: "left",
|
|
608
|
+
datasetMenuIndex,
|
|
609
|
+
runMenuIndex,
|
|
610
|
+
detailsScrollOffset: 0,
|
|
611
|
+
selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
|
|
612
|
+
evaluatorMenuIndex: 0,
|
|
613
|
+
searchQuery,
|
|
614
|
+
searchMode: false,
|
|
615
|
+
startupWarnings: warnings
|
|
616
|
+
};
|
|
617
|
+
}
|
|
618
|
+
function reduceCliState(state, action) {
|
|
619
|
+
if (action.type === "MOVE_UP") {
|
|
620
|
+
if (state.searchMode) {
|
|
621
|
+
return state;
|
|
622
|
+
}
|
|
623
|
+
if (state.level === "details" && state.focus === "right") {
|
|
624
|
+
return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
|
|
625
|
+
}
|
|
626
|
+
if (state.level === "datasets") {
|
|
627
|
+
return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
|
|
628
|
+
}
|
|
629
|
+
if (state.level === "runs") {
|
|
630
|
+
return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
|
|
631
|
+
}
|
|
632
|
+
if (state.level === "new-evaluation") {
|
|
633
|
+
return { ...state, evaluatorMenuIndex: Math.max(0, state.evaluatorMenuIndex - 1) };
|
|
634
|
+
}
|
|
635
|
+
return state;
|
|
636
|
+
}
|
|
637
|
+
if (action.type === "MOVE_DOWN") {
|
|
638
|
+
if (state.searchMode) {
|
|
639
|
+
return state;
|
|
640
|
+
}
|
|
641
|
+
if (state.level === "details" && state.focus === "right") {
|
|
642
|
+
return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
|
|
643
|
+
}
|
|
644
|
+
if (state.level === "datasets") {
|
|
645
|
+
return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
|
|
646
|
+
}
|
|
647
|
+
if (state.level === "runs") {
|
|
648
|
+
return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
|
|
649
|
+
}
|
|
650
|
+
if (state.level === "new-evaluation") {
|
|
651
|
+
return { ...state, evaluatorMenuIndex: Math.min(action.max, state.evaluatorMenuIndex + 1) };
|
|
652
|
+
}
|
|
653
|
+
return state;
|
|
654
|
+
}
|
|
655
|
+
if (action.type === "ENTER") {
|
|
656
|
+
if (state.searchMode) {
|
|
657
|
+
return { ...state, searchMode: false };
|
|
658
|
+
}
|
|
659
|
+
if (state.level === "datasets") {
|
|
660
|
+
if (state.datasetMenuIndex === 0) {
|
|
661
|
+
return { ...state, level: "new-evaluation" };
|
|
662
|
+
}
|
|
663
|
+
if (action.hasDataset) {
|
|
664
|
+
return { ...state, level: "runs", runMenuIndex: 0 };
|
|
665
|
+
}
|
|
666
|
+
return state;
|
|
667
|
+
}
|
|
668
|
+
if (state.level === "runs") {
|
|
669
|
+
if (state.runMenuIndex === 0) {
|
|
670
|
+
return { ...state, level: "new-evaluation" };
|
|
671
|
+
}
|
|
672
|
+
if (action.hasRun) {
|
|
673
|
+
return { ...state, level: "details", detailsScrollOffset: 0 };
|
|
674
|
+
}
|
|
675
|
+
return state;
|
|
676
|
+
}
|
|
677
|
+
if (state.level === "new-evaluation") {
|
|
678
|
+
return state;
|
|
679
|
+
}
|
|
680
|
+
return state;
|
|
681
|
+
}
|
|
682
|
+
if (action.type === "BACK") {
|
|
683
|
+
if (state.searchMode) {
|
|
684
|
+
return { ...state, searchMode: false };
|
|
685
|
+
}
|
|
686
|
+
if (state.level === "details") {
|
|
687
|
+
return { ...state, level: "runs" };
|
|
688
|
+
}
|
|
689
|
+
if (state.level === "runs" || state.level === "new-evaluation") {
|
|
690
|
+
return { ...state, level: "datasets" };
|
|
691
|
+
}
|
|
692
|
+
return state;
|
|
693
|
+
}
|
|
694
|
+
if (action.type === "TOGGLE_FOCUS") {
|
|
695
|
+
return { ...state, focus: state.focus === "left" ? "right" : "left" };
|
|
696
|
+
}
|
|
697
|
+
if (action.type === "START_SEARCH") {
|
|
698
|
+
return { ...state, searchMode: true };
|
|
699
|
+
}
|
|
700
|
+
if (action.type === "END_SEARCH") {
|
|
701
|
+
return { ...state, searchMode: false };
|
|
702
|
+
}
|
|
703
|
+
if (action.type === "APPEND_SEARCH") {
|
|
704
|
+
return { ...state, searchQuery: `${state.searchQuery}${action.value}` };
|
|
705
|
+
}
|
|
706
|
+
if (action.type === "REMOVE_SEARCH_CHAR") {
|
|
707
|
+
return { ...state, searchQuery: state.searchQuery.slice(0, -1) };
|
|
708
|
+
}
|
|
709
|
+
if (action.type === "TOGGLE_EVALUATOR") {
|
|
710
|
+
const exists = state.selectedEvaluatorIds.includes(action.evaluatorId);
|
|
711
|
+
return {
|
|
712
|
+
...state,
|
|
713
|
+
selectedEvaluatorIds: exists ? state.selectedEvaluatorIds.filter((id) => id !== action.evaluatorId) : [...state.selectedEvaluatorIds, action.evaluatorId]
|
|
714
|
+
};
|
|
715
|
+
}
|
|
716
|
+
if (action.type === "CLEAR_WARNINGS") {
|
|
717
|
+
return { ...state, startupWarnings: [] };
|
|
718
|
+
}
|
|
719
|
+
return state;
|
|
720
|
+
}
|
|
721
|
+
var LEFT_PANE_WIDTH2 = 44;
|
|
722
|
+
function DatasetsView({
|
|
723
|
+
state,
|
|
724
|
+
filteredDatasets,
|
|
725
|
+
selectedDataset
|
|
726
|
+
}) {
|
|
727
|
+
const leftFocused = state.focus === "left";
|
|
728
|
+
const rightFocused = state.focus === "right";
|
|
729
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
730
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
|
|
731
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
|
|
732
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
733
|
+
ListItem,
|
|
734
|
+
{
|
|
735
|
+
selected: state.datasetMenuIndex === 0,
|
|
736
|
+
label: "New evaluation",
|
|
737
|
+
itemKey: "datasets-new-eval"
|
|
738
|
+
}
|
|
739
|
+
),
|
|
740
|
+
filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
741
|
+
ListItem,
|
|
742
|
+
{
|
|
743
|
+
selected: state.datasetMenuIndex === index + 1,
|
|
744
|
+
label: dataset.name,
|
|
745
|
+
itemKey: `dataset-${dataset.id}`
|
|
746
|
+
},
|
|
747
|
+
dataset.id
|
|
748
|
+
))
|
|
749
|
+
] }),
|
|
750
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
751
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
|
|
752
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
|
|
753
|
+
] })
|
|
754
|
+
] });
|
|
755
|
+
}
|
|
756
|
+
function RunsView({
|
|
757
|
+
state,
|
|
758
|
+
dataset,
|
|
759
|
+
selectedRun
|
|
760
|
+
}) {
|
|
761
|
+
const runs = dataset?.runs ?? [];
|
|
762
|
+
const rightFocused = state.focus === "right";
|
|
763
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
764
|
+
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
765
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
|
|
766
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
767
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
|
|
768
|
+
" ",
|
|
769
|
+
selectedRun.label,
|
|
770
|
+
" ",
|
|
771
|
+
/* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
|
|
772
|
+
] }),
|
|
773
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
774
|
+
"Commit: ",
|
|
775
|
+
selectedRun.meta.commit,
|
|
776
|
+
" Branch: ",
|
|
777
|
+
selectedRun.meta.branch,
|
|
778
|
+
" ",
|
|
779
|
+
"Seed: ",
|
|
780
|
+
selectedRun.meta.seed
|
|
781
|
+
] }),
|
|
782
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
783
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
|
|
784
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
785
|
+
TextBar,
|
|
786
|
+
{
|
|
787
|
+
label: "pass rate",
|
|
788
|
+
value: selectedRun.performance.passRate,
|
|
789
|
+
format: (v) => `${v}%`
|
|
790
|
+
}
|
|
791
|
+
),
|
|
792
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
793
|
+
TextBar,
|
|
794
|
+
{
|
|
795
|
+
label: "avg score",
|
|
796
|
+
value: Math.round(selectedRun.performance.avgScore * 100)
|
|
797
|
+
}
|
|
798
|
+
),
|
|
799
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
800
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
|
|
801
|
+
selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
|
|
802
|
+
TextBar,
|
|
803
|
+
{
|
|
804
|
+
label: dimension.name,
|
|
805
|
+
value: dimension.score
|
|
806
|
+
},
|
|
807
|
+
dimension.name
|
|
808
|
+
)),
|
|
809
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
|
|
810
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
|
|
811
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
812
|
+
Sparkline,
|
|
813
|
+
{
|
|
814
|
+
data: selectedRun.performance.latencyHistoryMs ?? [
|
|
815
|
+
selectedRun.performance.latencyAvgMs - 40,
|
|
816
|
+
selectedRun.performance.latencyAvgMs - 10,
|
|
817
|
+
selectedRun.performance.latencyAvgMs + 20,
|
|
818
|
+
selectedRun.performance.latencyP95Ms - 80,
|
|
819
|
+
selectedRun.performance.latencyP95Ms
|
|
820
|
+
],
|
|
821
|
+
width: 24
|
|
822
|
+
}
|
|
823
|
+
)
|
|
824
|
+
] }) })
|
|
825
|
+
] });
|
|
826
|
+
}
|
|
827
|
+
var DETAILS_PAGE_SIZE = 20;
|
|
828
|
+
function CheckRow({
|
|
829
|
+
name,
|
|
830
|
+
passed,
|
|
831
|
+
detail
|
|
832
|
+
}) {
|
|
833
|
+
const status = passed ? "PASSED" : "FAILED";
|
|
834
|
+
const color = passed ? "green" : "red";
|
|
835
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
836
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: name.padEnd(14) }),
|
|
837
|
+
" ",
|
|
838
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
|
|
839
|
+
detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
840
|
+
" (",
|
|
841
|
+
detail,
|
|
842
|
+
")"
|
|
843
|
+
] }) : null
|
|
844
|
+
] });
|
|
845
|
+
}
|
|
846
|
+
function buildDetailRows(run) {
|
|
847
|
+
const { performance, dimensions, checks, failures, meta } = run;
|
|
848
|
+
const latencyHistory = performance.latencyHistoryMs ?? [
|
|
849
|
+
performance.latencyAvgMs - 40,
|
|
850
|
+
performance.latencyAvgMs - 10,
|
|
851
|
+
performance.latencyAvgMs + 20,
|
|
852
|
+
performance.latencyP95Ms - 80,
|
|
853
|
+
performance.latencyP95Ms
|
|
854
|
+
];
|
|
855
|
+
const rows = [
|
|
856
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Meta" }, "meta-h"),
|
|
857
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
858
|
+
"Model: ",
|
|
859
|
+
meta.model,
|
|
860
|
+
" Provider: ",
|
|
861
|
+
meta.provider
|
|
862
|
+
] }, "meta-1"),
|
|
863
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
864
|
+
"Commit: ",
|
|
865
|
+
meta.commit,
|
|
866
|
+
" Branch: ",
|
|
867
|
+
meta.branch,
|
|
868
|
+
" Seed: ",
|
|
869
|
+
meta.seed
|
|
870
|
+
] }, "meta-2"),
|
|
871
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
872
|
+
"Duration: ",
|
|
873
|
+
meta.duration,
|
|
874
|
+
" Concurrency: ",
|
|
875
|
+
meta.concurrency
|
|
876
|
+
] }, "meta-3"),
|
|
877
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
878
|
+
"Artifact: ",
|
|
879
|
+
meta.artifact
|
|
880
|
+
] }, "meta-4"),
|
|
881
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp1"),
|
|
882
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Scores (0\u2013100)" }, "scores-h"),
|
|
883
|
+
...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
|
|
884
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
|
|
885
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
|
|
886
|
+
...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
|
|
887
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
|
|
888
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
|
|
889
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
890
|
+
TextBar,
|
|
891
|
+
{
|
|
892
|
+
label: "pass rate",
|
|
893
|
+
value: performance.passRate,
|
|
894
|
+
format: (v) => `${v}%`
|
|
895
|
+
},
|
|
896
|
+
"perf-rate"
|
|
897
|
+
),
|
|
898
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
899
|
+
"latency avg ",
|
|
900
|
+
performance.latencyAvgMs,
|
|
901
|
+
"ms p95 ",
|
|
902
|
+
performance.latencyP95Ms,
|
|
903
|
+
"ms"
|
|
904
|
+
] }, "perf-lat"),
|
|
905
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
906
|
+
"tokens avg ",
|
|
907
|
+
performance.tokensAvg,
|
|
908
|
+
" p95 ",
|
|
909
|
+
performance.tokensP95
|
|
910
|
+
] }, "perf-tok"),
|
|
911
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
|
|
912
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }, "spark-h"),
|
|
913
|
+
/* @__PURE__ */ jsxRuntime.jsx(Sparkline, { data: latencyHistory, width: 20 }, "spark")
|
|
914
|
+
];
|
|
915
|
+
if (failures.length > 0) {
|
|
916
|
+
rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp5"));
|
|
917
|
+
rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Failures (top)" }, "fail-h"));
|
|
918
|
+
failures.forEach((f, i) => {
|
|
919
|
+
rows.push(
|
|
920
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", children: [
|
|
921
|
+
i + 1,
|
|
922
|
+
") ",
|
|
923
|
+
f.title
|
|
924
|
+
] }, `fail-${i}`)
|
|
925
|
+
);
|
|
926
|
+
});
|
|
927
|
+
}
|
|
928
|
+
return rows;
|
|
929
|
+
}
|
|
930
|
+
function RunDetailsView({
|
|
931
|
+
state,
|
|
932
|
+
dataset,
|
|
933
|
+
selectedRun
|
|
934
|
+
}) {
|
|
935
|
+
const runs = dataset?.runs ?? [];
|
|
936
|
+
const rightFocused = state.focus === "right";
|
|
937
|
+
if (!selectedRun) {
|
|
938
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
939
|
+
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
940
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to inspect details." }) })
|
|
941
|
+
] });
|
|
942
|
+
}
|
|
943
|
+
const rows = buildDetailRows(selectedRun);
|
|
944
|
+
const offset = Math.max(0, state.detailsScrollOffset);
|
|
945
|
+
const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
|
|
946
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
947
|
+
/* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
|
|
948
|
+
/* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
|
|
949
|
+
] });
|
|
950
|
+
}
|
|
951
|
+
var LEFT_PANE_WIDTH3 = 44;
|
|
952
|
+
function NewEvaluationView({
|
|
953
|
+
state,
|
|
954
|
+
data,
|
|
955
|
+
visibleEvaluators
|
|
956
|
+
}) {
|
|
957
|
+
const selectedCount = state.selectedEvaluatorIds.length;
|
|
958
|
+
const focusedEvaluator = visibleEvaluators[state.evaluatorMenuIndex];
|
|
959
|
+
const leftFocused = state.focus === "left";
|
|
960
|
+
const rightFocused = state.focus === "right";
|
|
961
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
|
|
962
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH3, focused: leftFocused, children: [
|
|
963
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Available Evaluators" }),
|
|
964
|
+
/* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
|
|
965
|
+
"Search: ",
|
|
966
|
+
state.searchQuery || "(none)"
|
|
967
|
+
] }),
|
|
968
|
+
visibleEvaluators.map((evaluator, index) => {
|
|
969
|
+
const selected = index === state.evaluatorMenuIndex;
|
|
970
|
+
const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
|
|
971
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
972
|
+
ink.Text,
|
|
973
|
+
{
|
|
974
|
+
color: selected ? "cyan" : "gray",
|
|
975
|
+
bold: selected,
|
|
976
|
+
children: [
|
|
977
|
+
selected ? "\u25B8 " : " ",
|
|
978
|
+
inSelection ? "[x] " : "[ ] ",
|
|
979
|
+
evaluator.name
|
|
980
|
+
]
|
|
981
|
+
},
|
|
982
|
+
evaluator.id
|
|
983
|
+
);
|
|
984
|
+
})
|
|
985
|
+
] }),
|
|
986
|
+
/* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
|
|
987
|
+
/* @__PURE__ */ jsxRuntime.jsxs(SectionHeader, { children: [
|
|
988
|
+
"Selected (",
|
|
989
|
+
selectedCount,
|
|
990
|
+
")"
|
|
991
|
+
] }),
|
|
992
|
+
state.selectedEvaluatorIds.map((id, index) => {
|
|
993
|
+
const evaluator = data.evaluators.find((item) => item.id === id);
|
|
994
|
+
if (!evaluator)
|
|
995
|
+
return null;
|
|
996
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
|
|
997
|
+
index + 1,
|
|
998
|
+
") ",
|
|
999
|
+
evaluator.name
|
|
1000
|
+
] }, id);
|
|
1001
|
+
}),
|
|
1002
|
+
/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Config preview" }),
|
|
1003
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: focusedEvaluator?.configPreview ?? "Select an evaluator to inspect config." })
|
|
1004
|
+
] })
|
|
1005
|
+
] });
|
|
1006
|
+
}
|
|
1007
|
+
function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
|
|
1008
|
+
const datasetMax = filteredDatasetsLength;
|
|
1009
|
+
const runMax = selectedRunCount;
|
|
1010
|
+
const evaluatorMax = 3;
|
|
1011
|
+
return {
|
|
1012
|
+
...state,
|
|
1013
|
+
datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
|
|
1014
|
+
runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
|
|
1015
|
+
evaluatorMenuIndex: Math.max(
|
|
1016
|
+
0,
|
|
1017
|
+
Math.min(state.evaluatorMenuIndex, evaluatorMax)
|
|
1018
|
+
)
|
|
1019
|
+
};
|
|
1020
|
+
}
|
|
1021
|
+
function EvalsCliApp({
|
|
1022
|
+
data,
|
|
1023
|
+
args,
|
|
1024
|
+
runner
|
|
1025
|
+
}) {
|
|
1026
|
+
const { exit } = ink.useApp();
|
|
1027
|
+
const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
|
|
1028
|
+
const [liveData, setLiveData] = React.useState(data);
|
|
1029
|
+
const [runtimeMessage, setRuntimeMessage] = React.useState();
|
|
1030
|
+
const [state, dispatch] = React.useReducer(
|
|
1031
|
+
reduceCliState,
|
|
1032
|
+
createInitialState(data, args)
|
|
1033
|
+
);
|
|
1034
|
+
React.useEffect(() => {
|
|
1035
|
+
setLiveData(data);
|
|
1036
|
+
}, [data]);
|
|
1037
|
+
React.useEffect(() => {
|
|
1038
|
+
if (!runner) {
|
|
1039
|
+
return void 0;
|
|
1040
|
+
}
|
|
1041
|
+
return runner.subscribeRunEvents((event) => {
|
|
1042
|
+
setLiveData((current) => applyRunnerEvent(current, event, runner));
|
|
1043
|
+
if (event.type === "RunQueued") {
|
|
1044
|
+
setRuntimeMessage(`Queued ${event.runId} with ${event.totalTestCases} test cases.`);
|
|
1045
|
+
}
|
|
1046
|
+
if (event.type === "RunCompleted") {
|
|
1047
|
+
setRuntimeMessage(
|
|
1048
|
+
`Completed ${event.runId}: ${event.passedTestCases}/${event.totalTestCases} passed.`
|
|
1049
|
+
);
|
|
1050
|
+
}
|
|
1051
|
+
if (event.type === "RunFailed") {
|
|
1052
|
+
setRuntimeMessage(`Run failed: ${event.errorMessage}`);
|
|
1053
|
+
}
|
|
1054
|
+
});
|
|
1055
|
+
}, [runner]);
|
|
1056
|
+
const filteredDatasets = React.useMemo(
|
|
1057
|
+
() => getFilteredDatasets(liveData, state.searchQuery),
|
|
1058
|
+
[liveData, state.searchQuery]
|
|
1059
|
+
);
|
|
1060
|
+
const clampedState = clampCursor(
|
|
1061
|
+
state,
|
|
1062
|
+
filteredDatasets.length,
|
|
1063
|
+
getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
|
|
1064
|
+
);
|
|
1065
|
+
const selectedDataset = getDatasetByMenuIndex(
|
|
1066
|
+
filteredDatasets,
|
|
1067
|
+
clampedState.datasetMenuIndex
|
|
1068
|
+
);
|
|
1069
|
+
const selectedRun = getRunByMenuIndex(
|
|
1070
|
+
selectedDataset,
|
|
1071
|
+
clampedState.runMenuIndex
|
|
1072
|
+
);
|
|
1073
|
+
const visibleEvaluators = liveData.evaluators.filter(
|
|
1074
|
+
(evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
|
|
1075
|
+
);
|
|
1076
|
+
ink.useInput((input, key) => {
|
|
1077
|
+
if (isQuitInput(input) || key.escape) {
|
|
1078
|
+
exit();
|
|
1079
|
+
return;
|
|
1080
|
+
}
|
|
1081
|
+
if (key.tab) {
|
|
1082
|
+
dispatch({ type: "TOGGLE_FOCUS" });
|
|
1083
|
+
return;
|
|
1084
|
+
}
|
|
1085
|
+
if (isSearchInput(input)) {
|
|
1086
|
+
dispatch({ type: "START_SEARCH" });
|
|
1087
|
+
return;
|
|
1088
|
+
}
|
|
1089
|
+
if (clampedState.searchMode) {
|
|
1090
|
+
if (key.return) {
|
|
1091
|
+
dispatch({ type: "END_SEARCH" });
|
|
1092
|
+
return;
|
|
1093
|
+
}
|
|
1094
|
+
if (isBackKey(key)) {
|
|
1095
|
+
dispatch({ type: "REMOVE_SEARCH_CHAR" });
|
|
1096
|
+
return;
|
|
1097
|
+
}
|
|
1098
|
+
if (isPrintableCharacter(input)) {
|
|
1099
|
+
dispatch({ type: "APPEND_SEARCH", value: input });
|
|
1100
|
+
}
|
|
1101
|
+
return;
|
|
1102
|
+
}
|
|
1103
|
+
if (key.upArrow) {
|
|
1104
|
+
const max = clampedState.level === "details" ? 100 : clampedState.level === "new-evaluation" ? visibleEvaluators.length - 1 : 100;
|
|
1105
|
+
dispatch({ type: "MOVE_UP", max });
|
|
1106
|
+
return;
|
|
1107
|
+
}
|
|
1108
|
+
if (key.downArrow) {
|
|
1109
|
+
const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
|
|
1110
|
+
dispatch({ type: "MOVE_DOWN", max });
|
|
1111
|
+
return;
|
|
1112
|
+
}
|
|
1113
|
+
if (key.return) {
|
|
1114
|
+
dispatch({
|
|
1115
|
+
type: "ENTER",
|
|
1116
|
+
hasDataset: Boolean(selectedDataset),
|
|
1117
|
+
hasRun: Boolean(selectedRun)
|
|
1118
|
+
});
|
|
1119
|
+
if (clampedState.level === "new-evaluation") {
|
|
1120
|
+
const evaluator = visibleEvaluators[clampedState.evaluatorMenuIndex];
|
|
1121
|
+
if (evaluator) {
|
|
1122
|
+
dispatch({ type: "TOGGLE_EVALUATOR", evaluatorId: evaluator.id });
|
|
1123
|
+
}
|
|
1124
|
+
}
|
|
1125
|
+
return;
|
|
1126
|
+
}
|
|
1127
|
+
if (isBackKey(key)) {
|
|
1128
|
+
dispatch({ type: "BACK" });
|
|
1129
|
+
return;
|
|
1130
|
+
}
|
|
1131
|
+
if (input.toLowerCase() === "c") {
|
|
1132
|
+
dispatch({ type: "CLEAR_WARNINGS" });
|
|
1133
|
+
setRuntimeMessage(void 0);
|
|
1134
|
+
return;
|
|
1135
|
+
}
|
|
1136
|
+
if (input.toLowerCase() === "s" && clampedState.level === "new-evaluation") {
|
|
1137
|
+
if (!runner) {
|
|
1138
|
+
setRuntimeMessage("Runner unavailable: cannot start evaluation.");
|
|
1139
|
+
return;
|
|
1140
|
+
}
|
|
1141
|
+
if (!selectedDataset) {
|
|
1142
|
+
setRuntimeMessage("Select a dataset before starting a new evaluation.");
|
|
1143
|
+
return;
|
|
1144
|
+
}
|
|
1145
|
+
if (clampedState.selectedEvaluatorIds.length === 0) {
|
|
1146
|
+
setRuntimeMessage("Select at least one evaluator before starting.");
|
|
1147
|
+
return;
|
|
1148
|
+
}
|
|
1149
|
+
void runner.runDatasetWith({
|
|
1150
|
+
datasetId: selectedDataset.id,
|
|
1151
|
+
evaluatorIds: clampedState.selectedEvaluatorIds
|
|
1152
|
+
}).then((snapshot) => {
|
|
1153
|
+
setRuntimeMessage(
|
|
1154
|
+
`Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
|
|
1155
|
+
);
|
|
1156
|
+
}).catch((error) => {
|
|
1157
|
+
setRuntimeMessage(
|
|
1158
|
+
error instanceof Error ? error.message : "Failed to start evaluation."
|
|
1159
|
+
);
|
|
1160
|
+
});
|
|
1161
|
+
}
|
|
1162
|
+
});
|
|
1163
|
+
const renderContent = () => {
|
|
1164
|
+
if (clampedState.level === "new-evaluation") {
|
|
1165
|
+
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
1166
|
+
NewEvaluationView,
|
|
1167
|
+
{
|
|
1168
|
+
state: clampedState,
|
|
1169
|
+
data: liveData,
|
|
1170
|
+
visibleEvaluators
|
|
1171
|
+
}
|
|
1172
|
+
);
|
|
1173
|
+
}
|
|
1174
|
+
if (clampedState.level === "datasets") {
|
|
1175
|
+
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
1176
|
+
DatasetsView,
|
|
1177
|
+
{
|
|
1178
|
+
state: clampedState,
|
|
1179
|
+
filteredDatasets,
|
|
1180
|
+
selectedDataset
|
|
1181
|
+
}
|
|
1182
|
+
);
|
|
1183
|
+
}
|
|
1184
|
+
if (clampedState.level === "runs") {
|
|
1185
|
+
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
1186
|
+
RunsView,
|
|
1187
|
+
{
|
|
1188
|
+
state: clampedState,
|
|
1189
|
+
dataset: selectedDataset,
|
|
1190
|
+
selectedRun
|
|
1191
|
+
}
|
|
1192
|
+
);
|
|
1193
|
+
}
|
|
1194
|
+
return /* @__PURE__ */ jsxRuntime.jsx(
|
|
1195
|
+
RunDetailsView,
|
|
1196
|
+
{
|
|
1197
|
+
state: clampedState,
|
|
1198
|
+
dataset: selectedDataset,
|
|
1199
|
+
selectedRun
|
|
1200
|
+
}
|
|
1201
|
+
);
|
|
1202
|
+
};
|
|
1203
|
+
return /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1204
|
+
ink.Box,
|
|
1205
|
+
{
|
|
1206
|
+
flexDirection: "column",
|
|
1207
|
+
flexGrow: 1,
|
|
1208
|
+
width: stdoutWidth,
|
|
1209
|
+
height: stdoutHeight,
|
|
1210
|
+
children: [
|
|
1211
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
1212
|
+
ink.Box,
|
|
1213
|
+
{
|
|
1214
|
+
borderStyle: "round",
|
|
1215
|
+
borderColor: "cyan",
|
|
1216
|
+
paddingX: 1,
|
|
1217
|
+
width: stdoutWidth,
|
|
1218
|
+
children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(
|
|
1219
|
+
clampedState,
|
|
1220
|
+
selectedDataset?.name,
|
|
1221
|
+
selectedRun?.label
|
|
1222
|
+
) })
|
|
1223
|
+
}
|
|
1224
|
+
),
|
|
1225
|
+
clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1226
|
+
ink.Box,
|
|
1227
|
+
{
|
|
1228
|
+
marginTop: 1,
|
|
1229
|
+
borderStyle: "round",
|
|
1230
|
+
borderColor: "yellow",
|
|
1231
|
+
paddingX: 1,
|
|
1232
|
+
flexDirection: "column",
|
|
1233
|
+
width: stdoutWidth,
|
|
1234
|
+
children: [
|
|
1235
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
|
|
1236
|
+
clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
|
|
1237
|
+
]
|
|
1238
|
+
}
|
|
1239
|
+
),
|
|
1240
|
+
clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
|
|
1241
|
+
ink.Box,
|
|
1242
|
+
{
|
|
1243
|
+
marginTop: 1,
|
|
1244
|
+
borderStyle: "round",
|
|
1245
|
+
borderColor: "magenta",
|
|
1246
|
+
paddingX: 1,
|
|
1247
|
+
width: stdoutWidth,
|
|
1248
|
+
children: [
|
|
1249
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", bold: true, children: "Search: " }),
|
|
1250
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
|
|
1251
|
+
]
|
|
1252
|
+
}
|
|
1253
|
+
),
|
|
1254
|
+
runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(
|
|
1255
|
+
ink.Box,
|
|
1256
|
+
{
|
|
1257
|
+
marginTop: 1,
|
|
1258
|
+
borderStyle: "round",
|
|
1259
|
+
borderColor: "blue",
|
|
1260
|
+
paddingX: 1,
|
|
1261
|
+
width: stdoutWidth,
|
|
1262
|
+
children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage })
|
|
1263
|
+
}
|
|
1264
|
+
),
|
|
1265
|
+
/* @__PURE__ */ jsxRuntime.jsx(
|
|
1266
|
+
ink.Box,
|
|
1267
|
+
{
|
|
1268
|
+
marginTop: 1,
|
|
1269
|
+
flexGrow: 1,
|
|
1270
|
+
width: stdoutWidth,
|
|
1271
|
+
flexDirection: "row",
|
|
1272
|
+
children: renderContent()
|
|
1273
|
+
}
|
|
1274
|
+
),
|
|
1275
|
+
/* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
|
|
1276
|
+
]
|
|
1277
|
+
}
|
|
1278
|
+
);
|
|
1279
|
+
}
|
|
1280
|
+
|
|
1281
|
+
// src/runner/config.ts
|
|
1282
|
+
var defaultRunnerConfig = {
|
|
1283
|
+
discovery: {
|
|
1284
|
+
rootDir: process.cwd(),
|
|
1285
|
+
datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
|
|
1286
|
+
evaluatorSuffixes: [
|
|
1287
|
+
".evaluator.ts",
|
|
1288
|
+
".evaluator.tsx",
|
|
1289
|
+
".evaluator.js",
|
|
1290
|
+
".evaluator.mjs"
|
|
1291
|
+
],
|
|
1292
|
+
testCaseSuffixes: [
|
|
1293
|
+
".test-case.ts",
|
|
1294
|
+
".test-case.tsx",
|
|
1295
|
+
".test-case.js",
|
|
1296
|
+
".test-case.mjs"
|
|
1297
|
+
],
|
|
1298
|
+
excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
|
|
1299
|
+
},
|
|
1300
|
+
artifactDirectory: ".eval-results"
|
|
1301
|
+
};
|
|
1302
|
+
function withRunnerConfig(overrides) {
|
|
1303
|
+
{
|
|
1304
|
+
return defaultRunnerConfig;
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
var jitiLoader;
|
|
1308
|
+
function toId(prefix, filePath, name) {
|
|
1309
|
+
const stable = name && name.trim().length > 0 ? name : filePath;
|
|
1310
|
+
return `${prefix}:${stable}`.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
|
|
1311
|
+
}
|
|
1312
|
+
function hasMethod(value, methodName) {
|
|
1313
|
+
return typeof value === "object" && value !== null && methodName in value && typeof value[methodName] === "function";
|
|
1314
|
+
}
|
|
1315
|
+
function isDatasetLike(value) {
|
|
1316
|
+
return hasMethod(value, "getName") && hasMethod(value, "matchesTestCase");
|
|
1317
|
+
}
|
|
1318
|
+
function isEvaluatorLike(value) {
|
|
1319
|
+
return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
|
|
1320
|
+
}
|
|
1321
|
+
function isTestCaseLike(value) {
|
|
1322
|
+
return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
|
|
1323
|
+
}
|
|
1324
|
+
async function walkDirectory(rootDir, excludeDirectories) {
|
|
1325
|
+
const out = [];
|
|
1326
|
+
async function walk(currentDir) {
|
|
1327
|
+
let entries;
|
|
1328
|
+
try {
|
|
1329
|
+
entries = await promises.readdir(currentDir, { withFileTypes: true });
|
|
1330
|
+
} catch {
|
|
1331
|
+
return;
|
|
1332
|
+
}
|
|
1333
|
+
await Promise.all(
|
|
1334
|
+
entries.map(async (entry) => {
|
|
1335
|
+
const absolute = path.resolve(currentDir, entry.name);
|
|
1336
|
+
if (entry.isDirectory()) {
|
|
1337
|
+
if (excludeDirectories.includes(entry.name)) {
|
|
1338
|
+
return;
|
|
1339
|
+
}
|
|
1340
|
+
await walk(absolute);
|
|
1341
|
+
return;
|
|
1342
|
+
}
|
|
1343
|
+
if (entry.isFile()) {
|
|
1344
|
+
out.push(absolute);
|
|
1345
|
+
}
|
|
1346
|
+
})
|
|
1347
|
+
);
|
|
1348
|
+
}
|
|
1349
|
+
await walk(rootDir);
|
|
1350
|
+
return out;
|
|
1351
|
+
}
|
|
1352
|
+
function hasOneSuffix(filePath, suffixes) {
|
|
1353
|
+
return suffixes.some((suffix) => filePath.endsWith(suffix));
|
|
1354
|
+
}
|
|
1355
|
+
async function loadModuleExports(filePath) {
|
|
1356
|
+
if (filePath.endsWith(".ts") || filePath.endsWith(".tsx")) {
|
|
1357
|
+
if (!jitiLoader) {
|
|
1358
|
+
const jitiModule = await import('jiti');
|
|
1359
|
+
const createJiti = jitiModule.createJiti ?? jitiModule.default;
|
|
1360
|
+
if (!createJiti) {
|
|
1361
|
+
throw new Error("Failed to initialize jiti TypeScript loader");
|
|
1362
|
+
}
|
|
1363
|
+
jitiLoader = createJiti((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
|
|
1364
|
+
interopDefault: true,
|
|
1365
|
+
moduleCache: true
|
|
1366
|
+
});
|
|
1367
|
+
}
|
|
1368
|
+
const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
|
|
1369
|
+
return Object.values(loaded2);
|
|
1370
|
+
}
|
|
1371
|
+
const moduleUrl = url.pathToFileURL(filePath).href;
|
|
1372
|
+
const loaded = await import(moduleUrl);
|
|
1373
|
+
return Object.values(loaded);
|
|
1374
|
+
}
|
|
1375
|
+
async function collectDatasetsFromFiles(config) {
|
|
1376
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1377
|
+
const matched = files.filter(
|
|
1378
|
+
(filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
|
|
1379
|
+
);
|
|
1380
|
+
const found = await Promise.all(
|
|
1381
|
+
matched.map(async (absolutePath) => {
|
|
1382
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1383
|
+
const datasets = exports.filter(isDatasetLike);
|
|
1384
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1385
|
+
return datasets.map((dataset) => ({
|
|
1386
|
+
id: toId("dataset", relPath, dataset.getName()),
|
|
1387
|
+
filePath: relPath,
|
|
1388
|
+
dataset
|
|
1389
|
+
}));
|
|
1390
|
+
})
|
|
1391
|
+
);
|
|
1392
|
+
return found.flat();
|
|
1393
|
+
}
|
|
1394
|
+
async function collectEvaluatorsFromFiles(config) {
|
|
1395
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1396
|
+
const matched = files.filter(
|
|
1397
|
+
(filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
|
|
1398
|
+
);
|
|
1399
|
+
const found = await Promise.all(
|
|
1400
|
+
matched.map(async (absolutePath) => {
|
|
1401
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1402
|
+
const evaluators = exports.filter(isEvaluatorLike);
|
|
1403
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1404
|
+
return evaluators.map((evaluator) => ({
|
|
1405
|
+
id: toId("evaluator", relPath, evaluator.getName()),
|
|
1406
|
+
filePath: relPath,
|
|
1407
|
+
evaluator
|
|
1408
|
+
}));
|
|
1409
|
+
})
|
|
1410
|
+
);
|
|
1411
|
+
return found.flat();
|
|
1412
|
+
}
|
|
1413
|
+
async function collectTestCasesFromFiles(config) {
|
|
1414
|
+
const files = await walkDirectory(config.rootDir, config.excludeDirectories);
|
|
1415
|
+
const matched = files.filter(
|
|
1416
|
+
(filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
|
|
1417
|
+
);
|
|
1418
|
+
const found = await Promise.all(
|
|
1419
|
+
matched.map(async (absolutePath) => {
|
|
1420
|
+
const exports = await loadModuleExports(absolutePath);
|
|
1421
|
+
const testCases = exports.filter(isTestCaseLike);
|
|
1422
|
+
const relPath = path.relative(config.rootDir, absolutePath);
|
|
1423
|
+
return testCases.map((testCase) => ({
|
|
1424
|
+
id: toId("test-case", relPath, testCase.getName()),
|
|
1425
|
+
filePath: relPath,
|
|
1426
|
+
testCase
|
|
1427
|
+
}));
|
|
1428
|
+
})
|
|
1429
|
+
);
|
|
1430
|
+
return found.flat();
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
// src/evals/metric.ts
|
|
1434
|
+
var registry = /* @__PURE__ */ new Map();
|
|
1435
|
+
var Metric = {
|
|
1436
|
+
of(config) {
|
|
1437
|
+
const def = {
|
|
1438
|
+
id: config.id,
|
|
1439
|
+
name: config.name,
|
|
1440
|
+
format: config.format,
|
|
1441
|
+
make: (data) => ({ id: config.id, data })
|
|
1442
|
+
};
|
|
1443
|
+
registry.set(config.id, def);
|
|
1444
|
+
return def;
|
|
1445
|
+
}
|
|
1446
|
+
};
|
|
1447
|
+
|
|
1448
|
+
// src/evals/score.ts
|
|
1449
|
+
var registry2 = /* @__PURE__ */ new Map();
|
|
1450
|
+
var Score = {
|
|
1451
|
+
of(config) {
|
|
1452
|
+
const def = {
|
|
1453
|
+
id: config.id,
|
|
1454
|
+
name: config.name,
|
|
1455
|
+
displayStrategy: config.displayStrategy,
|
|
1456
|
+
format: config.format,
|
|
1457
|
+
make: (data, options) => {
|
|
1458
|
+
const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
|
|
1459
|
+
return {
|
|
1460
|
+
id: config.id,
|
|
1461
|
+
data,
|
|
1462
|
+
...passed !== void 0 && { passed }
|
|
1463
|
+
};
|
|
1464
|
+
}
|
|
1465
|
+
};
|
|
1466
|
+
registry2.set(config.id, def);
|
|
1467
|
+
return def;
|
|
1468
|
+
}
|
|
1469
|
+
};
|
|
1470
|
+
function getScoreById(id) {
|
|
1471
|
+
return registry2.get(id);
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
// src/evals/metrics/standard.ts
|
|
1475
|
+
Metric.of({
|
|
1476
|
+
id: "token-count",
|
|
1477
|
+
name: "Tokens",
|
|
1478
|
+
format: (data) => {
|
|
1479
|
+
const input = data.input ?? 0;
|
|
1480
|
+
const output = data.output ?? 0;
|
|
1481
|
+
const inputCached = data.inputCached ?? 0;
|
|
1482
|
+
const outputCached = data.outputCached ?? 0;
|
|
1483
|
+
const cached = inputCached + outputCached;
|
|
1484
|
+
return `in:${input} out:${output} cached:${cached}`;
|
|
1485
|
+
}
|
|
1486
|
+
});
|
|
1487
|
+
Metric.of({
|
|
1488
|
+
id: "latency",
|
|
1489
|
+
name: "Latency",
|
|
1490
|
+
format: (data) => `${data.ms}ms`
|
|
1491
|
+
});
|
|
1492
|
+
|
|
1493
|
+
// src/evals/scores/standard.ts
|
|
1494
|
+
Score.of({
|
|
1495
|
+
id: "percent",
|
|
1496
|
+
name: "Score",
|
|
1497
|
+
displayStrategy: "bar",
|
|
1498
|
+
format: (data) => data.value.toFixed(2)
|
|
1499
|
+
});
|
|
1500
|
+
Score.of({
|
|
1501
|
+
id: "binary",
|
|
1502
|
+
name: "Result",
|
|
1503
|
+
displayStrategy: "passFail",
|
|
1504
|
+
format: (data) => data.passed ? "PASSED" : "NOT PASSED"
|
|
1505
|
+
});
|
|
1506
|
+
|
|
1507
|
+
// src/runner/score-utils.ts
|
|
1508
|
+
function toNumericScoreFromScores(scores) {
|
|
1509
|
+
for (const item of scores) {
|
|
1510
|
+
const def = getScoreById(item.id);
|
|
1511
|
+
if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
|
|
1512
|
+
const value = item.data.value;
|
|
1513
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1514
|
+
return value;
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
const numeric = toNumericScore(item.data);
|
|
1518
|
+
if (numeric !== void 0) {
|
|
1519
|
+
return numeric;
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
return void 0;
|
|
1523
|
+
}
|
|
1524
|
+
function toNumericScore(value) {
|
|
1525
|
+
if (typeof value === "number" && Number.isFinite(value)) {
|
|
1526
|
+
return value;
|
|
1527
|
+
}
|
|
1528
|
+
if (typeof value !== "object" || value === null) {
|
|
1529
|
+
return void 0;
|
|
1530
|
+
}
|
|
1531
|
+
const obj = value;
|
|
1532
|
+
if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
1533
|
+
return obj.score;
|
|
1534
|
+
}
|
|
1535
|
+
const numberValues = Object.values(value).filter(
|
|
1536
|
+
(entry) => typeof entry === "number" && Number.isFinite(entry)
|
|
1537
|
+
);
|
|
1538
|
+
if (numberValues.length === 0) {
|
|
1539
|
+
return void 0;
|
|
1540
|
+
}
|
|
1541
|
+
return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
// src/runner/execution.ts
|
|
1545
|
+
function computeEvaluatorPassed(evaluator, result, scores) {
|
|
1546
|
+
const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
|
|
1547
|
+
if (scoresWithPassed.length > 0) {
|
|
1548
|
+
return scoresWithPassed.every((s) => s.passed === true);
|
|
1549
|
+
}
|
|
1550
|
+
const passCriterion = evaluator.getPassCriterion();
|
|
1551
|
+
if (passCriterion) {
|
|
1552
|
+
return passCriterion(result);
|
|
1553
|
+
}
|
|
1554
|
+
const passThreshold = evaluator.getPassThreshold();
|
|
1555
|
+
if (passThreshold !== void 0) {
|
|
1556
|
+
const numeric = toNumericScoreFromScores(scores);
|
|
1557
|
+
return numeric !== void 0 && numeric >= passThreshold;
|
|
1558
|
+
}
|
|
1559
|
+
return true;
|
|
1560
|
+
}
|
|
1561
|
+
function normalizeResult(result) {
|
|
1562
|
+
if (typeof result !== "object" || result === null) {
|
|
1563
|
+
return { scores: [] };
|
|
1564
|
+
}
|
|
1565
|
+
const obj = result;
|
|
1566
|
+
const scores = Array.isArray(obj.scores) ? obj.scores : [];
|
|
1567
|
+
const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
|
|
1568
|
+
return { scores, metrics };
|
|
1569
|
+
}
|
|
1570
|
+
function nowIsoForFile() {
|
|
1571
|
+
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
|
|
1572
|
+
}
|
|
1573
|
+
function createArtifactPath(artifactDirectory, datasetId, runId) {
|
|
1574
|
+
return path.join(
|
|
1575
|
+
artifactDirectory,
|
|
1576
|
+
`${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
|
|
1577
|
+
);
|
|
1578
|
+
}
|
|
1579
|
+
var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
|
|
1580
|
+
const startedAt = Date.now();
|
|
1581
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1582
|
+
...snapshot,
|
|
1583
|
+
status: "running",
|
|
1584
|
+
startedAt
|
|
1585
|
+
}));
|
|
1586
|
+
yield* publishEvent({
|
|
1587
|
+
type: "RunStarted",
|
|
1588
|
+
runId: task.runId,
|
|
1589
|
+
startedAt
|
|
1590
|
+
});
|
|
1591
|
+
let completedTestCases = 0;
|
|
1592
|
+
let passedTestCases = 0;
|
|
1593
|
+
let failedTestCases = 0;
|
|
1594
|
+
for (const testCaseItem of task.testCases) {
|
|
1595
|
+
const started = Date.now();
|
|
1596
|
+
const evaluatorScores = [];
|
|
1597
|
+
let testCaseError;
|
|
1598
|
+
for (const { id: evaluatorId, evaluator } of task.evaluators) {
|
|
1599
|
+
const evaluateFn = evaluator.getEvaluateFn();
|
|
1600
|
+
if (!evaluateFn) {
|
|
1601
|
+
continue;
|
|
1602
|
+
}
|
|
1603
|
+
try {
|
|
1604
|
+
const ctx = yield* effect.Effect.promise(
|
|
1605
|
+
() => Promise.resolve(evaluator.resolveContext())
|
|
1606
|
+
);
|
|
1607
|
+
const result = yield* effect.Effect.promise(
|
|
1608
|
+
() => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
|
|
1609
|
+
);
|
|
1610
|
+
const { scores, metrics } = normalizeResult(result);
|
|
1611
|
+
const passed = computeEvaluatorPassed(evaluator, result, scores);
|
|
1612
|
+
evaluatorScores.push({ evaluatorId, scores, passed, metrics });
|
|
1613
|
+
} catch (error) {
|
|
1614
|
+
testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
|
|
1615
|
+
evaluatorScores.push({
|
|
1616
|
+
evaluatorId,
|
|
1617
|
+
scores: [],
|
|
1618
|
+
passed: false
|
|
1619
|
+
});
|
|
1620
|
+
}
|
|
1621
|
+
}
|
|
1622
|
+
const testCasePassed = evaluatorScores.every((s) => s.passed);
|
|
1623
|
+
completedTestCases += 1;
|
|
1624
|
+
if (testCasePassed) {
|
|
1625
|
+
passedTestCases += 1;
|
|
1626
|
+
} else {
|
|
1627
|
+
failedTestCases += 1;
|
|
1628
|
+
}
|
|
1629
|
+
const progressEvent = {
|
|
1630
|
+
type: "TestCaseProgress",
|
|
1631
|
+
runId: task.runId,
|
|
1632
|
+
testCaseId: testCaseItem.id,
|
|
1633
|
+
testCaseName: testCaseItem.testCase.getName(),
|
|
1634
|
+
completedTestCases,
|
|
1635
|
+
totalTestCases: task.testCases.length,
|
|
1636
|
+
passed: testCasePassed,
|
|
1637
|
+
durationMs: Date.now() - started,
|
|
1638
|
+
evaluatorScores,
|
|
1639
|
+
errorMessage: testCaseError
|
|
1640
|
+
};
|
|
1641
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1642
|
+
...snapshot,
|
|
1643
|
+
completedTestCases,
|
|
1644
|
+
passedTestCases,
|
|
1645
|
+
failedTestCases
|
|
1646
|
+
}));
|
|
1647
|
+
yield* publishEvent(progressEvent);
|
|
1648
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1649
|
+
runId: task.runId,
|
|
1650
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1651
|
+
payload: progressEvent
|
|
1652
|
+
});
|
|
1653
|
+
}
|
|
1654
|
+
const finishedAt = Date.now();
|
|
1655
|
+
const completedEvent = {
|
|
1656
|
+
type: "RunCompleted",
|
|
1657
|
+
runId: task.runId,
|
|
1658
|
+
finishedAt,
|
|
1659
|
+
passedTestCases,
|
|
1660
|
+
failedTestCases,
|
|
1661
|
+
totalTestCases: task.testCases.length,
|
|
1662
|
+
artifactPath: task.snapshot.artifactPath
|
|
1663
|
+
};
|
|
1664
|
+
updateSnapshot(task.runId, (snapshot) => ({
|
|
1665
|
+
...snapshot,
|
|
1666
|
+
status: "completed",
|
|
1667
|
+
completedTestCases,
|
|
1668
|
+
passedTestCases,
|
|
1669
|
+
failedTestCases,
|
|
1670
|
+
finishedAt
|
|
1671
|
+
}));
|
|
1672
|
+
yield* publishEvent(completedEvent);
|
|
1673
|
+
yield* effect.Queue.offer(persistenceQueue, {
|
|
1674
|
+
runId: task.runId,
|
|
1675
|
+
artifactPath: task.snapshot.artifactPath,
|
|
1676
|
+
payload: completedEvent
|
|
1677
|
+
});
|
|
1678
|
+
yield* publishEvent({
|
|
1679
|
+
type: "ArtifactFlushed",
|
|
1680
|
+
runId: task.runId,
|
|
1681
|
+
artifactPath: task.snapshot.artifactPath
|
|
1682
|
+
});
|
|
1683
|
+
});
|
|
1684
|
+
async function appendJsonLine(artifactPath, payload) {
|
|
1685
|
+
await promises.mkdir(path.dirname(artifactPath), { recursive: true });
|
|
1686
|
+
await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
|
|
1687
|
+
`, "utf8");
|
|
1688
|
+
}
|
|
1689
|
+
var createPersistenceWorker = (queue) => effect.Effect.forever(
|
|
1690
|
+
effect.Effect.gen(function* () {
|
|
1691
|
+
const message = yield* effect.Queue.take(queue);
|
|
1692
|
+
yield* effect.Effect.promise(
|
|
1693
|
+
() => appendJsonLine(message.artifactPath, {
|
|
1694
|
+
runId: message.runId,
|
|
1695
|
+
ts: Date.now(),
|
|
1696
|
+
...message.payload
|
|
1697
|
+
})
|
|
1698
|
+
);
|
|
1699
|
+
})
|
|
1700
|
+
);
|
|
1701
|
+
|
|
1702
|
+
// src/runner/search.ts
|
|
1703
|
+
function matchesAny(value, matchers) {
|
|
1704
|
+
if (!matchers || matchers.length === 0) {
|
|
1705
|
+
return true;
|
|
1706
|
+
}
|
|
1707
|
+
return matchers.some(
|
|
1708
|
+
(matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
|
|
1709
|
+
);
|
|
1710
|
+
}
|
|
1711
|
+
function matchesPath(value, matchers) {
|
|
1712
|
+
if (!matchers || matchers.length === 0) {
|
|
1713
|
+
return true;
|
|
1714
|
+
}
|
|
1715
|
+
return matchers.some((matcher) => {
|
|
1716
|
+
if (typeof matcher === "string") {
|
|
1717
|
+
return value.includes(matcher);
|
|
1718
|
+
}
|
|
1719
|
+
return matcher.test(value);
|
|
1720
|
+
});
|
|
1721
|
+
}
|
|
1722
|
+
function searchCollectedTestCases(all, query) {
|
|
1723
|
+
if (!query) {
|
|
1724
|
+
return all;
|
|
1725
|
+
}
|
|
1726
|
+
return all.filter((item) => {
|
|
1727
|
+
const tags = item.testCase.getTags();
|
|
1728
|
+
if (query.excludedTags && tags.some((tag) => matchesAny(tag, query.excludedTags))) {
|
|
1729
|
+
return false;
|
|
1730
|
+
}
|
|
1731
|
+
if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
|
|
1732
|
+
return false;
|
|
1733
|
+
}
|
|
1734
|
+
const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, query.includedTags));
|
|
1735
|
+
const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
|
|
1736
|
+
return includedTagsMatch && includedPathsMatch;
|
|
1737
|
+
});
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
// src/runner/api.ts
|
|
1741
|
+
function parseRegexLiteral(pattern) {
|
|
1742
|
+
if (!pattern.startsWith("/")) {
|
|
1743
|
+
return void 0;
|
|
1744
|
+
}
|
|
1745
|
+
const lastSlash = pattern.lastIndexOf("/");
|
|
1746
|
+
if (lastSlash <= 0) {
|
|
1747
|
+
return void 0;
|
|
1748
|
+
}
|
|
1749
|
+
return {
|
|
1750
|
+
source: pattern.slice(1, lastSlash),
|
|
1751
|
+
flags: pattern.slice(lastSlash + 1)
|
|
1752
|
+
};
|
|
1753
|
+
}
|
|
1754
|
+
function createNameMatcher(pattern) {
|
|
1755
|
+
const normalizedPattern = pattern.trim();
|
|
1756
|
+
const regexLiteral = parseRegexLiteral(normalizedPattern);
|
|
1757
|
+
if (regexLiteral) {
|
|
1758
|
+
const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
|
|
1759
|
+
return (value) => regex.test(value);
|
|
1760
|
+
}
|
|
1761
|
+
if (normalizedPattern.includes("*")) {
|
|
1762
|
+
const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
|
|
1763
|
+
const regex = new RegExp(`^${escaped}$`, "i");
|
|
1764
|
+
return (value) => regex.test(value);
|
|
1765
|
+
}
|
|
1766
|
+
return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
|
|
1767
|
+
}
|
|
1768
|
+
function createRunner(overrides) {
|
|
1769
|
+
return new EffectRunner(withRunnerConfig());
|
|
1770
|
+
}
|
|
1771
|
+
var EffectRunner = class {
|
|
1772
|
+
constructor(config) {
|
|
1773
|
+
this.eventBus = effect.Effect.runSync(effect.PubSub.unbounded());
|
|
1774
|
+
this.runQueue = effect.Effect.runSync(effect.Queue.unbounded());
|
|
1775
|
+
this.persistenceQueue = effect.Effect.runSync(
|
|
1776
|
+
effect.Queue.unbounded()
|
|
1777
|
+
);
|
|
1778
|
+
this.snapshots = /* @__PURE__ */ new Map();
|
|
1779
|
+
this.listeners = /* @__PURE__ */ new Set();
|
|
1780
|
+
this.datasetsById = /* @__PURE__ */ new Map();
|
|
1781
|
+
this.evaluatorsById = /* @__PURE__ */ new Map();
|
|
1782
|
+
this.schedulerFiber = effect.Effect.runFork(
|
|
1783
|
+
this.createSchedulerEffect()
|
|
1784
|
+
);
|
|
1785
|
+
this.persistenceFiber = effect.Effect.runFork(
|
|
1786
|
+
createPersistenceWorker(this.persistenceQueue)
|
|
1787
|
+
);
|
|
1788
|
+
this.config = config;
|
|
1789
|
+
}
|
|
1790
|
+
async collectDatasets() {
|
|
1791
|
+
const datasets = await collectDatasetsFromFiles(this.config.discovery);
|
|
1792
|
+
this.datasetsById.clear();
|
|
1793
|
+
for (const dataset of datasets) {
|
|
1794
|
+
this.datasetsById.set(dataset.id, dataset);
|
|
1795
|
+
}
|
|
1796
|
+
return datasets;
|
|
1797
|
+
}
|
|
1798
|
+
async collectEvaluators() {
|
|
1799
|
+
const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
|
|
1800
|
+
this.evaluatorsById.clear();
|
|
1801
|
+
for (const evaluator of evaluators) {
|
|
1802
|
+
this.evaluatorsById.set(evaluator.id, evaluator);
|
|
1803
|
+
}
|
|
1804
|
+
return evaluators;
|
|
1805
|
+
}
|
|
1806
|
+
async resolveDatasetByName(name) {
|
|
1807
|
+
if (this.datasetsById.size === 0) {
|
|
1808
|
+
await this.collectDatasets();
|
|
1809
|
+
}
|
|
1810
|
+
const normalized = name.trim().toLowerCase();
|
|
1811
|
+
return Array.from(this.datasetsById.values()).find(
|
|
1812
|
+
(item) => item.dataset.getName().toLowerCase() === normalized
|
|
1813
|
+
);
|
|
1814
|
+
}
|
|
1815
|
+
async resolveEvaluatorsByNamePattern(pattern) {
|
|
1816
|
+
if (this.evaluatorsById.size === 0) {
|
|
1817
|
+
await this.collectEvaluators();
|
|
1818
|
+
}
|
|
1819
|
+
const matcher = createNameMatcher(pattern);
|
|
1820
|
+
return Array.from(this.evaluatorsById.values()).filter(
|
|
1821
|
+
(item) => matcher(item.evaluator.getName() ?? "")
|
|
1822
|
+
);
|
|
1823
|
+
}
|
|
1824
|
+
async searchTestCases(query) {
|
|
1825
|
+
const testCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1826
|
+
return searchCollectedTestCases(testCases, query);
|
|
1827
|
+
}
|
|
1828
|
+
async collectDatasetTestCases(datasetId) {
|
|
1829
|
+
if (this.datasetsById.size === 0) {
|
|
1830
|
+
await this.collectDatasets();
|
|
1831
|
+
}
|
|
1832
|
+
const dataset = this.datasetsById.get(datasetId);
|
|
1833
|
+
if (!dataset) {
|
|
1834
|
+
throw new Error(`Unknown dataset: ${datasetId}`);
|
|
1835
|
+
}
|
|
1836
|
+
const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
|
|
1837
|
+
return allTestCases.filter(
|
|
1838
|
+
(testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
|
|
1839
|
+
);
|
|
1840
|
+
}
|
|
1841
|
+
async runDatasetWith(request) {
|
|
1842
|
+
if (this.datasetsById.size === 0) {
|
|
1843
|
+
await this.collectDatasets();
|
|
1844
|
+
}
|
|
1845
|
+
if (this.evaluatorsById.size === 0) {
|
|
1846
|
+
await this.collectEvaluators();
|
|
1847
|
+
}
|
|
1848
|
+
const dataset = this.datasetsById.get(request.datasetId);
|
|
1849
|
+
if (!dataset) {
|
|
1850
|
+
throw new Error(`Unknown dataset: ${request.datasetId}`);
|
|
1851
|
+
}
|
|
1852
|
+
const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
|
|
1853
|
+
if (selectedEvaluators.length === 0) {
|
|
1854
|
+
throw new Error("No evaluators selected for run");
|
|
1855
|
+
}
|
|
1856
|
+
const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
|
|
1857
|
+
const runId = `run-${crypto.randomUUID()}`;
|
|
1858
|
+
const artifactPath = createArtifactPath(
|
|
1859
|
+
this.config.artifactDirectory,
|
|
1860
|
+
request.datasetId,
|
|
1861
|
+
runId
|
|
1862
|
+
);
|
|
1863
|
+
const snapshot = {
|
|
1864
|
+
runId,
|
|
1865
|
+
datasetId: request.datasetId,
|
|
1866
|
+
datasetName: dataset.dataset.getName(),
|
|
1867
|
+
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1868
|
+
queuedAt: Date.now(),
|
|
1869
|
+
totalTestCases: selectedTestCases.length,
|
|
1870
|
+
completedTestCases: 0,
|
|
1871
|
+
passedTestCases: 0,
|
|
1872
|
+
failedTestCases: 0,
|
|
1873
|
+
status: "queued",
|
|
1874
|
+
artifactPath
|
|
1875
|
+
};
|
|
1876
|
+
this.snapshots.set(runId, snapshot);
|
|
1877
|
+
const queuedEvent = {
|
|
1878
|
+
type: "RunQueued",
|
|
1879
|
+
runId,
|
|
1880
|
+
datasetId: request.datasetId,
|
|
1881
|
+
datasetName: dataset.dataset.getName(),
|
|
1882
|
+
evaluatorIds: selectedEvaluators.map((item) => item.id),
|
|
1883
|
+
totalTestCases: selectedTestCases.length,
|
|
1884
|
+
artifactPath
|
|
1885
|
+
};
|
|
1886
|
+
await effect.Effect.runPromise(this.publishEvent(queuedEvent));
|
|
1887
|
+
await effect.Effect.runPromise(
|
|
1888
|
+
effect.Queue.offer(this.persistenceQueue, {
|
|
1889
|
+
runId,
|
|
1890
|
+
artifactPath,
|
|
1891
|
+
payload: queuedEvent
|
|
1892
|
+
})
|
|
1893
|
+
);
|
|
1894
|
+
await effect.Effect.runPromise(
|
|
1895
|
+
effect.Queue.offer(this.runQueue, {
|
|
1896
|
+
runId,
|
|
1897
|
+
datasetId: request.datasetId,
|
|
1898
|
+
dataset: dataset.dataset,
|
|
1899
|
+
evaluators: selectedEvaluators,
|
|
1900
|
+
testCases: selectedTestCases,
|
|
1901
|
+
snapshot
|
|
1902
|
+
})
|
|
1903
|
+
);
|
|
1904
|
+
return snapshot;
|
|
1905
|
+
}
|
|
1906
|
+
subscribeRunEvents(listener, options) {
|
|
1907
|
+
const entry = { runId: options?.runId, listener };
|
|
1908
|
+
this.listeners.add(entry);
|
|
1909
|
+
return () => {
|
|
1910
|
+
this.listeners.delete(entry);
|
|
1911
|
+
};
|
|
1912
|
+
}
|
|
1913
|
+
getRunSnapshot(runId) {
|
|
1914
|
+
return this.snapshots.get(runId);
|
|
1915
|
+
}
|
|
1916
|
+
getAllRunSnapshots() {
|
|
1917
|
+
return Array.from(this.snapshots.values()).sort(
|
|
1918
|
+
(a, b) => b.queuedAt - a.queuedAt
|
|
1919
|
+
);
|
|
1920
|
+
}
|
|
1921
|
+
async shutdown() {
|
|
1922
|
+
await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
|
|
1923
|
+
await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
|
|
1924
|
+
await effect.Effect.runPromise(effect.Queue.shutdown(this.runQueue));
|
|
1925
|
+
await effect.Effect.runPromise(effect.Queue.shutdown(this.persistenceQueue));
|
|
1926
|
+
await effect.Effect.runPromise(effect.PubSub.shutdown(this.eventBus));
|
|
1927
|
+
}
|
|
1928
|
+
createSchedulerEffect() {
|
|
1929
|
+
const self = this;
|
|
1930
|
+
return effect.Effect.forever(
|
|
1931
|
+
effect.Effect.gen(function* () {
|
|
1932
|
+
const task = yield* effect.Queue.take(self.runQueue);
|
|
1933
|
+
yield* effect.Effect.fork(
|
|
1934
|
+
executeRunTask(
|
|
1935
|
+
task,
|
|
1936
|
+
self.publishEvent.bind(self),
|
|
1937
|
+
self.persistenceQueue,
|
|
1938
|
+
self.updateSnapshot.bind(self)
|
|
1939
|
+
)
|
|
1940
|
+
);
|
|
1941
|
+
})
|
|
1942
|
+
);
|
|
1943
|
+
}
|
|
1944
|
+
updateSnapshot(runId, updater) {
|
|
1945
|
+
const existing = this.snapshots.get(runId);
|
|
1946
|
+
if (!existing) {
|
|
1947
|
+
return;
|
|
1948
|
+
}
|
|
1949
|
+
this.snapshots.set(runId, updater(existing));
|
|
1950
|
+
}
|
|
1951
|
+
publishEvent(event) {
|
|
1952
|
+
return effect.Effect.sync(() => {
|
|
1953
|
+
for (const entry of this.listeners) {
|
|
1954
|
+
if (entry.runId && entry.runId !== event.runId) {
|
|
1955
|
+
continue;
|
|
1956
|
+
}
|
|
1957
|
+
entry.listener(event);
|
|
1958
|
+
}
|
|
1959
|
+
}).pipe(
|
|
1960
|
+
effect.Effect.flatMap(() => effect.PubSub.publish(this.eventBus, event)),
|
|
1961
|
+
effect.Effect.asVoid
|
|
1962
|
+
);
|
|
1963
|
+
}
|
|
1964
|
+
};
|
|
1965
|
+
async function main() {
|
|
1966
|
+
const args = parseStartupArgs(process.argv.slice(2));
|
|
1967
|
+
const runner = createRunner();
|
|
1968
|
+
const data = await loadRunnerData(runner).catch(() => loadMockData());
|
|
1969
|
+
process.on("SIGINT", () => {
|
|
1970
|
+
void runner.shutdown().finally(() => process.exit(0));
|
|
1971
|
+
});
|
|
1972
|
+
process.on("SIGTERM", () => {
|
|
1973
|
+
void runner.shutdown().finally(() => process.exit(0));
|
|
1974
|
+
});
|
|
1975
|
+
fullscreenInk.withFullScreen(
|
|
1976
|
+
/* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })
|
|
1977
|
+
).start();
|
|
1978
|
+
}
|
|
1979
|
+
void main();
|
|
1980
|
+
//# sourceMappingURL=out.js.map
|
|
1981
|
+
//# sourceMappingURL=cli.cjs.map
|