@m4trix/evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs ADDED
@@ -0,0 +1,1981 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ var fullscreenInk = require('fullscreen-ink');
5
+ var React = require('react');
6
+ var ink = require('ink');
7
+ var jsxRuntime = require('react/jsx-runtime');
8
+ var crypto = require('crypto');
9
+ var effect = require('effect');
10
+ var promises = require('fs/promises');
11
+ var path = require('path');
12
+ var url = require('url');
13
+
14
+ var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
15
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
16
+
17
+ var React__default = /*#__PURE__*/_interopDefault(React);
18
+
19
+ var SEP = " ";
20
+ var ARROW = "\u203A";
21
+ function getBreadcrumbText(state, datasetName, runLabel) {
22
+ const dim = (s, k) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: s }, k ?? s);
23
+ const accent = (s) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children: s }, s);
24
+ if (state.level === "datasets") {
25
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
26
+ dim("Evaluations"),
27
+ SEP,
28
+ dim(ARROW, "a1"),
29
+ SEP,
30
+ accent("Datasets")
31
+ ] });
32
+ }
33
+ if (state.level === "runs") {
34
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
35
+ dim("Evaluations"),
36
+ SEP,
37
+ dim(ARROW, "a1"),
38
+ SEP,
39
+ dim("Dataset:"),
40
+ " ",
41
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: datasetName ?? "-" }, "ds"),
42
+ SEP,
43
+ dim(ARROW, "a2"),
44
+ SEP,
45
+ accent("Runs")
46
+ ] });
47
+ }
48
+ if (state.level === "details") {
49
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
50
+ dim("Evaluations"),
51
+ SEP,
52
+ dim(ARROW, "a1"),
53
+ SEP,
54
+ dim("Dataset:"),
55
+ " ",
56
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: datasetName ?? "-" }, "ds"),
57
+ SEP,
58
+ dim(ARROW, "a2"),
59
+ SEP,
60
+ dim("Run:"),
61
+ " ",
62
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: runLabel ?? "-" }, "rl"),
63
+ SEP,
64
+ dim(ARROW, "a3"),
65
+ SEP,
66
+ accent("Details")
67
+ ] });
68
+ }
69
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
70
+ dim("Evaluations"),
71
+ SEP,
72
+ dim(ARROW, "a1"),
73
+ SEP,
74
+ accent("New evaluation"),
75
+ SEP,
76
+ dim(ARROW, "a2"),
77
+ SEP,
78
+ dim("Select evaluators", "sel")
79
+ ] });
80
+ }
81
+
82
+ // src/cli/components/Footer.tsx
83
+ function getFooterText(state) {
84
+ if (state.level === "datasets") {
85
+ return "\u2191\u2193 move Enter open / search Tab focus q quit";
86
+ }
87
+ if (state.level === "runs") {
88
+ return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
89
+ }
90
+ if (state.level === "details") {
91
+ return "\u2191\u2193 scroll Backspace runs Tab focus q quit";
92
+ }
93
+ return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
94
+ }
95
+ function ListItem({
96
+ selected,
97
+ label,
98
+ itemKey
99
+ }) {
100
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
101
+ selected ? "\u25B8 " : " ",
102
+ label
103
+ ] }, itemKey);
104
+ }
105
+ function Pane({
106
+ children,
107
+ width,
108
+ flexGrow,
109
+ marginLeft,
110
+ focused = false
111
+ }) {
112
+ return /* @__PURE__ */ jsxRuntime.jsx(
113
+ ink.Box,
114
+ {
115
+ flexDirection: "column",
116
+ width,
117
+ flexGrow,
118
+ marginLeft,
119
+ borderStyle: focused ? "single" : "round",
120
+ borderColor: focused ? "cyan" : "gray",
121
+ paddingX: 1,
122
+ children
123
+ }
124
+ );
125
+ }
126
+ function SectionHeader({
127
+ children
128
+ }) {
129
+ return /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", bold: true, children });
130
+ }
131
+ function StatusText({ status }) {
132
+ const color = status === "PASS" ? "green" : status === "RUNNING" ? "yellow" : "red";
133
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color, children: [
134
+ "(",
135
+ status,
136
+ ")"
137
+ ] });
138
+ }
139
+ var LEFT_PANE_WIDTH = 44;
140
+ function RunsSidebar({
141
+ state,
142
+ runs
143
+ }) {
144
+ const focused = state.focus === "left";
145
+ return /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
146
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Runs" }),
147
+ /* @__PURE__ */ jsxRuntime.jsx(
148
+ ListItem,
149
+ {
150
+ selected: state.runMenuIndex === 0,
151
+ label: "New evaluation",
152
+ itemKey: "runs-new-eval"
153
+ }
154
+ ),
155
+ runs.map((run, index) => /* @__PURE__ */ jsxRuntime.jsxs(
156
+ ink.Text,
157
+ {
158
+ color: state.runMenuIndex === index + 1 ? "cyan" : "gray",
159
+ bold: state.runMenuIndex === index + 1,
160
+ children: [
161
+ state.runMenuIndex === index + 1 ? "\u25B8 " : " ",
162
+ run.label,
163
+ " ",
164
+ /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: run.status })
165
+ ]
166
+ },
167
+ run.id
168
+ ))
169
+ ] });
170
+ }
171
+ var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
172
+ function Sparkline({
173
+ data,
174
+ width,
175
+ label
176
+ }) {
177
+ if (data.length === 0)
178
+ return null;
179
+ const max = Math.max(...data);
180
+ const min = Math.min(...data);
181
+ const range = max - min || 1;
182
+ const targetWidth = width ?? Math.min(data.length, 24);
183
+ let values;
184
+ if (data.length <= targetWidth) {
185
+ values = data;
186
+ } else {
187
+ const step = data.length / targetWidth;
188
+ values = Array.from({ length: targetWidth }, (_, i) => {
189
+ const start = Math.floor(i * step);
190
+ const end = Math.floor((i + 1) * step);
191
+ const slice = data.slice(start, end);
192
+ return slice.reduce((a, b) => a + b, 0) / slice.length;
193
+ });
194
+ }
195
+ const spark = values.map((v) => {
196
+ const normalized = (v - min) / range;
197
+ const idx = Math.min(7, Math.floor(normalized * 8));
198
+ return BLOCKS[idx];
199
+ }).join("");
200
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
201
+ label !== void 0 && label !== "" ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
202
+ label.padEnd(14),
203
+ " "
204
+ ] }) : null,
205
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "cyan", children: spark })
206
+ ] });
207
+ }
208
+ function barColor(pct) {
209
+ if (pct >= 70)
210
+ return "green";
211
+ if (pct >= 40)
212
+ return "yellow";
213
+ return "red";
214
+ }
215
+ function TextBar({
216
+ label,
217
+ value,
218
+ max = 100,
219
+ labelWidth = 14,
220
+ barWidth = 20,
221
+ format = (v) => String(v),
222
+ colorByValue = true
223
+ }) {
224
+ const clamped = Math.max(0, Math.min(max, value));
225
+ const pct = max > 0 ? clamped / max * 100 : 0;
226
+ const filled = Math.round(clamped / max * barWidth);
227
+ const filledBar = "\u2588".repeat(filled);
228
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
229
+ const color = colorByValue ? barColor(pct) : void 0;
230
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
231
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: label.padEnd(labelWidth) }),
232
+ " [",
233
+ color ? /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
234
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, children: filledBar }),
235
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: emptyBar })
236
+ ] }) : filledBar + emptyBar,
237
+ "] ",
238
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: color ?? "white", bold: true, children: format(value) })
239
+ ] });
240
+ }
241
+
242
+ // src/cli/keys.ts
243
+ function isQuitInput(input) {
244
+ return input.toLowerCase() === "q";
245
+ }
246
+ function isSearchInput(input) {
247
+ return input === "/";
248
+ }
249
+ function isPrintableCharacter(input) {
250
+ return input.length === 1 && input >= " " && input !== "\x7F";
251
+ }
252
+ function isBackKey(key) {
253
+ return key.backspace || key.delete;
254
+ }
255
+
256
+ // src/cli/data.mock.json
257
+ var data_mock_default = {
258
+ datasets: [
259
+ {
260
+ id: "onboarding-flows",
261
+ name: "onboarding-flows",
262
+ overview: "Evaluate first-user journeys and schema compliance for generated onboarding payloads.",
263
+ runs: [
264
+ {
265
+ id: "run_2026-02-17_2044",
266
+ label: "2026-02-17 20:44",
267
+ status: "FAILED",
268
+ performance: {
269
+ passRate: 96,
270
+ avgScore: 0.91,
271
+ latencyP95Ms: 710,
272
+ latencyAvgMs: 502,
273
+ tokensAvg: 171,
274
+ tokensP95: 230,
275
+ costUsd: 24e-4,
276
+ latencyHistoryMs: [380, 420, 510, 480, 550, 620, 590, 710, 520, 480, 530, 600]
277
+ },
278
+ dimensions: [
279
+ { name: "correctness", score: 82 },
280
+ { name: "faithfulness", score: 79 },
281
+ { name: "brevity", score: 68 },
282
+ { name: "style", score: 90 }
283
+ ],
284
+ checks: [
285
+ { name: "json_schema", passed: false, detail: "3 violations" },
286
+ { name: "tool_calls", passed: true, detail: "0 unexpected" },
287
+ { name: "pii_leak", passed: true },
288
+ { name: "jailbreak", passed: true }
289
+ ],
290
+ failures: [
291
+ { title: "product_parser \u203A conforms to schema (price: string)" },
292
+ { title: "checkout \u203A tool-call count mismatch" }
293
+ ],
294
+ meta: {
295
+ model: "gpt-4o-mini",
296
+ provider: "OpenAI",
297
+ commit: "2f3c1a9",
298
+ branch: "main",
299
+ seed: 42,
300
+ concurrency: 4,
301
+ duration: "00:01:12",
302
+ artifact: "./eval-results/run_2026-02-17.jsonl"
303
+ }
304
+ },
305
+ {
306
+ id: "run_2026-02-16_1112",
307
+ label: "2026-02-16 11:12",
308
+ status: "PASS",
309
+ performance: {
310
+ passRate: 99,
311
+ avgScore: 0.95,
312
+ latencyP95Ms: 650,
313
+ latencyAvgMs: 488,
314
+ tokensAvg: 168,
315
+ tokensP95: 220,
316
+ costUsd: 2e-3,
317
+ latencyHistoryMs: [420, 450, 480, 460, 520, 490, 510, 650, 440, 470, 500, 480]
318
+ },
319
+ dimensions: [
320
+ { name: "correctness", score: 89 },
321
+ { name: "faithfulness", score: 88 },
322
+ { name: "brevity", score: 72 },
323
+ { name: "style", score: 93 }
324
+ ],
325
+ checks: [
326
+ { name: "json_schema", passed: true, detail: "0 violations" },
327
+ { name: "tool_calls", passed: true, detail: "0 unexpected" },
328
+ { name: "pii_leak", passed: true },
329
+ { name: "jailbreak", passed: true }
330
+ ],
331
+ failures: [],
332
+ meta: {
333
+ model: "gpt-4o-mini",
334
+ provider: "OpenAI",
335
+ commit: "0d24f8f",
336
+ branch: "main",
337
+ seed: 42,
338
+ concurrency: 4,
339
+ duration: "00:01:06",
340
+ artifact: "./eval-results/run_2026-02-16.jsonl"
341
+ }
342
+ },
343
+ {
344
+ id: "run_2026-02-15_0921",
345
+ label: "2026-02-15 09:21",
346
+ status: "PASS",
347
+ performance: {
348
+ passRate: 98,
349
+ avgScore: 0.93,
350
+ latencyP95Ms: 680,
351
+ latencyAvgMs: 495,
352
+ tokensAvg: 175,
353
+ tokensP95: 235,
354
+ costUsd: 22e-4,
355
+ latencyHistoryMs: [450, 480, 520, 490, 550, 580, 620, 680, 510, 470, 530, 560]
356
+ },
357
+ dimensions: [
358
+ { name: "correctness", score: 86 },
359
+ { name: "faithfulness", score: 84 },
360
+ { name: "brevity", score: 70 },
361
+ { name: "style", score: 91 }
362
+ ],
363
+ checks: [
364
+ { name: "json_schema", passed: true, detail: "0 violations" },
365
+ { name: "tool_calls", passed: true, detail: "0 unexpected" },
366
+ { name: "pii_leak", passed: true },
367
+ { name: "jailbreak", passed: true }
368
+ ],
369
+ failures: [],
370
+ meta: {
371
+ model: "gpt-4o-mini",
372
+ provider: "OpenAI",
373
+ commit: "a1b2c3d",
374
+ branch: "main",
375
+ seed: 42,
376
+ concurrency: 4,
377
+ duration: "00:01:08",
378
+ artifact: "./eval-results/run_2026-02-15.jsonl"
379
+ }
380
+ }
381
+ ]
382
+ },
383
+ {
384
+ id: "tool-calls",
385
+ name: "tool-calls",
386
+ overview: "Validate function-call conformance and unexpected tool invocation behavior.",
387
+ runs: [
388
+ {
389
+ id: "run_2026-02-14_1530",
390
+ label: "2026-02-14 15:30",
391
+ status: "PASS",
392
+ performance: {
393
+ passRate: 100,
394
+ avgScore: 1,
395
+ latencyP95Ms: 320,
396
+ latencyAvgMs: 280,
397
+ tokensAvg: 45,
398
+ tokensP95: 62,
399
+ costUsd: 8e-4,
400
+ latencyHistoryMs: [250, 270, 290, 280, 310, 320, 265, 290, 300, 275]
401
+ },
402
+ dimensions: [
403
+ { name: "contract_match", score: 100 },
404
+ { name: "arg_validity", score: 100 }
405
+ ],
406
+ checks: [
407
+ { name: "tool_calls", passed: true, detail: "0 unexpected" }
408
+ ],
409
+ failures: [],
410
+ meta: {
411
+ model: "gpt-4o-mini",
412
+ provider: "OpenAI",
413
+ commit: "e4f5g6h",
414
+ branch: "feat/tools",
415
+ seed: 42,
416
+ concurrency: 8,
417
+ duration: "00:00:45",
418
+ artifact: "./eval-results/tool-calls_2026-02-14.jsonl"
419
+ }
420
+ }
421
+ ]
422
+ },
423
+ {
424
+ id: "json-schema",
425
+ name: "json-schema",
426
+ overview: "Stress-test schema fidelity across generated extraction payloads.",
427
+ runs: []
428
+ }
429
+ ],
430
+ evaluators: [
431
+ { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
432
+ { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
433
+ { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
434
+ { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
435
+ ]
436
+ };
437
+
438
+ // src/cli/state.ts
439
+ function loadMockData() {
440
+ return data_mock_default;
441
+ }
442
+ function toSlug(input) {
443
+ return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
444
+ }
445
+ function toEvalRun(snapshot) {
446
+ const total = snapshot.totalTestCases === 0 ? 1 : snapshot.totalTestCases;
447
+ const passRate = Math.round(snapshot.passedTestCases / total * 100);
448
+ const avgScore = snapshot.passedTestCases / total;
449
+ const durationMs = snapshot.finishedAt ? snapshot.finishedAt - (snapshot.startedAt ?? snapshot.queuedAt) : Date.now() - (snapshot.startedAt ?? snapshot.queuedAt);
450
+ return {
451
+ id: snapshot.runId,
452
+ label: snapshot.runId.slice(0, 12),
453
+ status: snapshot.status === "completed" ? "PASS" : snapshot.status === "failed" ? "FAILED" : "RUNNING",
454
+ performance: {
455
+ passRate,
456
+ avgScore,
457
+ latencyP95Ms: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
458
+ latencyAvgMs: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
459
+ tokensAvg: 0,
460
+ tokensP95: 0,
461
+ costUsd: 0,
462
+ latencyHistoryMs: [durationMs]
463
+ },
464
+ dimensions: [
465
+ { name: "passed", score: Math.round(snapshot.passedTestCases / total * 100) },
466
+ { name: "failed", score: Math.round(snapshot.failedTestCases / total * 100) }
467
+ ],
468
+ checks: [
469
+ {
470
+ name: "run_status",
471
+ passed: snapshot.status === "completed",
472
+ detail: snapshot.status
473
+ }
474
+ ],
475
+ failures: snapshot.errorMessage && snapshot.errorMessage.length > 0 ? [{ title: snapshot.errorMessage }] : [],
476
+ meta: {
477
+ model: "n/a",
478
+ provider: "runner",
479
+ commit: "local",
480
+ branch: "local",
481
+ seed: 0,
482
+ concurrency: 1,
483
+ duration: `${durationMs}ms`,
484
+ artifact: snapshot.artifactPath
485
+ }
486
+ };
487
+ }
488
+ function toEvalDataset(item, snapshots) {
489
+ const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
490
+ return {
491
+ id: item.id,
492
+ name: item.dataset.getName(),
493
+ overview: `Discovered from ${item.filePath}`,
494
+ runs
495
+ };
496
+ }
497
+ function toEvaluatorOption(item) {
498
+ return {
499
+ id: item.id,
500
+ name: item.evaluator.getName() ?? toSlug(item.id),
501
+ configPreview: `Source: ${item.filePath}`
502
+ };
503
+ }
504
+ async function loadRunnerData(runner) {
505
+ const [datasets, evaluators] = await Promise.all([
506
+ runner.collectDatasets(),
507
+ runner.collectEvaluators()
508
+ ]);
509
+ const snapshots = runner.getAllRunSnapshots();
510
+ if (datasets.length === 0 && evaluators.length === 0) {
511
+ return loadMockData();
512
+ }
513
+ return {
514
+ datasets: datasets.map((dataset) => toEvalDataset(dataset, snapshots)),
515
+ evaluators: evaluators.map(toEvaluatorOption)
516
+ };
517
+ }
518
+ function applyRunnerEvent(data, event, runner) {
519
+ const snapshot = runner.getRunSnapshot(event.runId);
520
+ if (!snapshot) {
521
+ return data;
522
+ }
523
+ const dataset = data.datasets.find((item) => item.id === snapshot.datasetId);
524
+ if (!dataset) {
525
+ return data;
526
+ }
527
+ const run = toEvalRun(snapshot);
528
+ const hasRun = dataset.runs.some((item) => item.id === run.id);
529
+ const nextRuns = hasRun ? dataset.runs.map((item) => item.id === run.id ? run : item) : [run, ...dataset.runs];
530
+ return {
531
+ ...data,
532
+ datasets: data.datasets.map(
533
+ (item) => item.id === dataset.id ? { ...item, runs: nextRuns } : item
534
+ )
535
+ };
536
+ }
537
+ function parseStartupArgs(argv) {
538
+ const args = { unknownArgs: [] };
539
+ for (let index = 0; index < argv.length; index += 1) {
540
+ const token = argv[index];
541
+ if (token === "--dataset" && argv[index + 1]) {
542
+ args.datasetId = argv[index + 1];
543
+ index += 1;
544
+ continue;
545
+ }
546
+ if (token === "--run" && argv[index + 1]) {
547
+ args.runId = argv[index + 1];
548
+ index += 1;
549
+ continue;
550
+ }
551
+ if (token === "--search" && argv[index + 1]) {
552
+ args.search = argv[index + 1];
553
+ index += 1;
554
+ continue;
555
+ }
556
+ args.unknownArgs.push(token);
557
+ }
558
+ return args;
559
+ }
560
+ function getFilteredDatasets(data, searchQuery) {
561
+ const query = searchQuery.trim().toLowerCase();
562
+ if (!query) {
563
+ return data.datasets;
564
+ }
565
+ return data.datasets.filter((dataset) => dataset.name.toLowerCase().includes(query));
566
+ }
567
+ function getDatasetByMenuIndex(datasets, menuIndex) {
568
+ if (menuIndex <= 0) {
569
+ return void 0;
570
+ }
571
+ return datasets[menuIndex - 1];
572
+ }
573
+ function getRunByMenuIndex(dataset, menuIndex) {
574
+ if (!dataset || menuIndex <= 0) {
575
+ return void 0;
576
+ }
577
+ return dataset.runs[menuIndex - 1];
578
+ }
579
+ function createInitialState(data, args) {
580
+ const warnings = [];
581
+ if (args.unknownArgs.length > 0) {
582
+ warnings.push(`Unknown args: ${args.unknownArgs.join(", ")}`);
583
+ warnings.push("Supported: --dataset <id>, --run <id>, --search <term>");
584
+ }
585
+ const searchQuery = args.search ?? "";
586
+ const filteredDatasets = getFilteredDatasets(data, searchQuery);
587
+ const datasetByArg = filteredDatasets.find((dataset) => dataset.id === args.datasetId);
588
+ const datasetMenuIndex = datasetByArg ? filteredDatasets.indexOf(datasetByArg) + 1 : 0;
589
+ let level = "datasets";
590
+ let runMenuIndex = 0;
591
+ if (datasetByArg) {
592
+ level = "runs";
593
+ } else if (args.datasetId) {
594
+ warnings.push(`Dataset "${args.datasetId}" not found.`);
595
+ }
596
+ if (datasetByArg && args.runId) {
597
+ const runIndex = datasetByArg.runs.findIndex((run) => run.id === args.runId);
598
+ if (runIndex >= 0) {
599
+ runMenuIndex = runIndex + 1;
600
+ level = "details";
601
+ } else {
602
+ warnings.push(`Run "${args.runId}" not found in dataset "${datasetByArg.id}".`);
603
+ }
604
+ }
605
+ return {
606
+ level,
607
+ focus: "left",
608
+ datasetMenuIndex,
609
+ runMenuIndex,
610
+ detailsScrollOffset: 0,
611
+ selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
612
+ evaluatorMenuIndex: 0,
613
+ searchQuery,
614
+ searchMode: false,
615
+ startupWarnings: warnings
616
+ };
617
+ }
618
+ function reduceCliState(state, action) {
619
+ if (action.type === "MOVE_UP") {
620
+ if (state.searchMode) {
621
+ return state;
622
+ }
623
+ if (state.level === "details" && state.focus === "right") {
624
+ return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
625
+ }
626
+ if (state.level === "datasets") {
627
+ return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
628
+ }
629
+ if (state.level === "runs") {
630
+ return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
631
+ }
632
+ if (state.level === "new-evaluation") {
633
+ return { ...state, evaluatorMenuIndex: Math.max(0, state.evaluatorMenuIndex - 1) };
634
+ }
635
+ return state;
636
+ }
637
+ if (action.type === "MOVE_DOWN") {
638
+ if (state.searchMode) {
639
+ return state;
640
+ }
641
+ if (state.level === "details" && state.focus === "right") {
642
+ return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
643
+ }
644
+ if (state.level === "datasets") {
645
+ return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
646
+ }
647
+ if (state.level === "runs") {
648
+ return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
649
+ }
650
+ if (state.level === "new-evaluation") {
651
+ return { ...state, evaluatorMenuIndex: Math.min(action.max, state.evaluatorMenuIndex + 1) };
652
+ }
653
+ return state;
654
+ }
655
+ if (action.type === "ENTER") {
656
+ if (state.searchMode) {
657
+ return { ...state, searchMode: false };
658
+ }
659
+ if (state.level === "datasets") {
660
+ if (state.datasetMenuIndex === 0) {
661
+ return { ...state, level: "new-evaluation" };
662
+ }
663
+ if (action.hasDataset) {
664
+ return { ...state, level: "runs", runMenuIndex: 0 };
665
+ }
666
+ return state;
667
+ }
668
+ if (state.level === "runs") {
669
+ if (state.runMenuIndex === 0) {
670
+ return { ...state, level: "new-evaluation" };
671
+ }
672
+ if (action.hasRun) {
673
+ return { ...state, level: "details", detailsScrollOffset: 0 };
674
+ }
675
+ return state;
676
+ }
677
+ if (state.level === "new-evaluation") {
678
+ return state;
679
+ }
680
+ return state;
681
+ }
682
+ if (action.type === "BACK") {
683
+ if (state.searchMode) {
684
+ return { ...state, searchMode: false };
685
+ }
686
+ if (state.level === "details") {
687
+ return { ...state, level: "runs" };
688
+ }
689
+ if (state.level === "runs" || state.level === "new-evaluation") {
690
+ return { ...state, level: "datasets" };
691
+ }
692
+ return state;
693
+ }
694
+ if (action.type === "TOGGLE_FOCUS") {
695
+ return { ...state, focus: state.focus === "left" ? "right" : "left" };
696
+ }
697
+ if (action.type === "START_SEARCH") {
698
+ return { ...state, searchMode: true };
699
+ }
700
+ if (action.type === "END_SEARCH") {
701
+ return { ...state, searchMode: false };
702
+ }
703
+ if (action.type === "APPEND_SEARCH") {
704
+ return { ...state, searchQuery: `${state.searchQuery}${action.value}` };
705
+ }
706
+ if (action.type === "REMOVE_SEARCH_CHAR") {
707
+ return { ...state, searchQuery: state.searchQuery.slice(0, -1) };
708
+ }
709
+ if (action.type === "TOGGLE_EVALUATOR") {
710
+ const exists = state.selectedEvaluatorIds.includes(action.evaluatorId);
711
+ return {
712
+ ...state,
713
+ selectedEvaluatorIds: exists ? state.selectedEvaluatorIds.filter((id) => id !== action.evaluatorId) : [...state.selectedEvaluatorIds, action.evaluatorId]
714
+ };
715
+ }
716
+ if (action.type === "CLEAR_WARNINGS") {
717
+ return { ...state, startupWarnings: [] };
718
+ }
719
+ return state;
720
+ }
721
+ var LEFT_PANE_WIDTH2 = 44;
722
+ function DatasetsView({
723
+ state,
724
+ filteredDatasets,
725
+ selectedDataset
726
+ }) {
727
+ const leftFocused = state.focus === "left";
728
+ const rightFocused = state.focus === "right";
729
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
730
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
731
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Datasets" }),
732
+ /* @__PURE__ */ jsxRuntime.jsx(
733
+ ListItem,
734
+ {
735
+ selected: state.datasetMenuIndex === 0,
736
+ label: "New evaluation",
737
+ itemKey: "datasets-new-eval"
738
+ }
739
+ ),
740
+ filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsxRuntime.jsx(
741
+ ListItem,
742
+ {
743
+ selected: state.datasetMenuIndex === index + 1,
744
+ label: dataset.name,
745
+ itemKey: `dataset-${dataset.id}`
746
+ },
747
+ dataset.id
748
+ ))
749
+ ] }),
750
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
751
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overview" }),
752
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
753
+ ] })
754
+ ] });
755
+ }
756
+ function RunsView({
757
+ state,
758
+ dataset,
759
+ selectedRun
760
+ }) {
761
+ const runs = dataset?.runs ?? [];
762
+ const rightFocused = state.focus === "right";
763
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
764
+ /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
765
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxRuntime.jsxs(ink.Box, { flexDirection: "column", children: [
766
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
767
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Run:" }),
768
+ " ",
769
+ selectedRun.label,
770
+ " ",
771
+ /* @__PURE__ */ jsxRuntime.jsx(StatusText, { status: selectedRun.status })
772
+ ] }),
773
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
774
+ "Commit: ",
775
+ selectedRun.meta.commit,
776
+ " Branch: ",
777
+ selectedRun.meta.branch,
778
+ " ",
779
+ "Seed: ",
780
+ selectedRun.meta.seed
781
+ ] }),
782
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
783
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Overall" }),
784
+ /* @__PURE__ */ jsxRuntime.jsx(
785
+ TextBar,
786
+ {
787
+ label: "pass rate",
788
+ value: selectedRun.performance.passRate,
789
+ format: (v) => `${v}%`
790
+ }
791
+ ),
792
+ /* @__PURE__ */ jsxRuntime.jsx(
793
+ TextBar,
794
+ {
795
+ label: "avg score",
796
+ value: Math.round(selectedRun.performance.avgScore * 100)
797
+ }
798
+ ),
799
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
800
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Dimensions" }),
801
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsxRuntime.jsx(
802
+ TextBar,
803
+ {
804
+ label: dimension.name,
805
+ value: dimension.score
806
+ },
807
+ dimension.name
808
+ )),
809
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }),
810
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }),
811
+ /* @__PURE__ */ jsxRuntime.jsx(
812
+ Sparkline,
813
+ {
814
+ data: selectedRun.performance.latencyHistoryMs ?? [
815
+ selectedRun.performance.latencyAvgMs - 40,
816
+ selectedRun.performance.latencyAvgMs - 10,
817
+ selectedRun.performance.latencyAvgMs + 20,
818
+ selectedRun.performance.latencyP95Ms - 80,
819
+ selectedRun.performance.latencyP95Ms
820
+ ],
821
+ width: 24
822
+ }
823
+ )
824
+ ] }) })
825
+ ] });
826
+ }
827
+ var DETAILS_PAGE_SIZE = 20;
828
+ function CheckRow({
829
+ name,
830
+ passed,
831
+ detail
832
+ }) {
833
+ const status = passed ? "PASSED" : "FAILED";
834
+ const color = passed ? "green" : "red";
835
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
836
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: name.padEnd(14) }),
837
+ " ",
838
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color, bold: true, children: status }),
839
+ detail ? /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
840
+ " (",
841
+ detail,
842
+ ")"
843
+ ] }) : null
844
+ ] });
845
+ }
846
+ function buildDetailRows(run) {
847
+ const { performance, dimensions, checks, failures, meta } = run;
848
+ const latencyHistory = performance.latencyHistoryMs ?? [
849
+ performance.latencyAvgMs - 40,
850
+ performance.latencyAvgMs - 10,
851
+ performance.latencyAvgMs + 20,
852
+ performance.latencyP95Ms - 80,
853
+ performance.latencyP95Ms
854
+ ];
855
+ const rows = [
856
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Meta" }, "meta-h"),
857
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
858
+ "Model: ",
859
+ meta.model,
860
+ " Provider: ",
861
+ meta.provider
862
+ ] }, "meta-1"),
863
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
864
+ "Commit: ",
865
+ meta.commit,
866
+ " Branch: ",
867
+ meta.branch,
868
+ " Seed: ",
869
+ meta.seed
870
+ ] }, "meta-2"),
871
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
872
+ "Duration: ",
873
+ meta.duration,
874
+ " Concurrency: ",
875
+ meta.concurrency
876
+ ] }, "meta-3"),
877
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
878
+ "Artifact: ",
879
+ meta.artifact
880
+ ] }, "meta-4"),
881
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp1"),
882
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Scores (0\u2013100)" }, "scores-h"),
883
+ ...dimensions.map((d) => /* @__PURE__ */ jsxRuntime.jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
884
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp2"),
885
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
886
+ ...checks.map((c) => /* @__PURE__ */ jsxRuntime.jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
887
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp3"),
888
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Performance" }, "perf-h"),
889
+ /* @__PURE__ */ jsxRuntime.jsx(
890
+ TextBar,
891
+ {
892
+ label: "pass rate",
893
+ value: performance.passRate,
894
+ format: (v) => `${v}%`
895
+ },
896
+ "perf-rate"
897
+ ),
898
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
899
+ "latency avg ",
900
+ performance.latencyAvgMs,
901
+ "ms p95 ",
902
+ performance.latencyP95Ms,
903
+ "ms"
904
+ ] }, "perf-lat"),
905
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
906
+ "tokens avg ",
907
+ performance.tokensAvg,
908
+ " p95 ",
909
+ performance.tokensP95
910
+ ] }, "perf-tok"),
911
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp4"),
912
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Latency trend" }, "spark-h"),
913
+ /* @__PURE__ */ jsxRuntime.jsx(Sparkline, { data: latencyHistory, width: 20 }, "spark")
914
+ ];
915
+ if (failures.length > 0) {
916
+ rows.push(/* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: " " }, "sp5"));
917
+ rows.push(/* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Failures (top)" }, "fail-h"));
918
+ failures.forEach((f, i) => {
919
+ rows.push(
920
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "red", children: [
921
+ i + 1,
922
+ ") ",
923
+ f.title
924
+ ] }, `fail-${i}`)
925
+ );
926
+ });
927
+ }
928
+ return rows;
929
+ }
930
+ function RunDetailsView({
931
+ state,
932
+ dataset,
933
+ selectedRun
934
+ }) {
935
+ const runs = dataset?.runs ?? [];
936
+ const rightFocused = state.focus === "right";
937
+ if (!selectedRun) {
938
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
939
+ /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
940
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: "Select a run to inspect details." }) })
941
+ ] });
942
+ }
943
+ const rows = buildDetailRows(selectedRun);
944
+ const offset = Math.max(0, state.detailsScrollOffset);
945
+ const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
946
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
947
+ /* @__PURE__ */ jsxRuntime.jsx(RunsSidebar, { state, dataset, runs }),
948
+ /* @__PURE__ */ jsxRuntime.jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsxRuntime.jsx(React__default.default.Fragment, { children: row }, i)) }) })
949
+ ] });
950
+ }
951
+ var LEFT_PANE_WIDTH3 = 44;
952
+ function NewEvaluationView({
953
+ state,
954
+ data,
955
+ visibleEvaluators
956
+ }) {
957
+ const selectedCount = state.selectedEvaluatorIds.length;
958
+ const focusedEvaluator = visibleEvaluators[state.evaluatorMenuIndex];
959
+ const leftFocused = state.focus === "left";
960
+ const rightFocused = state.focus === "right";
961
+ return /* @__PURE__ */ jsxRuntime.jsxs(jsxRuntime.Fragment, { children: [
962
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { width: LEFT_PANE_WIDTH3, focused: leftFocused, children: [
963
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Available Evaluators" }),
964
+ /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { color: "gray", children: [
965
+ "Search: ",
966
+ state.searchQuery || "(none)"
967
+ ] }),
968
+ visibleEvaluators.map((evaluator, index) => {
969
+ const selected = index === state.evaluatorMenuIndex;
970
+ const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
971
+ return /* @__PURE__ */ jsxRuntime.jsxs(
972
+ ink.Text,
973
+ {
974
+ color: selected ? "cyan" : "gray",
975
+ bold: selected,
976
+ children: [
977
+ selected ? "\u25B8 " : " ",
978
+ inSelection ? "[x] " : "[ ] ",
979
+ evaluator.name
980
+ ]
981
+ },
982
+ evaluator.id
983
+ );
984
+ })
985
+ ] }),
986
+ /* @__PURE__ */ jsxRuntime.jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
987
+ /* @__PURE__ */ jsxRuntime.jsxs(SectionHeader, { children: [
988
+ "Selected (",
989
+ selectedCount,
990
+ ")"
991
+ ] }),
992
+ state.selectedEvaluatorIds.map((id, index) => {
993
+ const evaluator = data.evaluators.find((item) => item.id === id);
994
+ if (!evaluator)
995
+ return null;
996
+ return /* @__PURE__ */ jsxRuntime.jsxs(ink.Text, { children: [
997
+ index + 1,
998
+ ") ",
999
+ evaluator.name
1000
+ ] }, id);
1001
+ }),
1002
+ /* @__PURE__ */ jsxRuntime.jsx(SectionHeader, { children: "Config preview" }),
1003
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: focusedEvaluator?.configPreview ?? "Select an evaluator to inspect config." })
1004
+ ] })
1005
+ ] });
1006
+ }
1007
+ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
1008
+ const datasetMax = filteredDatasetsLength;
1009
+ const runMax = selectedRunCount;
1010
+ const evaluatorMax = 3;
1011
+ return {
1012
+ ...state,
1013
+ datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
1014
+ runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
1015
+ evaluatorMenuIndex: Math.max(
1016
+ 0,
1017
+ Math.min(state.evaluatorMenuIndex, evaluatorMax)
1018
+ )
1019
+ };
1020
+ }
1021
+ function EvalsCliApp({
1022
+ data,
1023
+ args,
1024
+ runner
1025
+ }) {
1026
+ const { exit } = ink.useApp();
1027
+ const { width: stdoutWidth, height: stdoutHeight } = fullscreenInk.useScreenSize();
1028
+ const [liveData, setLiveData] = React.useState(data);
1029
+ const [runtimeMessage, setRuntimeMessage] = React.useState();
1030
+ const [state, dispatch] = React.useReducer(
1031
+ reduceCliState,
1032
+ createInitialState(data, args)
1033
+ );
1034
+ React.useEffect(() => {
1035
+ setLiveData(data);
1036
+ }, [data]);
1037
+ React.useEffect(() => {
1038
+ if (!runner) {
1039
+ return void 0;
1040
+ }
1041
+ return runner.subscribeRunEvents((event) => {
1042
+ setLiveData((current) => applyRunnerEvent(current, event, runner));
1043
+ if (event.type === "RunQueued") {
1044
+ setRuntimeMessage(`Queued ${event.runId} with ${event.totalTestCases} test cases.`);
1045
+ }
1046
+ if (event.type === "RunCompleted") {
1047
+ setRuntimeMessage(
1048
+ `Completed ${event.runId}: ${event.passedTestCases}/${event.totalTestCases} passed.`
1049
+ );
1050
+ }
1051
+ if (event.type === "RunFailed") {
1052
+ setRuntimeMessage(`Run failed: ${event.errorMessage}`);
1053
+ }
1054
+ });
1055
+ }, [runner]);
1056
+ const filteredDatasets = React.useMemo(
1057
+ () => getFilteredDatasets(liveData, state.searchQuery),
1058
+ [liveData, state.searchQuery]
1059
+ );
1060
+ const clampedState = clampCursor(
1061
+ state,
1062
+ filteredDatasets.length,
1063
+ getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
1064
+ );
1065
+ const selectedDataset = getDatasetByMenuIndex(
1066
+ filteredDatasets,
1067
+ clampedState.datasetMenuIndex
1068
+ );
1069
+ const selectedRun = getRunByMenuIndex(
1070
+ selectedDataset,
1071
+ clampedState.runMenuIndex
1072
+ );
1073
+ const visibleEvaluators = liveData.evaluators.filter(
1074
+ (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
1075
+ );
1076
+ ink.useInput((input, key) => {
1077
+ if (isQuitInput(input) || key.escape) {
1078
+ exit();
1079
+ return;
1080
+ }
1081
+ if (key.tab) {
1082
+ dispatch({ type: "TOGGLE_FOCUS" });
1083
+ return;
1084
+ }
1085
+ if (isSearchInput(input)) {
1086
+ dispatch({ type: "START_SEARCH" });
1087
+ return;
1088
+ }
1089
+ if (clampedState.searchMode) {
1090
+ if (key.return) {
1091
+ dispatch({ type: "END_SEARCH" });
1092
+ return;
1093
+ }
1094
+ if (isBackKey(key)) {
1095
+ dispatch({ type: "REMOVE_SEARCH_CHAR" });
1096
+ return;
1097
+ }
1098
+ if (isPrintableCharacter(input)) {
1099
+ dispatch({ type: "APPEND_SEARCH", value: input });
1100
+ }
1101
+ return;
1102
+ }
1103
+ if (key.upArrow) {
1104
+ const max = clampedState.level === "details" ? 100 : clampedState.level === "new-evaluation" ? visibleEvaluators.length - 1 : 100;
1105
+ dispatch({ type: "MOVE_UP", max });
1106
+ return;
1107
+ }
1108
+ if (key.downArrow) {
1109
+ const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
1110
+ dispatch({ type: "MOVE_DOWN", max });
1111
+ return;
1112
+ }
1113
+ if (key.return) {
1114
+ dispatch({
1115
+ type: "ENTER",
1116
+ hasDataset: Boolean(selectedDataset),
1117
+ hasRun: Boolean(selectedRun)
1118
+ });
1119
+ if (clampedState.level === "new-evaluation") {
1120
+ const evaluator = visibleEvaluators[clampedState.evaluatorMenuIndex];
1121
+ if (evaluator) {
1122
+ dispatch({ type: "TOGGLE_EVALUATOR", evaluatorId: evaluator.id });
1123
+ }
1124
+ }
1125
+ return;
1126
+ }
1127
+ if (isBackKey(key)) {
1128
+ dispatch({ type: "BACK" });
1129
+ return;
1130
+ }
1131
+ if (input.toLowerCase() === "c") {
1132
+ dispatch({ type: "CLEAR_WARNINGS" });
1133
+ setRuntimeMessage(void 0);
1134
+ return;
1135
+ }
1136
+ if (input.toLowerCase() === "s" && clampedState.level === "new-evaluation") {
1137
+ if (!runner) {
1138
+ setRuntimeMessage("Runner unavailable: cannot start evaluation.");
1139
+ return;
1140
+ }
1141
+ if (!selectedDataset) {
1142
+ setRuntimeMessage("Select a dataset before starting a new evaluation.");
1143
+ return;
1144
+ }
1145
+ if (clampedState.selectedEvaluatorIds.length === 0) {
1146
+ setRuntimeMessage("Select at least one evaluator before starting.");
1147
+ return;
1148
+ }
1149
+ void runner.runDatasetWith({
1150
+ datasetId: selectedDataset.id,
1151
+ evaluatorIds: clampedState.selectedEvaluatorIds
1152
+ }).then((snapshot) => {
1153
+ setRuntimeMessage(
1154
+ `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
1155
+ );
1156
+ }).catch((error) => {
1157
+ setRuntimeMessage(
1158
+ error instanceof Error ? error.message : "Failed to start evaluation."
1159
+ );
1160
+ });
1161
+ }
1162
+ });
1163
+ const renderContent = () => {
1164
+ if (clampedState.level === "new-evaluation") {
1165
+ return /* @__PURE__ */ jsxRuntime.jsx(
1166
+ NewEvaluationView,
1167
+ {
1168
+ state: clampedState,
1169
+ data: liveData,
1170
+ visibleEvaluators
1171
+ }
1172
+ );
1173
+ }
1174
+ if (clampedState.level === "datasets") {
1175
+ return /* @__PURE__ */ jsxRuntime.jsx(
1176
+ DatasetsView,
1177
+ {
1178
+ state: clampedState,
1179
+ filteredDatasets,
1180
+ selectedDataset
1181
+ }
1182
+ );
1183
+ }
1184
+ if (clampedState.level === "runs") {
1185
+ return /* @__PURE__ */ jsxRuntime.jsx(
1186
+ RunsView,
1187
+ {
1188
+ state: clampedState,
1189
+ dataset: selectedDataset,
1190
+ selectedRun
1191
+ }
1192
+ );
1193
+ }
1194
+ return /* @__PURE__ */ jsxRuntime.jsx(
1195
+ RunDetailsView,
1196
+ {
1197
+ state: clampedState,
1198
+ dataset: selectedDataset,
1199
+ selectedRun
1200
+ }
1201
+ );
1202
+ };
1203
+ return /* @__PURE__ */ jsxRuntime.jsxs(
1204
+ ink.Box,
1205
+ {
1206
+ flexDirection: "column",
1207
+ flexGrow: 1,
1208
+ width: stdoutWidth,
1209
+ height: stdoutHeight,
1210
+ children: [
1211
+ /* @__PURE__ */ jsxRuntime.jsx(
1212
+ ink.Box,
1213
+ {
1214
+ borderStyle: "round",
1215
+ borderColor: "cyan",
1216
+ paddingX: 1,
1217
+ width: stdoutWidth,
1218
+ children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: getBreadcrumbText(
1219
+ clampedState,
1220
+ selectedDataset?.name,
1221
+ selectedRun?.label
1222
+ ) })
1223
+ }
1224
+ ),
1225
+ clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxRuntime.jsxs(
1226
+ ink.Box,
1227
+ {
1228
+ marginTop: 1,
1229
+ borderStyle: "round",
1230
+ borderColor: "yellow",
1231
+ paddingX: 1,
1232
+ flexDirection: "column",
1233
+ width: stdoutWidth,
1234
+ children: [
1235
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "yellow", children: "Startup warnings:" }),
1236
+ clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { children: warning }, `${warning}-${index}`))
1237
+ ]
1238
+ }
1239
+ ),
1240
+ clampedState.searchMode && /* @__PURE__ */ jsxRuntime.jsxs(
1241
+ ink.Box,
1242
+ {
1243
+ marginTop: 1,
1244
+ borderStyle: "round",
1245
+ borderColor: "magenta",
1246
+ paddingX: 1,
1247
+ width: stdoutWidth,
1248
+ children: [
1249
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "magenta", bold: true, children: "Search: " }),
1250
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "white", children: clampedState.searchQuery })
1251
+ ]
1252
+ }
1253
+ ),
1254
+ runtimeMessage && /* @__PURE__ */ jsxRuntime.jsx(
1255
+ ink.Box,
1256
+ {
1257
+ marginTop: 1,
1258
+ borderStyle: "round",
1259
+ borderColor: "blue",
1260
+ paddingX: 1,
1261
+ width: stdoutWidth,
1262
+ children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "blue", children: runtimeMessage })
1263
+ }
1264
+ ),
1265
+ /* @__PURE__ */ jsxRuntime.jsx(
1266
+ ink.Box,
1267
+ {
1268
+ marginTop: 1,
1269
+ flexGrow: 1,
1270
+ width: stdoutWidth,
1271
+ flexDirection: "row",
1272
+ children: renderContent()
1273
+ }
1274
+ ),
1275
+ /* @__PURE__ */ jsxRuntime.jsx(ink.Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsxRuntime.jsx(ink.Text, { color: "gray", children: getFooterText(clampedState) }) })
1276
+ ]
1277
+ }
1278
+ );
1279
+ }
1280
+
1281
+ // src/runner/config.ts
1282
+ var defaultRunnerConfig = {
1283
+ discovery: {
1284
+ rootDir: process.cwd(),
1285
+ datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
1286
+ evaluatorSuffixes: [
1287
+ ".evaluator.ts",
1288
+ ".evaluator.tsx",
1289
+ ".evaluator.js",
1290
+ ".evaluator.mjs"
1291
+ ],
1292
+ testCaseSuffixes: [
1293
+ ".test-case.ts",
1294
+ ".test-case.tsx",
1295
+ ".test-case.js",
1296
+ ".test-case.mjs"
1297
+ ],
1298
+ excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
1299
+ },
1300
+ artifactDirectory: ".eval-results"
1301
+ };
1302
+ function withRunnerConfig(overrides) {
1303
+ {
1304
+ return defaultRunnerConfig;
1305
+ }
1306
+ }
1307
+ var jitiLoader;
1308
+ function toId(prefix, filePath, name) {
1309
+ const stable = name && name.trim().length > 0 ? name : filePath;
1310
+ return `${prefix}:${stable}`.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
1311
+ }
1312
+ function hasMethod(value, methodName) {
1313
+ return typeof value === "object" && value !== null && methodName in value && typeof value[methodName] === "function";
1314
+ }
1315
+ function isDatasetLike(value) {
1316
+ return hasMethod(value, "getName") && hasMethod(value, "matchesTestCase");
1317
+ }
1318
+ function isEvaluatorLike(value) {
1319
+ return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
1320
+ }
1321
+ function isTestCaseLike(value) {
1322
+ return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
1323
+ }
1324
+ async function walkDirectory(rootDir, excludeDirectories) {
1325
+ const out = [];
1326
+ async function walk(currentDir) {
1327
+ let entries;
1328
+ try {
1329
+ entries = await promises.readdir(currentDir, { withFileTypes: true });
1330
+ } catch {
1331
+ return;
1332
+ }
1333
+ await Promise.all(
1334
+ entries.map(async (entry) => {
1335
+ const absolute = path.resolve(currentDir, entry.name);
1336
+ if (entry.isDirectory()) {
1337
+ if (excludeDirectories.includes(entry.name)) {
1338
+ return;
1339
+ }
1340
+ await walk(absolute);
1341
+ return;
1342
+ }
1343
+ if (entry.isFile()) {
1344
+ out.push(absolute);
1345
+ }
1346
+ })
1347
+ );
1348
+ }
1349
+ await walk(rootDir);
1350
+ return out;
1351
+ }
1352
+ function hasOneSuffix(filePath, suffixes) {
1353
+ return suffixes.some((suffix) => filePath.endsWith(suffix));
1354
+ }
1355
+ async function loadModuleExports(filePath) {
1356
+ if (filePath.endsWith(".ts") || filePath.endsWith(".tsx")) {
1357
+ if (!jitiLoader) {
1358
+ const jitiModule = await import('jiti');
1359
+ const createJiti = jitiModule.createJiti ?? jitiModule.default;
1360
+ if (!createJiti) {
1361
+ throw new Error("Failed to initialize jiti TypeScript loader");
1362
+ }
1363
+ jitiLoader = createJiti((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)), {
1364
+ interopDefault: true,
1365
+ moduleCache: true
1366
+ });
1367
+ }
1368
+ const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
1369
+ return Object.values(loaded2);
1370
+ }
1371
+ const moduleUrl = url.pathToFileURL(filePath).href;
1372
+ const loaded = await import(moduleUrl);
1373
+ return Object.values(loaded);
1374
+ }
1375
+ async function collectDatasetsFromFiles(config) {
1376
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1377
+ const matched = files.filter(
1378
+ (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1379
+ );
1380
+ const found = await Promise.all(
1381
+ matched.map(async (absolutePath) => {
1382
+ const exports = await loadModuleExports(absolutePath);
1383
+ const datasets = exports.filter(isDatasetLike);
1384
+ const relPath = path.relative(config.rootDir, absolutePath);
1385
+ return datasets.map((dataset) => ({
1386
+ id: toId("dataset", relPath, dataset.getName()),
1387
+ filePath: relPath,
1388
+ dataset
1389
+ }));
1390
+ })
1391
+ );
1392
+ return found.flat();
1393
+ }
1394
+ async function collectEvaluatorsFromFiles(config) {
1395
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1396
+ const matched = files.filter(
1397
+ (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1398
+ );
1399
+ const found = await Promise.all(
1400
+ matched.map(async (absolutePath) => {
1401
+ const exports = await loadModuleExports(absolutePath);
1402
+ const evaluators = exports.filter(isEvaluatorLike);
1403
+ const relPath = path.relative(config.rootDir, absolutePath);
1404
+ return evaluators.map((evaluator) => ({
1405
+ id: toId("evaluator", relPath, evaluator.getName()),
1406
+ filePath: relPath,
1407
+ evaluator
1408
+ }));
1409
+ })
1410
+ );
1411
+ return found.flat();
1412
+ }
1413
+ async function collectTestCasesFromFiles(config) {
1414
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1415
+ const matched = files.filter(
1416
+ (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1417
+ );
1418
+ const found = await Promise.all(
1419
+ matched.map(async (absolutePath) => {
1420
+ const exports = await loadModuleExports(absolutePath);
1421
+ const testCases = exports.filter(isTestCaseLike);
1422
+ const relPath = path.relative(config.rootDir, absolutePath);
1423
+ return testCases.map((testCase) => ({
1424
+ id: toId("test-case", relPath, testCase.getName()),
1425
+ filePath: relPath,
1426
+ testCase
1427
+ }));
1428
+ })
1429
+ );
1430
+ return found.flat();
1431
+ }
1432
+
1433
+ // src/evals/metric.ts
1434
+ var registry = /* @__PURE__ */ new Map();
1435
+ var Metric = {
1436
+ of(config) {
1437
+ const def = {
1438
+ id: config.id,
1439
+ name: config.name,
1440
+ format: config.format,
1441
+ make: (data) => ({ id: config.id, data })
1442
+ };
1443
+ registry.set(config.id, def);
1444
+ return def;
1445
+ }
1446
+ };
1447
+
1448
+ // src/evals/score.ts
1449
+ var registry2 = /* @__PURE__ */ new Map();
1450
+ var Score = {
1451
+ of(config) {
1452
+ const def = {
1453
+ id: config.id,
1454
+ name: config.name,
1455
+ displayStrategy: config.displayStrategy,
1456
+ format: config.format,
1457
+ make: (data, options) => {
1458
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1459
+ return {
1460
+ id: config.id,
1461
+ data,
1462
+ ...passed !== void 0 && { passed }
1463
+ };
1464
+ }
1465
+ };
1466
+ registry2.set(config.id, def);
1467
+ return def;
1468
+ }
1469
+ };
1470
+ function getScoreById(id) {
1471
+ return registry2.get(id);
1472
+ }
1473
+
1474
+ // src/evals/metrics/standard.ts
1475
+ Metric.of({
1476
+ id: "token-count",
1477
+ name: "Tokens",
1478
+ format: (data) => {
1479
+ const input = data.input ?? 0;
1480
+ const output = data.output ?? 0;
1481
+ const inputCached = data.inputCached ?? 0;
1482
+ const outputCached = data.outputCached ?? 0;
1483
+ const cached = inputCached + outputCached;
1484
+ return `in:${input} out:${output} cached:${cached}`;
1485
+ }
1486
+ });
1487
+ Metric.of({
1488
+ id: "latency",
1489
+ name: "Latency",
1490
+ format: (data) => `${data.ms}ms`
1491
+ });
1492
+
1493
+ // src/evals/scores/standard.ts
1494
+ Score.of({
1495
+ id: "percent",
1496
+ name: "Score",
1497
+ displayStrategy: "bar",
1498
+ format: (data) => data.value.toFixed(2)
1499
+ });
1500
+ Score.of({
1501
+ id: "binary",
1502
+ name: "Result",
1503
+ displayStrategy: "passFail",
1504
+ format: (data) => data.passed ? "PASSED" : "NOT PASSED"
1505
+ });
1506
+
1507
+ // src/runner/score-utils.ts
1508
+ function toNumericScoreFromScores(scores) {
1509
+ for (const item of scores) {
1510
+ const def = getScoreById(item.id);
1511
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1512
+ const value = item.data.value;
1513
+ if (typeof value === "number" && Number.isFinite(value)) {
1514
+ return value;
1515
+ }
1516
+ }
1517
+ const numeric = toNumericScore(item.data);
1518
+ if (numeric !== void 0) {
1519
+ return numeric;
1520
+ }
1521
+ }
1522
+ return void 0;
1523
+ }
1524
+ function toNumericScore(value) {
1525
+ if (typeof value === "number" && Number.isFinite(value)) {
1526
+ return value;
1527
+ }
1528
+ if (typeof value !== "object" || value === null) {
1529
+ return void 0;
1530
+ }
1531
+ const obj = value;
1532
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1533
+ return obj.score;
1534
+ }
1535
+ const numberValues = Object.values(value).filter(
1536
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1537
+ );
1538
+ if (numberValues.length === 0) {
1539
+ return void 0;
1540
+ }
1541
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1542
+ }
1543
+
1544
+ // src/runner/execution.ts
1545
+ function computeEvaluatorPassed(evaluator, result, scores) {
1546
+ const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1547
+ if (scoresWithPassed.length > 0) {
1548
+ return scoresWithPassed.every((s) => s.passed === true);
1549
+ }
1550
+ const passCriterion = evaluator.getPassCriterion();
1551
+ if (passCriterion) {
1552
+ return passCriterion(result);
1553
+ }
1554
+ const passThreshold = evaluator.getPassThreshold();
1555
+ if (passThreshold !== void 0) {
1556
+ const numeric = toNumericScoreFromScores(scores);
1557
+ return numeric !== void 0 && numeric >= passThreshold;
1558
+ }
1559
+ return true;
1560
+ }
1561
+ function normalizeResult(result) {
1562
+ if (typeof result !== "object" || result === null) {
1563
+ return { scores: [] };
1564
+ }
1565
+ const obj = result;
1566
+ const scores = Array.isArray(obj.scores) ? obj.scores : [];
1567
+ const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
1568
+ return { scores, metrics };
1569
+ }
1570
+ function nowIsoForFile() {
1571
+ return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1572
+ }
1573
+ function createArtifactPath(artifactDirectory, datasetId, runId) {
1574
+ return path.join(
1575
+ artifactDirectory,
1576
+ `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1577
+ );
1578
+ }
1579
+ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => effect.Effect.gen(function* () {
1580
+ const startedAt = Date.now();
1581
+ updateSnapshot(task.runId, (snapshot) => ({
1582
+ ...snapshot,
1583
+ status: "running",
1584
+ startedAt
1585
+ }));
1586
+ yield* publishEvent({
1587
+ type: "RunStarted",
1588
+ runId: task.runId,
1589
+ startedAt
1590
+ });
1591
+ let completedTestCases = 0;
1592
+ let passedTestCases = 0;
1593
+ let failedTestCases = 0;
1594
+ for (const testCaseItem of task.testCases) {
1595
+ const started = Date.now();
1596
+ const evaluatorScores = [];
1597
+ let testCaseError;
1598
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1599
+ const evaluateFn = evaluator.getEvaluateFn();
1600
+ if (!evaluateFn) {
1601
+ continue;
1602
+ }
1603
+ try {
1604
+ const ctx = yield* effect.Effect.promise(
1605
+ () => Promise.resolve(evaluator.resolveContext())
1606
+ );
1607
+ const result = yield* effect.Effect.promise(
1608
+ () => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
1609
+ );
1610
+ const { scores, metrics } = normalizeResult(result);
1611
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1612
+ evaluatorScores.push({ evaluatorId, scores, passed, metrics });
1613
+ } catch (error) {
1614
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1615
+ evaluatorScores.push({
1616
+ evaluatorId,
1617
+ scores: [],
1618
+ passed: false
1619
+ });
1620
+ }
1621
+ }
1622
+ const testCasePassed = evaluatorScores.every((s) => s.passed);
1623
+ completedTestCases += 1;
1624
+ if (testCasePassed) {
1625
+ passedTestCases += 1;
1626
+ } else {
1627
+ failedTestCases += 1;
1628
+ }
1629
+ const progressEvent = {
1630
+ type: "TestCaseProgress",
1631
+ runId: task.runId,
1632
+ testCaseId: testCaseItem.id,
1633
+ testCaseName: testCaseItem.testCase.getName(),
1634
+ completedTestCases,
1635
+ totalTestCases: task.testCases.length,
1636
+ passed: testCasePassed,
1637
+ durationMs: Date.now() - started,
1638
+ evaluatorScores,
1639
+ errorMessage: testCaseError
1640
+ };
1641
+ updateSnapshot(task.runId, (snapshot) => ({
1642
+ ...snapshot,
1643
+ completedTestCases,
1644
+ passedTestCases,
1645
+ failedTestCases
1646
+ }));
1647
+ yield* publishEvent(progressEvent);
1648
+ yield* effect.Queue.offer(persistenceQueue, {
1649
+ runId: task.runId,
1650
+ artifactPath: task.snapshot.artifactPath,
1651
+ payload: progressEvent
1652
+ });
1653
+ }
1654
+ const finishedAt = Date.now();
1655
+ const completedEvent = {
1656
+ type: "RunCompleted",
1657
+ runId: task.runId,
1658
+ finishedAt,
1659
+ passedTestCases,
1660
+ failedTestCases,
1661
+ totalTestCases: task.testCases.length,
1662
+ artifactPath: task.snapshot.artifactPath
1663
+ };
1664
+ updateSnapshot(task.runId, (snapshot) => ({
1665
+ ...snapshot,
1666
+ status: "completed",
1667
+ completedTestCases,
1668
+ passedTestCases,
1669
+ failedTestCases,
1670
+ finishedAt
1671
+ }));
1672
+ yield* publishEvent(completedEvent);
1673
+ yield* effect.Queue.offer(persistenceQueue, {
1674
+ runId: task.runId,
1675
+ artifactPath: task.snapshot.artifactPath,
1676
+ payload: completedEvent
1677
+ });
1678
+ yield* publishEvent({
1679
+ type: "ArtifactFlushed",
1680
+ runId: task.runId,
1681
+ artifactPath: task.snapshot.artifactPath
1682
+ });
1683
+ });
1684
+ async function appendJsonLine(artifactPath, payload) {
1685
+ await promises.mkdir(path.dirname(artifactPath), { recursive: true });
1686
+ await promises.appendFile(artifactPath, `${JSON.stringify(payload)}
1687
+ `, "utf8");
1688
+ }
1689
+ var createPersistenceWorker = (queue) => effect.Effect.forever(
1690
+ effect.Effect.gen(function* () {
1691
+ const message = yield* effect.Queue.take(queue);
1692
+ yield* effect.Effect.promise(
1693
+ () => appendJsonLine(message.artifactPath, {
1694
+ runId: message.runId,
1695
+ ts: Date.now(),
1696
+ ...message.payload
1697
+ })
1698
+ );
1699
+ })
1700
+ );
1701
+
1702
+ // src/runner/search.ts
1703
+ function matchesAny(value, matchers) {
1704
+ if (!matchers || matchers.length === 0) {
1705
+ return true;
1706
+ }
1707
+ return matchers.some(
1708
+ (matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
1709
+ );
1710
+ }
1711
+ function matchesPath(value, matchers) {
1712
+ if (!matchers || matchers.length === 0) {
1713
+ return true;
1714
+ }
1715
+ return matchers.some((matcher) => {
1716
+ if (typeof matcher === "string") {
1717
+ return value.includes(matcher);
1718
+ }
1719
+ return matcher.test(value);
1720
+ });
1721
+ }
1722
+ function searchCollectedTestCases(all, query) {
1723
+ if (!query) {
1724
+ return all;
1725
+ }
1726
+ return all.filter((item) => {
1727
+ const tags = item.testCase.getTags();
1728
+ if (query.excludedTags && tags.some((tag) => matchesAny(tag, query.excludedTags))) {
1729
+ return false;
1730
+ }
1731
+ if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
1732
+ return false;
1733
+ }
1734
+ const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, query.includedTags));
1735
+ const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
1736
+ return includedTagsMatch && includedPathsMatch;
1737
+ });
1738
+ }
1739
+
1740
+ // src/runner/api.ts
1741
+ function parseRegexLiteral(pattern) {
1742
+ if (!pattern.startsWith("/")) {
1743
+ return void 0;
1744
+ }
1745
+ const lastSlash = pattern.lastIndexOf("/");
1746
+ if (lastSlash <= 0) {
1747
+ return void 0;
1748
+ }
1749
+ return {
1750
+ source: pattern.slice(1, lastSlash),
1751
+ flags: pattern.slice(lastSlash + 1)
1752
+ };
1753
+ }
1754
+ function createNameMatcher(pattern) {
1755
+ const normalizedPattern = pattern.trim();
1756
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1757
+ if (regexLiteral) {
1758
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1759
+ return (value) => regex.test(value);
1760
+ }
1761
+ if (normalizedPattern.includes("*")) {
1762
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1763
+ const regex = new RegExp(`^${escaped}$`, "i");
1764
+ return (value) => regex.test(value);
1765
+ }
1766
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1767
+ }
1768
+ function createRunner(overrides) {
1769
+ return new EffectRunner(withRunnerConfig());
1770
+ }
1771
+ var EffectRunner = class {
1772
+ constructor(config) {
1773
+ this.eventBus = effect.Effect.runSync(effect.PubSub.unbounded());
1774
+ this.runQueue = effect.Effect.runSync(effect.Queue.unbounded());
1775
+ this.persistenceQueue = effect.Effect.runSync(
1776
+ effect.Queue.unbounded()
1777
+ );
1778
+ this.snapshots = /* @__PURE__ */ new Map();
1779
+ this.listeners = /* @__PURE__ */ new Set();
1780
+ this.datasetsById = /* @__PURE__ */ new Map();
1781
+ this.evaluatorsById = /* @__PURE__ */ new Map();
1782
+ this.schedulerFiber = effect.Effect.runFork(
1783
+ this.createSchedulerEffect()
1784
+ );
1785
+ this.persistenceFiber = effect.Effect.runFork(
1786
+ createPersistenceWorker(this.persistenceQueue)
1787
+ );
1788
+ this.config = config;
1789
+ }
1790
+ async collectDatasets() {
1791
+ const datasets = await collectDatasetsFromFiles(this.config.discovery);
1792
+ this.datasetsById.clear();
1793
+ for (const dataset of datasets) {
1794
+ this.datasetsById.set(dataset.id, dataset);
1795
+ }
1796
+ return datasets;
1797
+ }
1798
+ async collectEvaluators() {
1799
+ const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
1800
+ this.evaluatorsById.clear();
1801
+ for (const evaluator of evaluators) {
1802
+ this.evaluatorsById.set(evaluator.id, evaluator);
1803
+ }
1804
+ return evaluators;
1805
+ }
1806
+ async resolveDatasetByName(name) {
1807
+ if (this.datasetsById.size === 0) {
1808
+ await this.collectDatasets();
1809
+ }
1810
+ const normalized = name.trim().toLowerCase();
1811
+ return Array.from(this.datasetsById.values()).find(
1812
+ (item) => item.dataset.getName().toLowerCase() === normalized
1813
+ );
1814
+ }
1815
+ async resolveEvaluatorsByNamePattern(pattern) {
1816
+ if (this.evaluatorsById.size === 0) {
1817
+ await this.collectEvaluators();
1818
+ }
1819
+ const matcher = createNameMatcher(pattern);
1820
+ return Array.from(this.evaluatorsById.values()).filter(
1821
+ (item) => matcher(item.evaluator.getName() ?? "")
1822
+ );
1823
+ }
1824
+ async searchTestCases(query) {
1825
+ const testCases = await collectTestCasesFromFiles(this.config.discovery);
1826
+ return searchCollectedTestCases(testCases, query);
1827
+ }
1828
+ async collectDatasetTestCases(datasetId) {
1829
+ if (this.datasetsById.size === 0) {
1830
+ await this.collectDatasets();
1831
+ }
1832
+ const dataset = this.datasetsById.get(datasetId);
1833
+ if (!dataset) {
1834
+ throw new Error(`Unknown dataset: ${datasetId}`);
1835
+ }
1836
+ const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
1837
+ return allTestCases.filter(
1838
+ (testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
1839
+ );
1840
+ }
1841
+ async runDatasetWith(request) {
1842
+ if (this.datasetsById.size === 0) {
1843
+ await this.collectDatasets();
1844
+ }
1845
+ if (this.evaluatorsById.size === 0) {
1846
+ await this.collectEvaluators();
1847
+ }
1848
+ const dataset = this.datasetsById.get(request.datasetId);
1849
+ if (!dataset) {
1850
+ throw new Error(`Unknown dataset: ${request.datasetId}`);
1851
+ }
1852
+ const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1853
+ if (selectedEvaluators.length === 0) {
1854
+ throw new Error("No evaluators selected for run");
1855
+ }
1856
+ const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1857
+ const runId = `run-${crypto.randomUUID()}`;
1858
+ const artifactPath = createArtifactPath(
1859
+ this.config.artifactDirectory,
1860
+ request.datasetId,
1861
+ runId
1862
+ );
1863
+ const snapshot = {
1864
+ runId,
1865
+ datasetId: request.datasetId,
1866
+ datasetName: dataset.dataset.getName(),
1867
+ evaluatorIds: selectedEvaluators.map((item) => item.id),
1868
+ queuedAt: Date.now(),
1869
+ totalTestCases: selectedTestCases.length,
1870
+ completedTestCases: 0,
1871
+ passedTestCases: 0,
1872
+ failedTestCases: 0,
1873
+ status: "queued",
1874
+ artifactPath
1875
+ };
1876
+ this.snapshots.set(runId, snapshot);
1877
+ const queuedEvent = {
1878
+ type: "RunQueued",
1879
+ runId,
1880
+ datasetId: request.datasetId,
1881
+ datasetName: dataset.dataset.getName(),
1882
+ evaluatorIds: selectedEvaluators.map((item) => item.id),
1883
+ totalTestCases: selectedTestCases.length,
1884
+ artifactPath
1885
+ };
1886
+ await effect.Effect.runPromise(this.publishEvent(queuedEvent));
1887
+ await effect.Effect.runPromise(
1888
+ effect.Queue.offer(this.persistenceQueue, {
1889
+ runId,
1890
+ artifactPath,
1891
+ payload: queuedEvent
1892
+ })
1893
+ );
1894
+ await effect.Effect.runPromise(
1895
+ effect.Queue.offer(this.runQueue, {
1896
+ runId,
1897
+ datasetId: request.datasetId,
1898
+ dataset: dataset.dataset,
1899
+ evaluators: selectedEvaluators,
1900
+ testCases: selectedTestCases,
1901
+ snapshot
1902
+ })
1903
+ );
1904
+ return snapshot;
1905
+ }
1906
+ subscribeRunEvents(listener, options) {
1907
+ const entry = { runId: options?.runId, listener };
1908
+ this.listeners.add(entry);
1909
+ return () => {
1910
+ this.listeners.delete(entry);
1911
+ };
1912
+ }
1913
+ getRunSnapshot(runId) {
1914
+ return this.snapshots.get(runId);
1915
+ }
1916
+ getAllRunSnapshots() {
1917
+ return Array.from(this.snapshots.values()).sort(
1918
+ (a, b) => b.queuedAt - a.queuedAt
1919
+ );
1920
+ }
1921
+ async shutdown() {
1922
+ await effect.Effect.runPromise(effect.Fiber.interrupt(this.schedulerFiber));
1923
+ await effect.Effect.runPromise(effect.Fiber.interrupt(this.persistenceFiber));
1924
+ await effect.Effect.runPromise(effect.Queue.shutdown(this.runQueue));
1925
+ await effect.Effect.runPromise(effect.Queue.shutdown(this.persistenceQueue));
1926
+ await effect.Effect.runPromise(effect.PubSub.shutdown(this.eventBus));
1927
+ }
1928
+ createSchedulerEffect() {
1929
+ const self = this;
1930
+ return effect.Effect.forever(
1931
+ effect.Effect.gen(function* () {
1932
+ const task = yield* effect.Queue.take(self.runQueue);
1933
+ yield* effect.Effect.fork(
1934
+ executeRunTask(
1935
+ task,
1936
+ self.publishEvent.bind(self),
1937
+ self.persistenceQueue,
1938
+ self.updateSnapshot.bind(self)
1939
+ )
1940
+ );
1941
+ })
1942
+ );
1943
+ }
1944
+ updateSnapshot(runId, updater) {
1945
+ const existing = this.snapshots.get(runId);
1946
+ if (!existing) {
1947
+ return;
1948
+ }
1949
+ this.snapshots.set(runId, updater(existing));
1950
+ }
1951
+ publishEvent(event) {
1952
+ return effect.Effect.sync(() => {
1953
+ for (const entry of this.listeners) {
1954
+ if (entry.runId && entry.runId !== event.runId) {
1955
+ continue;
1956
+ }
1957
+ entry.listener(event);
1958
+ }
1959
+ }).pipe(
1960
+ effect.Effect.flatMap(() => effect.PubSub.publish(this.eventBus, event)),
1961
+ effect.Effect.asVoid
1962
+ );
1963
+ }
1964
+ };
1965
+ async function main() {
1966
+ const args = parseStartupArgs(process.argv.slice(2));
1967
+ const runner = createRunner();
1968
+ const data = await loadRunnerData(runner).catch(() => loadMockData());
1969
+ process.on("SIGINT", () => {
1970
+ void runner.shutdown().finally(() => process.exit(0));
1971
+ });
1972
+ process.on("SIGTERM", () => {
1973
+ void runner.shutdown().finally(() => process.exit(0));
1974
+ });
1975
+ fullscreenInk.withFullScreen(
1976
+ /* @__PURE__ */ jsxRuntime.jsx(EvalsCliApp, { data, args, runner })
1977
+ ).start();
1978
+ }
1979
+ void main();
1980
+ //# sourceMappingURL=out.js.map
1981
+ //# sourceMappingURL=cli.cjs.map