@m4trix/evals 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,1974 @@
1
+ #!/usr/bin/env node
2
+ import { withFullScreen, useScreenSize } from 'fullscreen-ink';
3
+ import React, { useState, useReducer, useEffect, useMemo } from 'react';
4
+ import { useApp, useInput, Box, Text } from 'ink';
5
+ import { jsx, jsxs, Fragment } from 'react/jsx-runtime';
6
+ import { randomUUID } from 'crypto';
7
+ import { Effect, PubSub, Queue, Fiber } from 'effect';
8
+ import { mkdir, appendFile, readdir } from 'fs/promises';
9
+ import { relative, join, dirname, resolve } from 'path';
10
+ import { pathToFileURL } from 'url';
11
+
12
+ var SEP = " ";
13
+ var ARROW = "\u203A";
14
+ function getBreadcrumbText(state, datasetName, runLabel) {
15
+ const dim = (s, k) => /* @__PURE__ */ jsx(Text, { color: "gray", children: s }, k ?? s);
16
+ const accent = (s) => /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children: s }, s);
17
+ if (state.level === "datasets") {
18
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
19
+ dim("Evaluations"),
20
+ SEP,
21
+ dim(ARROW, "a1"),
22
+ SEP,
23
+ accent("Datasets")
24
+ ] });
25
+ }
26
+ if (state.level === "runs") {
27
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
28
+ dim("Evaluations"),
29
+ SEP,
30
+ dim(ARROW, "a1"),
31
+ SEP,
32
+ dim("Dataset:"),
33
+ " ",
34
+ /* @__PURE__ */ jsx(Text, { color: "white", children: datasetName ?? "-" }, "ds"),
35
+ SEP,
36
+ dim(ARROW, "a2"),
37
+ SEP,
38
+ accent("Runs")
39
+ ] });
40
+ }
41
+ if (state.level === "details") {
42
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
43
+ dim("Evaluations"),
44
+ SEP,
45
+ dim(ARROW, "a1"),
46
+ SEP,
47
+ dim("Dataset:"),
48
+ " ",
49
+ /* @__PURE__ */ jsx(Text, { color: "white", children: datasetName ?? "-" }, "ds"),
50
+ SEP,
51
+ dim(ARROW, "a2"),
52
+ SEP,
53
+ dim("Run:"),
54
+ " ",
55
+ /* @__PURE__ */ jsx(Text, { color: "white", children: runLabel ?? "-" }, "rl"),
56
+ SEP,
57
+ dim(ARROW, "a3"),
58
+ SEP,
59
+ accent("Details")
60
+ ] });
61
+ }
62
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
63
+ dim("Evaluations"),
64
+ SEP,
65
+ dim(ARROW, "a1"),
66
+ SEP,
67
+ accent("New evaluation"),
68
+ SEP,
69
+ dim(ARROW, "a2"),
70
+ SEP,
71
+ dim("Select evaluators", "sel")
72
+ ] });
73
+ }
74
+
75
+ // src/cli/components/Footer.tsx
76
+ function getFooterText(state) {
77
+ if (state.level === "datasets") {
78
+ return "\u2191\u2193 move Enter open / search Tab focus q quit";
79
+ }
80
+ if (state.level === "runs") {
81
+ return "\u2191\u2193 move Enter details Backspace datasets Tab focus q quit";
82
+ }
83
+ if (state.level === "details") {
84
+ return "\u2191\u2193 scroll Backspace runs Tab focus q quit";
85
+ }
86
+ return "\u2191\u2193 move Enter add/remove S start run / search Esc cancel q quit";
87
+ }
88
+ function ListItem({
89
+ selected,
90
+ label,
91
+ itemKey
92
+ }) {
93
+ return /* @__PURE__ */ jsxs(Text, { color: selected ? "cyan" : "gray", bold: selected, children: [
94
+ selected ? "\u25B8 " : " ",
95
+ label
96
+ ] }, itemKey);
97
+ }
98
+ function Pane({
99
+ children,
100
+ width,
101
+ flexGrow,
102
+ marginLeft,
103
+ focused = false
104
+ }) {
105
+ return /* @__PURE__ */ jsx(
106
+ Box,
107
+ {
108
+ flexDirection: "column",
109
+ width,
110
+ flexGrow,
111
+ marginLeft,
112
+ borderStyle: focused ? "single" : "round",
113
+ borderColor: focused ? "cyan" : "gray",
114
+ paddingX: 1,
115
+ children
116
+ }
117
+ );
118
+ }
119
+ function SectionHeader({
120
+ children
121
+ }) {
122
+ return /* @__PURE__ */ jsx(Text, { color: "cyan", bold: true, children });
123
+ }
124
+ function StatusText({ status }) {
125
+ const color = status === "PASS" ? "green" : status === "RUNNING" ? "yellow" : "red";
126
+ return /* @__PURE__ */ jsxs(Text, { color, children: [
127
+ "(",
128
+ status,
129
+ ")"
130
+ ] });
131
+ }
132
+ var LEFT_PANE_WIDTH = 44;
133
+ function RunsSidebar({
134
+ state,
135
+ runs
136
+ }) {
137
+ const focused = state.focus === "left";
138
+ return /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH, focused, children: [
139
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Runs" }),
140
+ /* @__PURE__ */ jsx(
141
+ ListItem,
142
+ {
143
+ selected: state.runMenuIndex === 0,
144
+ label: "New evaluation",
145
+ itemKey: "runs-new-eval"
146
+ }
147
+ ),
148
+ runs.map((run, index) => /* @__PURE__ */ jsxs(
149
+ Text,
150
+ {
151
+ color: state.runMenuIndex === index + 1 ? "cyan" : "gray",
152
+ bold: state.runMenuIndex === index + 1,
153
+ children: [
154
+ state.runMenuIndex === index + 1 ? "\u25B8 " : " ",
155
+ run.label,
156
+ " ",
157
+ /* @__PURE__ */ jsx(StatusText, { status: run.status })
158
+ ]
159
+ },
160
+ run.id
161
+ ))
162
+ ] });
163
+ }
164
+ var BLOCKS = ["\u2581", "\u2582", "\u2583", "\u2584", "\u2585", "\u2586", "\u2587", "\u2588"];
165
+ function Sparkline({
166
+ data,
167
+ width,
168
+ label
169
+ }) {
170
+ if (data.length === 0)
171
+ return null;
172
+ const max = Math.max(...data);
173
+ const min = Math.min(...data);
174
+ const range = max - min || 1;
175
+ const targetWidth = width ?? Math.min(data.length, 24);
176
+ let values;
177
+ if (data.length <= targetWidth) {
178
+ values = data;
179
+ } else {
180
+ const step = data.length / targetWidth;
181
+ values = Array.from({ length: targetWidth }, (_, i) => {
182
+ const start = Math.floor(i * step);
183
+ const end = Math.floor((i + 1) * step);
184
+ const slice = data.slice(start, end);
185
+ return slice.reduce((a, b) => a + b, 0) / slice.length;
186
+ });
187
+ }
188
+ const spark = values.map((v) => {
189
+ const normalized = (v - min) / range;
190
+ const idx = Math.min(7, Math.floor(normalized * 8));
191
+ return BLOCKS[idx];
192
+ }).join("");
193
+ return /* @__PURE__ */ jsxs(Text, { children: [
194
+ label !== void 0 && label !== "" ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
195
+ label.padEnd(14),
196
+ " "
197
+ ] }) : null,
198
+ /* @__PURE__ */ jsx(Text, { color: "cyan", children: spark })
199
+ ] });
200
+ }
201
+ function barColor(pct) {
202
+ if (pct >= 70)
203
+ return "green";
204
+ if (pct >= 40)
205
+ return "yellow";
206
+ return "red";
207
+ }
208
+ function TextBar({
209
+ label,
210
+ value,
211
+ max = 100,
212
+ labelWidth = 14,
213
+ barWidth = 20,
214
+ format = (v) => String(v),
215
+ colorByValue = true
216
+ }) {
217
+ const clamped = Math.max(0, Math.min(max, value));
218
+ const pct = max > 0 ? clamped / max * 100 : 0;
219
+ const filled = Math.round(clamped / max * barWidth);
220
+ const filledBar = "\u2588".repeat(filled);
221
+ const emptyBar = "\u2591".repeat(Math.max(0, barWidth - filled));
222
+ const color = colorByValue ? barColor(pct) : void 0;
223
+ return /* @__PURE__ */ jsxs(Text, { children: [
224
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: label.padEnd(labelWidth) }),
225
+ " [",
226
+ color ? /* @__PURE__ */ jsxs(Fragment, { children: [
227
+ /* @__PURE__ */ jsx(Text, { color, children: filledBar }),
228
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: emptyBar })
229
+ ] }) : filledBar + emptyBar,
230
+ "] ",
231
+ /* @__PURE__ */ jsx(Text, { color: color ?? "white", bold: true, children: format(value) })
232
+ ] });
233
+ }
234
+
235
+ // src/cli/keys.ts
236
+ function isQuitInput(input) {
237
+ return input.toLowerCase() === "q";
238
+ }
239
+ function isSearchInput(input) {
240
+ return input === "/";
241
+ }
242
+ function isPrintableCharacter(input) {
243
+ return input.length === 1 && input >= " " && input !== "\x7F";
244
+ }
245
+ function isBackKey(key) {
246
+ return key.backspace || key.delete;
247
+ }
248
+
249
+ // src/cli/data.mock.json
250
+ var data_mock_default = {
251
+ datasets: [
252
+ {
253
+ id: "onboarding-flows",
254
+ name: "onboarding-flows",
255
+ overview: "Evaluate first-user journeys and schema compliance for generated onboarding payloads.",
256
+ runs: [
257
+ {
258
+ id: "run_2026-02-17_2044",
259
+ label: "2026-02-17 20:44",
260
+ status: "FAILED",
261
+ performance: {
262
+ passRate: 96,
263
+ avgScore: 0.91,
264
+ latencyP95Ms: 710,
265
+ latencyAvgMs: 502,
266
+ tokensAvg: 171,
267
+ tokensP95: 230,
268
+ costUsd: 24e-4,
269
+ latencyHistoryMs: [380, 420, 510, 480, 550, 620, 590, 710, 520, 480, 530, 600]
270
+ },
271
+ dimensions: [
272
+ { name: "correctness", score: 82 },
273
+ { name: "faithfulness", score: 79 },
274
+ { name: "brevity", score: 68 },
275
+ { name: "style", score: 90 }
276
+ ],
277
+ checks: [
278
+ { name: "json_schema", passed: false, detail: "3 violations" },
279
+ { name: "tool_calls", passed: true, detail: "0 unexpected" },
280
+ { name: "pii_leak", passed: true },
281
+ { name: "jailbreak", passed: true }
282
+ ],
283
+ failures: [
284
+ { title: "product_parser \u203A conforms to schema (price: string)" },
285
+ { title: "checkout \u203A tool-call count mismatch" }
286
+ ],
287
+ meta: {
288
+ model: "gpt-4o-mini",
289
+ provider: "OpenAI",
290
+ commit: "2f3c1a9",
291
+ branch: "main",
292
+ seed: 42,
293
+ concurrency: 4,
294
+ duration: "00:01:12",
295
+ artifact: "./eval-results/run_2026-02-17.jsonl"
296
+ }
297
+ },
298
+ {
299
+ id: "run_2026-02-16_1112",
300
+ label: "2026-02-16 11:12",
301
+ status: "PASS",
302
+ performance: {
303
+ passRate: 99,
304
+ avgScore: 0.95,
305
+ latencyP95Ms: 650,
306
+ latencyAvgMs: 488,
307
+ tokensAvg: 168,
308
+ tokensP95: 220,
309
+ costUsd: 2e-3,
310
+ latencyHistoryMs: [420, 450, 480, 460, 520, 490, 510, 650, 440, 470, 500, 480]
311
+ },
312
+ dimensions: [
313
+ { name: "correctness", score: 89 },
314
+ { name: "faithfulness", score: 88 },
315
+ { name: "brevity", score: 72 },
316
+ { name: "style", score: 93 }
317
+ ],
318
+ checks: [
319
+ { name: "json_schema", passed: true, detail: "0 violations" },
320
+ { name: "tool_calls", passed: true, detail: "0 unexpected" },
321
+ { name: "pii_leak", passed: true },
322
+ { name: "jailbreak", passed: true }
323
+ ],
324
+ failures: [],
325
+ meta: {
326
+ model: "gpt-4o-mini",
327
+ provider: "OpenAI",
328
+ commit: "0d24f8f",
329
+ branch: "main",
330
+ seed: 42,
331
+ concurrency: 4,
332
+ duration: "00:01:06",
333
+ artifact: "./eval-results/run_2026-02-16.jsonl"
334
+ }
335
+ },
336
+ {
337
+ id: "run_2026-02-15_0921",
338
+ label: "2026-02-15 09:21",
339
+ status: "PASS",
340
+ performance: {
341
+ passRate: 98,
342
+ avgScore: 0.93,
343
+ latencyP95Ms: 680,
344
+ latencyAvgMs: 495,
345
+ tokensAvg: 175,
346
+ tokensP95: 235,
347
+ costUsd: 22e-4,
348
+ latencyHistoryMs: [450, 480, 520, 490, 550, 580, 620, 680, 510, 470, 530, 560]
349
+ },
350
+ dimensions: [
351
+ { name: "correctness", score: 86 },
352
+ { name: "faithfulness", score: 84 },
353
+ { name: "brevity", score: 70 },
354
+ { name: "style", score: 91 }
355
+ ],
356
+ checks: [
357
+ { name: "json_schema", passed: true, detail: "0 violations" },
358
+ { name: "tool_calls", passed: true, detail: "0 unexpected" },
359
+ { name: "pii_leak", passed: true },
360
+ { name: "jailbreak", passed: true }
361
+ ],
362
+ failures: [],
363
+ meta: {
364
+ model: "gpt-4o-mini",
365
+ provider: "OpenAI",
366
+ commit: "a1b2c3d",
367
+ branch: "main",
368
+ seed: 42,
369
+ concurrency: 4,
370
+ duration: "00:01:08",
371
+ artifact: "./eval-results/run_2026-02-15.jsonl"
372
+ }
373
+ }
374
+ ]
375
+ },
376
+ {
377
+ id: "tool-calls",
378
+ name: "tool-calls",
379
+ overview: "Validate function-call conformance and unexpected tool invocation behavior.",
380
+ runs: [
381
+ {
382
+ id: "run_2026-02-14_1530",
383
+ label: "2026-02-14 15:30",
384
+ status: "PASS",
385
+ performance: {
386
+ passRate: 100,
387
+ avgScore: 1,
388
+ latencyP95Ms: 320,
389
+ latencyAvgMs: 280,
390
+ tokensAvg: 45,
391
+ tokensP95: 62,
392
+ costUsd: 8e-4,
393
+ latencyHistoryMs: [250, 270, 290, 280, 310, 320, 265, 290, 300, 275]
394
+ },
395
+ dimensions: [
396
+ { name: "contract_match", score: 100 },
397
+ { name: "arg_validity", score: 100 }
398
+ ],
399
+ checks: [
400
+ { name: "tool_calls", passed: true, detail: "0 unexpected" }
401
+ ],
402
+ failures: [],
403
+ meta: {
404
+ model: "gpt-4o-mini",
405
+ provider: "OpenAI",
406
+ commit: "e4f5g6h",
407
+ branch: "feat/tools",
408
+ seed: 42,
409
+ concurrency: 8,
410
+ duration: "00:00:45",
411
+ artifact: "./eval-results/tool-calls_2026-02-14.jsonl"
412
+ }
413
+ }
414
+ ]
415
+ },
416
+ {
417
+ id: "json-schema",
418
+ name: "json-schema",
419
+ overview: "Stress-test schema fidelity across generated extraction payloads.",
420
+ runs: []
421
+ }
422
+ ],
423
+ evaluators: [
424
+ { id: "json-schema-validator", name: "JSON Schema Validator", configPreview: "strict=true" },
425
+ { id: "tool-call-contract-checker", name: "Tool-call Contract Checker", configPreview: "unexpectedCalls=error" },
426
+ { id: "rubric-judge", name: "Rubric Judge (LLM)", configPreview: "model=gpt-4o-mini; scale=0-100" },
427
+ { id: "pii-leak-detector", name: "PII Leak Detector", configPreview: "redact=false" }
428
+ ]
429
+ };
430
+
431
+ // src/cli/state.ts
432
+ function loadMockData() {
433
+ return data_mock_default;
434
+ }
435
+ function toSlug(input) {
436
+ return input.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
437
+ }
438
+ function toEvalRun(snapshot) {
439
+ const total = snapshot.totalTestCases === 0 ? 1 : snapshot.totalTestCases;
440
+ const passRate = Math.round(snapshot.passedTestCases / total * 100);
441
+ const avgScore = snapshot.passedTestCases / total;
442
+ const durationMs = snapshot.finishedAt ? snapshot.finishedAt - (snapshot.startedAt ?? snapshot.queuedAt) : Date.now() - (snapshot.startedAt ?? snapshot.queuedAt);
443
+ return {
444
+ id: snapshot.runId,
445
+ label: snapshot.runId.slice(0, 12),
446
+ status: snapshot.status === "completed" ? "PASS" : snapshot.status === "failed" ? "FAILED" : "RUNNING",
447
+ performance: {
448
+ passRate,
449
+ avgScore,
450
+ latencyP95Ms: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
451
+ latencyAvgMs: Math.max(1, Math.floor(durationMs / Math.max(1, total))),
452
+ tokensAvg: 0,
453
+ tokensP95: 0,
454
+ costUsd: 0,
455
+ latencyHistoryMs: [durationMs]
456
+ },
457
+ dimensions: [
458
+ { name: "passed", score: Math.round(snapshot.passedTestCases / total * 100) },
459
+ { name: "failed", score: Math.round(snapshot.failedTestCases / total * 100) }
460
+ ],
461
+ checks: [
462
+ {
463
+ name: "run_status",
464
+ passed: snapshot.status === "completed",
465
+ detail: snapshot.status
466
+ }
467
+ ],
468
+ failures: snapshot.errorMessage && snapshot.errorMessage.length > 0 ? [{ title: snapshot.errorMessage }] : [],
469
+ meta: {
470
+ model: "n/a",
471
+ provider: "runner",
472
+ commit: "local",
473
+ branch: "local",
474
+ seed: 0,
475
+ concurrency: 1,
476
+ duration: `${durationMs}ms`,
477
+ artifact: snapshot.artifactPath
478
+ }
479
+ };
480
+ }
481
+ function toEvalDataset(item, snapshots) {
482
+ const runs = snapshots.filter((snapshot) => snapshot.datasetId === item.id).sort((a, b) => b.queuedAt - a.queuedAt).map(toEvalRun);
483
+ return {
484
+ id: item.id,
485
+ name: item.dataset.getName(),
486
+ overview: `Discovered from ${item.filePath}`,
487
+ runs
488
+ };
489
+ }
490
+ function toEvaluatorOption(item) {
491
+ return {
492
+ id: item.id,
493
+ name: item.evaluator.getName() ?? toSlug(item.id),
494
+ configPreview: `Source: ${item.filePath}`
495
+ };
496
+ }
497
+ async function loadRunnerData(runner) {
498
+ const [datasets, evaluators] = await Promise.all([
499
+ runner.collectDatasets(),
500
+ runner.collectEvaluators()
501
+ ]);
502
+ const snapshots = runner.getAllRunSnapshots();
503
+ if (datasets.length === 0 && evaluators.length === 0) {
504
+ return loadMockData();
505
+ }
506
+ return {
507
+ datasets: datasets.map((dataset) => toEvalDataset(dataset, snapshots)),
508
+ evaluators: evaluators.map(toEvaluatorOption)
509
+ };
510
+ }
511
+ function applyRunnerEvent(data, event, runner) {
512
+ const snapshot = runner.getRunSnapshot(event.runId);
513
+ if (!snapshot) {
514
+ return data;
515
+ }
516
+ const dataset = data.datasets.find((item) => item.id === snapshot.datasetId);
517
+ if (!dataset) {
518
+ return data;
519
+ }
520
+ const run = toEvalRun(snapshot);
521
+ const hasRun = dataset.runs.some((item) => item.id === run.id);
522
+ const nextRuns = hasRun ? dataset.runs.map((item) => item.id === run.id ? run : item) : [run, ...dataset.runs];
523
+ return {
524
+ ...data,
525
+ datasets: data.datasets.map(
526
+ (item) => item.id === dataset.id ? { ...item, runs: nextRuns } : item
527
+ )
528
+ };
529
+ }
530
+ function parseStartupArgs(argv) {
531
+ const args = { unknownArgs: [] };
532
+ for (let index = 0; index < argv.length; index += 1) {
533
+ const token = argv[index];
534
+ if (token === "--dataset" && argv[index + 1]) {
535
+ args.datasetId = argv[index + 1];
536
+ index += 1;
537
+ continue;
538
+ }
539
+ if (token === "--run" && argv[index + 1]) {
540
+ args.runId = argv[index + 1];
541
+ index += 1;
542
+ continue;
543
+ }
544
+ if (token === "--search" && argv[index + 1]) {
545
+ args.search = argv[index + 1];
546
+ index += 1;
547
+ continue;
548
+ }
549
+ args.unknownArgs.push(token);
550
+ }
551
+ return args;
552
+ }
553
+ function getFilteredDatasets(data, searchQuery) {
554
+ const query = searchQuery.trim().toLowerCase();
555
+ if (!query) {
556
+ return data.datasets;
557
+ }
558
+ return data.datasets.filter((dataset) => dataset.name.toLowerCase().includes(query));
559
+ }
560
+ function getDatasetByMenuIndex(datasets, menuIndex) {
561
+ if (menuIndex <= 0) {
562
+ return void 0;
563
+ }
564
+ return datasets[menuIndex - 1];
565
+ }
566
+ function getRunByMenuIndex(dataset, menuIndex) {
567
+ if (!dataset || menuIndex <= 0) {
568
+ return void 0;
569
+ }
570
+ return dataset.runs[menuIndex - 1];
571
+ }
572
+ function createInitialState(data, args) {
573
+ const warnings = [];
574
+ if (args.unknownArgs.length > 0) {
575
+ warnings.push(`Unknown args: ${args.unknownArgs.join(", ")}`);
576
+ warnings.push("Supported: --dataset <id>, --run <id>, --search <term>");
577
+ }
578
+ const searchQuery = args.search ?? "";
579
+ const filteredDatasets = getFilteredDatasets(data, searchQuery);
580
+ const datasetByArg = filteredDatasets.find((dataset) => dataset.id === args.datasetId);
581
+ const datasetMenuIndex = datasetByArg ? filteredDatasets.indexOf(datasetByArg) + 1 : 0;
582
+ let level = "datasets";
583
+ let runMenuIndex = 0;
584
+ if (datasetByArg) {
585
+ level = "runs";
586
+ } else if (args.datasetId) {
587
+ warnings.push(`Dataset "${args.datasetId}" not found.`);
588
+ }
589
+ if (datasetByArg && args.runId) {
590
+ const runIndex = datasetByArg.runs.findIndex((run) => run.id === args.runId);
591
+ if (runIndex >= 0) {
592
+ runMenuIndex = runIndex + 1;
593
+ level = "details";
594
+ } else {
595
+ warnings.push(`Run "${args.runId}" not found in dataset "${datasetByArg.id}".`);
596
+ }
597
+ }
598
+ return {
599
+ level,
600
+ focus: "left",
601
+ datasetMenuIndex,
602
+ runMenuIndex,
603
+ detailsScrollOffset: 0,
604
+ selectedEvaluatorIds: data.evaluators.slice(0, 2).map((item) => item.id),
605
+ evaluatorMenuIndex: 0,
606
+ searchQuery,
607
+ searchMode: false,
608
+ startupWarnings: warnings
609
+ };
610
+ }
611
+ function reduceCliState(state, action) {
612
+ if (action.type === "MOVE_UP") {
613
+ if (state.searchMode) {
614
+ return state;
615
+ }
616
+ if (state.level === "details" && state.focus === "right") {
617
+ return { ...state, detailsScrollOffset: Math.max(0, state.detailsScrollOffset - 1) };
618
+ }
619
+ if (state.level === "datasets") {
620
+ return { ...state, datasetMenuIndex: Math.max(0, state.datasetMenuIndex - 1) };
621
+ }
622
+ if (state.level === "runs") {
623
+ return { ...state, runMenuIndex: Math.max(0, state.runMenuIndex - 1) };
624
+ }
625
+ if (state.level === "new-evaluation") {
626
+ return { ...state, evaluatorMenuIndex: Math.max(0, state.evaluatorMenuIndex - 1) };
627
+ }
628
+ return state;
629
+ }
630
+ if (action.type === "MOVE_DOWN") {
631
+ if (state.searchMode) {
632
+ return state;
633
+ }
634
+ if (state.level === "details" && state.focus === "right") {
635
+ return { ...state, detailsScrollOffset: Math.min(action.max, state.detailsScrollOffset + 1) };
636
+ }
637
+ if (state.level === "datasets") {
638
+ return { ...state, datasetMenuIndex: Math.min(action.max, state.datasetMenuIndex + 1) };
639
+ }
640
+ if (state.level === "runs") {
641
+ return { ...state, runMenuIndex: Math.min(action.max, state.runMenuIndex + 1) };
642
+ }
643
+ if (state.level === "new-evaluation") {
644
+ return { ...state, evaluatorMenuIndex: Math.min(action.max, state.evaluatorMenuIndex + 1) };
645
+ }
646
+ return state;
647
+ }
648
+ if (action.type === "ENTER") {
649
+ if (state.searchMode) {
650
+ return { ...state, searchMode: false };
651
+ }
652
+ if (state.level === "datasets") {
653
+ if (state.datasetMenuIndex === 0) {
654
+ return { ...state, level: "new-evaluation" };
655
+ }
656
+ if (action.hasDataset) {
657
+ return { ...state, level: "runs", runMenuIndex: 0 };
658
+ }
659
+ return state;
660
+ }
661
+ if (state.level === "runs") {
662
+ if (state.runMenuIndex === 0) {
663
+ return { ...state, level: "new-evaluation" };
664
+ }
665
+ if (action.hasRun) {
666
+ return { ...state, level: "details", detailsScrollOffset: 0 };
667
+ }
668
+ return state;
669
+ }
670
+ if (state.level === "new-evaluation") {
671
+ return state;
672
+ }
673
+ return state;
674
+ }
675
+ if (action.type === "BACK") {
676
+ if (state.searchMode) {
677
+ return { ...state, searchMode: false };
678
+ }
679
+ if (state.level === "details") {
680
+ return { ...state, level: "runs" };
681
+ }
682
+ if (state.level === "runs" || state.level === "new-evaluation") {
683
+ return { ...state, level: "datasets" };
684
+ }
685
+ return state;
686
+ }
687
+ if (action.type === "TOGGLE_FOCUS") {
688
+ return { ...state, focus: state.focus === "left" ? "right" : "left" };
689
+ }
690
+ if (action.type === "START_SEARCH") {
691
+ return { ...state, searchMode: true };
692
+ }
693
+ if (action.type === "END_SEARCH") {
694
+ return { ...state, searchMode: false };
695
+ }
696
+ if (action.type === "APPEND_SEARCH") {
697
+ return { ...state, searchQuery: `${state.searchQuery}${action.value}` };
698
+ }
699
+ if (action.type === "REMOVE_SEARCH_CHAR") {
700
+ return { ...state, searchQuery: state.searchQuery.slice(0, -1) };
701
+ }
702
+ if (action.type === "TOGGLE_EVALUATOR") {
703
+ const exists = state.selectedEvaluatorIds.includes(action.evaluatorId);
704
+ return {
705
+ ...state,
706
+ selectedEvaluatorIds: exists ? state.selectedEvaluatorIds.filter((id) => id !== action.evaluatorId) : [...state.selectedEvaluatorIds, action.evaluatorId]
707
+ };
708
+ }
709
+ if (action.type === "CLEAR_WARNINGS") {
710
+ return { ...state, startupWarnings: [] };
711
+ }
712
+ return state;
713
+ }
714
+ var LEFT_PANE_WIDTH2 = 44;
715
+ function DatasetsView({
716
+ state,
717
+ filteredDatasets,
718
+ selectedDataset
719
+ }) {
720
+ const leftFocused = state.focus === "left";
721
+ const rightFocused = state.focus === "right";
722
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
723
+ /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH2, focused: leftFocused, children: [
724
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Datasets" }),
725
+ /* @__PURE__ */ jsx(
726
+ ListItem,
727
+ {
728
+ selected: state.datasetMenuIndex === 0,
729
+ label: "New evaluation",
730
+ itemKey: "datasets-new-eval"
731
+ }
732
+ ),
733
+ filteredDatasets.map((dataset, index) => /* @__PURE__ */ jsx(
734
+ ListItem,
735
+ {
736
+ selected: state.datasetMenuIndex === index + 1,
737
+ label: dataset.name,
738
+ itemKey: `dataset-${dataset.id}`
739
+ },
740
+ dataset.id
741
+ ))
742
+ ] }),
743
+ /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
744
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Overview" }),
745
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: selectedDataset?.overview ?? "Select a dataset to inspect prior runs." })
746
+ ] })
747
+ ] });
748
+ }
749
+ function RunsView({
750
+ state,
751
+ dataset,
752
+ selectedRun
753
+ }) {
754
+ const runs = dataset?.runs ?? [];
755
+ const rightFocused = state.focus === "right";
756
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
757
+ /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
758
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: !selectedRun ? /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to see summary metrics." }) : /* @__PURE__ */ jsxs(Box, { flexDirection: "column", children: [
759
+ /* @__PURE__ */ jsxs(Text, { children: [
760
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: "Run:" }),
761
+ " ",
762
+ selectedRun.label,
763
+ " ",
764
+ /* @__PURE__ */ jsx(StatusText, { status: selectedRun.status })
765
+ ] }),
766
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
767
+ "Commit: ",
768
+ selectedRun.meta.commit,
769
+ " Branch: ",
770
+ selectedRun.meta.branch,
771
+ " ",
772
+ "Seed: ",
773
+ selectedRun.meta.seed
774
+ ] }),
775
+ /* @__PURE__ */ jsx(Text, { children: " " }),
776
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Overall" }),
777
+ /* @__PURE__ */ jsx(
778
+ TextBar,
779
+ {
780
+ label: "pass rate",
781
+ value: selectedRun.performance.passRate,
782
+ format: (v) => `${v}%`
783
+ }
784
+ ),
785
+ /* @__PURE__ */ jsx(
786
+ TextBar,
787
+ {
788
+ label: "avg score",
789
+ value: Math.round(selectedRun.performance.avgScore * 100)
790
+ }
791
+ ),
792
+ /* @__PURE__ */ jsx(Text, { children: " " }),
793
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Dimensions" }),
794
+ selectedRun.dimensions.map((dimension) => /* @__PURE__ */ jsx(
795
+ TextBar,
796
+ {
797
+ label: dimension.name,
798
+ value: dimension.score
799
+ },
800
+ dimension.name
801
+ )),
802
+ /* @__PURE__ */ jsx(Text, { children: " " }),
803
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }),
804
+ /* @__PURE__ */ jsx(
805
+ Sparkline,
806
+ {
807
+ data: selectedRun.performance.latencyHistoryMs ?? [
808
+ selectedRun.performance.latencyAvgMs - 40,
809
+ selectedRun.performance.latencyAvgMs - 10,
810
+ selectedRun.performance.latencyAvgMs + 20,
811
+ selectedRun.performance.latencyP95Ms - 80,
812
+ selectedRun.performance.latencyP95Ms
813
+ ],
814
+ width: 24
815
+ }
816
+ )
817
+ ] }) })
818
+ ] });
819
+ }
820
+ var DETAILS_PAGE_SIZE = 20;
821
+ function CheckRow({
822
+ name,
823
+ passed,
824
+ detail
825
+ }) {
826
+ const status = passed ? "PASSED" : "FAILED";
827
+ const color = passed ? "green" : "red";
828
+ return /* @__PURE__ */ jsxs(Text, { children: [
829
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: name.padEnd(14) }),
830
+ " ",
831
+ /* @__PURE__ */ jsx(Text, { color, bold: true, children: status }),
832
+ detail ? /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
833
+ " (",
834
+ detail,
835
+ ")"
836
+ ] }) : null
837
+ ] });
838
+ }
839
+ function buildDetailRows(run) {
840
+ const { performance, dimensions, checks, failures, meta } = run;
841
+ const latencyHistory = performance.latencyHistoryMs ?? [
842
+ performance.latencyAvgMs - 40,
843
+ performance.latencyAvgMs - 10,
844
+ performance.latencyAvgMs + 20,
845
+ performance.latencyP95Ms - 80,
846
+ performance.latencyP95Ms
847
+ ];
848
+ const rows = [
849
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Meta" }, "meta-h"),
850
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
851
+ "Model: ",
852
+ meta.model,
853
+ " Provider: ",
854
+ meta.provider
855
+ ] }, "meta-1"),
856
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
857
+ "Commit: ",
858
+ meta.commit,
859
+ " Branch: ",
860
+ meta.branch,
861
+ " Seed: ",
862
+ meta.seed
863
+ ] }, "meta-2"),
864
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
865
+ "Duration: ",
866
+ meta.duration,
867
+ " Concurrency: ",
868
+ meta.concurrency
869
+ ] }, "meta-3"),
870
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
871
+ "Artifact: ",
872
+ meta.artifact
873
+ ] }, "meta-4"),
874
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp1"),
875
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Scores (0\u2013100)" }, "scores-h"),
876
+ ...dimensions.map((d) => /* @__PURE__ */ jsx(TextBar, { label: d.name, value: d.score }, `dim-${d.name}`)),
877
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp2"),
878
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Checks (boolean)" }, "checks-h"),
879
+ ...checks.map((c) => /* @__PURE__ */ jsx(CheckRow, { name: c.name, passed: c.passed, detail: c.detail }, `chk-${c.name}`)),
880
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp3"),
881
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Performance" }, "perf-h"),
882
+ /* @__PURE__ */ jsx(
883
+ TextBar,
884
+ {
885
+ label: "pass rate",
886
+ value: performance.passRate,
887
+ format: (v) => `${v}%`
888
+ },
889
+ "perf-rate"
890
+ ),
891
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
892
+ "latency avg ",
893
+ performance.latencyAvgMs,
894
+ "ms p95 ",
895
+ performance.latencyP95Ms,
896
+ "ms"
897
+ ] }, "perf-lat"),
898
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
899
+ "tokens avg ",
900
+ performance.tokensAvg,
901
+ " p95 ",
902
+ performance.tokensP95
903
+ ] }, "perf-tok"),
904
+ /* @__PURE__ */ jsx(Text, { children: " " }, "sp4"),
905
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Latency trend" }, "spark-h"),
906
+ /* @__PURE__ */ jsx(Sparkline, { data: latencyHistory, width: 20 }, "spark")
907
+ ];
908
+ if (failures.length > 0) {
909
+ rows.push(/* @__PURE__ */ jsx(Text, { children: " " }, "sp5"));
910
+ rows.push(/* @__PURE__ */ jsx(SectionHeader, { children: "Failures (top)" }, "fail-h"));
911
+ failures.forEach((f, i) => {
912
+ rows.push(
913
+ /* @__PURE__ */ jsxs(Text, { color: "red", children: [
914
+ i + 1,
915
+ ") ",
916
+ f.title
917
+ ] }, `fail-${i}`)
918
+ );
919
+ });
920
+ }
921
+ return rows;
922
+ }
923
+ function RunDetailsView({
924
+ state,
925
+ dataset,
926
+ selectedRun
927
+ }) {
928
+ const runs = dataset?.runs ?? [];
929
+ const rightFocused = state.focus === "right";
930
+ if (!selectedRun) {
931
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
932
+ /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
933
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "Select a run to inspect details." }) })
934
+ ] });
935
+ }
936
+ const rows = buildDetailRows(selectedRun);
937
+ const offset = Math.max(0, state.detailsScrollOffset);
938
+ const visible = rows.slice(offset, offset + DETAILS_PAGE_SIZE);
939
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
940
+ /* @__PURE__ */ jsx(RunsSidebar, { state, dataset, runs }),
941
+ /* @__PURE__ */ jsx(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: /* @__PURE__ */ jsx(Box, { flexDirection: "column", children: visible.map((row, i) => /* @__PURE__ */ jsx(React.Fragment, { children: row }, i)) }) })
942
+ ] });
943
+ }
944
+ var LEFT_PANE_WIDTH3 = 44;
945
+ function NewEvaluationView({
946
+ state,
947
+ data,
948
+ visibleEvaluators
949
+ }) {
950
+ const selectedCount = state.selectedEvaluatorIds.length;
951
+ const focusedEvaluator = visibleEvaluators[state.evaluatorMenuIndex];
952
+ const leftFocused = state.focus === "left";
953
+ const rightFocused = state.focus === "right";
954
+ return /* @__PURE__ */ jsxs(Fragment, { children: [
955
+ /* @__PURE__ */ jsxs(Pane, { width: LEFT_PANE_WIDTH3, focused: leftFocused, children: [
956
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Available Evaluators" }),
957
+ /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
958
+ "Search: ",
959
+ state.searchQuery || "(none)"
960
+ ] }),
961
+ visibleEvaluators.map((evaluator, index) => {
962
+ const selected = index === state.evaluatorMenuIndex;
963
+ const inSelection = state.selectedEvaluatorIds.includes(evaluator.id);
964
+ return /* @__PURE__ */ jsxs(
965
+ Text,
966
+ {
967
+ color: selected ? "cyan" : "gray",
968
+ bold: selected,
969
+ children: [
970
+ selected ? "\u25B8 " : " ",
971
+ inSelection ? "[x] " : "[ ] ",
972
+ evaluator.name
973
+ ]
974
+ },
975
+ evaluator.id
976
+ );
977
+ })
978
+ ] }),
979
+ /* @__PURE__ */ jsxs(Pane, { flexGrow: 1, marginLeft: 1, focused: rightFocused, children: [
980
+ /* @__PURE__ */ jsxs(SectionHeader, { children: [
981
+ "Selected (",
982
+ selectedCount,
983
+ ")"
984
+ ] }),
985
+ state.selectedEvaluatorIds.map((id, index) => {
986
+ const evaluator = data.evaluators.find((item) => item.id === id);
987
+ if (!evaluator)
988
+ return null;
989
+ return /* @__PURE__ */ jsxs(Text, { children: [
990
+ index + 1,
991
+ ") ",
992
+ evaluator.name
993
+ ] }, id);
994
+ }),
995
+ /* @__PURE__ */ jsx(SectionHeader, { children: "Config preview" }),
996
+ /* @__PURE__ */ jsx(Text, { color: "gray", children: focusedEvaluator?.configPreview ?? "Select an evaluator to inspect config." })
997
+ ] })
998
+ ] });
999
+ }
1000
+ function clampCursor(state, filteredDatasetsLength, selectedRunCount) {
1001
+ const datasetMax = filteredDatasetsLength;
1002
+ const runMax = selectedRunCount;
1003
+ const evaluatorMax = 3;
1004
+ return {
1005
+ ...state,
1006
+ datasetMenuIndex: Math.max(0, Math.min(state.datasetMenuIndex, datasetMax)),
1007
+ runMenuIndex: Math.max(0, Math.min(state.runMenuIndex, runMax)),
1008
+ evaluatorMenuIndex: Math.max(
1009
+ 0,
1010
+ Math.min(state.evaluatorMenuIndex, evaluatorMax)
1011
+ )
1012
+ };
1013
+ }
1014
+ function EvalsCliApp({
1015
+ data,
1016
+ args,
1017
+ runner
1018
+ }) {
1019
+ const { exit } = useApp();
1020
+ const { width: stdoutWidth, height: stdoutHeight } = useScreenSize();
1021
+ const [liveData, setLiveData] = useState(data);
1022
+ const [runtimeMessage, setRuntimeMessage] = useState();
1023
+ const [state, dispatch] = useReducer(
1024
+ reduceCliState,
1025
+ createInitialState(data, args)
1026
+ );
1027
+ useEffect(() => {
1028
+ setLiveData(data);
1029
+ }, [data]);
1030
+ useEffect(() => {
1031
+ if (!runner) {
1032
+ return void 0;
1033
+ }
1034
+ return runner.subscribeRunEvents((event) => {
1035
+ setLiveData((current) => applyRunnerEvent(current, event, runner));
1036
+ if (event.type === "RunQueued") {
1037
+ setRuntimeMessage(`Queued ${event.runId} with ${event.totalTestCases} test cases.`);
1038
+ }
1039
+ if (event.type === "RunCompleted") {
1040
+ setRuntimeMessage(
1041
+ `Completed ${event.runId}: ${event.passedTestCases}/${event.totalTestCases} passed.`
1042
+ );
1043
+ }
1044
+ if (event.type === "RunFailed") {
1045
+ setRuntimeMessage(`Run failed: ${event.errorMessage}`);
1046
+ }
1047
+ });
1048
+ }, [runner]);
1049
+ const filteredDatasets = useMemo(
1050
+ () => getFilteredDatasets(liveData, state.searchQuery),
1051
+ [liveData, state.searchQuery]
1052
+ );
1053
+ const clampedState = clampCursor(
1054
+ state,
1055
+ filteredDatasets.length,
1056
+ getDatasetByMenuIndex(filteredDatasets, state.datasetMenuIndex)?.runs.length ?? 0
1057
+ );
1058
+ const selectedDataset = getDatasetByMenuIndex(
1059
+ filteredDatasets,
1060
+ clampedState.datasetMenuIndex
1061
+ );
1062
+ const selectedRun = getRunByMenuIndex(
1063
+ selectedDataset,
1064
+ clampedState.runMenuIndex
1065
+ );
1066
+ const visibleEvaluators = liveData.evaluators.filter(
1067
+ (evaluator) => evaluator.name.toLowerCase().includes(clampedState.searchQuery.toLowerCase())
1068
+ );
1069
+ useInput((input, key) => {
1070
+ if (isQuitInput(input) || key.escape) {
1071
+ exit();
1072
+ return;
1073
+ }
1074
+ if (key.tab) {
1075
+ dispatch({ type: "TOGGLE_FOCUS" });
1076
+ return;
1077
+ }
1078
+ if (isSearchInput(input)) {
1079
+ dispatch({ type: "START_SEARCH" });
1080
+ return;
1081
+ }
1082
+ if (clampedState.searchMode) {
1083
+ if (key.return) {
1084
+ dispatch({ type: "END_SEARCH" });
1085
+ return;
1086
+ }
1087
+ if (isBackKey(key)) {
1088
+ dispatch({ type: "REMOVE_SEARCH_CHAR" });
1089
+ return;
1090
+ }
1091
+ if (isPrintableCharacter(input)) {
1092
+ dispatch({ type: "APPEND_SEARCH", value: input });
1093
+ }
1094
+ return;
1095
+ }
1096
+ if (key.upArrow) {
1097
+ const max = clampedState.level === "details" ? 100 : clampedState.level === "new-evaluation" ? visibleEvaluators.length - 1 : 100;
1098
+ dispatch({ type: "MOVE_UP", max });
1099
+ return;
1100
+ }
1101
+ if (key.downArrow) {
1102
+ const max = clampedState.level === "datasets" ? filteredDatasets.length : clampedState.level === "runs" ? selectedDataset?.runs.length ?? 0 : clampedState.level === "new-evaluation" ? Math.max(0, visibleEvaluators.length - 1) : 100;
1103
+ dispatch({ type: "MOVE_DOWN", max });
1104
+ return;
1105
+ }
1106
+ if (key.return) {
1107
+ dispatch({
1108
+ type: "ENTER",
1109
+ hasDataset: Boolean(selectedDataset),
1110
+ hasRun: Boolean(selectedRun)
1111
+ });
1112
+ if (clampedState.level === "new-evaluation") {
1113
+ const evaluator = visibleEvaluators[clampedState.evaluatorMenuIndex];
1114
+ if (evaluator) {
1115
+ dispatch({ type: "TOGGLE_EVALUATOR", evaluatorId: evaluator.id });
1116
+ }
1117
+ }
1118
+ return;
1119
+ }
1120
+ if (isBackKey(key)) {
1121
+ dispatch({ type: "BACK" });
1122
+ return;
1123
+ }
1124
+ if (input.toLowerCase() === "c") {
1125
+ dispatch({ type: "CLEAR_WARNINGS" });
1126
+ setRuntimeMessage(void 0);
1127
+ return;
1128
+ }
1129
+ if (input.toLowerCase() === "s" && clampedState.level === "new-evaluation") {
1130
+ if (!runner) {
1131
+ setRuntimeMessage("Runner unavailable: cannot start evaluation.");
1132
+ return;
1133
+ }
1134
+ if (!selectedDataset) {
1135
+ setRuntimeMessage("Select a dataset before starting a new evaluation.");
1136
+ return;
1137
+ }
1138
+ if (clampedState.selectedEvaluatorIds.length === 0) {
1139
+ setRuntimeMessage("Select at least one evaluator before starting.");
1140
+ return;
1141
+ }
1142
+ void runner.runDatasetWith({
1143
+ datasetId: selectedDataset.id,
1144
+ evaluatorIds: clampedState.selectedEvaluatorIds
1145
+ }).then((snapshot) => {
1146
+ setRuntimeMessage(
1147
+ `Started ${snapshot.runId} on ${selectedDataset.name} (${snapshot.totalTestCases} cases).`
1148
+ );
1149
+ }).catch((error) => {
1150
+ setRuntimeMessage(
1151
+ error instanceof Error ? error.message : "Failed to start evaluation."
1152
+ );
1153
+ });
1154
+ }
1155
+ });
1156
+ const renderContent = () => {
1157
+ if (clampedState.level === "new-evaluation") {
1158
+ return /* @__PURE__ */ jsx(
1159
+ NewEvaluationView,
1160
+ {
1161
+ state: clampedState,
1162
+ data: liveData,
1163
+ visibleEvaluators
1164
+ }
1165
+ );
1166
+ }
1167
+ if (clampedState.level === "datasets") {
1168
+ return /* @__PURE__ */ jsx(
1169
+ DatasetsView,
1170
+ {
1171
+ state: clampedState,
1172
+ filteredDatasets,
1173
+ selectedDataset
1174
+ }
1175
+ );
1176
+ }
1177
+ if (clampedState.level === "runs") {
1178
+ return /* @__PURE__ */ jsx(
1179
+ RunsView,
1180
+ {
1181
+ state: clampedState,
1182
+ dataset: selectedDataset,
1183
+ selectedRun
1184
+ }
1185
+ );
1186
+ }
1187
+ return /* @__PURE__ */ jsx(
1188
+ RunDetailsView,
1189
+ {
1190
+ state: clampedState,
1191
+ dataset: selectedDataset,
1192
+ selectedRun
1193
+ }
1194
+ );
1195
+ };
1196
+ return /* @__PURE__ */ jsxs(
1197
+ Box,
1198
+ {
1199
+ flexDirection: "column",
1200
+ flexGrow: 1,
1201
+ width: stdoutWidth,
1202
+ height: stdoutHeight,
1203
+ children: [
1204
+ /* @__PURE__ */ jsx(
1205
+ Box,
1206
+ {
1207
+ borderStyle: "round",
1208
+ borderColor: "cyan",
1209
+ paddingX: 1,
1210
+ width: stdoutWidth,
1211
+ children: /* @__PURE__ */ jsx(Text, { children: getBreadcrumbText(
1212
+ clampedState,
1213
+ selectedDataset?.name,
1214
+ selectedRun?.label
1215
+ ) })
1216
+ }
1217
+ ),
1218
+ clampedState.startupWarnings.length > 0 && /* @__PURE__ */ jsxs(
1219
+ Box,
1220
+ {
1221
+ marginTop: 1,
1222
+ borderStyle: "round",
1223
+ borderColor: "yellow",
1224
+ paddingX: 1,
1225
+ flexDirection: "column",
1226
+ width: stdoutWidth,
1227
+ children: [
1228
+ /* @__PURE__ */ jsx(Text, { color: "yellow", children: "Startup warnings:" }),
1229
+ clampedState.startupWarnings.map((warning, index) => /* @__PURE__ */ jsx(Text, { children: warning }, `${warning}-${index}`))
1230
+ ]
1231
+ }
1232
+ ),
1233
+ clampedState.searchMode && /* @__PURE__ */ jsxs(
1234
+ Box,
1235
+ {
1236
+ marginTop: 1,
1237
+ borderStyle: "round",
1238
+ borderColor: "magenta",
1239
+ paddingX: 1,
1240
+ width: stdoutWidth,
1241
+ children: [
1242
+ /* @__PURE__ */ jsx(Text, { color: "magenta", bold: true, children: "Search: " }),
1243
+ /* @__PURE__ */ jsx(Text, { color: "white", children: clampedState.searchQuery })
1244
+ ]
1245
+ }
1246
+ ),
1247
+ runtimeMessage && /* @__PURE__ */ jsx(
1248
+ Box,
1249
+ {
1250
+ marginTop: 1,
1251
+ borderStyle: "round",
1252
+ borderColor: "blue",
1253
+ paddingX: 1,
1254
+ width: stdoutWidth,
1255
+ children: /* @__PURE__ */ jsx(Text, { color: "blue", children: runtimeMessage })
1256
+ }
1257
+ ),
1258
+ /* @__PURE__ */ jsx(
1259
+ Box,
1260
+ {
1261
+ marginTop: 1,
1262
+ flexGrow: 1,
1263
+ width: stdoutWidth,
1264
+ flexDirection: "row",
1265
+ children: renderContent()
1266
+ }
1267
+ ),
1268
+ /* @__PURE__ */ jsx(Box, { marginTop: 1, paddingX: 1, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: getFooterText(clampedState) }) })
1269
+ ]
1270
+ }
1271
+ );
1272
+ }
1273
+
1274
+ // src/runner/config.ts
1275
+ var defaultRunnerConfig = {
1276
+ discovery: {
1277
+ rootDir: process.cwd(),
1278
+ datasetSuffixes: [".dataset.ts", ".dataset.tsx", ".dataset.js", ".dataset.mjs"],
1279
+ evaluatorSuffixes: [
1280
+ ".evaluator.ts",
1281
+ ".evaluator.tsx",
1282
+ ".evaluator.js",
1283
+ ".evaluator.mjs"
1284
+ ],
1285
+ testCaseSuffixes: [
1286
+ ".test-case.ts",
1287
+ ".test-case.tsx",
1288
+ ".test-case.js",
1289
+ ".test-case.mjs"
1290
+ ],
1291
+ excludeDirectories: ["node_modules", "dist", ".next", ".git", ".pnpm-store"]
1292
+ },
1293
+ artifactDirectory: ".eval-results"
1294
+ };
1295
+ function withRunnerConfig(overrides) {
1296
+ {
1297
+ return defaultRunnerConfig;
1298
+ }
1299
+ }
1300
+ var jitiLoader;
1301
+ function toId(prefix, filePath, name) {
1302
+ const stable = name && name.trim().length > 0 ? name : filePath;
1303
+ return `${prefix}:${stable}`.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
1304
+ }
1305
+ function hasMethod(value, methodName) {
1306
+ return typeof value === "object" && value !== null && methodName in value && typeof value[methodName] === "function";
1307
+ }
1308
+ function isDatasetLike(value) {
1309
+ return hasMethod(value, "getName") && hasMethod(value, "matchesTestCase");
1310
+ }
1311
+ function isEvaluatorLike(value) {
1312
+ return hasMethod(value, "getName") && hasMethod(value, "resolveContext") && hasMethod(value, "getEvaluateFn");
1313
+ }
1314
+ function isTestCaseLike(value) {
1315
+ return hasMethod(value, "getName") && hasMethod(value, "getTags") && hasMethod(value, "getInput");
1316
+ }
1317
+ async function walkDirectory(rootDir, excludeDirectories) {
1318
+ const out = [];
1319
+ async function walk(currentDir) {
1320
+ let entries;
1321
+ try {
1322
+ entries = await readdir(currentDir, { withFileTypes: true });
1323
+ } catch {
1324
+ return;
1325
+ }
1326
+ await Promise.all(
1327
+ entries.map(async (entry) => {
1328
+ const absolute = resolve(currentDir, entry.name);
1329
+ if (entry.isDirectory()) {
1330
+ if (excludeDirectories.includes(entry.name)) {
1331
+ return;
1332
+ }
1333
+ await walk(absolute);
1334
+ return;
1335
+ }
1336
+ if (entry.isFile()) {
1337
+ out.push(absolute);
1338
+ }
1339
+ })
1340
+ );
1341
+ }
1342
+ await walk(rootDir);
1343
+ return out;
1344
+ }
1345
+ function hasOneSuffix(filePath, suffixes) {
1346
+ return suffixes.some((suffix) => filePath.endsWith(suffix));
1347
+ }
1348
+ async function loadModuleExports(filePath) {
1349
+ if (filePath.endsWith(".ts") || filePath.endsWith(".tsx")) {
1350
+ if (!jitiLoader) {
1351
+ const jitiModule = await import('jiti');
1352
+ const createJiti = jitiModule.createJiti ?? jitiModule.default;
1353
+ if (!createJiti) {
1354
+ throw new Error("Failed to initialize jiti TypeScript loader");
1355
+ }
1356
+ jitiLoader = createJiti(import.meta.url, {
1357
+ interopDefault: true,
1358
+ moduleCache: true
1359
+ });
1360
+ }
1361
+ const loaded2 = jitiLoader.import ? await jitiLoader.import(filePath) : await Promise.resolve(jitiLoader(filePath));
1362
+ return Object.values(loaded2);
1363
+ }
1364
+ const moduleUrl = pathToFileURL(filePath).href;
1365
+ const loaded = await import(moduleUrl);
1366
+ return Object.values(loaded);
1367
+ }
1368
+ async function collectDatasetsFromFiles(config) {
1369
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1370
+ const matched = files.filter(
1371
+ (filePath) => hasOneSuffix(filePath, config.datasetSuffixes)
1372
+ );
1373
+ const found = await Promise.all(
1374
+ matched.map(async (absolutePath) => {
1375
+ const exports = await loadModuleExports(absolutePath);
1376
+ const datasets = exports.filter(isDatasetLike);
1377
+ const relPath = relative(config.rootDir, absolutePath);
1378
+ return datasets.map((dataset) => ({
1379
+ id: toId("dataset", relPath, dataset.getName()),
1380
+ filePath: relPath,
1381
+ dataset
1382
+ }));
1383
+ })
1384
+ );
1385
+ return found.flat();
1386
+ }
1387
+ async function collectEvaluatorsFromFiles(config) {
1388
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1389
+ const matched = files.filter(
1390
+ (filePath) => hasOneSuffix(filePath, config.evaluatorSuffixes)
1391
+ );
1392
+ const found = await Promise.all(
1393
+ matched.map(async (absolutePath) => {
1394
+ const exports = await loadModuleExports(absolutePath);
1395
+ const evaluators = exports.filter(isEvaluatorLike);
1396
+ const relPath = relative(config.rootDir, absolutePath);
1397
+ return evaluators.map((evaluator) => ({
1398
+ id: toId("evaluator", relPath, evaluator.getName()),
1399
+ filePath: relPath,
1400
+ evaluator
1401
+ }));
1402
+ })
1403
+ );
1404
+ return found.flat();
1405
+ }
1406
+ async function collectTestCasesFromFiles(config) {
1407
+ const files = await walkDirectory(config.rootDir, config.excludeDirectories);
1408
+ const matched = files.filter(
1409
+ (filePath) => hasOneSuffix(filePath, config.testCaseSuffixes)
1410
+ );
1411
+ const found = await Promise.all(
1412
+ matched.map(async (absolutePath) => {
1413
+ const exports = await loadModuleExports(absolutePath);
1414
+ const testCases = exports.filter(isTestCaseLike);
1415
+ const relPath = relative(config.rootDir, absolutePath);
1416
+ return testCases.map((testCase) => ({
1417
+ id: toId("test-case", relPath, testCase.getName()),
1418
+ filePath: relPath,
1419
+ testCase
1420
+ }));
1421
+ })
1422
+ );
1423
+ return found.flat();
1424
+ }
1425
+
1426
+ // src/evals/metric.ts
1427
+ var registry = /* @__PURE__ */ new Map();
1428
+ var Metric = {
1429
+ of(config) {
1430
+ const def = {
1431
+ id: config.id,
1432
+ name: config.name,
1433
+ format: config.format,
1434
+ make: (data) => ({ id: config.id, data })
1435
+ };
1436
+ registry.set(config.id, def);
1437
+ return def;
1438
+ }
1439
+ };
1440
+
1441
+ // src/evals/score.ts
1442
+ var registry2 = /* @__PURE__ */ new Map();
1443
+ var Score = {
1444
+ of(config) {
1445
+ const def = {
1446
+ id: config.id,
1447
+ name: config.name,
1448
+ displayStrategy: config.displayStrategy,
1449
+ format: config.format,
1450
+ make: (data, options) => {
1451
+ const passed = options?.definePassed !== void 0 ? options.definePassed(data) : void 0;
1452
+ return {
1453
+ id: config.id,
1454
+ data,
1455
+ ...passed !== void 0 && { passed }
1456
+ };
1457
+ }
1458
+ };
1459
+ registry2.set(config.id, def);
1460
+ return def;
1461
+ }
1462
+ };
1463
+ function getScoreById(id) {
1464
+ return registry2.get(id);
1465
+ }
1466
+
1467
+ // src/evals/metrics/standard.ts
1468
+ Metric.of({
1469
+ id: "token-count",
1470
+ name: "Tokens",
1471
+ format: (data) => {
1472
+ const input = data.input ?? 0;
1473
+ const output = data.output ?? 0;
1474
+ const inputCached = data.inputCached ?? 0;
1475
+ const outputCached = data.outputCached ?? 0;
1476
+ const cached = inputCached + outputCached;
1477
+ return `in:${input} out:${output} cached:${cached}`;
1478
+ }
1479
+ });
1480
+ Metric.of({
1481
+ id: "latency",
1482
+ name: "Latency",
1483
+ format: (data) => `${data.ms}ms`
1484
+ });
1485
+
1486
+ // src/evals/scores/standard.ts
1487
+ Score.of({
1488
+ id: "percent",
1489
+ name: "Score",
1490
+ displayStrategy: "bar",
1491
+ format: (data) => data.value.toFixed(2)
1492
+ });
1493
+ Score.of({
1494
+ id: "binary",
1495
+ name: "Result",
1496
+ displayStrategy: "passFail",
1497
+ format: (data) => data.passed ? "PASSED" : "NOT PASSED"
1498
+ });
1499
+
1500
+ // src/runner/score-utils.ts
1501
+ function toNumericScoreFromScores(scores) {
1502
+ for (const item of scores) {
1503
+ const def = getScoreById(item.id);
1504
+ if (def && def.displayStrategy === "bar" && typeof item.data === "object" && item.data !== null && "value" in item.data) {
1505
+ const value = item.data.value;
1506
+ if (typeof value === "number" && Number.isFinite(value)) {
1507
+ return value;
1508
+ }
1509
+ }
1510
+ const numeric = toNumericScore(item.data);
1511
+ if (numeric !== void 0) {
1512
+ return numeric;
1513
+ }
1514
+ }
1515
+ return void 0;
1516
+ }
1517
+ function toNumericScore(value) {
1518
+ if (typeof value === "number" && Number.isFinite(value)) {
1519
+ return value;
1520
+ }
1521
+ if (typeof value !== "object" || value === null) {
1522
+ return void 0;
1523
+ }
1524
+ const obj = value;
1525
+ if ("score" in obj && typeof obj.score === "number" && Number.isFinite(obj.score)) {
1526
+ return obj.score;
1527
+ }
1528
+ const numberValues = Object.values(value).filter(
1529
+ (entry) => typeof entry === "number" && Number.isFinite(entry)
1530
+ );
1531
+ if (numberValues.length === 0) {
1532
+ return void 0;
1533
+ }
1534
+ return numberValues.reduce((sum, entry) => sum + entry, 0) / numberValues.length;
1535
+ }
1536
+
1537
+ // src/runner/execution.ts
1538
+ function computeEvaluatorPassed(evaluator, result, scores) {
1539
+ const scoresWithPassed = scores.filter((s) => "passed" in s && s.passed !== void 0);
1540
+ if (scoresWithPassed.length > 0) {
1541
+ return scoresWithPassed.every((s) => s.passed === true);
1542
+ }
1543
+ const passCriterion = evaluator.getPassCriterion();
1544
+ if (passCriterion) {
1545
+ return passCriterion(result);
1546
+ }
1547
+ const passThreshold = evaluator.getPassThreshold();
1548
+ if (passThreshold !== void 0) {
1549
+ const numeric = toNumericScoreFromScores(scores);
1550
+ return numeric !== void 0 && numeric >= passThreshold;
1551
+ }
1552
+ return true;
1553
+ }
1554
+ function normalizeResult(result) {
1555
+ if (typeof result !== "object" || result === null) {
1556
+ return { scores: [] };
1557
+ }
1558
+ const obj = result;
1559
+ const scores = Array.isArray(obj.scores) ? obj.scores : [];
1560
+ const metrics = Array.isArray(obj.metrics) ? obj.metrics : void 0;
1561
+ return { scores, metrics };
1562
+ }
1563
+ function nowIsoForFile() {
1564
+ return (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
1565
+ }
1566
+ function createArtifactPath(artifactDirectory, datasetId, runId) {
1567
+ return join(
1568
+ artifactDirectory,
1569
+ `${datasetId}_${runId}_${nowIsoForFile()}.jsonl`
1570
+ );
1571
+ }
1572
+ var executeRunTask = (task, publishEvent, persistenceQueue, updateSnapshot) => Effect.gen(function* () {
1573
+ const startedAt = Date.now();
1574
+ updateSnapshot(task.runId, (snapshot) => ({
1575
+ ...snapshot,
1576
+ status: "running",
1577
+ startedAt
1578
+ }));
1579
+ yield* publishEvent({
1580
+ type: "RunStarted",
1581
+ runId: task.runId,
1582
+ startedAt
1583
+ });
1584
+ let completedTestCases = 0;
1585
+ let passedTestCases = 0;
1586
+ let failedTestCases = 0;
1587
+ for (const testCaseItem of task.testCases) {
1588
+ const started = Date.now();
1589
+ const evaluatorScores = [];
1590
+ let testCaseError;
1591
+ for (const { id: evaluatorId, evaluator } of task.evaluators) {
1592
+ const evaluateFn = evaluator.getEvaluateFn();
1593
+ if (!evaluateFn) {
1594
+ continue;
1595
+ }
1596
+ try {
1597
+ const ctx = yield* Effect.promise(
1598
+ () => Promise.resolve(evaluator.resolveContext())
1599
+ );
1600
+ const result = yield* Effect.promise(
1601
+ () => Promise.resolve(evaluateFn(testCaseItem.testCase.getInput(), ctx))
1602
+ );
1603
+ const { scores, metrics } = normalizeResult(result);
1604
+ const passed = computeEvaluatorPassed(evaluator, result, scores);
1605
+ evaluatorScores.push({ evaluatorId, scores, passed, metrics });
1606
+ } catch (error) {
1607
+ testCaseError = error instanceof Error ? error.message : "Evaluator execution failed";
1608
+ evaluatorScores.push({
1609
+ evaluatorId,
1610
+ scores: [],
1611
+ passed: false
1612
+ });
1613
+ }
1614
+ }
1615
+ const testCasePassed = evaluatorScores.every((s) => s.passed);
1616
+ completedTestCases += 1;
1617
+ if (testCasePassed) {
1618
+ passedTestCases += 1;
1619
+ } else {
1620
+ failedTestCases += 1;
1621
+ }
1622
+ const progressEvent = {
1623
+ type: "TestCaseProgress",
1624
+ runId: task.runId,
1625
+ testCaseId: testCaseItem.id,
1626
+ testCaseName: testCaseItem.testCase.getName(),
1627
+ completedTestCases,
1628
+ totalTestCases: task.testCases.length,
1629
+ passed: testCasePassed,
1630
+ durationMs: Date.now() - started,
1631
+ evaluatorScores,
1632
+ errorMessage: testCaseError
1633
+ };
1634
+ updateSnapshot(task.runId, (snapshot) => ({
1635
+ ...snapshot,
1636
+ completedTestCases,
1637
+ passedTestCases,
1638
+ failedTestCases
1639
+ }));
1640
+ yield* publishEvent(progressEvent);
1641
+ yield* Queue.offer(persistenceQueue, {
1642
+ runId: task.runId,
1643
+ artifactPath: task.snapshot.artifactPath,
1644
+ payload: progressEvent
1645
+ });
1646
+ }
1647
+ const finishedAt = Date.now();
1648
+ const completedEvent = {
1649
+ type: "RunCompleted",
1650
+ runId: task.runId,
1651
+ finishedAt,
1652
+ passedTestCases,
1653
+ failedTestCases,
1654
+ totalTestCases: task.testCases.length,
1655
+ artifactPath: task.snapshot.artifactPath
1656
+ };
1657
+ updateSnapshot(task.runId, (snapshot) => ({
1658
+ ...snapshot,
1659
+ status: "completed",
1660
+ completedTestCases,
1661
+ passedTestCases,
1662
+ failedTestCases,
1663
+ finishedAt
1664
+ }));
1665
+ yield* publishEvent(completedEvent);
1666
+ yield* Queue.offer(persistenceQueue, {
1667
+ runId: task.runId,
1668
+ artifactPath: task.snapshot.artifactPath,
1669
+ payload: completedEvent
1670
+ });
1671
+ yield* publishEvent({
1672
+ type: "ArtifactFlushed",
1673
+ runId: task.runId,
1674
+ artifactPath: task.snapshot.artifactPath
1675
+ });
1676
+ });
1677
+ async function appendJsonLine(artifactPath, payload) {
1678
+ await mkdir(dirname(artifactPath), { recursive: true });
1679
+ await appendFile(artifactPath, `${JSON.stringify(payload)}
1680
+ `, "utf8");
1681
+ }
1682
+ var createPersistenceWorker = (queue) => Effect.forever(
1683
+ Effect.gen(function* () {
1684
+ const message = yield* Queue.take(queue);
1685
+ yield* Effect.promise(
1686
+ () => appendJsonLine(message.artifactPath, {
1687
+ runId: message.runId,
1688
+ ts: Date.now(),
1689
+ ...message.payload
1690
+ })
1691
+ );
1692
+ })
1693
+ );
1694
+
1695
+ // src/runner/search.ts
1696
+ function matchesAny(value, matchers) {
1697
+ if (!matchers || matchers.length === 0) {
1698
+ return true;
1699
+ }
1700
+ return matchers.some(
1701
+ (matcher) => typeof matcher === "string" ? matcher === value : matcher.test(value)
1702
+ );
1703
+ }
1704
+ function matchesPath(value, matchers) {
1705
+ if (!matchers || matchers.length === 0) {
1706
+ return true;
1707
+ }
1708
+ return matchers.some((matcher) => {
1709
+ if (typeof matcher === "string") {
1710
+ return value.includes(matcher);
1711
+ }
1712
+ return matcher.test(value);
1713
+ });
1714
+ }
1715
+ function searchCollectedTestCases(all, query) {
1716
+ if (!query) {
1717
+ return all;
1718
+ }
1719
+ return all.filter((item) => {
1720
+ const tags = item.testCase.getTags();
1721
+ if (query.excludedTags && tags.some((tag) => matchesAny(tag, query.excludedTags))) {
1722
+ return false;
1723
+ }
1724
+ if (query.excludedPaths && matchesPath(item.filePath, query.excludedPaths)) {
1725
+ return false;
1726
+ }
1727
+ const includedTagsMatch = !query.includedTags || query.includedTags.length === 0 || tags.some((tag) => matchesAny(tag, query.includedTags));
1728
+ const includedPathsMatch = !query.includedPaths || query.includedPaths.length === 0 || matchesPath(item.filePath, query.includedPaths);
1729
+ return includedTagsMatch && includedPathsMatch;
1730
+ });
1731
+ }
1732
+
1733
+ // src/runner/api.ts
1734
+ function parseRegexLiteral(pattern) {
1735
+ if (!pattern.startsWith("/")) {
1736
+ return void 0;
1737
+ }
1738
+ const lastSlash = pattern.lastIndexOf("/");
1739
+ if (lastSlash <= 0) {
1740
+ return void 0;
1741
+ }
1742
+ return {
1743
+ source: pattern.slice(1, lastSlash),
1744
+ flags: pattern.slice(lastSlash + 1)
1745
+ };
1746
+ }
1747
+ function createNameMatcher(pattern) {
1748
+ const normalizedPattern = pattern.trim();
1749
+ const regexLiteral = parseRegexLiteral(normalizedPattern);
1750
+ if (regexLiteral) {
1751
+ const regex = new RegExp(regexLiteral.source, regexLiteral.flags);
1752
+ return (value) => regex.test(value);
1753
+ }
1754
+ if (normalizedPattern.includes("*")) {
1755
+ const escaped = normalizedPattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*");
1756
+ const regex = new RegExp(`^${escaped}$`, "i");
1757
+ return (value) => regex.test(value);
1758
+ }
1759
+ return (value) => value.toLowerCase() === normalizedPattern.toLowerCase();
1760
+ }
1761
+ function createRunner(overrides) {
1762
+ return new EffectRunner(withRunnerConfig());
1763
+ }
1764
+ var EffectRunner = class {
1765
+ constructor(config) {
1766
+ this.eventBus = Effect.runSync(PubSub.unbounded());
1767
+ this.runQueue = Effect.runSync(Queue.unbounded());
1768
+ this.persistenceQueue = Effect.runSync(
1769
+ Queue.unbounded()
1770
+ );
1771
+ this.snapshots = /* @__PURE__ */ new Map();
1772
+ this.listeners = /* @__PURE__ */ new Set();
1773
+ this.datasetsById = /* @__PURE__ */ new Map();
1774
+ this.evaluatorsById = /* @__PURE__ */ new Map();
1775
+ this.schedulerFiber = Effect.runFork(
1776
+ this.createSchedulerEffect()
1777
+ );
1778
+ this.persistenceFiber = Effect.runFork(
1779
+ createPersistenceWorker(this.persistenceQueue)
1780
+ );
1781
+ this.config = config;
1782
+ }
1783
+ async collectDatasets() {
1784
+ const datasets = await collectDatasetsFromFiles(this.config.discovery);
1785
+ this.datasetsById.clear();
1786
+ for (const dataset of datasets) {
1787
+ this.datasetsById.set(dataset.id, dataset);
1788
+ }
1789
+ return datasets;
1790
+ }
1791
+ async collectEvaluators() {
1792
+ const evaluators = await collectEvaluatorsFromFiles(this.config.discovery);
1793
+ this.evaluatorsById.clear();
1794
+ for (const evaluator of evaluators) {
1795
+ this.evaluatorsById.set(evaluator.id, evaluator);
1796
+ }
1797
+ return evaluators;
1798
+ }
1799
+ async resolveDatasetByName(name) {
1800
+ if (this.datasetsById.size === 0) {
1801
+ await this.collectDatasets();
1802
+ }
1803
+ const normalized = name.trim().toLowerCase();
1804
+ return Array.from(this.datasetsById.values()).find(
1805
+ (item) => item.dataset.getName().toLowerCase() === normalized
1806
+ );
1807
+ }
1808
+ async resolveEvaluatorsByNamePattern(pattern) {
1809
+ if (this.evaluatorsById.size === 0) {
1810
+ await this.collectEvaluators();
1811
+ }
1812
+ const matcher = createNameMatcher(pattern);
1813
+ return Array.from(this.evaluatorsById.values()).filter(
1814
+ (item) => matcher(item.evaluator.getName() ?? "")
1815
+ );
1816
+ }
1817
+ async searchTestCases(query) {
1818
+ const testCases = await collectTestCasesFromFiles(this.config.discovery);
1819
+ return searchCollectedTestCases(testCases, query);
1820
+ }
1821
+ async collectDatasetTestCases(datasetId) {
1822
+ if (this.datasetsById.size === 0) {
1823
+ await this.collectDatasets();
1824
+ }
1825
+ const dataset = this.datasetsById.get(datasetId);
1826
+ if (!dataset) {
1827
+ throw new Error(`Unknown dataset: ${datasetId}`);
1828
+ }
1829
+ const allTestCases = await collectTestCasesFromFiles(this.config.discovery);
1830
+ return allTestCases.filter(
1831
+ (testCase) => dataset.dataset.matchesTestCase(testCase.testCase, testCase.filePath)
1832
+ );
1833
+ }
1834
+ async runDatasetWith(request) {
1835
+ if (this.datasetsById.size === 0) {
1836
+ await this.collectDatasets();
1837
+ }
1838
+ if (this.evaluatorsById.size === 0) {
1839
+ await this.collectEvaluators();
1840
+ }
1841
+ const dataset = this.datasetsById.get(request.datasetId);
1842
+ if (!dataset) {
1843
+ throw new Error(`Unknown dataset: ${request.datasetId}`);
1844
+ }
1845
+ const selectedEvaluators = request.evaluatorIds.map((id) => this.evaluatorsById.get(id)).filter((value) => Boolean(value)).map((value) => ({ id: value.id, evaluator: value.evaluator }));
1846
+ if (selectedEvaluators.length === 0) {
1847
+ throw new Error("No evaluators selected for run");
1848
+ }
1849
+ const selectedTestCases = await this.collectDatasetTestCases(request.datasetId);
1850
+ const runId = `run-${randomUUID()}`;
1851
+ const artifactPath = createArtifactPath(
1852
+ this.config.artifactDirectory,
1853
+ request.datasetId,
1854
+ runId
1855
+ );
1856
+ const snapshot = {
1857
+ runId,
1858
+ datasetId: request.datasetId,
1859
+ datasetName: dataset.dataset.getName(),
1860
+ evaluatorIds: selectedEvaluators.map((item) => item.id),
1861
+ queuedAt: Date.now(),
1862
+ totalTestCases: selectedTestCases.length,
1863
+ completedTestCases: 0,
1864
+ passedTestCases: 0,
1865
+ failedTestCases: 0,
1866
+ status: "queued",
1867
+ artifactPath
1868
+ };
1869
+ this.snapshots.set(runId, snapshot);
1870
+ const queuedEvent = {
1871
+ type: "RunQueued",
1872
+ runId,
1873
+ datasetId: request.datasetId,
1874
+ datasetName: dataset.dataset.getName(),
1875
+ evaluatorIds: selectedEvaluators.map((item) => item.id),
1876
+ totalTestCases: selectedTestCases.length,
1877
+ artifactPath
1878
+ };
1879
+ await Effect.runPromise(this.publishEvent(queuedEvent));
1880
+ await Effect.runPromise(
1881
+ Queue.offer(this.persistenceQueue, {
1882
+ runId,
1883
+ artifactPath,
1884
+ payload: queuedEvent
1885
+ })
1886
+ );
1887
+ await Effect.runPromise(
1888
+ Queue.offer(this.runQueue, {
1889
+ runId,
1890
+ datasetId: request.datasetId,
1891
+ dataset: dataset.dataset,
1892
+ evaluators: selectedEvaluators,
1893
+ testCases: selectedTestCases,
1894
+ snapshot
1895
+ })
1896
+ );
1897
+ return snapshot;
1898
+ }
1899
+ subscribeRunEvents(listener, options) {
1900
+ const entry = { runId: options?.runId, listener };
1901
+ this.listeners.add(entry);
1902
+ return () => {
1903
+ this.listeners.delete(entry);
1904
+ };
1905
+ }
1906
+ getRunSnapshot(runId) {
1907
+ return this.snapshots.get(runId);
1908
+ }
1909
+ getAllRunSnapshots() {
1910
+ return Array.from(this.snapshots.values()).sort(
1911
+ (a, b) => b.queuedAt - a.queuedAt
1912
+ );
1913
+ }
1914
+ async shutdown() {
1915
+ await Effect.runPromise(Fiber.interrupt(this.schedulerFiber));
1916
+ await Effect.runPromise(Fiber.interrupt(this.persistenceFiber));
1917
+ await Effect.runPromise(Queue.shutdown(this.runQueue));
1918
+ await Effect.runPromise(Queue.shutdown(this.persistenceQueue));
1919
+ await Effect.runPromise(PubSub.shutdown(this.eventBus));
1920
+ }
1921
+ createSchedulerEffect() {
1922
+ const self = this;
1923
+ return Effect.forever(
1924
+ Effect.gen(function* () {
1925
+ const task = yield* Queue.take(self.runQueue);
1926
+ yield* Effect.fork(
1927
+ executeRunTask(
1928
+ task,
1929
+ self.publishEvent.bind(self),
1930
+ self.persistenceQueue,
1931
+ self.updateSnapshot.bind(self)
1932
+ )
1933
+ );
1934
+ })
1935
+ );
1936
+ }
1937
+ updateSnapshot(runId, updater) {
1938
+ const existing = this.snapshots.get(runId);
1939
+ if (!existing) {
1940
+ return;
1941
+ }
1942
+ this.snapshots.set(runId, updater(existing));
1943
+ }
1944
+ publishEvent(event) {
1945
+ return Effect.sync(() => {
1946
+ for (const entry of this.listeners) {
1947
+ if (entry.runId && entry.runId !== event.runId) {
1948
+ continue;
1949
+ }
1950
+ entry.listener(event);
1951
+ }
1952
+ }).pipe(
1953
+ Effect.flatMap(() => PubSub.publish(this.eventBus, event)),
1954
+ Effect.asVoid
1955
+ );
1956
+ }
1957
+ };
1958
+ async function main() {
1959
+ const args = parseStartupArgs(process.argv.slice(2));
1960
+ const runner = createRunner();
1961
+ const data = await loadRunnerData(runner).catch(() => loadMockData());
1962
+ process.on("SIGINT", () => {
1963
+ void runner.shutdown().finally(() => process.exit(0));
1964
+ });
1965
+ process.on("SIGTERM", () => {
1966
+ void runner.shutdown().finally(() => process.exit(0));
1967
+ });
1968
+ withFullScreen(
1969
+ /* @__PURE__ */ jsx(EvalsCliApp, { data, args, runner })
1970
+ ).start();
1971
+ }
1972
+ void main();
1973
+ //# sourceMappingURL=out.js.map
1974
+ //# sourceMappingURL=cli.js.map