vedana-backoffice 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1006 @@
1
+ import reflex as rx
2
+
3
+ from vedana_backoffice.states.eval import EvalState, RunSummary
4
+ from vedana_backoffice.states.chat import ChatState
5
+ from vedana_backoffice.ui import app_header
6
+
7
+
8
+ def _selection_and_actions() -> rx.Component:
9
+ """Selection controls and action buttons for the right panel."""
10
+ return rx.vstack(
11
+ rx.hstack(
12
+ rx.input(
13
+ placeholder="Test run name",
14
+ value=EvalState.test_run_name,
15
+ on_change=EvalState.set_test_run_name,
16
+ width="18em",
17
+ ),
18
+ rx.button(
19
+ EvalState.selection_label,
20
+ color_scheme="blue",
21
+ on_click=EvalState.run_selected_tests,
22
+ loading=EvalState.is_running,
23
+ disabled=rx.cond(EvalState.can_run, False, True), # type: ignore[arg-type]
24
+ ),
25
+ rx.spacer(),
26
+ rx.button(
27
+ "Reset selection",
28
+ variant="ghost",
29
+ color_scheme="gray",
30
+ size="1",
31
+ disabled=rx.cond(EvalState.selected_count > 0, False, True), # type: ignore[arg-type]
32
+ on_click=EvalState.reset_selection,
33
+ ),
34
+ spacing="2",
35
+ align="center",
36
+ width="100%",
37
+ ),
38
+ rx.button(
39
+ "Reload data",
40
+ variant="soft",
41
+ color_scheme="gray",
42
+ size="1",
43
+ on_click=EvalState.load_eval_data,
44
+ loading=EvalState.loading,
45
+ width="100%",
46
+ ),
47
+ spacing="3",
48
+ width="100%",
49
+ )
50
+
51
+
52
+ def _questions_card() -> rx.Component:
53
+ def _expandable_text(row: dict[str, rx.Var], key: str, clamp: int = 10) -> rx.Component:
54
+ """todo reuse _expandable_text in both tables"""
55
+ row_id = row.get("id", "")
56
+ return rx.table.cell(
57
+ rx.box(
58
+ rx.cond(
59
+ row.get("expanded", False),
60
+ rx.text(row.get(key, ""), size="1", white_space="pre-wrap"),
61
+ rx.text(
62
+ row.get(key, ""),
63
+ size="1",
64
+ white_space="pre-wrap",
65
+ style={
66
+ "display": "-webkit-box",
67
+ "WebkitLineClamp": str(clamp),
68
+ "WebkitBoxOrient": "vertical",
69
+ "overflow": "hidden",
70
+ "textOverflow": "ellipsis",
71
+ },
72
+ ),
73
+ ),
74
+ cursor="pointer",
75
+ on_click=EvalState.toggle_gds_row(row_id=row_id), # type: ignore[arg-type,call-arg,func-returns-value]
76
+ )
77
+ )
78
+
79
+ def _row(row: dict[str, rx.Var]) -> rx.Component:
80
+ row_id_var = row.get("id", "")
81
+ return rx.table.row(
82
+ rx.table.cell(
83
+ rx.checkbox(
84
+ checked=row.get("selected", False),
85
+ on_change=lambda checked: EvalState.toggle_question_selection(question=row_id_var, checked=checked), # type: ignore[arg-type,call-arg,func-returns-value]
86
+ )
87
+ ),
88
+ _expandable_text(row, "gds_question"),
89
+ _expandable_text(row, "gds_answer"),
90
+ _expandable_text(row, "question_context"),
91
+ rx.table.cell(
92
+ rx.cond(
93
+ row.get("question_scenario", "") != "",
94
+ rx.badge(
95
+ row.get("question_scenario", ""),
96
+ variant="soft",
97
+ size="1",
98
+ color_scheme=row.get("scenario_color", "gray"),
99
+ ),
100
+ rx.box(),
101
+ )
102
+ ),
103
+ )
104
+
105
+ return rx.card(
106
+ rx.vstack(
107
+ rx.hstack(
108
+ rx.heading("Golden QA Dataset", size="4"),
109
+ rx.spacer(),
110
+ rx.hstack(
111
+ rx.text("Scenario", size="1", color="gray"),
112
+ rx.select(
113
+ items=EvalState.available_scenarios,
114
+ value=EvalState.selected_scenario,
115
+ on_change=EvalState.set_scenario,
116
+ width="12em",
117
+ ),
118
+ rx.tooltip(
119
+ rx.button(
120
+ "↻",
121
+ variant="ghost",
122
+ color_scheme="gray",
123
+ size="1",
124
+ on_click=EvalState.refresh_golden_dataset,
125
+ loading=EvalState.is_running,
126
+ ),
127
+ content="Refresh golden dataset from Grist",
128
+ ),
129
+ spacing="2",
130
+ align="center",
131
+ ),
132
+ align="center",
133
+ width="100%",
134
+ ),
135
+ rx.scroll_area(
136
+ rx.table.root(
137
+ rx.table.header(
138
+ rx.table.row(
139
+ rx.table.column_header_cell(
140
+ rx.checkbox(
141
+ checked=EvalState.all_selected,
142
+ on_change=EvalState.toggle_select_all,
143
+ )
144
+ ),
145
+ rx.table.column_header_cell("Question"),
146
+ rx.table.column_header_cell("Golden answer"),
147
+ rx.table.column_header_cell("Context"),
148
+ rx.table.column_header_cell("Scenario"),
149
+ )
150
+ ),
151
+ rx.table.body(rx.foreach(EvalState.eval_gds_rows_with_selection, _row)),
152
+ variant="surface",
153
+ style={"width": "100%", "tableLayout": "fixed"},
154
+ ),
155
+ type="always",
156
+ scrollbars="vertical",
157
+ style={"flex": "1", "width": "100%", "minHeight": "0"},
158
+ ),
159
+ spacing="3",
160
+ style={"height": "100%", "display": "flex", "flexDirection": "column"},
161
+ ),
162
+ padding="1em",
163
+ width="100%",
164
+ style={"maxHeight": "80vh", "display": "flex", "flexDirection": "column"},
165
+ )
166
+
167
+
168
+ def _judge_card() -> rx.Component:
169
+ return rx.card(
170
+ rx.vstack(
171
+ rx.heading("Judge configuration", size="4"),
172
+ rx.box(
173
+ rx.text("Judge model", weight="medium"),
174
+ rx.text(EvalState.judge_model, size="3"),
175
+ padding_bottom="0.75em",
176
+ ),
177
+ rx.button(
178
+ "View Judge Prompt",
179
+ variant="soft",
180
+ size="1",
181
+ on_click=EvalState.open_judge_prompt_dialog,
182
+ disabled=rx.cond(EvalState.judge_prompt_id != "", False, True), # type: ignore[arg-type]
183
+ width="100%",
184
+ margin_top="0.5em",
185
+ ),
186
+ spacing="1",
187
+ width="100%",
188
+ ),
189
+ padding="1em",
190
+ width="100%",
191
+ )
192
+
193
+
194
+ def _pipeline_card() -> rx.Component:
195
+ return rx.card(
196
+ rx.vstack(
197
+ rx.heading("Pipeline config", size="4"),
198
+ rx.vstack(
199
+ rx.box(
200
+ rx.text("Data model", weight="medium"),
201
+ rx.button(
202
+ "View Data Model",
203
+ variant="soft",
204
+ size="1",
205
+ on_click=EvalState.open_data_model_dialog,
206
+ disabled=rx.cond(EvalState.dm_id != "", False, True), # type: ignore[arg-type]
207
+ width="100%",
208
+ margin_top="0.5em",
209
+ ),
210
+ rx.button(
211
+ "Refresh Data Model",
212
+ variant="soft",
213
+ size="1",
214
+ on_click=ChatState.reload_data_model,
215
+ loading=ChatState.is_refreshing_dm,
216
+ width="100%",
217
+ margin_top="0.5em",
218
+ ),
219
+ width="100%",
220
+ padding_bottom="0.75em",
221
+ ),
222
+ rx.box(
223
+ rx.text("Pipeline model", weight="medium", width="100%"),
224
+ rx.hstack(
225
+ rx.select(
226
+ items=["openai", "openrouter"],
227
+ value=EvalState.provider,
228
+ on_change=EvalState.set_provider,
229
+ width="100%",
230
+ placeholder="Provider",
231
+ ),
232
+ rx.cond(
233
+ EvalState.provider == "openrouter",
234
+ rx.input(
235
+ placeholder=rx.cond(
236
+ EvalState.default_openrouter_key_present,
237
+ "(Optional) custom OPENROUTER_API_KEY",
238
+ "(Required) OPENROUTER_API_KEY",
239
+ ),
240
+ type="password",
241
+ value=EvalState.custom_openrouter_key,
242
+ on_change=EvalState.set_custom_openrouter_key,
243
+ width="100%",
244
+ required=rx.cond(EvalState.default_openrouter_key_present, False, True),
245
+ ),
246
+ ),
247
+ rx.select(
248
+ items=EvalState.available_models_view,
249
+ value=EvalState.pipeline_model,
250
+ on_change=EvalState.set_pipeline_model,
251
+ width="100%",
252
+ placeholder="Select model",
253
+ ),
254
+ rx.checkbox(
255
+ "Filter Data Model",
256
+ checked=EvalState.enable_dm_filtering,
257
+ on_change=EvalState.set_enable_dm_filtering,
258
+ size="2",
259
+ ),
260
+ spacing="2",
261
+ align="center",
262
+ wrap="wrap",
263
+ width="100%",
264
+ ),
265
+ padding_bottom="0.75em",
266
+ width="100%",
267
+ ),
268
+ rx.box(
269
+ rx.text("Embeddings", weight="medium"),
270
+ rx.text(
271
+ rx.cond(EvalState.embeddings_model != "", EvalState.embeddings_model, "—"),
272
+ size="3",
273
+ ),
274
+ rx.text(
275
+ EvalState.embeddings_dim_label,
276
+ size="1",
277
+ color="gray",
278
+ ),
279
+ ),
280
+ spacing="1",
281
+ width="100%",
282
+ ),
283
+ spacing="3",
284
+ width="100%",
285
+ ),
286
+ padding="1em",
287
+ width="100%",
288
+ )
289
+
290
+
291
+ def _tests_card() -> rx.Component:
292
+ def _expandable_text(row: dict[str, rx.Var], key: str, clamp: int = 2) -> rx.Component:
293
+ row_id = row.get("row_id", "")
294
+ return rx.table.cell(
295
+ rx.box(
296
+ rx.cond(
297
+ row.get("expanded", False),
298
+ rx.text(
299
+ row.get(key, ""),
300
+ size="1",
301
+ white_space="pre-wrap",
302
+ style={"wordBreak": "break-word"},
303
+ ),
304
+ rx.text(
305
+ row.get(key, ""),
306
+ size="1",
307
+ white_space="pre-wrap",
308
+ style={
309
+ "display": "-webkit-box",
310
+ "WebkitLineClamp": str(clamp),
311
+ "WebkitBoxOrient": "vertical",
312
+ "overflow": "hidden",
313
+ "textOverflow": "ellipsis",
314
+ "wordBreak": "break-word",
315
+ },
316
+ ),
317
+ ),
318
+ cursor="pointer",
319
+ on_click=EvalState.toggle_row_expand(row_id=row_id), # type: ignore[arg-type,call-arg,func-returns-value]
320
+ style={"minWidth": "0", "width": "100%"},
321
+ ),
322
+ style={"minWidth": "0"},
323
+ )
324
+
325
+ def _row(row: dict[str, rx.Var]) -> rx.Component:
326
+ return rx.table.row(
327
+ rx.table.cell(rx.text(row.get("test_date", ""))),
328
+ _expandable_text(row, "gds_question"),
329
+ rx.table.cell(rx.text(row.get("pipeline_model", ""))),
330
+ _expandable_text(row, "llm_answer"),
331
+ _expandable_text(row, "gds_answer"),
332
+ rx.table.cell(
333
+ rx.badge(
334
+ row.get("test_status", ""),
335
+ color_scheme=row.get("status_color", "gray"),
336
+ variant="soft",
337
+ )
338
+ ),
339
+ rx.table.cell(rx.text(row.get("eval_judge_rating", "—"))),
340
+ _expandable_text(row, "eval_judge_comment"),
341
+ )
342
+
343
+ return rx.card(
344
+ rx.vstack(
345
+ rx.hstack(
346
+ rx.heading("Test results", size="4"),
347
+ rx.spacer(),
348
+ rx.text(EvalState.pass_fail_summary, size="2", color="gray"),
349
+ rx.badge(EvalState.cost_label, color_scheme="gray", variant="soft"),
350
+ rx.select(
351
+ items=EvalState.tests_sort_options,
352
+ value=EvalState.selected_tests_sort,
353
+ placeholder="Sort",
354
+ on_change=EvalState.select_tests_sort,
355
+ width="14em",
356
+ ),
357
+ rx.select(
358
+ items=EvalState.tests_scenario_options,
359
+ value=EvalState.selected_tests_scenario,
360
+ placeholder="Scenario (All)",
361
+ on_change=EvalState.select_tests_scenario,
362
+ width="14em",
363
+ ),
364
+ rx.select(
365
+ items=EvalState.run_id_options,
366
+ value=EvalState.selected_run_id,
367
+ placeholder="Run id (All)",
368
+ on_change=EvalState.select_run,
369
+ width="18em",
370
+ ),
371
+ align="center",
372
+ width="100%",
373
+ ),
374
+ rx.scroll_area(
375
+ rx.table.root(
376
+ rx.table.header(
377
+ rx.table.row(
378
+ rx.table.column_header_cell("Run at"),
379
+ rx.table.column_header_cell("Question"),
380
+ rx.table.column_header_cell("Pipeline"),
381
+ rx.table.column_header_cell("Answer"),
382
+ rx.table.column_header_cell("Golden Answer"),
383
+ rx.table.column_header_cell("Status"),
384
+ rx.table.column_header_cell("Rating"),
385
+ rx.table.column_header_cell("Judge comment"),
386
+ )
387
+ ),
388
+ rx.table.body(rx.foreach(EvalState.tests_rows, _row)),
389
+ variant="surface",
390
+ style={
391
+ "width": "100%",
392
+ "maxWidth": "100%",
393
+ "tableLayout": "fixed",
394
+ },
395
+ ),
396
+ type="always",
397
+ scrollbars="vertical",
398
+ style={
399
+ "maxHeight": "80vh",
400
+ "width": "100%",
401
+ "maxWidth": "100%",
402
+ },
403
+ ),
404
+ # Server-side pagination controls
405
+ rx.hstack(
406
+ rx.text(EvalState.tests_rows_display, size="2", color="gray"), # type: ignore[arg-type]
407
+ rx.spacer(),
408
+ rx.hstack(
409
+ rx.button(
410
+ "⏮",
411
+ variant="soft",
412
+ size="1",
413
+ on_click=EvalState.tests_first_page,
414
+ disabled=~EvalState.tests_has_prev, # type: ignore[operator]
415
+ ),
416
+ rx.button(
417
+ "← Prev",
418
+ variant="soft",
419
+ size="1",
420
+ on_click=EvalState.tests_prev_page,
421
+ disabled=~EvalState.tests_has_prev, # type: ignore[operator]
422
+ ),
423
+ rx.text(
424
+ EvalState.tests_page_display, # type: ignore[arg-type]
425
+ size="2",
426
+ style={"minWidth": "100px", "textAlign": "center"},
427
+ ),
428
+ rx.button(
429
+ "Next →",
430
+ variant="soft",
431
+ size="1",
432
+ on_click=EvalState.tests_next_page,
433
+ disabled=~EvalState.tests_has_next, # type: ignore[operator]
434
+ ),
435
+ rx.button(
436
+ "⏭",
437
+ variant="soft",
438
+ size="1",
439
+ on_click=EvalState.tests_last_page,
440
+ disabled=~EvalState.tests_has_next, # type: ignore[operator]
441
+ ),
442
+ spacing="2",
443
+ align="center",
444
+ ),
445
+ align="center",
446
+ width="100%",
447
+ ),
448
+ spacing="3",
449
+ ),
450
+ padding="1em",
451
+ width="100%",
452
+ )
453
+
454
+
455
+ def _compare_card() -> rx.Component:
456
+ return rx.card(
457
+ rx.vstack(
458
+ rx.heading("Compare runs", size="4"),
459
+ rx.vstack(
460
+ rx.select(
461
+ items=EvalState.run_options_only,
462
+ value=EvalState.compare_run_a,
463
+ placeholder="Run A",
464
+ on_change=EvalState.set_compare_run_a,
465
+ width="100%",
466
+ ),
467
+ rx.select(
468
+ items=EvalState.run_options_only,
469
+ value=EvalState.compare_run_b,
470
+ placeholder="Run B",
471
+ on_change=EvalState.set_compare_run_b,
472
+ width="100%",
473
+ ),
474
+ spacing="2",
475
+ align="center",
476
+ width="100%",
477
+ ),
478
+ rx.button(
479
+ "Compare",
480
+ on_click=EvalState.compare_runs,
481
+ disabled=~EvalState.can_compare_runs, # type: ignore[operator]
482
+ loading=EvalState.compare_loading,
483
+ width="100%",
484
+ margin_top="0.5em",
485
+ ),
486
+ rx.cond(
487
+ EvalState.compare_error != "",
488
+ rx.callout(EvalState.compare_error, color_scheme="red", variant="soft"),
489
+ rx.box(),
490
+ ),
491
+ spacing="3",
492
+ width="100%",
493
+ ),
494
+ padding="1em",
495
+ width="100%",
496
+ )
497
+
498
+
499
+ def _compare_dialog() -> rx.Component:
500
+ def _stat_block(label: str, summary: RunSummary) -> rx.Component:
501
+ avg_time = summary["avg_answer_time_sec"]
502
+ median_time = summary["median_answer_time_sec"]
503
+ return rx.card(
504
+ rx.vstack(
505
+ rx.text(label, weight="medium"),
506
+ rx.hstack(
507
+ rx.badge(
508
+ rx.text(
509
+ rx.cond(
510
+ summary["tests_total"],
511
+ f"Pass: {summary['passed']}",
512
+ "No tests",
513
+ ),
514
+ size="1",
515
+ ),
516
+ color_scheme="green",
517
+ variant="soft",
518
+ ),
519
+ rx.badge(
520
+ f"Fail: {summary['failed']}",
521
+ color_scheme="red",
522
+ variant="soft",
523
+ ),
524
+ rx.badge(
525
+ f"Rating: {summary['avg_rating']}",
526
+ color_scheme="blue",
527
+ variant="soft",
528
+ ),
529
+ rx.badge(
530
+ f"Cost: ${summary['cost_total']:.3f}",
531
+ color_scheme="gray",
532
+ variant="soft",
533
+ ),
534
+ rx.badge(
535
+ rx.text(
536
+ f"Time (avg/med): {avg_time} / {median_time}",
537
+ size="1",
538
+ ),
539
+ color_scheme="purple",
540
+ variant="soft",
541
+ ),
542
+ spacing="2",
543
+ align="center",
544
+ ),
545
+ spacing="2",
546
+ ),
547
+ padding="0.75em",
548
+ width="100%",
549
+ )
550
+
551
+ def _config_block(rows: list[dict[str, rx.Var]]) -> rx.Component:
552
+ def _row(row: dict[str, rx.Var]) -> rx.Component:
553
+ return rx.cond(
554
+ row["diff"],
555
+ rx.hstack(
556
+ rx.text(row["label"], weight="medium", size="1"),
557
+ rx.spacer(),
558
+ rx.text(row["value"], size="1"),
559
+ style={"padding": "0.15em 0.25em", "backgroundColor": "var(--amber-3)"},
560
+ ),
561
+ rx.hstack(
562
+ rx.text(row["label"], weight="medium", size="1"),
563
+ rx.spacer(),
564
+ rx.text(row["value"], size="1"),
565
+ style={"padding": "0.15em 0.25em"},
566
+ ),
567
+ )
568
+
569
+ return rx.card(
570
+ rx.vstack(
571
+ rx.text("Config", weight="medium"),
572
+ rx.foreach(rows, _row),
573
+ spacing="1",
574
+ ),
575
+ padding="0.75em",
576
+ width="100%",
577
+ )
578
+
579
+ def _diff_table(title: str, rows: list[dict[str, rx.Var]]) -> rx.Component:
580
+ def _line(row: dict[str, rx.Var]) -> rx.Component:
581
+ left_border = rx.cond(
582
+ row.get("strong", False),
583
+ f"2px solid {row.get('left_color', 'inherit')}",
584
+ "2px solid transparent",
585
+ )
586
+ right_border = rx.cond(
587
+ row.get("strong", False),
588
+ f"2px solid {row.get('right_color', 'inherit')}",
589
+ "2px solid transparent",
590
+ )
591
+ return rx.hstack(
592
+ rx.box(
593
+ rx.text(
594
+ row.get("left", ""),
595
+ size="1",
596
+ white_space="pre-wrap",
597
+ weight=rx.cond(row.get("strong", False), "bold", "regular"),
598
+ color=row.get("left_color", "inherit"),
599
+ ),
600
+ style={
601
+ "fontFamily": "monospace",
602
+ "padding": "1px 3px",
603
+ "borderRadius": "4px",
604
+ "borderLeft": left_border,
605
+ "width": "100%",
606
+ },
607
+ ),
608
+ rx.box(
609
+ rx.text(
610
+ row.get("right", ""),
611
+ size="1",
612
+ white_space="pre-wrap",
613
+ weight=rx.cond(row.get("strong", False), "bold", "regular"),
614
+ color=row.get("right_color", "inherit"),
615
+ ),
616
+ style={
617
+ "fontFamily": "monospace",
618
+ "padding": "1px 3px",
619
+ "borderRadius": "4px",
620
+ "borderLeft": right_border,
621
+ "width": "100%",
622
+ },
623
+ ),
624
+ spacing="1",
625
+ width="100%",
626
+ )
627
+
628
+ return rx.vstack(
629
+ rx.text(title, weight="medium"),
630
+ rx.hstack(
631
+ rx.text(EvalState.compare_run_label_a, weight="medium", size="1"),
632
+ rx.spacer(),
633
+ rx.text(EvalState.compare_run_label_b, weight="medium", size="1"),
634
+ width="100%",
635
+ ),
636
+ rx.scroll_area(
637
+ rx.vstack(
638
+ rx.foreach(rows, _line),
639
+ spacing="1",
640
+ width="100%",
641
+ ),
642
+ type="always",
643
+ scrollbars="vertical",
644
+ style={"maxHeight": "22vh", "padding": "1px"},
645
+ ),
646
+ spacing="1",
647
+ width="100%",
648
+ )
649
+
650
+ def _result_row(row: dict[str, rx.Var]) -> rx.Component:
651
+ def _badge_color(status: str | None) -> rx.Var:
652
+ """
653
+ green for pass, red for fail, grey else
654
+ """
655
+ return rx.cond(
656
+ status == "pass",
657
+ "green",
658
+ rx.cond(status == "fail", "red", "gray"),
659
+ )
660
+
661
+ def _answer_block(text: str, tool_calls: str) -> rx.Component:
662
+ return rx.vstack(
663
+ rx.text(text, size="1", white_space="pre-wrap", style={"wordBreak": "break-word"}),
664
+ rx.cond(
665
+ tool_calls != "",
666
+ rx.accordion.root(
667
+ rx.accordion.item(
668
+ rx.accordion.trigger("Tool calls", style={"fontSize": "12px"}),
669
+ rx.accordion.content(
670
+ rx.text(
671
+ tool_calls,
672
+ size="1",
673
+ color="gray",
674
+ white_space="pre-wrap",
675
+ style={"wordBreak": "break-word"},
676
+ )
677
+ ),
678
+ value="tool-calls",
679
+ ),
680
+ collapsible=True,
681
+ type="single",
682
+ default_value="",
683
+ variant="ghost",
684
+ style={"width": "100%"},
685
+ ),
686
+ rx.box(),
687
+ ),
688
+ spacing="1",
689
+ align="start",
690
+ width="100%",
691
+ )
692
+
693
+ return rx.table.row(
694
+ rx.table.cell(
695
+ rx.vstack(
696
+ rx.text(row["question"]),
697
+ rx.text(
698
+ row["golden_answer"],
699
+ size="1",
700
+ color="gray",
701
+ white_space="pre-wrap",
702
+ style={"wordBreak": "break-word"},
703
+ ),
704
+ spacing="1",
705
+ align="start",
706
+ width="100%",
707
+ )
708
+ ),
709
+ rx.table.cell(
710
+ rx.vstack(
711
+ rx.hstack(
712
+ rx.badge(
713
+ row["status_a"],
714
+ color_scheme=_badge_color(row["status_a"]), # type: ignore[arg-type]
715
+ variant="soft",
716
+ ),
717
+ rx.text(f"Rating: {row['rating_a']}", size="1"),
718
+ align="center",
719
+ ),
720
+ _answer_block(row["answer_a"], row["tool_calls_a"]), # type: ignore[arg-type]
721
+ rx.text(row["comment_a"], size="1", color="gray"),
722
+ spacing="1",
723
+ align="start",
724
+ )
725
+ ),
726
+ rx.table.cell(
727
+ rx.vstack(
728
+ rx.hstack(
729
+ rx.badge(
730
+ row["status_b"],
731
+ color_scheme=_badge_color(row["status_b"]), # type: ignore[arg-type]
732
+ variant="soft",
733
+ ),
734
+ rx.text(f"Rating: {row['rating_b']}", size="1"),
735
+ align="center",
736
+ ),
737
+ _answer_block(row["answer_b"], row["tool_calls_b"]), # type: ignore[arg-type]
738
+ rx.text(row["comment_b"], size="1", color="gray"),
739
+ spacing="1",
740
+ align="start",
741
+ )
742
+ ),
743
+ )
744
+
745
+ return rx.dialog.root(
746
+ rx.dialog.content(
747
+ rx.dialog.title("Run comparison"),
748
+ rx.vstack(
749
+ rx.hstack(
750
+ _stat_block(EvalState.compare_run_label_a, EvalState.compare_summary_a), # type: ignore[arg-type]
751
+ _stat_block(EvalState.compare_run_label_b, EvalState.compare_summary_b), # type: ignore[arg-type]
752
+ spacing="3",
753
+ width="100%",
754
+ ),
755
+ rx.hstack(
756
+ _config_block(EvalState.compare_config_a_rows), # type: ignore[arg-type]
757
+ _config_block(EvalState.compare_config_b_rows), # type: ignore[arg-type]
758
+ spacing="3",
759
+ width="100%",
760
+ ),
761
+ rx.cond(
762
+ EvalState.compare_diff_keys != [],
763
+ rx.card(
764
+ rx.vstack(
765
+ rx.hstack(
766
+ rx.text("Differences:", weight="medium", size="1"),
767
+ rx.box(rx.foreach(EvalState.compare_diff_keys, lambda k: rx.badge(k, variant="soft"))),
768
+ spacing="3",
769
+ align="center",
770
+ ),
771
+ rx.accordion.root(
772
+ rx.accordion.item(
773
+ rx.accordion.trigger("Judge prompt diff"),
774
+ rx.accordion.content(
775
+ rx.vstack(
776
+ rx.checkbox(
777
+ "Show only changes",
778
+ default_checked=True,
779
+ checked=EvalState.compare_judge_prompt_compact,
780
+ on_change=EvalState.set_compare_judge_prompt_compact,
781
+ size="2",
782
+ ),
783
+ _diff_table("Judge prompt diff", EvalState.compare_prompt_rows_view), # type: ignore[arg-type]
784
+ spacing="1",
785
+ )
786
+ ),
787
+ value="prompt-diff-block",
788
+ ),
789
+ rx.accordion.item(
790
+ rx.accordion.trigger("Data model diff"),
791
+ rx.accordion.content(
792
+ rx.vstack(
793
+ rx.checkbox(
794
+ "Show only changes",
795
+ default_checked=True,
796
+ checked=EvalState.compare_dm_compact,
797
+ on_change=EvalState.set_compare_dm_compact,
798
+ size="2",
799
+ ),
800
+ _diff_table("Data model diff", EvalState.compare_dm_rows_view), # type: ignore[arg-type]
801
+ spacing="1",
802
+ )
803
+ ),
804
+ value="dm-diff-block",
805
+ ),
806
+ type="multiple",
807
+ collapsible=True,
808
+ variant="outline",
809
+ width="100%",
810
+ ),
811
+ spacing="2",
812
+ width="100%",
813
+ ),
814
+ variant="surface",
815
+ width="100%",
816
+ ),
817
+ rx.box(),
818
+ ),
819
+ rx.cond(
820
+ EvalState.compare_loading,
821
+ rx.center(rx.spinner(size="3"), height="200px"),
822
+ rx.scroll_area(
823
+ rx.table.root(
824
+ rx.table.header(
825
+ rx.table.row(
826
+ rx.table.column_header_cell("Question", style={"width": "25%"}),
827
+ rx.table.column_header_cell(
828
+ EvalState.compare_run_label_a, style={"width": "37.5%"}
829
+ ),
830
+ rx.table.column_header_cell(
831
+ EvalState.compare_run_label_b, style={"width": "37.5%"}
832
+ ),
833
+ )
834
+ ),
835
+ rx.table.body(rx.foreach(EvalState.compare_rows, _result_row)),
836
+ variant="surface",
837
+ style={"width": "100%", "tableLayout": "fixed"},
838
+ ),
839
+ type="always",
840
+ scrollbars="vertical",
841
+ style={"maxHeight": "60vh"},
842
+ ),
843
+ ),
844
+ rx.dialog.close(rx.button("Close", variant="soft")),
845
+ spacing="3",
846
+ width="100%",
847
+ ),
848
+ style={"maxWidth": "92vw"},
849
+ ),
850
+ open=EvalState.compare_dialog_open,
851
+ on_open_change=EvalState.set_compare_dialog_open,
852
+ )
853
+
854
+
855
+ def _judge_prompt_dialog() -> rx.Component:
856
+ return rx.dialog.root(
857
+ rx.dialog.content(
858
+ rx.dialog.title("Judge Prompt"),
859
+ rx.vstack(
860
+ rx.text(f"Prompt ID: {EvalState.judge_prompt_id}", size="2", color="gray"),
861
+ rx.box(
862
+ rx.text(
863
+ rx.cond(
864
+ EvalState.judge_prompt != "",
865
+ EvalState.judge_prompt,
866
+ "Prompt not loaded",
867
+ ),
868
+ size="2",
869
+ ),
870
+ padding="1em",
871
+ border="1px solid var(--gray-6)",
872
+ border_radius="8px",
873
+ style={"maxHeight": "60vh", "overflow": "auto", "whiteSpace": "pre-wrap"},
874
+ ),
875
+ rx.dialog.close(
876
+ rx.button("Close", variant="soft"),
877
+ ),
878
+ spacing="3",
879
+ width="100%",
880
+ ),
881
+ style={"maxWidth": "800px"},
882
+ ),
883
+ open=EvalState.judge_prompt_dialog_open,
884
+ on_open_change=EvalState.set_judge_prompt_dialog_open,
885
+ )
886
+
887
+
888
+ def _data_model_dialog() -> rx.Component:
889
+ return rx.dialog.root(
890
+ rx.dialog.content(
891
+ rx.dialog.title("Data Model"),
892
+ rx.vstack(
893
+ rx.text(f"Model ID: {EvalState.dm_id}", size="2", color="gray"),
894
+ rx.box(
895
+ rx.text(
896
+ rx.cond(
897
+ EvalState.dm_description != "",
898
+ EvalState.dm_description,
899
+ "Description not loaded",
900
+ ),
901
+ size="2",
902
+ ),
903
+ padding="1em",
904
+ border="1px solid var(--gray-6)",
905
+ border_radius="8px",
906
+ style={"maxHeight": "60vh", "overflow": "auto", "whiteSpace": "pre-wrap"},
907
+ ),
908
+ rx.dialog.close(
909
+ rx.button("Close", variant="soft"),
910
+ ),
911
+ spacing="3",
912
+ width="100%",
913
+ ),
914
+ style={"maxWidth": "800px"},
915
+ ),
916
+ open=EvalState.data_model_dialog_open,
917
+ on_open_change=EvalState.set_data_model_dialog_open,
918
+ )
919
+
920
+
921
+ def _status_messages() -> rx.Component:
922
+ return rx.vstack(
923
+ rx.cond(
924
+ EvalState.status_message != "",
925
+ rx.callout(
926
+ rx.vstack(
927
+ rx.text(EvalState.status_message, weight="medium"),
928
+ rx.cond(
929
+ EvalState.current_question_progress != "",
930
+ rx.text(EvalState.current_question_progress, size="1", color="gray"), # type: ignore[arg-type]
931
+ rx.box(),
932
+ ),
933
+ spacing="1",
934
+ ),
935
+ color_scheme="green",
936
+ variant="soft",
937
+ width="100%",
938
+ ),
939
+ rx.box(),
940
+ ),
941
+ rx.cond(
942
+ EvalState.error_message != "",
943
+ rx.callout(EvalState.error_message, color_scheme="red", variant="soft"),
944
+ rx.box(),
945
+ ),
946
+ rx.cond(
947
+ EvalState.has_run_progress,
948
+ rx.card(
949
+ rx.vstack(
950
+ rx.heading("Run log", size="3"),
951
+ rx.scroll_area(
952
+ rx.vstack(
953
+ rx.foreach(
954
+ EvalState.run_progress,
955
+ lambda line: rx.text(line, size="1"),
956
+ )
957
+ ),
958
+ type="always",
959
+ scrollbars="vertical",
960
+ style={"height": "160px"},
961
+ ),
962
+ spacing="2",
963
+ ),
964
+ padding="1em",
965
+ width="100%",
966
+ ),
967
+ rx.box(),
968
+ ),
969
+ spacing="3",
970
+ width="100%",
971
+ )
972
+
973
+
974
+ def page() -> rx.Component:
975
+ return rx.vstack(
976
+ app_header(),
977
+ rx.vstack(
978
+ rx.grid(
979
+ rx.vstack(
980
+ _questions_card(),
981
+ _tests_card(),
982
+ ),
983
+ rx.vstack(
984
+ _judge_card(),
985
+ _pipeline_card(),
986
+ _selection_and_actions(),
987
+ _status_messages(),
988
+ _compare_card(),
989
+ spacing="4",
990
+ width="100%",
991
+ ),
992
+ columns="2",
993
+ spacing="4",
994
+ width="100%",
995
+ style={"gridTemplateColumns": "3fr 1fr", "height": "calc(100vh - 200px)", "minHeight": "700px"},
996
+ ),
997
+ spacing="4",
998
+ width="100%",
999
+ ),
1000
+ _compare_dialog(),
1001
+ _judge_prompt_dialog(),
1002
+ _data_model_dialog(),
1003
+ align="start",
1004
+ spacing="2",
1005
+ padding="1.5em",
1006
+ )