vedana-backoffice 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1940 @@
1
+ import asyncio
2
+ import difflib
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ import os
7
+ import statistics
8
+ import traceback
9
+ from dataclasses import asdict, dataclass
10
+ from datetime import datetime
11
+ from typing import Any, TypedDict, cast
12
+ from uuid import UUID
13
+
14
+ import reflex as rx
15
+ import requests
16
+ import sqlalchemy as sa
17
+ from datapipe.compute import run_steps
18
+ from jims_core.db import ThreadDB, ThreadEventDB
19
+ from jims_core.llms.llm_provider import LLMProvider
20
+ from jims_core.llms.llm_provider import env_settings as llm_settings
21
+ from jims_core.thread.thread_controller import ThreadController
22
+ from jims_core.util import uuid7
23
+ from pydantic import BaseModel, Field
24
+ from vedana_core.settings import settings as core_settings
25
+ from vedana_etl.app import app as etl_app
26
+
27
+ from vedana_backoffice.states.common import get_vedana_app
28
+ from vedana_backoffice.util import safe_render_value
29
+
30
+
31
+ class QuestionResult(TypedDict):
32
+ status: str
33
+ rating: float | str | None
34
+ comment: str
35
+ answer: str
36
+ tool_calls: str
37
+ golden_answer: str
38
+ thread_id: str
39
+
40
+
41
+ class RunSummary(TypedDict):
42
+ run_id: str
43
+ run_label: str
44
+ tests_total: int
45
+ passed: int
46
+ failed: int
47
+ pass_rate: float
48
+ avg_rating: str # rounded and converted to str
49
+ cost_total: float
50
+ test_run_name: str
51
+ avg_answer_time_sec: float
52
+ median_answer_time_sec: float
53
+
54
+
55
+ @dataclass
56
+ class GraphMeta:
57
+ nodes_by_label: dict[str, int]
58
+ edges_by_type: dict[str, int]
59
+ vector_indexes: list[dict[str, object]]
60
+
61
+
62
+ @dataclass
63
+ class DmMeta:
64
+ dm_id: str = ""
65
+ dm_description: str = ""
66
+
67
+
68
+ @dataclass
69
+ class JudgeMeta:
70
+ judge_model: str = ""
71
+ judge_prompt_id: str = ""
72
+ judge_prompt: str = ""
73
+
74
+
75
+ @dataclass
76
+ class RunConfig:
77
+ pipeline_model: str = ""
78
+ embeddings_model: str = ""
79
+ embeddings_dim: int = 0
80
+
81
+
82
+ @dataclass
83
+ class RunMeta:
84
+ graph: GraphMeta
85
+ judge: JudgeMeta
86
+ run_config: RunConfig
87
+ dm: DmMeta
88
+
89
+
90
+ @dataclass
91
+ class RunData:
92
+ summary: RunSummary
93
+ meta: RunMeta
94
+ config_summary: "RunConfigSummary"
95
+ results: dict[str, QuestionResult]
96
+
97
+
98
+ @dataclass
99
+ class RunConfigSummary:
100
+ test_run_id: str
101
+ test_run_name: str
102
+ pipeline_model: str
103
+ embeddings_model: str
104
+ embeddings_dim: int
105
+ judge_model: str
106
+ judge_prompt_id: str
107
+ judge_prompt_hash: str
108
+ dm_hash: str
109
+ dm_id: str
110
+ graph_nodes: dict[str, int]
111
+ graph_edges: dict[str, int]
112
+ vector_indexes: list[dict[str, object]]
113
+
114
+
115
+ @dataclass
116
+ class DiffLine:
117
+ left: str
118
+ right: str
119
+ op: str
120
+ left_color: str
121
+ right_color: str
122
+ strong: bool
123
+ row_idx: int
124
+ is_change: bool
125
+
126
+
127
+ @dataclass
128
+ class CompareRow:
129
+ question: str
130
+ golden_answer: str
131
+ status_a: str
132
+ rating_a: float | str | None
133
+ comment_a: str
134
+ answer_a: str
135
+ tool_calls_a: str
136
+ status_b: str
137
+ rating_b: float | str | None
138
+ comment_b: str
139
+ answer_b: str
140
+ tool_calls_b: str
141
+
142
+
143
+ DiffRow = dict[str, str | int | bool]
144
+
145
+
146
+ EMPTY_RESULT: QuestionResult = {
147
+ "status": "—",
148
+ "rating": "—",
149
+ "comment": "",
150
+ "answer": "",
151
+ "tool_calls": "",
152
+ "golden_answer": "",
153
+ "thread_id": "",
154
+ }
155
+
156
+ EMPTY_SUMMARY: RunSummary = {
157
+ "run_id": "",
158
+ "run_label": "",
159
+ "tests_total": 0,
160
+ "passed": 0,
161
+ "failed": 0,
162
+ "pass_rate": 0.0,
163
+ "avg_rating": "—",
164
+ "cost_total": 0.0,
165
+ "test_run_name": "",
166
+ "avg_answer_time_sec": 0.0,
167
+ "median_answer_time_sec": 0.0,
168
+ }
169
+
170
+
171
+ class EvalState(rx.State):
172
+ """State holder for evaluation workflow."""
173
+
174
+ loading: bool = False
175
+ error_message: str = ""
176
+ status_message: str = ""
177
+ eval_gds_rows: list[dict[str, Any]] = []
178
+ gds_expanded_rows: list[str] = []
179
+ selected_question_ids: list[str] = []
180
+ test_run_name: str = ""
181
+ selected_scenario: str = "all" # Filter by scenario
182
+ judge_model: str = ""
183
+ judge_prompt_id: str = ""
184
+ judge_prompt: str = ""
185
+ provider: str = "openai"
186
+ pipeline_model: str = core_settings.model
187
+ embeddings_model: str = core_settings.embeddings_model
188
+ embeddings_dim: int = core_settings.embeddings_dim
189
+ custom_openrouter_key: str = ""
190
+ default_openrouter_key_present: bool = bool(os.environ.get("OPENROUTER_API_KEY"))
191
+ enable_dm_filtering: bool = bool(os.environ.get("ENABLE_DM_FILTERING", False))
192
+ _default_models: tuple[str, ...] = (
193
+ "gpt-5.1-chat-latest",
194
+ "gpt-5.1",
195
+ "gpt-5-chat-latest",
196
+ "gpt-5",
197
+ "gpt-5-mini",
198
+ "gpt-5-nano",
199
+ "gpt-4.1",
200
+ "gpt-4.1-mini",
201
+ "gpt-4.1-nano",
202
+ "gpt-4o",
203
+ "gpt-4o-mini",
204
+ "o4-mini",
205
+ )
206
+ openai_models: list[str] = list(set(list(_default_models) + [core_settings.model]))
207
+ openrouter_models: list[str] = []
208
+ available_models: list[str] = list(set(list(_default_models) + [core_settings.model]))
209
+ dm_id: str = ""
210
+ tests_rows: list[dict[str, Any]] = []
211
+ tests_cost_total: float = 0.0
212
+ run_passed: int = 0
213
+ run_failed: int = 0
214
+ selected_run_id: str = ""
215
+ run_id_options: list[str] = []
216
+ run_id_lookup: dict[str, str] = {}
217
+ selected_tests_scenario: str = "All"
218
+ tests_scenario_options: list[str] = ["All"]
219
+ is_running: bool = False
220
+ run_progress: list[str] = []
221
+ max_eval_rows: int = 500
222
+ current_question_index: int = -1 # Track which question is being processed
223
+ total_questions_to_run: int = 0 # Total number of questions in current run
224
+ judge_prompt_dialog_open: bool = False
225
+ data_model_dialog_open: bool = False
226
+ dm_description: str = ""
227
+ # Server-side pagination for tests
228
+ tests_page: int = 0 # 0-indexed current page
229
+ tests_page_size: int = 100 # rows per page
230
+ tests_total_rows: int = 0 # total count
231
+ tests_sort_options: list[str] = ["Sort: Recent", "Sort: Rating"]
232
+ selected_tests_sort: str = "Sort: Recent"
233
+ max_parallel_tests: int = 4
234
+ # Run comparison
235
+ compare_run_a: str = ""
236
+ compare_run_b: str = ""
237
+ compare_dialog_open: bool = False
238
+ compare_loading: bool = False
239
+ compare_error: str = ""
240
+ compare_rows: list[dict[str, Any]] = []
241
+ compare_summary: dict[str, Any] = {}
242
+ compare_summary_a: RunSummary = EMPTY_SUMMARY
243
+ compare_summary_b: RunSummary = EMPTY_SUMMARY
244
+ compare_config_a: dict[str, object] = {}
245
+ compare_config_b: dict[str, object] = {}
246
+ compare_config_a_rows: list[dict[str, object]] = []
247
+ compare_config_b_rows: list[dict[str, object]] = []
248
+ compare_configs: dict[str, Any] = {}
249
+ compare_diff_keys: list[str] = []
250
+ compare_prompt_diff: str = ""
251
+ compare_dm_diff: str = ""
252
+ compare_prompt_full_a: str = ""
253
+ compare_prompt_full_b: str = ""
254
+ compare_dm_full_a: str = ""
255
+ compare_dm_full_b: str = ""
256
+ compare_prompt_diff_rows: list[DiffRow] = []
257
+ compare_dm_diff_rows: list[DiffRow] = []
258
+ compare_judge_prompt_compact: bool = True
259
+ compare_dm_compact: bool = True
260
+
261
+ @rx.var
262
+ def available_scenarios(self) -> list[str]:
263
+ """Get unique scenarios from eval_gds_rows."""
264
+ scenarios = set()
265
+ for row in self.eval_gds_rows:
266
+ scenario = row.get("question_scenario")
267
+ if scenario:
268
+ scenarios.add(str(scenario))
269
+ return ["all"] + sorted(scenarios)
270
+
271
+ @rx.var
272
+ def eval_gds_rows_with_selection(self) -> list[dict[str, Any]]:
273
+ selected = set(self.selected_question_ids)
274
+ expanded = set(self.gds_expanded_rows)
275
+ rows: list[dict[str, Any]] = []
276
+ for row in self.eval_gds_rows:
277
+ # Apply scenario filter
278
+ if self.selected_scenario != "all":
279
+ scenario = row.get("question_scenario")
280
+ if str(scenario) != self.selected_scenario:
281
+ continue
282
+ enriched = dict(row)
283
+ enriched["selected"] = row.get("id") in selected
284
+ enriched["expanded"] = row.get("id") in expanded
285
+ # Add scenario color for badge display
286
+ scenario_val = str(row.get("question_scenario", ""))
287
+ enriched["scenario_color"] = self._scenario_color(scenario_val)
288
+ rows.append(enriched)
289
+ return rows
290
+
291
+ @rx.var
292
+ def selected_count(self) -> int:
293
+ return len(self.selected_question_ids or [])
294
+
295
+ @rx.var
296
+ def selection_label(self) -> str:
297
+ total = len(self.eval_gds_rows_with_selection) # Use filtered count
298
+ if total == 0:
299
+ return "No questions available"
300
+ if self.selected_count == 0:
301
+ return f"Select tests to run ({self.selected_count}/{total})"
302
+ return f"Run selected ({self.selected_count}/{total})"
303
+
304
+ @rx.var
305
+ def all_selected(self) -> bool:
306
+ rows = len(self.eval_gds_rows_with_selection) # Use filtered count
307
+ return 0 < rows == self.selected_count
308
+
309
+ @rx.var
310
+ def can_run(self) -> bool:
311
+ return (self.selected_count > 0) and (not self.is_running)
312
+
313
+ @rx.var
314
+ def cost_label(self) -> str:
315
+ if self.tests_cost_total > 0:
316
+ return f"${self.tests_cost_total:.4f}"
317
+ return "Cost data unavailable"
318
+
319
+ @rx.var
320
+ def tests_row_count(self) -> int:
321
+ return len(self.tests_rows or [])
322
+
323
+ @rx.var
324
+ def tests_row_count_str(self) -> str:
325
+ return f"{self.tests_row_count} rows" if self.tests_row_count else "No records"
326
+
327
+ @rx.var
328
+ def has_run_progress(self) -> bool:
329
+ return len(self.run_progress or []) > 0
330
+
331
+ @rx.var
332
+ def pass_fail_summary(self) -> str:
333
+ return f"{self.run_passed} pass / {self.run_failed} fail"
334
+
335
+ @rx.var
336
+ def current_question_progress(self) -> str:
337
+ """Display current question progress."""
338
+ if self.total_questions_to_run == 0:
339
+ return ""
340
+ current = self.current_question_index + 1
341
+ return f"Processing question {current} of {self.total_questions_to_run}"
342
+
343
+ @rx.var
344
+ def embeddings_dim_label(self) -> str:
345
+ return f"{self.embeddings_dim} dims" if self.embeddings_dim > 0 else ""
346
+
347
+ @rx.var
348
+ def run_options_only(self) -> list[str]:
349
+ """Run options excluding the 'All' placeholder."""
350
+ return [opt for opt in (self.run_id_options or []) if opt != "All"]
351
+
352
+ @rx.var
353
+ def can_compare_runs(self) -> bool:
354
+ """Enable compare when both runs are selected and distinct."""
355
+ return (
356
+ bool(self.compare_run_a)
357
+ and bool(self.compare_run_b)
358
+ and self.compare_run_a != self.compare_run_b
359
+ and not self.compare_loading
360
+ )
361
+
362
+ @rx.var
363
+ def available_models_view(self) -> list[str]:
364
+ return self.available_models
365
+
366
+ def toggle_question_selection(self, question: str, checked: bool) -> None:
367
+ question = str(question or "").strip()
368
+ if not question:
369
+ return
370
+ current = list(self.selected_question_ids or [])
371
+ if checked:
372
+ if question not in current:
373
+ current.append(question)
374
+ else:
375
+ current = [q for q in current if q != question]
376
+ self.selected_question_ids = current
377
+
378
+ def toggle_select_all(self, checked: bool) -> None:
379
+ if not checked:
380
+ self.selected_question_ids = []
381
+ return
382
+ # Only select from filtered rows
383
+ ids = [str(row.get("id", "") or "").strip() for row in self.eval_gds_rows_with_selection if row.get("id")]
384
+ self.selected_question_ids = [qid for qid in ids if qid] # Filter out empty IDs
385
+
386
+ def reset_selection(self) -> None:
387
+ self.selected_question_ids = []
388
+ self.status_message = ""
389
+
390
+ def open_judge_prompt_dialog(self) -> None:
391
+ self.judge_prompt_dialog_open = True
392
+
393
+ def close_judge_prompt_dialog(self) -> None:
394
+ self.judge_prompt_dialog_open = False
395
+
396
+ def set_judge_prompt_dialog_open(self, open: bool) -> None:
397
+ self.judge_prompt_dialog_open = open
398
+
399
+ def open_data_model_dialog(self) -> None:
400
+ self.data_model_dialog_open = True
401
+
402
+ def close_data_model_dialog(self) -> None:
403
+ self.data_model_dialog_open = False
404
+
405
+ def set_data_model_dialog_open(self, open: bool) -> None:
406
+ self.data_model_dialog_open = open
407
+
408
+ def set_scenario(self, value: str) -> None:
409
+ """Set the scenario filter and prune invalid selections."""
410
+ self.selected_scenario = str(value or "all")
411
+ if self.selected_scenario == "all":
412
+ self._prune_selection()
413
+ return
414
+
415
+ # Drop selections that are not in the chosen scenario
416
+ allowed_ids = {
417
+ str(row.get("id"))
418
+ for row in self.eval_gds_rows
419
+ if str(row.get("question_scenario", "")) == self.selected_scenario
420
+ }
421
+ self.selected_question_ids = [q for q in (self.selected_question_ids or []) if q in allowed_ids]
422
+
423
+ def set_test_run_name(self, value: str) -> None:
424
+ """Set user-provided test run name."""
425
+ self.test_run_name = str(value or "").strip()
426
+
427
+ def set_pipeline_model(self, value: str) -> None:
428
+ if value in self.available_models:
429
+ self.pipeline_model = value
430
+
431
+ def set_custom_openrouter_key(self, value: str) -> None:
432
+ self.custom_openrouter_key = str(value or "").strip()
433
+ # optional: could refetch models with the override; keep static to avoid extra calls
434
+
435
+ def set_enable_dm_filtering(self, value: bool) -> None:
436
+ self.enable_dm_filtering = value
437
+
438
+ def set_provider(self, value: str) -> None:
439
+ self.provider = str(value or "openai")
440
+ if self.provider == "openrouter" and not self.openrouter_models:
441
+ self.fetch_openrouter_models()
442
+ self._sync_available_models()
443
+
444
+ def set_compare_run_a(self, value: str) -> None:
445
+ self.compare_run_a = str(value or "").strip()
446
+
447
+ def set_compare_run_b(self, value: str) -> None:
448
+ self.compare_run_b = str(value or "").strip()
449
+
450
+ def set_compare_dialog_open(self, open: bool) -> None:
451
+ self.compare_dialog_open = open
452
+
453
+ def _prune_selection(self) -> None:
454
+ # Validate against all rows (not filtered) to keep selections valid across filter changes
455
+ valid = {str(row.get("id")) for row in self.eval_gds_rows if row.get("id")}
456
+ self.selected_question_ids = [q for q in (self.selected_question_ids or []) if q in valid]
457
+
458
+ def _filter_chat_capable(self, models: list[dict[str, Any]]) -> list[str]:
459
+ result: list[str] = []
460
+ for m in models:
461
+ model_id = str(m.get("id", "")).strip()
462
+ if not model_id:
463
+ continue
464
+
465
+ architecture = m.get("architecture", {}) or {}
466
+ has_chat = False
467
+ if "text" in architecture.get("input_modalities", []) and "text" in architecture.get(
468
+ "output_modalities", []
469
+ ):
470
+ has_chat = True
471
+
472
+ has_tools = "tools" in (m.get("supported_parameters") or [])
473
+
474
+ if has_chat and has_tools:
475
+ result.append(model_id)
476
+
477
+ return result
478
+
479
+ def fetch_openrouter_models(self) -> None:
480
+ try:
481
+ resp = requests.get(
482
+ f"{llm_settings.openrouter_api_base_url}/models",
483
+ # headers={"Authorization": f"Bearer {openrouter_api_key}"}, # actually works without a token as well
484
+ timeout=10,
485
+ )
486
+ resp.raise_for_status()
487
+ payload = resp.json()
488
+ models = payload.get("data", [])
489
+ parsed = self._filter_chat_capable(models)
490
+ except Exception as exc: # pragma: no cover - best effort
491
+ logging.warning(f"Failed to fetch OpenRouter models: {exc}")
492
+ parsed = []
493
+ self.openrouter_models = sorted(list(parsed))
494
+
495
+ def _sync_available_models(self) -> None:
496
+ if self.provider == "openrouter":
497
+ models = self.openrouter_models
498
+ if not models:
499
+ self.provider = "openai"
500
+ models = self.openai_models
501
+ else:
502
+ models = self.openai_models
503
+
504
+ self.available_models = list(models)
505
+ if self.pipeline_model not in self.available_models and self.available_models:
506
+ self.pipeline_model = self.available_models[0]
507
+
508
+ def _resolved_pipeline_model(self) -> str:
509
+ provider = self.provider or "openai"
510
+ return f"{provider}/{self.pipeline_model}"
511
+
512
+ def get_eval_gds_from_grist(self):
513
+ # Run datapipe step to refresh eval_gds from Grist first
514
+ step = next((s for s in etl_app.steps if s._name == "get_eval_gds_from_grist"), None)
515
+ if step is not None:
516
+ try:
517
+ run_steps(etl_app.ds, [step])
518
+ except Exception as exc:
519
+ logging.exception(f"Failed to run get_eval_gds_from_grist: {exc}")
520
+
521
+ async def _load_eval_questions(self) -> None:
522
+ vedana_app = await get_vedana_app()
523
+
524
+ stmt = sa.text(
525
+ f"""
526
+ SELECT gds_question, gds_answer, question_context, question_scenario
527
+ FROM "eval_gds"
528
+ ORDER BY gds_question
529
+ LIMIT {int(self.max_eval_rows)}
530
+ """
531
+ )
532
+
533
+ async with vedana_app.sessionmaker() as session:
534
+ result = await session.execute(stmt)
535
+ rs = result.mappings().all()
536
+
537
+ rows: list[dict[str, Any]] = []
538
+ for rec in rs:
539
+ question = str(safe_render_value(rec.get("gds_question")) or "").strip()
540
+ rows.append(
541
+ {
542
+ "id": question,
543
+ "gds_question": question,
544
+ "gds_answer": safe_render_value(rec.get("gds_answer")),
545
+ "question_context": safe_render_value(rec.get("question_context")),
546
+ "question_scenario": safe_render_value(rec.get("question_scenario")),
547
+ }
548
+ )
549
+ self.eval_gds_rows = rows
550
+ self.gds_expanded_rows = []
551
+ self._prune_selection()
552
+
553
+ async def _load_judge_config(self) -> None:
554
+ self.judge_model = core_settings.judge_model
555
+ self.judge_prompt_id = ""
556
+ self.judge_prompt = ""
557
+
558
+ vedana_app = await get_vedana_app()
559
+ dm_pt = await vedana_app.data_model.prompt_templates()
560
+ judge_prompt = dm_pt.get("eval_judge_prompt")
561
+
562
+ if judge_prompt:
563
+ text_b = bytearray(judge_prompt, "utf-8")
564
+ self.judge_prompt_id = hashlib.sha256(text_b).hexdigest()
565
+ self.judge_prompt = judge_prompt
566
+
567
+ async def _load_pipeline_config(self) -> None:
568
+ vedana_app = await get_vedana_app()
569
+ dm = vedana_app.data_model
570
+ self.dm_description = await dm.to_text_descr()
571
+ dm_text_b = bytearray(self.dm_description, "utf-8")
572
+ self.dm_id = hashlib.sha256(dm_text_b).hexdigest()
573
+
574
+ def _status_color(self, status: str) -> str:
575
+ if status == "pass":
576
+ return "green"
577
+ if status == "fail":
578
+ return "red"
579
+ return "gray"
580
+
581
+ def _scenario_color(self, scenario: str) -> str:
582
+ """Assign a consistent color to each unique scenario value."""
583
+ if not scenario:
584
+ return "gray"
585
+ color_schemes = [
586
+ "blue",
587
+ "green",
588
+ "purple",
589
+ "pink",
590
+ "indigo",
591
+ "cyan",
592
+ "amber",
593
+ "lime",
594
+ "emerald",
595
+ "teal",
596
+ "sky",
597
+ "violet",
598
+ "fuchsia",
599
+ "rose",
600
+ "orange",
601
+ "slate",
602
+ ]
603
+ hash_val = hash(str(scenario))
604
+ color_idx = abs(hash_val) % len(color_schemes)
605
+ return color_schemes[color_idx]
606
+
607
+ async def _load_tests(self) -> None:
608
+ """Build test results table directly from JIMS threads (source=eval)."""
609
+ vedana_app = await get_vedana_app()
610
+
611
+ async with vedana_app.sessionmaker() as session:
612
+ run_rows = (
613
+ await session.execute(
614
+ sa.select(ThreadDB.contact_id, ThreadDB.thread_config)
615
+ .where(ThreadDB.thread_config.contains({"source": "eval"}))
616
+ .order_by(ThreadDB.created_at.desc())
617
+ )
618
+ ).all()
619
+ scenario_rows = (
620
+ (
621
+ await session.execute(
622
+ sa.select(ThreadDB.thread_config).where(ThreadDB.thread_config.contains({"source": "eval"}))
623
+ )
624
+ )
625
+ .scalars()
626
+ .all()
627
+ )
628
+
629
+ seen = set()
630
+ ordered_runs: list[tuple[str, dict[str, Any]]] = []
631
+ for rid, cfg in run_rows:
632
+ if rid not in seen:
633
+ ordered_runs.append((rid, cfg))
634
+ seen.add(rid)
635
+
636
+ lookup: dict[str, str] = {}
637
+ labels: list[str] = []
638
+ for rid, cfg in ordered_runs:
639
+ base_label = self._format_run_label_with_name(rid, cfg)
640
+ label = base_label
641
+ if label in lookup:
642
+ label = f"{base_label} ({rid})"
643
+ lookup[label] = rid
644
+ labels.append(label)
645
+
646
+ self.run_id_lookup = lookup
647
+ self.run_id_options = ["All", *labels]
648
+ if not self.selected_run_id:
649
+ self.selected_run_id = labels[0] if labels else "All"
650
+ if self.selected_tests_sort not in self.tests_sort_options:
651
+ self.selected_tests_sort = "Sort: Recent"
652
+
653
+ # Scenario options
654
+ scenarios = [
655
+ str(cfg.get("question_scenario"))
656
+ for cfg in scenario_rows
657
+ if isinstance(cfg, dict) and cfg.get("question_scenario") not in (None, "", "None")
658
+ ]
659
+ scen_seen = set()
660
+ scen_labels = []
661
+ for sc in scenarios:
662
+ if sc not in scen_seen:
663
+ scen_labels.append(sc)
664
+ scen_seen.add(sc)
665
+ self.tests_scenario_options = ["All", *scen_labels]
666
+ if self.selected_tests_scenario not in self.tests_scenario_options:
667
+ self.selected_tests_scenario = "All"
668
+
669
+ eval_result_subq = (
670
+ sa.select(
671
+ ThreadEventDB.thread_id.label("thread_id"),
672
+ ThreadEventDB.event_data.label("eval_data"),
673
+ ThreadEventDB.created_at.label("eval_created_at"),
674
+ sa.func.row_number()
675
+ .over(partition_by=ThreadEventDB.thread_id, order_by=ThreadEventDB.created_at.desc())
676
+ .label("rn"),
677
+ )
678
+ .where(ThreadEventDB.event_type == "eval.result")
679
+ .subquery()
680
+ )
681
+
682
+ # Base query for eval threads
683
+ base_threads = sa.select(ThreadDB).where(ThreadDB.thread_config.contains({"source": "eval"}))
684
+ if self.selected_run_id and self.selected_run_id != "All":
685
+ selected_raw = self.run_id_lookup.get(self.selected_run_id)
686
+ if selected_raw:
687
+ base_threads = base_threads.where(ThreadDB.contact_id == selected_raw)
688
+ if self.selected_tests_scenario and self.selected_tests_scenario != "All":
689
+ base_threads = base_threads.where(
690
+ ThreadDB.thread_config.contains({"question_scenario": self.selected_tests_scenario})
691
+ )
692
+
693
+ count_q = sa.select(sa.func.count()).select_from(base_threads.subquery())
694
+ self.tests_total_rows = int((await session.execute(count_q)).scalar_one())
695
+
696
+ offset = self.tests_page * self.tests_page_size
697
+ threads_q = base_threads
698
+
699
+ selected_sort = self.selected_tests_sort or "Sort: Recent"
700
+ rating_expr = None
701
+ if selected_sort in ("Sort: Rating"):
702
+ threads_q = threads_q.join(
703
+ eval_result_subq,
704
+ sa.and_(eval_result_subq.c.thread_id == ThreadDB.thread_id, eval_result_subq.c.rn == 1),
705
+ isouter=True,
706
+ )
707
+ rating_expr = sa.cast(eval_result_subq.c.eval_data["eval_judge_rating"].astext, sa.Integer)
708
+
709
+ if selected_sort == "Sort: Rating" and rating_expr is not None:
710
+ threads_q = threads_q.order_by(sa.desc(rating_expr), ThreadDB.created_at.desc())
711
+ else:
712
+ threads_q = threads_q.order_by(ThreadDB.created_at.desc())
713
+
714
+ threads_q = threads_q.limit(self.tests_page_size).offset(offset)
715
+ page_threads = (await session.execute(threads_q)).scalars().all()
716
+
717
+ if not page_threads:
718
+ self.tests_rows = []
719
+ self.tests_cost_total = 0.0
720
+ self.run_passed = 0
721
+ self.run_failed = 0
722
+ return
723
+
724
+ thread_ids = [t.thread_id for t in page_threads]
725
+ ev_stmt = (
726
+ sa.select(ThreadEventDB)
727
+ .where(ThreadEventDB.thread_id.in_(thread_ids))
728
+ .order_by(ThreadEventDB.created_at)
729
+ )
730
+ events_res = (await session.execute(ev_stmt)).scalars().all()
731
+
732
+ events_by_thread: dict[UUID, list[ThreadEventDB]] = {}
733
+ for ev in events_res:
734
+ events_by_thread.setdefault(ev.thread_id, []).append(ev)
735
+
736
+ rows: list[dict[str, Any]] = []
737
+ passed = 0
738
+ failed = 0
739
+ cost_total = 0.0
740
+ for thread in page_threads:
741
+ cfg = thread.thread_config or {}
742
+ evs = events_by_thread.get(thread.thread_id, [])
743
+ answer = ""
744
+ status = "—"
745
+ judge_comment = ""
746
+ rating_label = "—"
747
+ run_label = self._format_run_label_with_name(thread.contact_id, cfg)
748
+ test_date = run_label
749
+
750
+ for ev in evs:
751
+ if ev.event_type == "comm.assistant_message":
752
+ answer = str(ev.event_data.get("content", ""))
753
+ elif ev.event_type == "rag.query_processed":
754
+ tech = ev.event_data.get("technical_info", {}) if isinstance(ev.event_data, dict) else {}
755
+ model_stats = tech.get("model_stats") if isinstance(tech, dict) else {}
756
+ if isinstance(model_stats, dict):
757
+ for stats in model_stats.values():
758
+ if isinstance(stats, dict):
759
+ cost_val = stats.get("requests_cost")
760
+ try:
761
+ if cost_val is not None:
762
+ cost_total += float(cost_val)
763
+ except (TypeError, ValueError):
764
+ pass
765
+ elif ev.event_type == "eval.result":
766
+ status = ev.event_data.get("test_status", status)
767
+ judge_comment = ev.event_data.get("eval_judge_comment", judge_comment)
768
+ rating_label = str(ev.event_data.get("eval_judge_rating", rating_label))
769
+ test_date = self._format_run_label(ev.event_data.get("test_date", test_date))
770
+
771
+ if status == "pass":
772
+ passed += 1
773
+ elif status == "fail":
774
+ failed += 1
775
+
776
+ rows.append(
777
+ {
778
+ "row_id": str(thread.thread_id),
779
+ "expanded": False,
780
+ "test_date": safe_render_value(test_date),
781
+ "gds_question": safe_render_value(cfg.get("gds_question")),
782
+ "llm_answer": safe_render_value(answer),
783
+ "gds_answer": safe_render_value(cfg.get("gds_answer")),
784
+ "pipeline_model": safe_render_value(cfg.get("pipeline_model")),
785
+ "test_status": status or "—",
786
+ "status_color": self._status_color(status),
787
+ "eval_judge_comment": safe_render_value(judge_comment),
788
+ "eval_judge_rating": rating_label,
789
+ }
790
+ )
791
+
792
+ self.tests_rows = rows
793
+ self.tests_cost_total = cost_total
794
+ self.run_passed = passed
795
+ self.run_failed = failed
796
+
797
+ @rx.event(background=True) # type: ignore[operator]
798
+ async def tests_next_page(self):
799
+ """Load the next page of tests."""
800
+ async with self:
801
+ max_page = (self.tests_total_rows - 1) // self.tests_page_size if self.tests_total_rows > 0 else 0
802
+ if self.tests_page < max_page:
803
+ self.tests_page += 1
804
+ await self._load_tests()
805
+ yield
806
+
807
+ @rx.event(background=True) # type: ignore[operator]
808
+ async def tests_prev_page(self):
809
+ """Load the previous page of tests."""
810
+ async with self:
811
+ if self.tests_page > 0:
812
+ self.tests_page -= 1
813
+ await self._load_tests()
814
+ yield
815
+
816
+ @rx.event(background=True) # type: ignore[operator]
817
+ async def tests_first_page(self):
818
+ """Jump to the first page."""
819
+ async with self:
820
+ if self.tests_page != 0:
821
+ self.tests_page = 0
822
+ await self._load_tests()
823
+ yield
824
+
825
+ @rx.event(background=True) # type: ignore[operator]
826
+ async def tests_last_page(self):
827
+ """Jump to the last page."""
828
+ async with self:
829
+ max_page = (self.tests_total_rows - 1) // self.tests_page_size if self.tests_total_rows > 0 else 0
830
+ if self.tests_page != max_page:
831
+ self.tests_page = max_page
832
+ await self._load_tests()
833
+ yield
834
+
835
+ @rx.event(background=True) # type: ignore[operator]
836
+ async def select_run(self, value: str):
837
+ """Update selected run id and reload tests."""
838
+ async with self:
839
+ self.selected_run_id = str(value or "")
840
+ self.tests_page = 0
841
+ await self._load_tests()
842
+ yield
843
+
844
+ @rx.event(background=True) # type: ignore[operator]
845
+ async def select_tests_scenario(self, value: str):
846
+ """Update scenario filter for tests and reload."""
847
+ async with self:
848
+ self.selected_tests_scenario = str(value or "All")
849
+ self.tests_page = 0
850
+ await self._load_tests()
851
+ yield
852
+
853
+ @rx.event(background=True) # type: ignore[operator]
854
+ async def select_tests_sort(self, value: str):
855
+ """Update sorting for tests and reload."""
856
+ async with self:
857
+ self.selected_tests_sort = str(value or "Sort: Recent")
858
+ self.tests_page = 0
859
+ await self._load_tests()
860
+ yield
861
+
862
+ def _resolve_run_contact(self, label: str) -> str:
863
+ """Translate UI label to contact_id; fallback to provided label."""
864
+ if not label:
865
+ return ""
866
+ return self.run_id_lookup.get(label, label)
867
+
868
+ def set_compare_judge_prompt_compact(self, checked: bool) -> None:
869
+ """Toggle compact diff view for prompt diff."""
870
+ self.compare_judge_prompt_compact = bool(checked)
871
+
872
+ def set_compare_dm_compact(self, checked: bool) -> None:
873
+ """Toggle compact diff view for data model diff."""
874
+ self.compare_dm_compact = bool(checked)
875
+
876
+ def compare_runs(self):
877
+ """Connecting button with a background task. Used to trigger animations properly."""
878
+ if self.compare_loading:
879
+ return
880
+ if not self.compare_run_a or not self.compare_run_b or self.compare_run_a == self.compare_run_b:
881
+ self.compare_error = "Select two different runs to compare."
882
+ self.compare_dialog_open = True
883
+ return
884
+
885
+ run_a_id = self._resolve_run_contact(self.compare_run_a)
886
+ run_b_id = self._resolve_run_contact(self.compare_run_b)
887
+ if not run_a_id or not run_b_id:
888
+ self.compare_error = "Unable to resolve selected runs."
889
+ self.compare_dialog_open = True
890
+ return
891
+
892
+ self.compare_loading = True
893
+ self.compare_error = ""
894
+ self.compare_dialog_open = True
895
+ yield
896
+ yield EvalState.compare_runs_background(run_a_id, run_b_id)
897
+
898
+ @rx.event(background=True) # type: ignore[operator]
899
+ async def compare_runs_background(self, run_a_id: str, run_b_id: str):
900
+ try:
901
+ vedana_app = await get_vedana_app()
902
+ async with vedana_app.sessionmaker() as session:
903
+ threads_res = (
904
+ (
905
+ await session.execute(
906
+ sa.select(ThreadDB)
907
+ .where(
908
+ ThreadDB.thread_config.contains({"source": "eval"}),
909
+ ThreadDB.contact_id.in_([run_a_id, run_b_id]),
910
+ )
911
+ .order_by(ThreadDB.created_at.desc())
912
+ )
913
+ )
914
+ .scalars()
915
+ .all()
916
+ )
917
+
918
+ if not threads_res:
919
+ async with self:
920
+ self.compare_error = "No threads found for selected runs."
921
+ return
922
+
923
+ thread_ids = [t.thread_id for t in threads_res]
924
+ events_res = (
925
+ (
926
+ await session.execute(
927
+ sa.select(ThreadEventDB)
928
+ .where(ThreadEventDB.thread_id.in_(thread_ids))
929
+ .order_by(ThreadEventDB.created_at)
930
+ )
931
+ )
932
+ .scalars()
933
+ .all()
934
+ )
935
+
936
+ events_by_thread: dict[UUID, list[ThreadEventDB]] = {}
937
+ for ev in events_res:
938
+ events_by_thread.setdefault(ev.thread_id, []).append(ev)
939
+
940
+ threads_a = [t for t in threads_res if t.contact_id == run_a_id]
941
+ threads_b = [t for t in threads_res if t.contact_id == run_b_id]
942
+
943
+ async with self:
944
+ run_a_data = self._collect_run_data(run_a_id, threads_a, events_by_thread)
945
+ run_b_data = self._collect_run_data(run_b_id, threads_b, events_by_thread)
946
+
947
+ # Align questions across both runs
948
+ run_a_results = run_a_data.results
949
+ run_b_results = run_b_data.results
950
+ all_questions = set(run_a_results.keys()) | set(run_b_results.keys())
951
+
952
+ aligned_rows: list[CompareRow] = []
953
+ for q in sorted(all_questions):
954
+ ra = run_a_results.get(q, EMPTY_RESULT)
955
+ rb = run_b_results.get(q, EMPTY_RESULT)
956
+ aligned_rows.append(
957
+ CompareRow(
958
+ question=q,
959
+ golden_answer=ra["golden_answer"] or rb["golden_answer"],
960
+ status_a=ra["status"],
961
+ rating_a=ra["rating"],
962
+ comment_a=ra["comment"],
963
+ answer_a=ra["answer"],
964
+ tool_calls_a=ra["tool_calls"],
965
+ status_b=rb["status"],
966
+ rating_b=rb["rating"],
967
+ comment_b=rb["comment"],
968
+ answer_b=rb["answer"],
969
+ tool_calls_b=rb["tool_calls"],
970
+ )
971
+ )
972
+
973
+ cfg_a = asdict(run_a_data.config_summary)
974
+ cfg_b = asdict(run_b_data.config_summary)
975
+ async with self:
976
+ diff_keys = self._diff_config_keys(cfg_a, cfg_b)
977
+
978
+ # Prompt and data model diffs
979
+ meta_a = run_a_data.meta
980
+ meta_b = run_b_data.meta
981
+ prompt_a = meta_a.judge.judge_prompt
982
+ prompt_b = meta_b.judge.judge_prompt
983
+
984
+ dm_a_str = meta_a.dm.dm_description
985
+ dm_b_str = meta_b.dm.dm_description
986
+
987
+ prompt_diff = "\n".join(difflib.unified_diff(prompt_a.splitlines(), prompt_b.splitlines(), lineterm=""))
988
+ dm_diff = "\n".join(difflib.unified_diff(dm_a_str.splitlines(), dm_b_str.splitlines(), lineterm=""))
989
+ async with self:
990
+ prompt_diff_rows = self._build_side_by_side_diff(prompt_a, prompt_b)
991
+ dm_diff_rows = self._build_side_by_side_diff(dm_a_str, dm_b_str)
992
+
993
+ self.compare_summary_a = run_a_data.summary
994
+ self.compare_summary_b = run_b_data.summary
995
+ self.compare_diff_keys = diff_keys
996
+ self.compare_rows = [asdict(r) for r in aligned_rows]
997
+ self.compare_prompt_diff = prompt_diff
998
+ self.compare_dm_diff = dm_diff
999
+ self.compare_prompt_full_a = prompt_a
1000
+ self.compare_prompt_full_b = prompt_b
1001
+ self.compare_dm_full_a = dm_a_str
1002
+ self.compare_dm_full_b = dm_b_str
1003
+ self.compare_prompt_diff_rows = prompt_diff_rows
1004
+ self.compare_dm_diff_rows = dm_diff_rows
1005
+ self.compare_config_a = cfg_a
1006
+ self.compare_config_b = cfg_b
1007
+ self.compare_config_a_rows = self._config_rows(cfg_a, diff_keys)
1008
+ self.compare_config_b_rows = self._config_rows(cfg_b, diff_keys)
1009
+ except Exception as e:
1010
+ async with self:
1011
+ self.compare_error = f"Failed to compare runs: {e}"
1012
+ finally:
1013
+ async with self:
1014
+ self.compare_loading = False
1015
+ yield
1016
+
1017
+ def toggle_gds_row(self, row_id: str) -> None:
1018
+ """Toggle expansion for a golden dataset row."""
1019
+ row_id = str(row_id or "")
1020
+ if not row_id:
1021
+ return
1022
+ current = set(self.gds_expanded_rows or [])
1023
+ if row_id in current:
1024
+ current.remove(row_id)
1025
+ else:
1026
+ current.add(row_id)
1027
+ self.gds_expanded_rows = list(current)
1028
+
1029
+ def toggle_row_expand(self, row_id: str) -> None:
1030
+ """Toggle expansion state for a result row."""
1031
+ row_id = str(row_id or "")
1032
+ if not row_id:
1033
+ return
1034
+ updated = []
1035
+ for row in self.tests_rows or []:
1036
+ if str(row.get("row_id")) == row_id:
1037
+ new_row = dict(row)
1038
+ new_row["expanded"] = not bool(row.get("expanded"))
1039
+ updated.append(new_row)
1040
+ else:
1041
+ updated.append(row)
1042
+ self.tests_rows = updated
1043
+
1044
+ @rx.var
1045
+ def tests_page_display(self) -> str:
1046
+ """Current page display (1-indexed for users)."""
1047
+ total_pages = (self.tests_total_rows - 1) // self.tests_page_size + 1 if self.tests_total_rows > 0 else 1
1048
+ return f"Page {self.tests_page + 1} of {total_pages}"
1049
+
1050
+ @rx.var
1051
+ def tests_rows_display(self) -> str:
1052
+ """Display range of rows being shown."""
1053
+ if self.tests_total_rows == 0:
1054
+ return "No rows"
1055
+ start = self.tests_page * self.tests_page_size + 1
1056
+ end = min(start + self.tests_page_size - 1, self.tests_total_rows)
1057
+ return f"Rows {start}-{end} of {self.tests_total_rows}"
1058
+
1059
+ @rx.var
1060
+ def tests_has_next(self) -> bool:
1061
+ """Whether there's a next page."""
1062
+ max_page = (self.tests_total_rows - 1) // self.tests_page_size if self.tests_total_rows > 0 else 0
1063
+ return self.tests_page < max_page
1064
+
1065
+ @rx.var
1066
+ def tests_has_prev(self) -> bool:
1067
+ """Whether there's a previous page."""
1068
+ return self.tests_page > 0
1069
+
1070
+ def _append_progress(self, message: str) -> None:
1071
+ stamp = datetime.now().strftime("%H:%M:%S")
1072
+ self.run_progress = [*self.run_progress[-20:], f"[{stamp}] {message}"]
1073
+
1074
+ def _record_val(self, rec: Any, key: str) -> Any:
1075
+ """Best-effort extractor for neo4j / sqlalchemy records."""
1076
+ if isinstance(rec, dict):
1077
+ return rec.get(key)
1078
+ getter = getattr(rec, "get", None)
1079
+ if callable(getter):
1080
+ try:
1081
+ return getter(key)
1082
+ except Exception:
1083
+ pass
1084
+ try:
1085
+ return rec[key] # type: ignore[index]
1086
+ except Exception:
1087
+ pass
1088
+ data_fn = getattr(rec, "data", None)
1089
+ if callable(data_fn):
1090
+ try:
1091
+ data = data_fn()
1092
+ if isinstance(data, dict):
1093
+ return data.get(key)
1094
+ except Exception:
1095
+ return None
1096
+ return None
1097
+
1098
+ async def _collect_graph_metadata(self, graph) -> dict[str, Any]:
1099
+ """Collect node/edge counts and vector index info from the graph."""
1100
+ meta: dict[str, Any] = {"nodes_by_label": {}, "edges_by_type": {}, "vector_indexes": []}
1101
+
1102
+ try:
1103
+ node_res = await graph.execute_ro_cypher_query(
1104
+ "MATCH (n) UNWIND labels(n) AS lbl RETURN lbl, count(*) AS cnt"
1105
+ )
1106
+ for rec in node_res:
1107
+ lbl = str(self._record_val(rec, "lbl") or "")
1108
+ cnt_val = self._record_val(rec, "cnt")
1109
+ try:
1110
+ cnt = int(cnt_val)
1111
+ except Exception:
1112
+ cnt = None
1113
+ if lbl and cnt is not None:
1114
+ meta["nodes_by_label"][lbl] = cnt
1115
+ except Exception as exc:
1116
+ meta["nodes_error"] = str(exc)
1117
+
1118
+ try:
1119
+ edge_res = await graph.execute_ro_cypher_query(
1120
+ "MATCH ()-[r]->() RETURN type(r) AS rel_type, count(*) AS cnt"
1121
+ )
1122
+ for rec in edge_res:
1123
+ rel = str(self._record_val(rec, "rel_type") or "")
1124
+ cnt_val = self._record_val(rec, "cnt")
1125
+ try:
1126
+ cnt = int(cnt_val)
1127
+ except Exception:
1128
+ cnt = None
1129
+ if rel and cnt is not None:
1130
+ meta["edges_by_type"][rel] = cnt
1131
+ except Exception as exc:
1132
+ meta["edges_error"] = str(exc)
1133
+
1134
+ try:
1135
+ res = await graph.driver.execute_query("CALL vector_search.show_index_info() YIELD * RETURN *")
1136
+ for rec in res.records:
1137
+ row = {}
1138
+ try:
1139
+ for key in rec.keys():
1140
+ row[key] = rec.get(key)
1141
+ except Exception:
1142
+ row = {}
1143
+ if row:
1144
+ meta["vector_indexes"].append(row)
1145
+ except Exception as exc:
1146
+ meta["vector_indexes_error"] = str(exc)
1147
+
1148
+ return meta
1149
+
1150
+ def _build_data_model_meta(self) -> dict[str, Any]:
1151
+ # dm_json = vedana_app.data_model.to_json()
1152
+ return {
1153
+ # "dm_json": dm_json, # may get used later
1154
+ "dm_id": self.dm_id,
1155
+ "dm_description": self.dm_description,
1156
+ }
1157
+
1158
+ async def _build_eval_meta_payload(self, vedana_app, test_run_id: str, test_run_name: str) -> dict[str, Any]:
1159
+ """Build a single eval.meta payload shared across threads for a run."""
1160
+ graph_meta = await self._collect_graph_metadata(vedana_app.graph)
1161
+ data_model_meta = self._build_data_model_meta()
1162
+ judge_meta = JudgeMeta(
1163
+ judge_model=self.judge_model,
1164
+ judge_prompt_id=self.judge_prompt_id,
1165
+ judge_prompt=self.judge_prompt,
1166
+ )
1167
+ run_config = RunConfig(
1168
+ pipeline_model=self._resolved_pipeline_model(),
1169
+ embeddings_model=self.embeddings_model,
1170
+ embeddings_dim=self.embeddings_dim,
1171
+ )
1172
+ return {
1173
+ "meta_version": 1,
1174
+ "test_run": test_run_id,
1175
+ "test_run_name": test_run_name,
1176
+ "run_config": asdict(run_config),
1177
+ "judge": asdict(judge_meta),
1178
+ "graph": graph_meta,
1179
+ "data_model": data_model_meta,
1180
+ }
1181
+
1182
+ def _format_tool_calls(self, technical_info: dict[str, Any]) -> str:
1183
+ """Flatten VTS/Cypher info into a text blob for judge/storage."""
1184
+ if not isinstance(technical_info, dict):
1185
+ return ""
1186
+ vts = technical_info.get("vts_queries") or []
1187
+ cypher = technical_info.get("cypher_queries") or []
1188
+ vts_s = "\n".join([str(v) for v in vts]) if isinstance(vts, list) else ""
1189
+ cypher_s = "\n".join([str(c) for c in cypher]) if isinstance(cypher, list) else ""
1190
+ return "\n---\n".join(part for part in [vts_s, cypher_s] if part).strip()
1191
+
1192
+ def _format_run_label(self, contact_id: str | None) -> str:
1193
+ """
1194
+ Convert run id like 'eval:20251208-214017' -> '2025-12-08 21:40:17'.
1195
+ Falls back to the raw value if parsing fails.
1196
+ """
1197
+ raw = str(contact_id or "").strip()
1198
+ if raw.startswith("eval:") and len(raw) >= 18:
1199
+ ts = raw.removeprefix("eval:")
1200
+ try:
1201
+ dt = datetime.strptime(ts, "%Y%m%d-%H%M%S")
1202
+ return dt.strftime("%Y-%m-%d %H:%M:%S")
1203
+ except Exception:
1204
+ return raw
1205
+ return raw
1206
+
1207
+ def _format_run_label_with_name(self, contact_id: str | None, cfg: dict[str, Any] | None) -> str:
1208
+ """
1209
+ Prefer user-provided test_run_name, fallback to formatted timestamp label.
1210
+ """
1211
+ name = cfg["test_run_name"] if isinstance(cfg, dict) and "test_run_name" in cfg else ""
1212
+ base = self._format_run_label(contact_id)
1213
+ if name:
1214
+ return f"{name} — {base}"
1215
+ return base
1216
+
1217
+ def _normalize_diff_val(self, val: Any) -> Any:
1218
+ """Normalize values for diffing."""
1219
+ if isinstance(val, (dict, list)):
1220
+ try:
1221
+ return json.dumps(val, sort_keys=True)
1222
+ except Exception:
1223
+ return str(val)
1224
+ return val
1225
+
1226
+ def _build_side_by_side_diff(self, a_text: str, b_text: str) -> list[DiffRow]:
1227
+ """Produce side-by-side diff rows with color hints (text color, no background)."""
1228
+ a_lines = a_text.splitlines()
1229
+ b_lines = b_text.splitlines()
1230
+ sm = difflib.SequenceMatcher(None, a_lines, b_lines)
1231
+ rows: list[DiffLine] = []
1232
+ for tag, i1, i2, j1, j2 in sm.get_opcodes():
1233
+ if tag == "equal":
1234
+ for al, bl in zip(a_lines[i1:i2], b_lines[j1:j2]):
1235
+ rows.append(
1236
+ DiffLine(
1237
+ left=al,
1238
+ right=bl,
1239
+ op=tag,
1240
+ left_color="inherit",
1241
+ right_color="inherit",
1242
+ strong=False,
1243
+ row_idx=0,
1244
+ is_change=False,
1245
+ )
1246
+ )
1247
+ elif tag == "replace":
1248
+ max_len = max(i2 - i1, j2 - j1)
1249
+ for k in range(max_len):
1250
+ al = a_lines[i1 + k] if i1 + k < i2 else ""
1251
+ bl = b_lines[j1 + k] if j1 + k < j2 else ""
1252
+ rows.append(
1253
+ DiffLine(
1254
+ left=al,
1255
+ right=bl,
1256
+ op=tag,
1257
+ left_color="var(--indigo-11)",
1258
+ right_color="var(--indigo-11)",
1259
+ strong=True,
1260
+ row_idx=0,
1261
+ is_change=True,
1262
+ )
1263
+ )
1264
+ elif tag == "delete":
1265
+ for al in a_lines[i1:i2]:
1266
+ rows.append(
1267
+ DiffLine(
1268
+ left=al,
1269
+ right="",
1270
+ op=tag,
1271
+ left_color="var(--red-11)",
1272
+ right_color="inherit",
1273
+ strong=True,
1274
+ row_idx=0,
1275
+ is_change=True,
1276
+ )
1277
+ )
1278
+ elif tag == "insert":
1279
+ for bl in b_lines[j1:j2]:
1280
+ rows.append(
1281
+ DiffLine(
1282
+ left="",
1283
+ right=bl,
1284
+ op=tag,
1285
+ left_color="inherit",
1286
+ right_color="var(--green-11)",
1287
+ strong=True,
1288
+ row_idx=0,
1289
+ is_change=True,
1290
+ )
1291
+ )
1292
+ for idx, row in enumerate(rows):
1293
+ row.row_idx = idx
1294
+ row.is_change = row.op != "equal"
1295
+ return [cast(DiffRow, asdict(r)) for r in rows]
1296
+
1297
+ def _diff_config_keys(self, cfg_a: dict[str, object], cfg_b: dict[str, object]) -> list[str]:
1298
+ """Return keys whose normalized values differ."""
1299
+ keys = set(cfg_a.keys()) | set(cfg_b.keys())
1300
+ diffs: list[str] = []
1301
+ for key in keys:
1302
+ left = cfg_a[key] if key in cfg_a else None
1303
+ right = cfg_b[key] if key in cfg_b else None
1304
+ if self._normalize_diff_val(left) != self._normalize_diff_val(right):
1305
+ diffs.append(key)
1306
+ return sorted(diffs)
1307
+
1308
+ def _summarize_config_for_display(self, meta: RunMeta, cfg_fallback: dict[str, Any]) -> RunConfigSummary:
1309
+ judge = meta.judge
1310
+ dm = meta.dm
1311
+ graph = meta.graph
1312
+ run_cfg = meta.run_config
1313
+
1314
+ pipeline_model_val = (
1315
+ run_cfg.pipeline_model if run_cfg.pipeline_model else cfg_fallback.get("pipeline_model", "")
1316
+ )
1317
+ embeddings_model_val = (
1318
+ run_cfg.embeddings_model if run_cfg.embeddings_model else cfg_fallback.get("embeddings_model", "")
1319
+ )
1320
+ embeddings_dim_val = run_cfg.embeddings_dim if run_cfg.embeddings_dim else cfg_fallback.get("embeddings_dim", 0)
1321
+ pipeline_model = str(pipeline_model_val)
1322
+ embeddings_model = str(embeddings_model_val)
1323
+ embeddings_dim = self._to_int(embeddings_dim_val)
1324
+
1325
+ judge_model = judge.judge_model or cfg_fallback.get("judge_model", "")
1326
+ judge_prompt_id = judge.judge_prompt_id or cfg_fallback.get("judge_prompt_id", "")
1327
+
1328
+ judge_prompt_hash = hashlib.sha256(judge.judge_prompt.encode("utf-8")).hexdigest() if judge.judge_prompt else ""
1329
+
1330
+ test_run_id = cfg_fallback.get("test_run", "")
1331
+ test_run_name = cfg_fallback.get("test_run_name", "")
1332
+
1333
+ return RunConfigSummary(
1334
+ test_run_id=test_run_id,
1335
+ test_run_name=test_run_name,
1336
+ pipeline_model=pipeline_model,
1337
+ embeddings_model=embeddings_model,
1338
+ embeddings_dim=embeddings_dim,
1339
+ judge_model=judge_model,
1340
+ judge_prompt_id=judge_prompt_id,
1341
+ judge_prompt_hash=judge_prompt_hash,
1342
+ dm_hash=str(cfg_fallback.get("dm_hash", "")),
1343
+ dm_id=dm.dm_id,
1344
+ graph_nodes=graph.nodes_by_label,
1345
+ graph_edges=graph.edges_by_type,
1346
+ vector_indexes=graph.vector_indexes,
1347
+ )
1348
+
1349
+ @rx.var
1350
+ def compare_prompt_rows_view(self) -> list[DiffRow]:
1351
+ rows = self.compare_prompt_diff_rows or []
1352
+ if not self.compare_judge_prompt_compact:
1353
+ return rows
1354
+ change_idxs = [self._to_int(cast(int | float | str | None, r["row_idx"])) for r in rows if r.get("is_change")]
1355
+ window = 4
1356
+ return [
1357
+ r
1358
+ for r in rows
1359
+ if r.get("is_change")
1360
+ or any(abs(self._to_int(cast(int | float | str | None, r["row_idx"])) - ci) <= window for ci in change_idxs)
1361
+ ]
1362
+
1363
+ @rx.var
1364
+ def compare_dm_rows_view(self) -> list[DiffRow]:
1365
+ rows = self.compare_dm_diff_rows or []
1366
+ if not self.compare_dm_compact:
1367
+ return rows
1368
+ change_idxs = [self._to_int(cast(int | float | str | None, r["row_idx"])) for r in rows if r.get("is_change")]
1369
+ window = 4
1370
+ return [
1371
+ r
1372
+ for r in rows
1373
+ if r.get("is_change")
1374
+ or any(abs(self._to_int(cast(int | float | str | None, r["row_idx"])) - ci) <= window for ci in change_idxs)
1375
+ ]
1376
+
1377
+ @rx.var
1378
+ def compare_run_label_a(self) -> str:
1379
+ name = self.compare_summary_a.get("test_run_name", "")
1380
+ if isinstance(name, str) and name.strip():
1381
+ other = self.compare_summary_b.get("test_run_name", "")
1382
+ if isinstance(other, str) and other.strip() and other == name:
1383
+ return f"{self.compare_summary_a.get('run_label', '')}"
1384
+ return name
1385
+ label = self.compare_summary_a.get("run_label", "")
1386
+ return str(label) if label else "Run A"
1387
+
1388
+ @rx.var
1389
+ def compare_run_label_b(self) -> str:
1390
+ name = self.compare_summary_b.get("test_run_name", "")
1391
+ if isinstance(name, str) and name.strip():
1392
+ other = self.compare_summary_a.get("test_run_name", "")
1393
+ if isinstance(other, str) and other.strip() and other == name:
1394
+ return f"{self.compare_summary_b.get('run_label', '')}"
1395
+ return name
1396
+ label = self.compare_summary_b.get("run_label", "")
1397
+ return str(label) if label else "Run B"
1398
+
1399
+ def _extract_eval_meta(self, events: list[ThreadEventDB]) -> dict[str, Any]:
1400
+ """Return first eval.meta event data, if any."""
1401
+ for ev in events:
1402
+ if ev.event_type == "eval.meta" and isinstance(ev.event_data, dict):
1403
+ return ev.event_data
1404
+ return {}
1405
+
1406
+ def _to_int(self, val: int | float | str | None, default: int = 0) -> int:
1407
+ if val is None:
1408
+ return default
1409
+ try:
1410
+ narrowed: int | float | str = cast(int | float | str, val)
1411
+ return int(narrowed)
1412
+ except Exception:
1413
+ return default
1414
+
1415
+ def _parse_run_meta(self, meta: dict[str, Any], cfg_fallback: dict[str, Any]) -> RunMeta:
1416
+ graph_src = meta["graph"] if "graph" in meta and isinstance(meta["graph"], dict) else {}
1417
+ judge_src = meta["judge"] if "judge" in meta and isinstance(meta["judge"], dict) else {}
1418
+ dm_src = meta["data_model"] if "data_model" in meta and isinstance(meta["data_model"], dict) else {}
1419
+ run_cfg_src = meta["run_config"] if "run_config" in meta and isinstance(meta["run_config"], dict) else {}
1420
+
1421
+ graph = GraphMeta(
1422
+ nodes_by_label=graph_src["nodes_by_label"] if "nodes_by_label" in graph_src else {},
1423
+ edges_by_type=graph_src["edges_by_type"] if "edges_by_type" in graph_src else {},
1424
+ vector_indexes=graph_src["vector_indexes"] if "vector_indexes" in graph_src else [],
1425
+ )
1426
+ judge = JudgeMeta(
1427
+ judge_model=str(judge_src["judge_model"])
1428
+ if "judge_model" in judge_src
1429
+ else str(cfg_fallback.get("judge_model", "")),
1430
+ judge_prompt_id=str(judge_src["judge_prompt_id"])
1431
+ if "judge_prompt_id" in judge_src
1432
+ else str(cfg_fallback.get("judge_prompt_id", "")),
1433
+ judge_prompt=str(judge_src["judge_prompt"]) if "judge_prompt" in judge_src else "",
1434
+ )
1435
+ embeddings_dim_src = (
1436
+ run_cfg_src["embeddings_dim"] if "embeddings_dim" in run_cfg_src else cfg_fallback.get("embeddings_dim", 0)
1437
+ )
1438
+ run_config = RunConfig(
1439
+ pipeline_model=str(run_cfg_src["pipeline_model"])
1440
+ if "pipeline_model" in run_cfg_src
1441
+ else str(cfg_fallback.get("pipeline_model", "")),
1442
+ embeddings_model=str(run_cfg_src["embeddings_model"])
1443
+ if "embeddings_model" in run_cfg_src
1444
+ else str(cfg_fallback.get("embeddings_model", "")),
1445
+ embeddings_dim=self._to_int(embeddings_dim_src),
1446
+ )
1447
+ dm = DmMeta(
1448
+ dm_id=str(dm_src["dm_id"]) if "dm_id" in dm_src else str(cfg_fallback.get("dm_id", "")),
1449
+ dm_description=str(dm_src["dm_description"]) if "dm_description" in dm_src else "",
1450
+ )
1451
+ return RunMeta(graph=graph, judge=judge, run_config=run_config, dm=dm)
1452
+
1453
+ def _collect_run_data(
1454
+ self,
1455
+ run_id: str,
1456
+ threads: list[ThreadDB],
1457
+ events_by_thread: dict[UUID, list[ThreadEventDB]],
1458
+ ) -> RunData:
1459
+ """Aggregate results, meta, and stats for a single run."""
1460
+ results_by_question: dict[str, QuestionResult] = {}
1461
+ meta_sample: dict[str, Any] = {}
1462
+ cfg_sample: dict[str, Any] = {}
1463
+ total = 0
1464
+ passed = 0
1465
+ failed = 0
1466
+ cost_total = 0.0
1467
+ ratings: list[float] = []
1468
+ answer_times: list[float] = []
1469
+
1470
+ for thread in threads:
1471
+ cfg = thread.thread_config or {}
1472
+ if not cfg_sample:
1473
+ cfg_sample = cfg
1474
+ evs = events_by_thread.get(thread.thread_id, [])
1475
+ if not meta_sample:
1476
+ meta_sample = self._extract_eval_meta(evs)
1477
+
1478
+ answer = ""
1479
+ tool_calls = ""
1480
+ status = "—"
1481
+ judge_comment = ""
1482
+ rating_val: float | None = None
1483
+ question_text = str(cfg["gds_question"]) if "gds_question" in cfg else ""
1484
+ golden_answer = str(cfg["gds_answer"]) if "gds_answer" in cfg else ""
1485
+
1486
+ for ev in evs:
1487
+ if not isinstance(ev.event_data, dict):
1488
+ continue
1489
+ if ev.event_type == "comm.user_message":
1490
+ user_ts = ev.created_at
1491
+ if ev.event_type == "comm.assistant_message":
1492
+ if "content" in ev.event_data:
1493
+ answer = str(ev.event_data["content"])
1494
+ answer_ts = ev.created_at
1495
+ elif ev.event_type == "rag.query_processed":
1496
+ tech = (
1497
+ cast(dict[str, Any], ev.event_data["technical_info"])
1498
+ if "technical_info" in ev.event_data
1499
+ else {}
1500
+ )
1501
+ model_stats = cast(dict[str, Any], tech["model_stats"]) if "model_stats" in tech else {}
1502
+ for stats in model_stats.values():
1503
+ if isinstance(stats, dict):
1504
+ cost_val = stats["requests_cost"] if "requests_cost" in stats else None
1505
+ try:
1506
+ if cost_val is not None:
1507
+ cost_total += float(cost_val)
1508
+ except (TypeError, ValueError):
1509
+ pass
1510
+ elif ev.event_type == "eval.result":
1511
+ status = str(ev.event_data["test_status"]) if "test_status" in ev.event_data else status
1512
+ judge_comment = (
1513
+ str(ev.event_data["eval_judge_comment"])
1514
+ if "eval_judge_comment" in ev.event_data
1515
+ else judge_comment
1516
+ )
1517
+ answer = str(ev.event_data["llm_answer"]) if "llm_answer" in ev.event_data else answer
1518
+ tool_calls = str(ev.event_data["tool_calls"]) if "tool_calls" in ev.event_data else tool_calls
1519
+ golden_answer = str(ev.event_data["gds_answer"]) if "gds_answer" in ev.event_data else golden_answer
1520
+ rating_label = ev.event_data["eval_judge_rating"] if "eval_judge_rating" in ev.event_data else None
1521
+ try:
1522
+ rating_val = float(rating_label) if rating_label is not None else None
1523
+ except Exception:
1524
+ rating_val = None
1525
+ if "gds_question" in ev.event_data:
1526
+ question_text = str(ev.event_data["gds_question"])
1527
+
1528
+ total += 1
1529
+ if status == "pass":
1530
+ passed += 1
1531
+ elif status == "fail":
1532
+ failed += 1
1533
+
1534
+ if user_ts and answer_ts:
1535
+ try:
1536
+ delta = (answer_ts - user_ts).total_seconds()
1537
+ if delta >= 0:
1538
+ answer_times.append(delta)
1539
+ except Exception:
1540
+ pass
1541
+
1542
+ if rating_val is not None:
1543
+ ratings.append(rating_val)
1544
+
1545
+ key = str(question_text or f"question-{total}")
1546
+ results_by_question[key] = QuestionResult(
1547
+ status=status or "—",
1548
+ rating=rating_val if rating_val is not None else "—",
1549
+ comment=safe_render_value(judge_comment),
1550
+ answer=safe_render_value(answer),
1551
+ tool_calls=safe_render_value(tool_calls),
1552
+ golden_answer=safe_render_value(golden_answer),
1553
+ thread_id=str(thread.thread_id),
1554
+ )
1555
+
1556
+ avg_rating = sum(ratings) / len(ratings) if ratings else 0.0
1557
+ summary: RunSummary = {
1558
+ "run_id": run_id,
1559
+ "run_label": self._format_run_label_with_name(run_id, meta_sample or cfg_sample),
1560
+ "tests_total": total,
1561
+ "passed": passed,
1562
+ "failed": failed,
1563
+ "pass_rate": (passed / total) if total else 0.0,
1564
+ "avg_rating": str(round(avg_rating, 2) if ratings else "—"),
1565
+ "cost_total": round(cost_total, 3),
1566
+ "test_run_name": meta_sample["test_run_name"]
1567
+ if "test_run_name" in meta_sample
1568
+ else (cfg_sample["test_run_name"] if "test_run_name" in cfg_sample else ""),
1569
+ "avg_answer_time_sec": round(sum(answer_times) / len(answer_times), 2) if answer_times else 0.0,
1570
+ "median_answer_time_sec": round(statistics.median(answer_times), 2) if answer_times else 0.0,
1571
+ }
1572
+
1573
+ run_meta = self._parse_run_meta(meta_sample, cfg_sample)
1574
+ config_summary = self._summarize_config_for_display(run_meta, cfg_sample)
1575
+
1576
+ return RunData(
1577
+ summary=summary,
1578
+ meta=run_meta,
1579
+ config_summary=config_summary,
1580
+ results=results_by_question,
1581
+ )
1582
+
1583
+ def _config_rows(self, cfg: dict[str, object], diff_keys: list[str]) -> list[dict[str, object]]:
1584
+ def _as_text(val: object) -> str:
1585
+ if isinstance(val, (dict, list)):
1586
+ try:
1587
+ return json.dumps(val, ensure_ascii=False)
1588
+ except Exception:
1589
+ return str(val)
1590
+ return str(val)
1591
+
1592
+ rows: list[dict[str, object]] = []
1593
+ for label, key in [
1594
+ ("Pipeline model", "pipeline_model"),
1595
+ ("Embeddings", "embeddings_model"),
1596
+ ("Embedding dims", "embeddings_dim"),
1597
+ ("Judge model", "judge_model"),
1598
+ ("Graph nodes", "graph_nodes"),
1599
+ ("Graph edges", "graph_edges"),
1600
+ # ("Vector indexes", "vector_indexes"), # takes a lot of space
1601
+ ]:
1602
+ val = _as_text(cfg[key]) if key in cfg else "—"
1603
+ rows.append({"label": label, "value": val, "diff": key in diff_keys})
1604
+ return rows
1605
+
1606
+ def _build_thread_config(
1607
+ self, question_row: dict[str, Any], test_run_id: str, test_run_name: str
1608
+ ) -> dict[str, Any]:
1609
+ """Pack metadata into thread_config so runs are traceable in JIMS."""
1610
+ resolved_model = self._resolved_pipeline_model()
1611
+ return {
1612
+ "interface": "reflex-eval",
1613
+ "source": "eval",
1614
+ "test_run": test_run_id,
1615
+ "test_run_name": test_run_name,
1616
+ "gds_question": question_row.get("gds_question"),
1617
+ "gds_answer": question_row.get("gds_answer"),
1618
+ "question_context": question_row.get("question_context"),
1619
+ "question_scenario": question_row.get("question_scenario"),
1620
+ "judge_model": self.judge_model,
1621
+ "judge_prompt_id": self.judge_prompt_id,
1622
+ "pipeline_model": resolved_model,
1623
+ "pipeline_provider": self.provider,
1624
+ "embeddings_model": self.embeddings_model,
1625
+ "embeddings_dim": self.embeddings_dim,
1626
+ "dm_id": self.dm_id,
1627
+ }
1628
+
1629
+ async def _run_question_thread(
1630
+ self,
1631
+ vedana_app,
1632
+ question_row: dict[str, Any],
1633
+ test_run_name: str,
1634
+ test_run_id: str,
1635
+ eval_meta_base: dict[str, Any],
1636
+ ) -> tuple[str, str, dict[str, Any]]:
1637
+ """Create a JIMS thread, post the question, run pipeline, return answer + tech info."""
1638
+ thread_id = uuid7()
1639
+ ctl = await ThreadController.new_thread(
1640
+ vedana_app.sessionmaker,
1641
+ contact_id=test_run_id,
1642
+ thread_id=thread_id,
1643
+ thread_config=self._build_thread_config(question_row, test_run_id, test_run_name),
1644
+ )
1645
+
1646
+ try:
1647
+ meta_payload = {
1648
+ **eval_meta_base,
1649
+ "question": {
1650
+ "gds_question": question_row.get("gds_question"),
1651
+ "gds_answer": question_row.get("gds_answer"),
1652
+ "question_context": question_row.get("question_context"),
1653
+ "question_scenario": question_row.get("question_scenario"),
1654
+ },
1655
+ }
1656
+ await ctl.store_event_dict(uuid7(), "eval.meta", meta_payload)
1657
+ except Exception as exc:
1658
+ logging.warning(f"Failed to store eval.meta for thread {thread_id}: {exc}")
1659
+
1660
+ question_text = str(question_row.get("gds_question", "") or "")
1661
+ q_ctx = str(question_row.get("question_context", "") or "").strip()
1662
+ user_query = f"{question_text} {q_ctx}".strip()
1663
+
1664
+ await ctl.store_user_message(uuid7(), user_query)
1665
+ pipeline = vedana_app.pipeline
1666
+ resolved_model = self._resolved_pipeline_model()
1667
+ pipeline.model = resolved_model
1668
+ pipeline.enable_filtering = self.enable_dm_filtering
1669
+
1670
+ ctx = await ctl.make_context()
1671
+ if self.provider == "openrouter" and self.custom_openrouter_key:
1672
+ ctx.llm.model_api_key = self.custom_openrouter_key
1673
+
1674
+ events = await ctl.run_pipeline_with_context(pipeline, ctx)
1675
+
1676
+ answer: str = ""
1677
+ technical_info: dict[str, Any] = {}
1678
+ for ev in events:
1679
+ if ev.event_type == "comm.assistant_message":
1680
+ answer = str(ev.event_data.get("content", ""))
1681
+ elif ev.event_type == "rag.query_processed":
1682
+ technical_info = dict(ev.event_data.get("technical_info", {}))
1683
+
1684
+ return str(thread_id), answer, technical_info
1685
+
1686
+ async def _judge_answer(self, question_row: dict[str, Any], answer: str, tool_calls: str) -> tuple[str, str, int]:
1687
+ """Judge model answer with current judge prompt/model and rating."""
1688
+ judge_prompt = self.judge_prompt
1689
+ if not judge_prompt:
1690
+ return "fail", "Judge prompt not loaded", 0
1691
+
1692
+ provider = LLMProvider() # todo use single LLMProvider per-thread
1693
+ judge_model = self.judge_model
1694
+ if judge_model:
1695
+ try:
1696
+ provider.set_model(judge_model)
1697
+ except Exception:
1698
+ logging.warning(f"Failed to set judge model {judge_model}")
1699
+
1700
+ class JudgeResult(BaseModel):
1701
+ test_status: str = Field(description="pass / fail")
1702
+ comment: str = Field(description="justification and hints")
1703
+ errors: str | list[str] | None = Field(default=None, description="Text description of errors found")
1704
+ rating: int = Field(description="Numeric rating between 0 (worst) and 10 (best)")
1705
+
1706
+ user_msg = (
1707
+ f"Golden answer:\n{question_row.get('gds_answer', '')}\n\n"
1708
+ f"Expected context (if any):\n{question_row.get('question_context', '')}\n\n"
1709
+ f"Model answer:\n{answer}\n\n"
1710
+ f"Technical info (for reference):\n{tool_calls}\n\n"
1711
+ "Return test_status (pass/fail), a helpful comment, optional errors list/text, "
1712
+ "and rating as an integer number between 0 and 10 (10 = best possible answer)."
1713
+ )
1714
+
1715
+ try:
1716
+ res = await provider.chat_completion_structured(
1717
+ [
1718
+ {"role": "system", "content": judge_prompt},
1719
+ {"role": "user", "content": user_msg},
1720
+ ],
1721
+ JudgeResult,
1722
+ ) # type: ignore[arg-type]
1723
+ except Exception as e:
1724
+ logging.exception(f"Judge failed for question '{question_row.get('gds_question')}': {e}")
1725
+ return "fail", f"Judge failed: {e}", 0
1726
+
1727
+ if res is None:
1728
+ return "fail", "", 0
1729
+
1730
+ try:
1731
+ rating = int(res.rating)
1732
+ except Exception:
1733
+ rating = 0
1734
+
1735
+ return res.test_status or "fail", res.comment or "", rating
1736
+
1737
+ async def _store_eval_result_event(self, thread_id: str, result_row: dict[str, Any]) -> None:
1738
+ """Persist eval result as a thread event for thread-based history. todo why is this needed?"""
1739
+ try:
1740
+ tid = UUID(str(thread_id))
1741
+ except Exception:
1742
+ return
1743
+
1744
+ vedana_app = await get_vedana_app()
1745
+ ctl = await ThreadController.from_thread_id(vedana_app.sessionmaker, tid)
1746
+ if ctl is None:
1747
+ return
1748
+
1749
+ data = dict(result_row)
1750
+ data.pop("thread_id", None)
1751
+ await ctl.store_event_dict(uuid7(), "eval.result", data)
1752
+
1753
+ def run_selected_tests(self):
1754
+ """Trigger test run - validates, sets loading state and starts background task."""
1755
+ if self.is_running:
1756
+ return
1757
+
1758
+ # Validation
1759
+ selection = [str(q) for q in (self.selected_question_ids or []) if str(q)]
1760
+ if not selection:
1761
+ self.error_message = "Select at least one question to run tests."
1762
+ return
1763
+ if not self.judge_prompt:
1764
+ self.error_message = "Judge prompt not loaded. Refresh judge config first."
1765
+ return
1766
+ if self.provider == "openrouter":
1767
+ key = (self.custom_openrouter_key or os.environ.get("OPENROUTER_API_KEY") or "").strip()
1768
+ if not key:
1769
+ self.error_message = "OPENROUTER_API_KEY is required for OpenRouter provider."
1770
+ return
1771
+
1772
+ test_run_name = self.test_run_name.strip() or ""
1773
+
1774
+ # Initialize run state
1775
+ self.is_running = True
1776
+ self.current_question_index = -1
1777
+ self.total_questions_to_run = len(selection)
1778
+ self.status_message = f"Evaluation run '{test_run_name}' for {len(selection)} question(s)…"
1779
+ self.run_progress = []
1780
+ self.error_message = ""
1781
+ yield
1782
+ yield EvalState.run_selected_tests_background()
1783
+
1784
+ @rx.event(background=True) # type: ignore[operator]
1785
+ async def run_selected_tests_background(self):
1786
+ async with self:
1787
+ selection = [str(q) for q in (self.selected_question_ids or []) if str(q)]
1788
+
1789
+ question_map = {}
1790
+ for row in self.eval_gds_rows:
1791
+ if row:
1792
+ row_id = str(row.get("id", ""))
1793
+ if row_id:
1794
+ question_map[row_id] = row
1795
+
1796
+ test_run_ts = datetime.now().strftime("%Y%m%d-%H%M%S")
1797
+ test_run_id = f"eval:{test_run_ts}"
1798
+ test_run_name = self.test_run_name.strip() or ""
1799
+ resolved_pipeline_model = self._resolved_pipeline_model()
1800
+
1801
+ vedana_app = await get_vedana_app()
1802
+ async with self:
1803
+ eval_meta_base = await self._build_eval_meta_payload(vedana_app, test_run_id, test_run_name)
1804
+ max_parallel = max(1, int(self.max_parallel_tests or 1))
1805
+
1806
+ sem = asyncio.Semaphore(max_parallel)
1807
+
1808
+ async def _run_one(question: str) -> dict[str, Any]:
1809
+ async with sem:
1810
+ row = question_map.get(str(question or "").strip())
1811
+ if row is None:
1812
+ return {"question": question, "status": None, "error": "not found"}
1813
+ try:
1814
+ thread_id, answer, tech = await self._run_question_thread(
1815
+ vedana_app, row, test_run_name, test_run_id, eval_meta_base
1816
+ )
1817
+ tool_calls = self._format_tool_calls(tech)
1818
+ async with self:
1819
+ status, comment, rating = await self._judge_answer(row, answer, tool_calls)
1820
+
1821
+ result_row = {
1822
+ "judge_model": self.judge_model,
1823
+ "judge_prompt_id": self.judge_prompt_id,
1824
+ "dm_id": self.dm_id,
1825
+ "pipeline_model": resolved_pipeline_model,
1826
+ "embeddings_model": self.embeddings_model,
1827
+ "embeddings_dim": self.embeddings_dim,
1828
+ "test_run_id": test_run_id,
1829
+ "test_run_name": test_run_name,
1830
+ "gds_question": row.get("gds_question"),
1831
+ "question_context": row.get("question_context"),
1832
+ "gds_answer": row.get("gds_answer"),
1833
+ "llm_answer": answer,
1834
+ "tool_calls": tool_calls,
1835
+ "test_status": status,
1836
+ "eval_judge_comment": comment,
1837
+ "eval_judge_rating": rating,
1838
+ "test_date": test_run_ts,
1839
+ "thread_id": thread_id,
1840
+ }
1841
+
1842
+ await self._store_eval_result_event(thread_id, result_row)
1843
+ return {"question": question, "status": status, "rating": rating, "error": None}
1844
+ except Exception as exc:
1845
+ logging.error(f"Failed for '{question}': {exc}", exc_info=True)
1846
+ return {"question": question, "status": None, "error": str(exc)}
1847
+
1848
+ async with self:
1849
+ self._append_progress(f"Queued {len(selection)} question(s) with up to {max_parallel} parallel worker(s)")
1850
+ tasks = [asyncio.create_task(_run_one(question)) for question in selection]
1851
+
1852
+ completed = 0
1853
+ for future in asyncio.as_completed(tasks):
1854
+ res = await future
1855
+ completed += 1
1856
+ async with self:
1857
+ self.current_question_index = completed - 1
1858
+ question = res.get("question", "")
1859
+ if res.get("error"):
1860
+ msg = f"Failed for '{question}': {res.get('error')}"
1861
+ else:
1862
+ status = res.get("status", "")
1863
+ rating = res.get("rating", 0.0)
1864
+ msg = f"Completed: '{question}' (status: {status}, rating: {rating})"
1865
+ self._append_progress(msg)
1866
+ self.status_message = f"Completed {completed} of {len(selection)} question(s)"
1867
+ yield # Yield after each completion to update UI
1868
+
1869
+ # Finalize
1870
+ async with self:
1871
+ self.status_message = f"Evaluation complete: {completed} of {len(selection)} question(s) processed"
1872
+ self.current_question_index = -1
1873
+ self.total_questions_to_run = 0
1874
+
1875
+ # Reload data to show new test results
1876
+ try:
1877
+ yield EvalState.load_eval_data_background()
1878
+ except Exception as e:
1879
+ logging.warning(f"Failed to reload eval data after test run: {e}")
1880
+
1881
+ async with self:
1882
+ self.is_running = False
1883
+ yield
1884
+
1885
+ def refresh_golden_dataset(self):
1886
+ """Connecting button with a background task. Used to trigger animations properly."""
1887
+ if self.is_running:
1888
+ return
1889
+ self.status_message = "Refreshing golden dataset from Grist…"
1890
+ self.error_message = ""
1891
+ self.is_running = True
1892
+ yield
1893
+ yield EvalState.refresh_golden_dataset_background()
1894
+
1895
+ @rx.event(background=True) # type: ignore[operator]
1896
+ async def refresh_golden_dataset_background(self):
1897
+ try:
1898
+ await asyncio.to_thread(self.get_eval_gds_from_grist)
1899
+ async with self:
1900
+ self._append_progress("Golden dataset refreshed from Grist")
1901
+ await self._load_eval_questions()
1902
+ self.status_message = "Golden dataset refreshed successfully"
1903
+ except Exception as e:
1904
+ async with self:
1905
+ self.error_message = f"Failed to refresh golden dataset: {e}"
1906
+ logging.error(f"Failed to refresh golden dataset: {e}", exc_info=True)
1907
+ finally:
1908
+ async with self:
1909
+ self.is_running = False
1910
+ yield
1911
+
1912
+ def load_eval_data(self):
1913
+ """Connecting button with a background task. Used to trigger animations properly."""
1914
+ if self.loading:
1915
+ return
1916
+ self.loading = True
1917
+ self.error_message = ""
1918
+ self.status_message = ""
1919
+ self.tests_page = 0 # Reset to first page
1920
+ yield
1921
+ yield EvalState.load_eval_data_background()
1922
+
1923
+ @rx.event(background=True) # type: ignore[operator]
1924
+ async def load_eval_data_background(self):
1925
+ try:
1926
+ async with self:
1927
+ await asyncio.to_thread(self.fetch_openrouter_models)
1928
+ self._sync_available_models()
1929
+ await self._load_eval_questions()
1930
+ self.tests_page_size = max(1, len(self.eval_gds_rows) * 2)
1931
+ await self._load_judge_config()
1932
+ await self._load_pipeline_config()
1933
+ await self._load_tests()
1934
+ except Exception as e:
1935
+ async with self:
1936
+ self.error_message = f"Failed to load eval data: {e} {traceback.format_exc()}"
1937
+ finally:
1938
+ async with self:
1939
+ self.loading = False
1940
+ yield