supervertaler 1.9.163__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. Supervertaler.py +48473 -0
  2. modules/__init__.py +10 -0
  3. modules/ai_actions.py +964 -0
  4. modules/ai_attachment_manager.py +343 -0
  5. modules/ai_file_viewer_dialog.py +210 -0
  6. modules/autofingers_engine.py +466 -0
  7. modules/cafetran_docx_handler.py +379 -0
  8. modules/config_manager.py +469 -0
  9. modules/database_manager.py +1911 -0
  10. modules/database_migrations.py +417 -0
  11. modules/dejavurtf_handler.py +779 -0
  12. modules/document_analyzer.py +427 -0
  13. modules/docx_handler.py +689 -0
  14. modules/encoding_repair.py +319 -0
  15. modules/encoding_repair_Qt.py +393 -0
  16. modules/encoding_repair_ui.py +481 -0
  17. modules/feature_manager.py +350 -0
  18. modules/figure_context_manager.py +340 -0
  19. modules/file_dialog_helper.py +148 -0
  20. modules/find_replace.py +164 -0
  21. modules/find_replace_qt.py +457 -0
  22. modules/glossary_manager.py +433 -0
  23. modules/image_extractor.py +188 -0
  24. modules/keyboard_shortcuts_widget.py +571 -0
  25. modules/llm_clients.py +1211 -0
  26. modules/llm_leaderboard.py +737 -0
  27. modules/llm_superbench_ui.py +1401 -0
  28. modules/local_llm_setup.py +1104 -0
  29. modules/model_update_dialog.py +381 -0
  30. modules/model_version_checker.py +373 -0
  31. modules/mqxliff_handler.py +638 -0
  32. modules/non_translatables_manager.py +743 -0
  33. modules/pdf_rescue_Qt.py +1822 -0
  34. modules/pdf_rescue_tkinter.py +909 -0
  35. modules/phrase_docx_handler.py +516 -0
  36. modules/project_home_panel.py +209 -0
  37. modules/prompt_assistant.py +357 -0
  38. modules/prompt_library.py +689 -0
  39. modules/prompt_library_migration.py +447 -0
  40. modules/quick_access_sidebar.py +282 -0
  41. modules/ribbon_widget.py +597 -0
  42. modules/sdlppx_handler.py +874 -0
  43. modules/setup_wizard.py +353 -0
  44. modules/shortcut_manager.py +932 -0
  45. modules/simple_segmenter.py +128 -0
  46. modules/spellcheck_manager.py +727 -0
  47. modules/statuses.py +207 -0
  48. modules/style_guide_manager.py +315 -0
  49. modules/superbench_ui.py +1319 -0
  50. modules/superbrowser.py +329 -0
  51. modules/supercleaner.py +600 -0
  52. modules/supercleaner_ui.py +444 -0
  53. modules/superdocs.py +19 -0
  54. modules/superdocs_viewer_qt.py +382 -0
  55. modules/superlookup.py +252 -0
  56. modules/tag_cleaner.py +260 -0
  57. modules/tag_manager.py +351 -0
  58. modules/term_extractor.py +270 -0
  59. modules/termbase_entry_editor.py +842 -0
  60. modules/termbase_import_export.py +488 -0
  61. modules/termbase_manager.py +1060 -0
  62. modules/termview_widget.py +1176 -0
  63. modules/theme_manager.py +499 -0
  64. modules/tm_editor_dialog.py +99 -0
  65. modules/tm_manager_qt.py +1280 -0
  66. modules/tm_metadata_manager.py +545 -0
  67. modules/tmx_editor.py +1461 -0
  68. modules/tmx_editor_qt.py +2784 -0
  69. modules/tmx_generator.py +284 -0
  70. modules/tracked_changes.py +900 -0
  71. modules/trados_docx_handler.py +430 -0
  72. modules/translation_memory.py +715 -0
  73. modules/translation_results_panel.py +2134 -0
  74. modules/translation_services.py +282 -0
  75. modules/unified_prompt_library.py +659 -0
  76. modules/unified_prompt_manager_qt.py +3951 -0
  77. modules/voice_commands.py +920 -0
  78. modules/voice_dictation.py +477 -0
  79. modules/voice_dictation_lite.py +249 -0
  80. supervertaler-1.9.163.dist-info/METADATA +906 -0
  81. supervertaler-1.9.163.dist-info/RECORD +85 -0
  82. supervertaler-1.9.163.dist-info/WHEEL +5 -0
  83. supervertaler-1.9.163.dist-info/entry_points.txt +2 -0
  84. supervertaler-1.9.163.dist-info/licenses/LICENSE +21 -0
  85. supervertaler-1.9.163.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1319 @@
1
+ """
2
+ Superbench - Qt UI Components
3
+ ==============================
4
+
5
+ PyQt6 user interface for LLM translation benchmarking.
6
+
7
+ Features:
8
+ - Test dataset selection
9
+ - Model selection (checkboxes)
10
+ - Benchmark execution with progress
11
+ - Results table with comparison
12
+ - Summary statistics panel
13
+ - Export functionality
14
+
15
+ Author: Michael Beijer
16
+ License: MIT
17
+ """
18
+
19
+ from PyQt6.QtWidgets import (
20
+ QWidget, QVBoxLayout, QHBoxLayout, QGroupBox, QLabel,
21
+ QPushButton, QComboBox, QCheckBox, QTableWidget, QTableWidgetItem,
22
+ QProgressBar, QTextEdit, QSplitter, QHeaderView, QMessageBox,
23
+ QFileDialog, QRadioButton, QSpinBox
24
+ )
25
+ from PyQt6.QtCore import Qt, QThread, pyqtSignal, QPointF
26
+ from PyQt6.QtGui import QColor, QFont, QPainter, QPen
27
+ from PyQt6.QtWidgets import QStyleOptionButton
28
+ from typing import List, Optional, Dict
29
+ import json
30
+ from pathlib import Path
31
+
32
+ try:
33
+ from modules.llm_leaderboard import (
34
+ LLMLeaderboard, TestDataset, ModelConfig, BenchmarkResult,
35
+ create_sample_datasets, create_dataset_from_project, CHRF_AVAILABLE
36
+ )
37
+ except ImportError:
38
+ from llm_leaderboard import (
39
+ LLMLeaderboard, TestDataset, ModelConfig, BenchmarkResult,
40
+ create_sample_datasets, create_dataset_from_project, CHRF_AVAILABLE
41
+ )
42
+
43
+
44
+ class CheckmarkCheckBox(QCheckBox):
45
+ """Custom checkbox with green background and white checkmark when checked"""
46
+
47
+ def __init__(self, text="", parent=None):
48
+ super().__init__(text, parent)
49
+ self.setCheckable(True)
50
+ self.setEnabled(True)
51
+ self.setStyleSheet("""
52
+ QCheckBox {
53
+ font-size: 9pt;
54
+ spacing: 6px;
55
+ }
56
+ QCheckBox::indicator {
57
+ width: 18px;
58
+ height: 18px;
59
+ border: 2px solid #999;
60
+ border-radius: 3px;
61
+ background-color: white;
62
+ }
63
+ QCheckBox::indicator:checked {
64
+ background-color: #4CAF50;
65
+ border-color: #4CAF50;
66
+ }
67
+ QCheckBox::indicator:hover {
68
+ border-color: #666;
69
+ }
70
+ QCheckBox::indicator:checked:hover {
71
+ background-color: #45a049;
72
+ border-color: #45a049;
73
+ }
74
+ """)
75
+
76
+ def paintEvent(self, event):
77
+ """Override paint event to draw white checkmark when checked"""
78
+ super().paintEvent(event)
79
+
80
+ if self.isChecked():
81
+ opt = QStyleOptionButton()
82
+ self.initStyleOption(opt)
83
+ indicator_rect = self.style().subElementRect(
84
+ self.style().SubElement.SE_CheckBoxIndicator,
85
+ opt,
86
+ self
87
+ )
88
+
89
+ if indicator_rect.isValid():
90
+ # Draw white checkmark
91
+ painter = QPainter(self)
92
+ try:
93
+ painter.setRenderHint(QPainter.RenderHint.Antialiasing)
94
+ pen_width = max(2.0, min(indicator_rect.width(), indicator_rect.height()) * 0.12)
95
+ painter.setPen(QPen(QColor(255, 255, 255), pen_width, Qt.PenStyle.SolidLine, Qt.PenCapStyle.RoundCap, Qt.PenJoinStyle.RoundJoin))
96
+ painter.setBrush(QColor(255, 255, 255))
97
+
98
+ # Draw checkmark (✓ shape)
99
+ x = indicator_rect.x()
100
+ y = indicator_rect.y()
101
+ w = indicator_rect.width()
102
+ h = indicator_rect.height()
103
+
104
+ # Add padding
105
+ padding = min(w, h) * 0.15
106
+ x += padding
107
+ y += padding
108
+ w -= padding * 2
109
+ h -= padding * 2
110
+
111
+ # Checkmark path
112
+ check_x1 = x + w * 0.10
113
+ check_y1 = y + h * 0.50
114
+ check_x2 = x + w * 0.35
115
+ check_y2 = y + h * 0.70
116
+ check_x3 = x + w * 0.90
117
+ check_y3 = y + h * 0.25
118
+
119
+ # Draw checkmark lines
120
+ painter.drawLine(QPointF(check_x2, check_y2), QPointF(check_x3, check_y3))
121
+ painter.drawLine(QPointF(check_x1, check_y1), QPointF(check_x2, check_y2))
122
+ finally:
123
+ painter.end()
124
+
125
+
126
+ class CustomRadioButton(QRadioButton):
127
+ """Custom radio button with square indicator, green when checked, white checkmark"""
128
+
129
+ def __init__(self, text="", parent=None):
130
+ super().__init__(text, parent)
131
+ self.setCheckable(True)
132
+ self.setEnabled(True)
133
+ self.setStyleSheet("""
134
+ QRadioButton {
135
+ font-size: 9pt;
136
+ spacing: 6px;
137
+ }
138
+ QRadioButton::indicator {
139
+ width: 18px;
140
+ height: 18px;
141
+ border: 2px solid #999;
142
+ border-radius: 3px;
143
+ background-color: white;
144
+ }
145
+ QRadioButton::indicator:checked {
146
+ background-color: #4CAF50;
147
+ border-color: #4CAF50;
148
+ }
149
+ QRadioButton::indicator:hover {
150
+ border-color: #666;
151
+ }
152
+ QRadioButton::indicator:checked:hover {
153
+ background-color: #45a049;
154
+ border-color: #45a049;
155
+ }
156
+ """)
157
+
158
+ def paintEvent(self, event):
159
+ """Override paint event to draw white checkmark when checked"""
160
+ super().paintEvent(event)
161
+
162
+ if self.isChecked():
163
+ opt = QStyleOptionButton()
164
+ self.initStyleOption(opt)
165
+ indicator_rect = self.style().subElementRect(
166
+ self.style().SubElement.SE_RadioButtonIndicator,
167
+ opt,
168
+ self
169
+ )
170
+
171
+ if indicator_rect.isValid():
172
+ # Draw white checkmark
173
+ painter = QPainter(self)
174
+ try:
175
+ painter.setRenderHint(QPainter.RenderHint.Antialiasing)
176
+ pen_width = max(2.0, min(indicator_rect.width(), indicator_rect.height()) * 0.12)
177
+ painter.setPen(QPen(QColor(255, 255, 255), pen_width, Qt.PenStyle.SolidLine, Qt.PenCapStyle.RoundCap, Qt.PenJoinStyle.RoundJoin))
178
+ painter.setBrush(QColor(255, 255, 255))
179
+
180
+ # Draw checkmark (✓ shape)
181
+ x = indicator_rect.x()
182
+ y = indicator_rect.y()
183
+ w = indicator_rect.width()
184
+ h = indicator_rect.height()
185
+
186
+ # Add padding
187
+ padding = min(w, h) * 0.15
188
+ x += padding
189
+ y += padding
190
+ w -= padding * 2
191
+ h -= padding * 2
192
+
193
+ # Checkmark path
194
+ check_x1 = x + w * 0.10
195
+ check_y1 = y + h * 0.50
196
+ check_x2 = x + w * 0.35
197
+ check_y2 = y + h * 0.70
198
+ check_x3 = x + w * 0.90
199
+ check_y3 = y + h * 0.25
200
+
201
+ # Draw checkmark lines
202
+ painter.drawLine(QPointF(check_x2, check_y2), QPointF(check_x3, check_y3))
203
+ painter.drawLine(QPointF(check_x1, check_y1), QPointF(check_x2, check_y2))
204
+ finally:
205
+ painter.end()
206
+
207
+
208
+ class BenchmarkThread(QThread):
209
+ """Background thread for running benchmarks without blocking UI"""
210
+
211
+ progress_update = pyqtSignal(int, int, str) # current, total, message
212
+ finished = pyqtSignal(list) # results
213
+ error = pyqtSignal(str) # error message
214
+
215
+ def __init__(self, leaderboard: LLMLeaderboard, dataset: TestDataset, models: List[ModelConfig]):
216
+ super().__init__()
217
+ self.leaderboard = leaderboard
218
+ self.dataset = dataset
219
+ self.models = models
220
+
221
+ def run(self):
222
+ """Run benchmark in background thread"""
223
+ try:
224
+ print(f"[BENCHMARK THREAD] Starting benchmark with {len(self.models)} models on {len(self.dataset.segments)} segments")
225
+ results = self.leaderboard.run_benchmark(
226
+ self.dataset,
227
+ self.models,
228
+ progress_callback=self._on_progress
229
+ )
230
+ print(f"[BENCHMARK THREAD] Benchmark completed with {len(results)} results")
231
+ self.finished.emit(results)
232
+ print(f"[BENCHMARK THREAD] Finished signal emitted successfully")
233
+ except Exception as e:
234
+ print(f"[BENCHMARK THREAD] ERROR: {str(e)}")
235
+ import traceback
236
+ print(f"[BENCHMARK THREAD] TRACEBACK:\n{traceback.format_exc()}")
237
+ self.error.emit(str(e))
238
+
239
+ def _on_progress(self, current: int, total: int, message: str):
240
+ """Forward progress updates to main thread"""
241
+ try:
242
+ self.progress_update.emit(current, total, message)
243
+ except Exception as e:
244
+ print(f"[BENCHMARK THREAD] Progress update failed: {str(e)}")
245
+
246
+
247
+ class LLMLeaderboardUI(QWidget):
248
+ """Main UI widget for Superbench"""
249
+
250
+ def __init__(self, parent=None, llm_client_factory=None):
251
+ super().__init__(parent)
252
+ self.parent_app = parent
253
+ self.llm_client_factory = llm_client_factory
254
+ self.leaderboard = None
255
+ self.benchmark_thread = None
256
+ self.current_results = []
257
+
258
+ # Load sample datasets
259
+ self.datasets = create_sample_datasets()
260
+ self.current_dataset = self.datasets[0] if self.datasets else None
261
+ self.project_dataset = None
262
+ self.project_metadata = None
263
+
264
+ self.init_ui()
265
+
266
+ def init_ui(self):
267
+ """Initialize the user interface"""
268
+ layout = QVBoxLayout()
269
+ layout.setSpacing(5) # Tighter spacing for consistency
270
+ layout.setContentsMargins(10, 10, 10, 10)
271
+
272
+ # Header (matches TMX Editor / AutoFingers / PDF Rescue style)
273
+ header = QLabel("📊 Superbench")
274
+ header.setStyleSheet("font-size: 16pt; font-weight: bold; color: #1976D2;")
275
+ layout.addWidget(header, 0) # 0 = no stretch, stays compact
276
+
277
+ # Description box (matches TMX Editor / AutoFingers / PDF Rescue style)
278
+ description = QLabel(
279
+ "LLM Translation Quality Benchmarking System - A Supervertaler Module.\n"
280
+ "Compare translation quality, speed, and cost across multiple LLM providers."
281
+ )
282
+ description.setWordWrap(True)
283
+ description.setStyleSheet("color: #666; padding: 5px; background-color: #E3F2FD; border-radius: 3px;")
284
+ layout.addWidget(description, 0)
285
+
286
+ # Spacing after description
287
+ layout.addSpacing(10)
288
+
289
+ # Top section: Dataset and Model selection
290
+ top_widget = self._create_top_section()
291
+ layout.addWidget(top_widget)
292
+
293
+ # Progress bar
294
+ self.progress_bar = QProgressBar()
295
+ self.progress_bar.setVisible(False)
296
+ layout.addWidget(self.progress_bar)
297
+
298
+ # Status label
299
+ self.status_label = QLabel("Ready")
300
+ self.status_label.setStyleSheet("color: #666; font-size: 9pt;")
301
+ layout.addWidget(self.status_label)
302
+
303
+ # Splitter for results and log
304
+ splitter = QSplitter(Qt.Orientation.Vertical)
305
+
306
+ # Results table
307
+ self.results_table = self._create_results_table()
308
+ splitter.addWidget(self.results_table)
309
+
310
+ # Summary panel
311
+ self.summary_panel = self._create_summary_panel()
312
+ splitter.addWidget(self.summary_panel)
313
+
314
+ # Log output
315
+ self.log_output = QTextEdit()
316
+ self.log_output.setReadOnly(True)
317
+ self.log_output.setMaximumHeight(150)
318
+ self.log_output.setPlaceholderText("Benchmark log will appear here...")
319
+ splitter.addWidget(self.log_output)
320
+
321
+ splitter.setStretchFactor(0, 3) # Results table gets most space
322
+ splitter.setStretchFactor(1, 1) # Summary panel medium space
323
+ splitter.setStretchFactor(2, 1) # Log output smallest
324
+
325
+ layout.addWidget(splitter)
326
+
327
+ self.setLayout(layout)
328
+
329
+ def _create_top_section(self) -> QWidget:
330
+ """Create dataset selection and model selection section"""
331
+ widget = QWidget()
332
+ layout = QHBoxLayout()
333
+
334
+ # Left: Dataset selection
335
+ dataset_group = QGroupBox("Test Dataset")
336
+ dataset_layout = QVBoxLayout()
337
+
338
+ # Radio buttons for dataset source
339
+ self.predefined_radio = CustomRadioButton("Pre-defined Datasets")
340
+ self.predefined_radio.setChecked(True)
341
+ self.predefined_radio.toggled.connect(self._on_dataset_source_changed)
342
+ dataset_layout.addWidget(self.predefined_radio)
343
+
344
+ # Pre-defined datasets dropdown
345
+ self.dataset_combo = QComboBox()
346
+ for ds in self.datasets:
347
+ self.dataset_combo.addItem(f"{ds.name} ({len(ds.segments)} segments)", ds)
348
+ self.dataset_combo.currentIndexChanged.connect(self._on_dataset_changed)
349
+ dataset_layout.addWidget(self.dataset_combo)
350
+
351
+ dataset_layout.addSpacing(10)
352
+
353
+ # Current Project option
354
+ self.project_radio = CustomRadioButton("Current Project")
355
+ self.project_radio.toggled.connect(self._on_dataset_source_changed)
356
+ dataset_layout.addWidget(self.project_radio)
357
+
358
+ # Project dataset controls (initially hidden)
359
+ self.project_controls_widget = QWidget()
360
+ project_controls_layout = QVBoxLayout()
361
+ project_controls_layout.setContentsMargins(20, 0, 0, 0) # Indent
362
+
363
+ # Sample size
364
+ sample_size_layout = QHBoxLayout()
365
+ sample_size_layout.addWidget(QLabel("Sample size:"))
366
+ self.sample_size_spin = QSpinBox()
367
+ self.sample_size_spin.setRange(1, 50)
368
+ self.sample_size_spin.setValue(10)
369
+ self.sample_size_spin.setToolTip("Number of segments to sample from project")
370
+ sample_size_layout.addWidget(self.sample_size_spin)
371
+ sample_size_layout.addStretch()
372
+ project_controls_layout.addLayout(sample_size_layout)
373
+
374
+ # Sampling method
375
+ method_layout = QHBoxLayout()
376
+ method_layout.addWidget(QLabel("Method:"))
377
+ self.sampling_method_combo = QComboBox()
378
+ self.sampling_method_combo.addItems(["Smart Sampling", "Random", "Evenly Spaced"])
379
+ self.sampling_method_combo.setToolTip(
380
+ "Smart: 30% begin, 40% middle, 30% end\n"
381
+ "Random: Random selection\n"
382
+ "Evenly Spaced: Every Nth segment"
383
+ )
384
+ method_layout.addWidget(self.sampling_method_combo)
385
+ method_layout.addStretch()
386
+ project_controls_layout.addLayout(method_layout)
387
+
388
+ # Project status info
389
+ self.project_status_label = QLabel("Project status: No project loaded")
390
+ self.project_status_label.setStyleSheet("color: #666; font-size: 9pt; padding: 5px;")
391
+ self.project_status_label.setWordWrap(True)
392
+ project_controls_layout.addWidget(self.project_status_label)
393
+
394
+ # Create dataset button
395
+ self.create_dataset_button = QPushButton("📊 Create Test Dataset from Project")
396
+ self.create_dataset_button.clicked.connect(self._on_create_project_dataset)
397
+ self.create_dataset_button.setEnabled(False)
398
+ project_controls_layout.addWidget(self.create_dataset_button)
399
+
400
+ self.project_controls_widget.setLayout(project_controls_layout)
401
+ self.project_controls_widget.setVisible(False)
402
+ dataset_layout.addWidget(self.project_controls_widget)
403
+
404
+ dataset_layout.addStretch()
405
+ dataset_group.setLayout(dataset_layout)
406
+ layout.addWidget(dataset_group)
407
+
408
+ # Update project status on init
409
+ self._update_project_status()
410
+
411
+ # Right: Model selection
412
+ model_group = QGroupBox("Model Selection")
413
+ model_layout = QVBoxLayout()
414
+
415
+ model_layout.addWidget(QLabel("Select models to test:"))
416
+
417
+ # OpenAI models
418
+ self.openai_checkbox = CheckmarkCheckBox("OpenAI (GPT-4o)")
419
+ self.openai_checkbox.setChecked(True)
420
+ model_layout.addWidget(self.openai_checkbox)
421
+
422
+ self.openai_model_combo = QComboBox()
423
+ self.openai_model_combo.addItems([
424
+ "gpt-4o",
425
+ "gpt-4o-mini",
426
+ "gpt-5"
427
+ ])
428
+ self.openai_model_combo.setEnabled(True)
429
+ model_layout.addWidget(self.openai_model_combo)
430
+
431
+ # Claude models
432
+ self.claude_checkbox = CheckmarkCheckBox("Claude (Sonnet 4.5)")
433
+ self.claude_checkbox.setChecked(True)
434
+ model_layout.addWidget(self.claude_checkbox)
435
+
436
+ self.claude_model_combo = QComboBox()
437
+ self.claude_model_combo.addItems([
438
+ "claude-sonnet-4-5-20250929",
439
+ "claude-haiku-4-5-20251001",
440
+ "claude-opus-4-1-20250805"
441
+ ])
442
+ self.claude_model_combo.setEnabled(True)
443
+ model_layout.addWidget(self.claude_model_combo)
444
+
445
+ # Gemini models
446
+ self.gemini_checkbox = CheckmarkCheckBox("Gemini (2.5 Flash)")
447
+ self.gemini_checkbox.setChecked(True)
448
+ model_layout.addWidget(self.gemini_checkbox)
449
+
450
+ self.gemini_model_combo = QComboBox()
451
+ self.gemini_model_combo.addItems([
452
+ "gemini-2.5-flash",
453
+ "gemini-2.5-flash-lite",
454
+ "gemini-2.5-pro"
455
+ ])
456
+ self.gemini_model_combo.setEnabled(True)
457
+ model_layout.addWidget(self.gemini_model_combo)
458
+
459
+ model_layout.addStretch()
460
+
461
+ # Run button
462
+ self.run_button = QPushButton("🚀 Run Benchmark")
463
+ self.run_button.setStyleSheet("font-weight: bold; padding: 8px;")
464
+ self.run_button.clicked.connect(self._on_run_benchmark)
465
+ model_layout.addWidget(self.run_button)
466
+
467
+ # Cancel button
468
+ self.cancel_button = QPushButton("Cancel")
469
+ self.cancel_button.setEnabled(False)
470
+ self.cancel_button.clicked.connect(self._on_cancel_benchmark)
471
+ model_layout.addWidget(self.cancel_button)
472
+
473
+ # Export button
474
+ self.export_button = QPushButton("📊 Export Results...")
475
+ self.export_button.setEnabled(False)
476
+ self.export_button.clicked.connect(self._on_export_results)
477
+ model_layout.addWidget(self.export_button)
478
+
479
+ model_group.setLayout(model_layout)
480
+ layout.addWidget(model_group)
481
+
482
+ widget.setLayout(layout)
483
+ return widget
484
+
485
+ def _create_results_table(self) -> QTableWidget:
486
+ """Create results comparison table"""
487
+ table = QTableWidget()
488
+ table.setColumnCount(6)
489
+ table.setHorizontalHeaderLabels([
490
+ "Segment", "Source Text", "Model", "Translation", "Speed (ms)", "Quality"
491
+ ])
492
+
493
+ # Set column widths
494
+ header = table.horizontalHeader()
495
+ header.setSectionResizeMode(0, QHeaderView.ResizeMode.ResizeToContents)
496
+ header.setSectionResizeMode(1, QHeaderView.ResizeMode.Stretch)
497
+ header.setSectionResizeMode(2, QHeaderView.ResizeMode.ResizeToContents)
498
+ header.setSectionResizeMode(3, QHeaderView.ResizeMode.Stretch)
499
+ header.setSectionResizeMode(4, QHeaderView.ResizeMode.ResizeToContents)
500
+ header.setSectionResizeMode(5, QHeaderView.ResizeMode.ResizeToContents)
501
+
502
+ table.setAlternatingRowColors(True)
503
+ table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers)
504
+ table.setSelectionBehavior(QTableWidget.SelectionBehavior.SelectRows)
505
+
506
+ return table
507
+
508
+ def _create_summary_panel(self) -> QWidget:
509
+ """Create summary statistics panel"""
510
+ widget = QGroupBox("Summary Statistics")
511
+ layout = QVBoxLayout()
512
+
513
+ self.summary_table = QTableWidget()
514
+ self.summary_table.setColumnCount(5)
515
+ self.summary_table.setHorizontalHeaderLabels([
516
+ "Model", "Avg Speed (ms)", "Avg Quality", "Success", "Errors"
517
+ ])
518
+
519
+ header = self.summary_table.horizontalHeader()
520
+ header.setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
521
+
522
+ self.summary_table.setMaximumHeight(200)
523
+ self.summary_table.setAlternatingRowColors(True)
524
+ self.summary_table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers)
525
+
526
+ layout.addWidget(self.summary_table)
527
+ widget.setLayout(layout)
528
+
529
+ return widget
530
+
531
+ def _on_dataset_changed(self, index: int):
532
+ """Handle dataset selection change"""
533
+ self.current_dataset = self.dataset_combo.itemData(index)
534
+ self.log(f"Selected dataset: {self.current_dataset.name}")
535
+
536
+ def _on_dataset_source_changed(self):
537
+ """Handle radio button toggle between predefined and project datasets"""
538
+ print(f"[LLM DEBUG] _on_dataset_source_changed() called")
539
+ is_project = self.project_radio.isChecked()
540
+ print(f"[LLM DEBUG] is_project: {is_project}")
541
+ self.dataset_combo.setEnabled(not is_project)
542
+ self.project_controls_widget.setVisible(is_project)
543
+
544
+ if is_project:
545
+ # Update project status when switching to project mode
546
+ print(f"[LLM DEBUG] Calling _update_project_status() from _on_dataset_source_changed")
547
+ self._update_project_status()
548
+
549
+ # Switch to project dataset if available
550
+ if self.project_dataset:
551
+ self.current_dataset = self.project_dataset
552
+ self.log(f"Using project dataset: {self.current_dataset.name}")
553
+ else:
554
+ self.current_dataset = None
555
+ else:
556
+ # Switch back to predefined dataset
557
+ self.current_dataset = self.dataset_combo.currentData()
558
+ if self.current_dataset:
559
+ self.log(f"Using predefined dataset: {self.current_dataset.name}")
560
+
561
+ def _update_project_status(self):
562
+ """Update the project status label based on loaded project"""
563
+ try:
564
+ # Debug to console (always print)
565
+ print(f"[LLM DEBUG] _update_project_status() called")
566
+ print(f"[LLM DEBUG] parent_app exists: {self.parent_app is not None}")
567
+ except Exception as e:
568
+ print(f"[LLM DEBUG] ERROR in _update_project_status: {e}")
569
+ import traceback
570
+ traceback.print_exc()
571
+
572
+ if self.parent_app:
573
+ print(f"[LLM DEBUG] parent_app has 'current_project': {hasattr(self.parent_app, 'current_project')}")
574
+ if hasattr(self.parent_app, 'current_project'):
575
+ print(f"[LLM DEBUG] current_project is not None: {self.parent_app.current_project is not None}")
576
+ if self.parent_app.current_project:
577
+ print(f"[LLM DEBUG] project.segments exists: {hasattr(self.parent_app.current_project, 'segments')}")
578
+ if hasattr(self.parent_app.current_project, 'segments'):
579
+ print(f"[LLM DEBUG] project.segments length: {len(self.parent_app.current_project.segments) if self.parent_app.current_project.segments else 0}")
580
+
581
+ if not self.parent_app or not hasattr(self.parent_app, 'current_project') or not self.parent_app.current_project:
582
+ self.project_status_label.setText("⚠️ No project loaded")
583
+ self.project_status_label.setStyleSheet("color: #FF6600; font-size: 9pt; padding: 5px;")
584
+ self.create_dataset_button.setEnabled(False)
585
+ return
586
+
587
+ project = self.parent_app.current_project
588
+ total_segs = len(project.segments) if project.segments else 0
589
+ print(f"[LLM DEBUG] Project has {total_segs} segments")
590
+
591
+ if total_segs == 0:
592
+ self.project_status_label.setText("⚠️ Project has no segments")
593
+ self.project_status_label.setStyleSheet("color: #FF6600; font-size: 9pt; padding: 5px;")
594
+ self.create_dataset_button.setEnabled(False)
595
+ return
596
+
597
+ # Count translated segments
598
+ translated = sum(1 for seg in project.segments if seg.target and seg.target.strip())
599
+ pct = (translated / total_segs * 100) if total_segs > 0 else 0
600
+
601
+ status_html = f"""
602
+ <b>Project Status:</b><br>
603
+ • Total segments: {total_segs}<br>
604
+ • Translated: {translated} ({pct:.1f}%)<br>
605
+ """
606
+
607
+ if translated == 0:
608
+ status_html += "<br>⚠️ <b>No translations yet</b><br>"
609
+ status_html += "Quality scoring unavailable<br>"
610
+ status_html += "Will compare: Speed, Cost, Outputs"
611
+ self.project_status_label.setStyleSheet("color: #FF6600; font-size: 9pt; padding: 5px; background: #FFF8E1; border-radius: 3px;")
612
+ elif translated < total_segs:
613
+ status_html += f"<br>✓ Quality scoring available for {translated} segments"
614
+ self.project_status_label.setStyleSheet("color: #0066CC; font-size: 9pt; padding: 5px; background: #E3F2FD; border-radius: 3px;")
615
+ else:
616
+ status_html += "<br>✓ Quality scoring available (fully translated)"
617
+ self.project_status_label.setStyleSheet("color: #00AA00; font-size: 9pt; padding: 5px; background: #E8F5E9; border-radius: 3px;")
618
+
619
+ self.project_status_label.setText(status_html)
620
+ self.create_dataset_button.setEnabled(True)
621
+
622
+ def _on_create_project_dataset(self):
623
+ """Create test dataset from current project"""
624
+ if not self.parent_app or not hasattr(self.parent_app, 'current_project') or not self.parent_app.current_project:
625
+ QMessageBox.warning(self, "Error", "No project loaded")
626
+ return
627
+
628
+ project = self.parent_app.current_project
629
+ sample_size = self.sample_size_spin.value()
630
+
631
+ # Map combo box text to method name
632
+ method_map = {
633
+ "Smart Sampling": "smart",
634
+ "Random": "random",
635
+ "Evenly Spaced": "evenly_spaced"
636
+ }
637
+ sampling_method = method_map.get(self.sampling_method_combo.currentText(), "smart")
638
+
639
+ try:
640
+ # Create dataset
641
+ self.project_dataset, self.project_metadata = create_dataset_from_project(
642
+ project,
643
+ sample_size=sample_size,
644
+ sampling_method=sampling_method,
645
+ require_targets=False
646
+ )
647
+
648
+ self.current_dataset = self.project_dataset
649
+
650
+ # Log creation
651
+ meta = self.project_metadata
652
+ self.log(f"Created project dataset: {self.project_dataset.name}")
653
+ self.log(f" • Sampled {meta['sampled_segments']} segments from {meta['total_segments']} total")
654
+ self.log(f" • Method: {sampling_method}")
655
+ self.log(f" • References available: {meta['segments_with_references']}/{meta['sampled_segments']}")
656
+
657
+ if meta['quality_scoring_available']:
658
+ self.log(f" • ✓ Quality scoring enabled")
659
+ else:
660
+ self.log(f" • ⚠️ Quality scoring disabled (no reference translations)")
661
+
662
+ # Update button text
663
+ self.create_dataset_button.setText(f"✓ Dataset Created ({len(self.project_dataset.segments)} segments)")
664
+ self.create_dataset_button.setStyleSheet("background-color: #4CAF50; color: white; font-weight: bold;")
665
+
666
+ QMessageBox.information(
667
+ self,
668
+ "Dataset Created",
669
+ f"Successfully created test dataset with {meta['sampled_segments']} segments.\n\n"
670
+ f"Quality scoring: {'Enabled' if meta['quality_scoring_available'] else 'Disabled (no references)'}\n"
671
+ f"Ready to benchmark!"
672
+ )
673
+
674
+ except Exception as e:
675
+ QMessageBox.critical(self, "Error", f"Failed to create dataset:\n{str(e)}")
676
+ self.log(f"ERROR creating project dataset: {str(e)}")
677
+
678
+ def _on_run_benchmark(self):
679
+ """Start benchmark execution"""
680
+ if not self.llm_client_factory:
681
+ QMessageBox.warning(self, "Error", "LLM client factory not available")
682
+ return
683
+
684
+ if not self.current_dataset:
685
+ QMessageBox.warning(self, "Error", "No dataset selected")
686
+ return
687
+
688
+ # Get selected models
689
+ models = self._get_selected_models()
690
+ if not models:
691
+ QMessageBox.warning(self, "Error", "Please select at least one model to test")
692
+ return
693
+
694
+ # Confirm if sacrebleu not available
695
+ if not CHRF_AVAILABLE:
696
+ reply = QMessageBox.question(
697
+ self,
698
+ "Quality Scoring Unavailable",
699
+ "sacrebleu library is not installed. Quality scores will not be calculated.\n\n"
700
+ "Continue anyway?",
701
+ QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No
702
+ )
703
+ if reply == QMessageBox.StandardButton.No:
704
+ return
705
+
706
+ # Clear previous results
707
+ self.results_table.setRowCount(0)
708
+ self.summary_table.setRowCount(0)
709
+ self.log_output.clear()
710
+ self.current_results = []
711
+
712
+ # Update UI state
713
+ self.run_button.setEnabled(False)
714
+ self.cancel_button.setEnabled(True)
715
+ self.export_button.setEnabled(False)
716
+ self.progress_bar.setVisible(True)
717
+ self.progress_bar.setValue(0)
718
+ self.status_label.setText("Running benchmark...")
719
+
720
+ # Create leaderboard instance
721
+ self.leaderboard = LLMLeaderboard(self.llm_client_factory, self.log)
722
+
723
+ # Start benchmark in background thread
724
+ self.benchmark_thread = BenchmarkThread(self.leaderboard, self.current_dataset, models)
725
+ self.benchmark_thread.progress_update.connect(self._on_progress_update)
726
+ self.benchmark_thread.finished.connect(self._on_benchmark_finished)
727
+ self.benchmark_thread.error.connect(self._on_benchmark_error)
728
+ self.benchmark_thread.start()
729
+
730
+ def _on_cancel_benchmark(self):
731
+ """Cancel running benchmark"""
732
+ if self.leaderboard:
733
+ self.leaderboard.cancel_benchmark()
734
+ self.log("⚠️ Cancelling benchmark...")
735
+
736
+ def _on_progress_update(self, current: int, total: int, message: str):
737
+ """Update progress bar and status"""
738
+ self.progress_bar.setMaximum(total)
739
+ self.progress_bar.setValue(current)
740
+ self.status_label.setText(f"{message} ({current}/{total})")
741
+
742
+ def _on_benchmark_finished(self, results: List[BenchmarkResult]):
743
+ """Handle benchmark completion"""
744
+ try:
745
+ print(f"[UI] _on_benchmark_finished called with {len(results)} results")
746
+ self.current_results = results
747
+
748
+ # Update UI state
749
+ print(f"[UI] Updating UI state...")
750
+ self.run_button.setEnabled(True)
751
+ self.cancel_button.setEnabled(False)
752
+ self.export_button.setEnabled(True)
753
+ self.progress_bar.setVisible(False)
754
+ self.status_label.setText(f"✅ Benchmark complete: {len(results)} results")
755
+ print(f"[UI] UI state updated")
756
+
757
+ # Populate results table
758
+ print(f"[UI] Populating results table...")
759
+ self._populate_results_table(results)
760
+ print(f"[UI] Results table populated")
761
+
762
+ # Populate summary table
763
+ print(f"[UI] Populating summary table...")
764
+ self._populate_summary_table()
765
+ print(f"[UI] Summary table populated")
766
+
767
+ self.log("✅ Benchmark finished successfully")
768
+ print(f"[UI] _on_benchmark_finished completed successfully")
769
+ except Exception as e:
770
+ print(f"[UI] ERROR in _on_benchmark_finished: {str(e)}")
771
+ import traceback
772
+ print(f"[UI] TRACEBACK:\n{traceback.format_exc()}")
773
+ self.log(f"❌ Error displaying results: {str(e)}")
774
+ QMessageBox.critical(self, "Display Error", f"Benchmark completed but failed to display results:\n\n{str(e)}")
775
+
776
+ def _on_benchmark_error(self, error_msg: str):
777
+ """Handle benchmark error"""
778
+ self.run_button.setEnabled(True)
779
+ self.cancel_button.setEnabled(False)
780
+ self.progress_bar.setVisible(False)
781
+ self.status_label.setText("❌ Benchmark failed")
782
+
783
+ QMessageBox.critical(self, "Benchmark Error", f"An error occurred:\n\n{error_msg}")
784
+ self.log(f"❌ Error: {error_msg}")
785
+
786
+ def _populate_results_table(self, results: List[BenchmarkResult]):
787
+ """Populate results table with benchmark data"""
788
+ # Group results by segment
789
+ segments_dict = {}
790
+ for result in results:
791
+ if result.segment_id not in segments_dict:
792
+ segments_dict[result.segment_id] = []
793
+ segments_dict[result.segment_id].append(result)
794
+
795
+ # Populate table
796
+ row = 0
797
+ for segment_id in sorted(segments_dict.keys()):
798
+ segment_results = segments_dict[segment_id]
799
+
800
+ # Get source text from dataset
801
+ source_text = ""
802
+ for seg in self.current_dataset.segments:
803
+ if seg.id == segment_id:
804
+ source_text = seg.source
805
+ break
806
+
807
+ # Truncate source text for display
808
+ if len(source_text) > 80:
809
+ source_text = source_text[:77] + "..."
810
+
811
+ for result in segment_results:
812
+ self.results_table.insertRow(row)
813
+
814
+ # Segment ID
815
+ self.results_table.setItem(row, 0, QTableWidgetItem(str(segment_id)))
816
+
817
+ # Source text
818
+ self.results_table.setItem(row, 1, QTableWidgetItem(source_text))
819
+
820
+ # Model name
821
+ self.results_table.setItem(row, 2, QTableWidgetItem(result.model_name))
822
+
823
+ # Translation output
824
+ output_text = result.output if result.output else f"ERROR: {result.error}"
825
+ if len(output_text) > 100:
826
+ output_text = output_text[:97] + "..."
827
+ item = QTableWidgetItem(output_text)
828
+ if result.error:
829
+ item.setForeground(QColor("red"))
830
+ self.results_table.setItem(row, 3, item)
831
+
832
+ # Speed
833
+ speed_item = QTableWidgetItem(f"{result.latency_ms:.0f}")
834
+ self.results_table.setItem(row, 4, speed_item)
835
+
836
+ # Quality
837
+ if result.quality_score is not None:
838
+ quality_item = QTableWidgetItem(f"{result.quality_score:.1f}")
839
+ self.results_table.setItem(row, 5, quality_item)
840
+ else:
841
+ self.results_table.setItem(row, 5, QTableWidgetItem("—"))
842
+
843
+ row += 1
844
+
845
+ def _populate_summary_table(self):
846
+ """Populate summary statistics table"""
847
+ if not self.leaderboard:
848
+ return
849
+
850
+ summary = self.leaderboard.get_summary_stats()
851
+
852
+ self.summary_table.setRowCount(len(summary))
853
+ row = 0
854
+
855
+ for model_name, stats in summary.items():
856
+ # Model name
857
+ self.summary_table.setItem(row, 0, QTableWidgetItem(model_name))
858
+
859
+ # Avg speed
860
+ avg_speed = stats["avg_latency_ms"]
861
+ speed_item = QTableWidgetItem(f"{avg_speed:.0f}")
862
+ self.summary_table.setItem(row, 1, speed_item)
863
+
864
+ # Avg quality
865
+ avg_quality = stats["avg_quality_score"]
866
+ if avg_quality is not None:
867
+ quality_item = QTableWidgetItem(f"{avg_quality:.1f}")
868
+ self.summary_table.setItem(row, 2, quality_item)
869
+ else:
870
+ self.summary_table.setItem(row, 2, QTableWidgetItem("—"))
871
+
872
+ # Success count
873
+ self.summary_table.setItem(row, 3, QTableWidgetItem(str(stats["success_count"])))
874
+
875
+ # Error count
876
+ error_item = QTableWidgetItem(str(stats["error_count"]))
877
+ if stats["error_count"] > 0:
878
+ error_item.setForeground(QColor("red"))
879
+ self.summary_table.setItem(row, 4, error_item)
880
+
881
+ row += 1
882
+
883
+ def _on_export_results(self):
884
+ """Export results to file (JSON or Excel)"""
885
+ if not self.current_results:
886
+ QMessageBox.warning(self, "No Results", "No benchmark results to export")
887
+ return
888
+
889
+ # Generate filename with dataset info - sanitize for Windows filesystem
890
+ dataset_name = self.current_dataset.name.replace(" ", "_").replace("→", "-")
891
+ # Remove invalid filename characters: < > : " / \ | ? *
892
+ invalid_chars = '<>:"/\\|?*'
893
+ for char in invalid_chars:
894
+ dataset_name = dataset_name.replace(char, "_")
895
+ from datetime import datetime
896
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
897
+ default_filename = f"LLM_Leaderboard_{dataset_name}_{timestamp}.xlsx"
898
+
899
+ # Ask user for file path and format
900
+ filepath, selected_filter = QFileDialog.getSaveFileName(
901
+ self,
902
+ "Export Benchmark Results",
903
+ default_filename,
904
+ "Excel Files (*.xlsx);;JSON Files (*.json);;All Files (*)"
905
+ )
906
+
907
+ if not filepath:
908
+ return
909
+
910
+ try:
911
+ # Determine export format from selected filter or file extension
912
+ if "Excel" in selected_filter or filepath.endswith('.xlsx'):
913
+ self._export_to_excel(filepath)
914
+ else:
915
+ self._export_to_json(filepath)
916
+
917
+ QMessageBox.information(self, "Export Complete", f"Results exported to:\n{filepath}")
918
+ self.log(f"OK Results exported to {filepath}")
919
+
920
+ except Exception as e:
921
+ QMessageBox.critical(self, "Export Error", f"Failed to export results:\n{str(e)}")
922
+ self.log(f"ERROR Export error: {e}")
923
+
924
+ def _export_to_json(self, filepath: str):
925
+ """Export results to JSON file"""
926
+ export_data = self.leaderboard.export_to_dict()
927
+ with open(filepath, 'w', encoding='utf-8') as f:
928
+ json.dump(export_data, f, indent=2, ensure_ascii=False)
929
+
930
+ def _export_to_excel(self, filepath: str):
931
+ """Export results to Excel file with title sheet, detailed results, and summary"""
932
+ from openpyxl import Workbook
933
+ from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
934
+ from openpyxl.utils import get_column_letter
935
+ from datetime import datetime
936
+
937
+ wb = Workbook()
938
+
939
+ # === TITLE/INFO SHEET ===
940
+ ws_info = wb.active
941
+ ws_info.title = "About"
942
+
943
+ # Title with emoji (matches UI header style)
944
+ ws_info['A1'] = "📊 Superbench"
945
+ ws_info['A1'].font = Font(size=24, bold=True, color="1976D2") # Blue color matching UI
946
+ ws_info.merge_cells('A1:D1')
947
+
948
+ # Subtitle (matches UI description style)
949
+ ws_info['A2'] = "Translation Quality Benchmarking System"
950
+ ws_info['A2'].font = Font(size=12, italic=True, color="666666")
951
+ ws_info.merge_cells('A2:D2')
952
+
953
+ # Supervertaler module branding (matches standardized naming)
954
+ ws_info['A3'] = "A Supervertaler Module"
955
+ ws_info['A3'].font = Font(size=11, color="0066CC", underline="single")
956
+ ws_info['A3'].hyperlink = "https://supervertaler.com/"
957
+ ws_info.merge_cells('A3:D3')
958
+
959
+ # Spacing
960
+ ws_info.row_dimensions[4].height = 20
961
+
962
+ # Benchmark Info
963
+ info_header_font = Font(bold=True, size=11)
964
+ info_label_font = Font(size=10, color="666666")
965
+ info_value_font = Font(size=10)
966
+
967
+ row = 5
968
+ ws_info[f'A{row}'] = "BENCHMARK INFORMATION"
969
+ ws_info[f'A{row}'].font = Font(bold=True, size=12, color="366092")
970
+ row += 1
971
+
972
+ # Dataset info
973
+ ws_info[f'A{row}'] = "Test Dataset:"
974
+ ws_info[f'A{row}'].font = info_label_font
975
+ ws_info[f'B{row}'] = self.current_dataset.name
976
+ ws_info[f'B{row}'].font = info_value_font
977
+ row += 1
978
+
979
+ ws_info[f'A{row}'] = "Description:"
980
+ ws_info[f'A{row}'].font = info_label_font
981
+ ws_info[f'B{row}'] = self.current_dataset.description
982
+ ws_info[f'B{row}'].font = info_value_font
983
+ row += 1
984
+
985
+ ws_info[f'A{row}'] = "Segments Tested:"
986
+ ws_info[f'A{row}'].font = info_label_font
987
+ ws_info[f'B{row}'] = len(self.current_dataset.segments)
988
+ ws_info[f'B{row}'].font = info_value_font
989
+ row += 1
990
+
991
+ ws_info[f'A{row}'] = "Date & Time:"
992
+ ws_info[f'A{row}'].font = info_label_font
993
+ ws_info[f'B{row}'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
994
+ ws_info[f'B{row}'].font = info_value_font
995
+ row += 1
996
+
997
+ # Models tested
998
+ ws_info[f'A{row}'] = "Models Tested:"
999
+ ws_info[f'A{row}'].font = info_label_font
1000
+ models_tested = set(r.model_name for r in self.current_results)
1001
+ ws_info[f'B{row}'] = ", ".join(models_tested)
1002
+ ws_info[f'B{row}'].font = info_value_font
1003
+ row += 2
1004
+
1005
+ # Explanation section
1006
+ ws_info[f'A{row}'] = "UNDERSTANDING THE RESULTS"
1007
+ ws_info[f'A{row}'].font = Font(bold=True, size=12, color="366092")
1008
+ row += 1
1009
+
1010
+ explanations = [
1011
+ ("Quality Score (chrF++):", "Character-level metric measuring translation accuracy. Higher is better (0-100). Scores above 80 indicate excellent quality."),
1012
+ ("Speed (ms):", "Translation time in milliseconds. Lower is better. Typical range: 1000-5000ms per segment."),
1013
+ ("Success Count:", "Number of segments successfully translated without errors."),
1014
+ ("Error Count:", "Number of failed translations. Should be 0 for production use."),
1015
+ ("Color Coding:", "Green highlighting indicates the best performer in each category (quality/speed).")
1016
+ ]
1017
+
1018
+ for label, explanation in explanations:
1019
+ ws_info[f'A{row}'] = label
1020
+ ws_info[f'A{row}'].font = Font(bold=True, size=10)
1021
+ ws_info[f'B{row}'] = explanation
1022
+ ws_info[f'B{row}'].font = Font(size=10)
1023
+ ws_info[f'B{row}'].alignment = Alignment(wrap_text=True)
1024
+ row += 1
1025
+
1026
+ row += 1
1027
+ ws_info[f'A{row}'] = "NAVIGATION"
1028
+ ws_info[f'A{row}'].font = Font(bold=True, size=12, color="366092")
1029
+ row += 1
1030
+
1031
+ navigation_items = [
1032
+ ("Summary Tab:", "View aggregated statistics and compare models side-by-side."),
1033
+ ("Results Tab:", "View detailed translation output for each segment and model."),
1034
+ ]
1035
+
1036
+ for label, description in navigation_items:
1037
+ ws_info[f'A{row}'] = label
1038
+ ws_info[f'A{row}'].font = Font(bold=True, size=10)
1039
+ ws_info[f'B{row}'] = description
1040
+ ws_info[f'B{row}'].font = Font(size=10)
1041
+ row += 1
1042
+
1043
+ # Column widths
1044
+ ws_info.column_dimensions['A'].width = 20
1045
+ ws_info.column_dimensions['B'].width = 80
1046
+ ws_info.column_dimensions['C'].width = 15
1047
+ ws_info.column_dimensions['D'].width = 15
1048
+
1049
+ # === SUMMARY SHEET ===
1050
+ ws_summary = wb.create_sheet("Summary")
1051
+
1052
+ # Header row
1053
+ summary_headers = ["Model", "Provider", "Model ID", "Avg Speed (ms)", "Avg Quality (chrF++)",
1054
+ "Success Count", "Error Count", "Total Tests"]
1055
+ ws_summary.append(summary_headers)
1056
+
1057
+ # Format header row
1058
+ header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
1059
+ header_font = Font(color="FFFFFF", bold=True)
1060
+ for col_num, _ in enumerate(summary_headers, 1):
1061
+ cell = ws_summary.cell(1, col_num)
1062
+ cell.fill = header_fill
1063
+ cell.font = header_font
1064
+ cell.alignment = Alignment(horizontal="center", vertical="center")
1065
+
1066
+ # Get summary statistics
1067
+ summary_stats = self.leaderboard.get_summary_stats()
1068
+
1069
+ # Populate summary data
1070
+ row_num = 2
1071
+ for model_name, stats in summary_stats.items():
1072
+ # Extract provider and model ID from results
1073
+ provider = ""
1074
+ model_id = ""
1075
+ for result in self.current_results:
1076
+ if result.model_name == model_name:
1077
+ provider = result.provider
1078
+ model_id = result.model_id
1079
+ break
1080
+
1081
+ total_tests = stats["success_count"] + stats["error_count"]
1082
+
1083
+ ws_summary.cell(row_num, 1, model_name)
1084
+ ws_summary.cell(row_num, 2, provider)
1085
+ ws_summary.cell(row_num, 3, model_id)
1086
+ ws_summary.cell(row_num, 4, f"{stats['avg_latency_ms']:.0f}" if stats['avg_latency_ms'] else "")
1087
+ ws_summary.cell(row_num, 5, f"{stats['avg_quality_score']:.2f}" if stats['avg_quality_score'] else "")
1088
+ ws_summary.cell(row_num, 6, stats["success_count"])
1089
+ ws_summary.cell(row_num, 7, stats["error_count"])
1090
+ ws_summary.cell(row_num, 8, total_tests)
1091
+
1092
+ # Highlight best quality score
1093
+ if stats['avg_quality_score']:
1094
+ quality_cell = ws_summary.cell(row_num, 5)
1095
+ best_quality = max(s['avg_quality_score'] for s in summary_stats.values() if s['avg_quality_score'])
1096
+ if stats['avg_quality_score'] == best_quality:
1097
+ quality_cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1098
+ quality_cell.font = Font(bold=True)
1099
+
1100
+ # Highlight best speed
1101
+ if stats['avg_latency_ms']:
1102
+ speed_cell = ws_summary.cell(row_num, 4)
1103
+ best_speed = min(s['avg_latency_ms'] for s in summary_stats.values() if s['avg_latency_ms'])
1104
+ if stats['avg_latency_ms'] == best_speed:
1105
+ speed_cell.fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1106
+ speed_cell.font = Font(bold=True)
1107
+
1108
+ row_num += 1
1109
+
1110
+ # Auto-size summary columns
1111
+ ws_summary.column_dimensions['A'].width = 25
1112
+ ws_summary.column_dimensions['B'].width = 15
1113
+ ws_summary.column_dimensions['C'].width = 30
1114
+ ws_summary.column_dimensions['D'].width = 18
1115
+ ws_summary.column_dimensions['E'].width = 20
1116
+ ws_summary.column_dimensions['F'].width = 15
1117
+ ws_summary.column_dimensions['G'].width = 15
1118
+ ws_summary.column_dimensions['H'].width = 15
1119
+
1120
+ # === RESULTS SHEET (New Segment-Grouped Format) ===
1121
+ ws_results = wb.create_sheet("Results")
1122
+
1123
+ # Group results by segment
1124
+ segments_dict = {}
1125
+ for result in self.current_results:
1126
+ if result.segment_id not in segments_dict:
1127
+ segments_dict[result.segment_id] = []
1128
+ segments_dict[result.segment_id].append(result)
1129
+
1130
+ # Get list of all models tested
1131
+ all_models = []
1132
+ model_seen = set()
1133
+ for result in self.current_results:
1134
+ if result.model_name not in model_seen:
1135
+ all_models.append(result.model_name)
1136
+ model_seen.add(result.model_name)
1137
+
1138
+ # Styling
1139
+ header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
1140
+ header_font = Font(color="FFFFFF", bold=True)
1141
+ segment_header_fill = PatternFill(start_color="E3F2FD", end_color="E3F2FD", fill_type="solid")
1142
+ segment_header_font = Font(bold=True, size=11)
1143
+ label_font = Font(bold=True, size=10)
1144
+ best_quality_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
1145
+
1146
+ # Model-specific background colors (alternating for visual clarity)
1147
+ model_colors = {
1148
+ "GPT-4o": PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid"), # Stronger pink/salmon
1149
+ "Claude Sonnet 4.5": PatternFill(start_color="CCFFCC", end_color="CCFFCC", fill_type="solid"), # Stronger light green
1150
+ "Gemini 2.5 Flash": PatternFill(start_color="CCDDFF", end_color="CCDDFF", fill_type="solid"), # Stronger light blue
1151
+ }
1152
+ # Default colors for other models
1153
+ default_model_colors = [
1154
+ PatternFill(start_color="FFF4E6", end_color="FFF4E6", fill_type="solid"), # Light orange
1155
+ PatternFill(start_color="F3E5F5", end_color="F3E5F5", fill_type="solid"), # Light purple
1156
+ PatternFill(start_color="E8F5E9", end_color="E8F5E9", fill_type="solid"), # Light green-2
1157
+ ]
1158
+
1159
+ row_num = 1
1160
+
1161
+ # Process each segment
1162
+ for segment_id in sorted(segments_dict.keys()):
1163
+ segment_results = segments_dict[segment_id]
1164
+
1165
+ # Get source and reference text from dataset
1166
+ source_text = ""
1167
+ reference_text = ""
1168
+ for seg in self.current_dataset.segments:
1169
+ if seg.id == segment_id:
1170
+ source_text = seg.source
1171
+ reference_text = seg.reference
1172
+ break
1173
+
1174
+ # Segment header row (spans columns A-B)
1175
+ ws_results.merge_cells(f'A{row_num}:B{row_num}')
1176
+ segment_header_cell = ws_results.cell(row_num, 1, f"Segment {segment_id}")
1177
+ segment_header_cell.fill = segment_header_fill
1178
+ segment_header_cell.font = segment_header_font
1179
+ segment_header_cell.alignment = Alignment(horizontal="left", vertical="center")
1180
+ row_num += 1
1181
+
1182
+ # Source row
1183
+ ws_results.cell(row_num, 1, "Source:").font = label_font
1184
+ ws_results.cell(row_num, 2, source_text)
1185
+ ws_results.cell(row_num, 2).alignment = Alignment(wrap_text=True, vertical="top")
1186
+ row_num += 1
1187
+
1188
+ # Reference row (if available)
1189
+ if reference_text:
1190
+ ws_results.cell(row_num, 1, "Reference:").font = label_font
1191
+ ws_results.cell(row_num, 2, reference_text)
1192
+ ws_results.cell(row_num, 2).alignment = Alignment(wrap_text=True, vertical="top")
1193
+ row_num += 1
1194
+
1195
+ # Find best quality score for this segment (if available)
1196
+ best_quality = None
1197
+ if reference_text: # Only if we have references
1198
+ quality_scores = [r.quality_score for r in segment_results if r.quality_score is not None]
1199
+ if quality_scores:
1200
+ best_quality = max(quality_scores)
1201
+
1202
+ # Model output rows
1203
+ for idx, result in enumerate(segment_results):
1204
+ # Model name label
1205
+ model_cell = ws_results.cell(row_num, 1, result.model_name)
1206
+ model_cell.font = label_font
1207
+
1208
+ # Translation output
1209
+ output_cell = ws_results.cell(row_num, 2, result.output if result.output else result.error if result.error else "")
1210
+ output_cell.alignment = Alignment(wrap_text=True, vertical="top")
1211
+
1212
+ # Apply model-specific background color first
1213
+ if result.model_name in model_colors:
1214
+ model_cell.fill = model_colors[result.model_name]
1215
+ output_cell.fill = model_colors[result.model_name]
1216
+ else:
1217
+ # Use alternating default colors for unknown models
1218
+ color_idx = idx % len(default_model_colors)
1219
+ model_cell.fill = default_model_colors[color_idx]
1220
+ output_cell.fill = default_model_colors[color_idx]
1221
+
1222
+ # Override with best quality highlight (green wins over model color)
1223
+ if best_quality and result.quality_score == best_quality:
1224
+ model_cell.fill = best_quality_fill
1225
+ output_cell.fill = best_quality_fill
1226
+
1227
+ # Override with error highlight (red wins over everything)
1228
+ if result.error:
1229
+ error_fill = PatternFill(start_color="FFCCCC", end_color="FFCCCC", fill_type="solid")
1230
+ model_cell.fill = error_fill
1231
+ output_cell.fill = error_fill
1232
+
1233
+ row_num += 1
1234
+
1235
+ # Add blank row between segments
1236
+ row_num += 1
1237
+
1238
+ # Set column widths
1239
+ ws_results.column_dimensions['A'].width = 20 # Model name column
1240
+ ws_results.column_dimensions['B'].width = 80 # Text column (wider for readability)
1241
+
1242
+ # Save workbook
1243
+ wb.save(filepath)
1244
+
1245
+ def _get_selected_models(self) -> List[ModelConfig]:
1246
+ """Get list of selected models from UI"""
1247
+ models = []
1248
+
1249
+ # Map model IDs to friendly display names
1250
+ model_names = {
1251
+ # OpenAI
1252
+ "gpt-4o": "GPT-4o",
1253
+ "gpt-4o-mini": "GPT-4o Mini",
1254
+ "gpt-5": "GPT-5 (Reasoning)",
1255
+
1256
+ # Claude
1257
+ "claude-sonnet-4-5-20250929": "Claude Sonnet 4.5",
1258
+ "claude-haiku-4-5-20251001": "Claude Haiku 4.5",
1259
+ "claude-opus-4-1-20250805": "Claude Opus 4.1",
1260
+
1261
+ # Gemini
1262
+ "gemini-2.5-flash": "Gemini 2.5 Flash",
1263
+ "gemini-2.5-flash-lite": "Gemini 2.5 Flash Lite",
1264
+ "gemini-2.5-pro": "Gemini 2.5 Pro",
1265
+ "gemini-2.0-flash-exp": "Gemini 2.0 Flash (Exp)"
1266
+ }
1267
+
1268
+ if self.openai_checkbox.isChecked():
1269
+ model_id = self.openai_model_combo.currentText()
1270
+ models.append(ModelConfig(
1271
+ name=model_names.get(model_id, model_id),
1272
+ provider="openai",
1273
+ model_id=model_id
1274
+ ))
1275
+
1276
+ if self.claude_checkbox.isChecked():
1277
+ model_id = self.claude_model_combo.currentText()
1278
+ models.append(ModelConfig(
1279
+ name=model_names.get(model_id, model_id),
1280
+ provider="claude",
1281
+ model_id=model_id
1282
+ ))
1283
+
1284
+ if self.gemini_checkbox.isChecked():
1285
+ model_id = self.gemini_model_combo.currentText()
1286
+ models.append(ModelConfig(
1287
+ name=model_names.get(model_id, model_id),
1288
+ provider="gemini",
1289
+ model_id=model_id
1290
+ ))
1291
+
1292
+ return models
1293
+
1294
+ def log(self, message: str):
1295
+ """Append message to log output and auto-scroll to bottom"""
1296
+ self.log_output.append(message)
1297
+ # Auto-scroll to bottom to show latest messages
1298
+ scrollbar = self.log_output.verticalScrollBar()
1299
+ scrollbar.setValue(scrollbar.maximum())
1300
+
1301
+
1302
+ # For standalone testing
1303
+ if __name__ == "__main__":
1304
+ import sys
1305
+ from PyQt6.QtWidgets import QApplication
1306
+
1307
+ app = QApplication(sys.argv)
1308
+
1309
+ # Mock LLM client factory for testing UI
1310
+ def mock_llm_factory(provider, model):
1311
+ print(f"Mock: Creating {provider} client with model {model}")
1312
+ return None
1313
+
1314
+ window = LLMLeaderboardUI(llm_client_factory=mock_llm_factory)
1315
+ window.setWindowTitle("Superbench")
1316
+ window.resize(1200, 800)
1317
+ window.show()
1318
+
1319
+ sys.exit(app.exec())